Source code for open_atp.provers.base

"""Base prover abstraction.

An :class:`AutomatedProver` is a *candidate generator*: it takes a
:class:`~open_atp.lean.ProofTask` and a caller-chosen output directory, fills
the project's ``sorry``\\s, verifies the result in a shared sandbox, and returns a
:class:`ProofResult`. The base class owns the shared lifecycle
-- stage the output layout, generate, then verify -- so subclasses only implement
``_generate`` and every prover (including Aristotle) gets the same final check for
free. Concrete provers live alongside this base in ``open_atp.provers``.

The public entry point is :meth:`AutomatedProver.prove`. A caller constructs a prover
directly (or via :func:`open_atp.standard_prover`) and calls it::

    result = prover.prove(task, output_dir)

``prove`` populates ``output_dir/{wd,logs}/``: ``wd`` is the completed lake project
and ``logs`` is the run record.
"""

from __future__ import annotations

import abc
import json
import time
from dataclasses import dataclass, field
from pathlib import Path

from open_atp.backends.base import ComputeBackend
from open_atp.lean import LeanProject, ProofTask
from open_atp.verify import VerificationReport, Verifier

#: Heading the optional per-task ``user_prompt`` is appended under.
_ADDITIONAL_INSTRUCTIONS = "\n\n# Additional instructions\n\n{user_prompt}"


def compose_prompt(prover_prompt: str, user_prompt: str | None) -> str:
    """Combine a prover's own prompt with the task's optional user prompt.

    ``prover_prompt`` is the prover-specific instruction set handed to the agent;
    ``user_prompt`` is the optional per-task guidance from
    :attr:`~open_atp.lean.ProofTask.user_prompt`. When present it is appended under an
    ``# Additional instructions`` heading; when absent (the common case) the prover
    prompt is returned unchanged.
    """
    if user_prompt:
        return prover_prompt + _ADDITIONAL_INSTRUCTIONS.format(user_prompt=user_prompt)
    return prover_prompt


[docs] @dataclass class ProofResult: """What a prover returns from :meth:`AutomatedProver.prove`. The prover writes its artifacts into the caller-chosen :attr:`output_dir`, laid out as ``output_dir/{wd,logs}/``: ``wd`` is the completed lake project (the proof output) and ``logs`` is the run record (the streamed agent ``stdout.txt``, ``stderr.txt``, ``result.json``, and any harness-specific rich logs). This object just records where those live, plus the verification verdict and run metadata. Attributes ---------- prover : str Name of the prover that produced this result. verification : VerificationReport or None The shared verification of the completed project, or ``None`` when the run failed before a candidate could be verified (see :attr:`error`). output_dir : pathlib.Path The run's output directory. Holds the :attr:`wd` (proof project) and :attr:`logs_dir` (run record) subdirectories the prover populated. completed_files : dict[str, str] The completed ``.lean`` sources, keyed by file path relative to the project root. cost_usd : float, optional Estimated USD cost of the run. ``None`` when the prover does not report cost. duration_s : float, optional Wall-clock duration of the run, in seconds. metadata : dict[str, object] Harness-specific run metadata (token counts, run summaries, ...). error : str, optional Set when the prover raised before producing a result (Docker down, API error, toolchain mismatch). :attr:`verification` is ``None`` and :attr:`success` is ``False``. wd : pathlib.Path The completed working directory (``output_dir/wd``) -- a complete lake project with the completed ``.lean`` files. The proof output. logs_dir : pathlib.Path The run's logs directory (``output_dir/logs``) -- the captured agent stream (``stdout.txt``), ``stderr.txt``, ``result.json``, and any harness-specific rich record (Vibe's session log, ax-prover's per-target logs, Aristotle's events). success : bool True iff :attr:`verification` exists and is :attr:`~open_atp.verify.VerificationReport.verified`. """ prover: str verification: VerificationReport | None output_dir: Path completed_files: dict[str, str] = field(default_factory=dict) cost_usd: float | None = None duration_s: float | None = None metadata: dict[str, object] = field(default_factory=dict) error: str | None = None @property def wd(self) -> Path: """The completed proof project (``output_dir/wd``).""" return self.output_dir / "wd" @property def logs_dir(self) -> Path: """The run's logs directory (``output_dir/logs``).""" return self.output_dir / "logs" @property def success(self) -> bool: return bool(self.verification and self.verification.verified)
[docs] def to_dict(self) -> dict[str, object]: """JSON-ready view: inline files, verification, cost, and artifact paths.""" return { "prover": self.prover, "success": self.success, "error": self.error, "verification": self.verification.to_dict() if self.verification else None, "completed_files": dict(self.completed_files), "cost_usd": self.cost_usd, "duration_s": self.duration_s, "output_dir": str(self.output_dir), "wd": str(self.wd), "logs_dir": str(self.logs_dir), "metadata": dict(self.metadata), }
[docs] class AutomatedProver(abc.ABC): """Generate candidate proofs, then verify them in a shared sandbox. The sandbox image (its tag plus the Lean toolchain + Mathlib pins the shared verifier checks every project against) comes from ``backend`` -- a prover inherits whatever image its backend runs. Parameters ---------- backend : ComputeBackend The one backend for this prover. Agentic provers reuse it (via a live session) for generation, then verify in that hot sandbox; Aristotle uses it only for the final check. timeout_s : int Wall-clock budget for the generation run, in seconds. Default ``1800``. """ name: str = "base" def __init__( self, *, backend: ComputeBackend, timeout_s: int = 1800, ) -> None: # Documented as a class Parameter above; not a separately documented member. self.timeout_s = timeout_s # The backend's image carries the toolchain + Mathlib pins the verifier rejects # mismatched projects against. Agentic provers reuse this same backend (via a # live session over it) for generation, then verify in that hot sandbox -- see # ``AgentProver._generate``. self.verifier = Verifier(backend) @abc.abstractmethod def _generate( self, task: ProofTask, wd: Path, logs_dir: Path, result: ProofResult ) -> None: """Generate the completed project in ``wd`` and record the run in ``result``. Implementations must leave ``wd`` containing the full completed project so the verifier can compile it in place, write the run's logs into ``logs_dir``, and fill ``result`` (``completed_files``, ``cost_usd``, ``metadata``). A prover that already verified the candidate in its own live sandbox sets ``result.verification`` itself; otherwise :meth:`prove` runs the shared check. """
[docs] def prove(self, task: ProofTask, output_dir: Path | str) -> ProofResult: """Full lifecycle: reject-on-mismatch, generate, verify, write the result. Parameters ---------- task : ~open_atp.lean.ProofTask The unit of work: the lake project to complete, the optional :attr:`~open_atp.lean.ProofTask.targets` to focus on, and any :attr:`~open_atp.lean.ProofTask.user_prompt` guidance. output_dir : pathlib.Path or str Caller-chosen output directory, populated as ``output_dir/{wd,logs}/``: ``wd`` is the completed lake project (the proof output) and ``logs`` is the run record (the agent ``stdout.txt``/``stderr.txt``, ``result.json``, and any harness-specific rich logs). Returns ------- ProofResult The verification verdict and run metadata, pointing at the populated :attr:`~ProofResult.wd` and :attr:`~ProofResult.logs_dir`. """ self.verifier.check_compatible(task.project) output_dir = Path(output_dir) wd = output_dir / "wd" logs_dir = output_dir / "logs" wd.mkdir(parents=True, exist_ok=True) logs_dir.mkdir(parents=True, exist_ok=True) result = ProofResult(prover=self.name, verification=None, output_dir=output_dir) start = time.monotonic() self._generate(task, wd, logs_dir, result) result.duration_s = time.monotonic() - start # An agentic prover ran generation and the final check in one live session and # set ``result.verification`` itself; reuse it rather than spinning a second # sandbox. Only Aristotle (network generation, no session) lands here and gets # the standalone check. if result.verification is None: result.verification = self.verifier.verify(LeanProject(wd)) # A self-describing summary so a downloaded logs dir stands on its own. (logs_dir / "result.json").write_text( json.dumps(result.to_dict(), indent=2, default=str) ) return result