Source code for open_atp.provers.base

"""Base prover abstraction.

An :class:`AutomatedProver` is a *candidate generator*: it takes a
:class:`~open_atp.lean.ProofTask` and a caller-chosen output directory, fills
the project's ``sorry``\\s, verifies the result in a shared sandbox, and returns a
:class:`~open_atp.verify.ProofResult`. The base class owns the shared lifecycle
-- stage the output layout, generate, then verify -- so subclasses only implement
``_generate`` and every prover (including Aristotle) gets the same final check for
free. Concrete provers live alongside this base in ``open_atp.provers``.

The public entry point is :meth:`AutomatedProver.prove`. A caller constructs a prover
directly (or via :func:`open_atp.provers.get_prover`) and calls it::

    result = prover.prove(task, output_dir)

``prove`` populates ``output_dir/{wd,logs}/``: ``wd`` is the completed lake project
and ``logs`` is the run record.
"""

from __future__ import annotations

import abc
import json
import time
from collections.abc import Mapping
from dataclasses import dataclass, field, fields
from pathlib import Path
from typing import Any, Self

from open_atp.backends.base import ComputeBackend
from open_atp.lean import LeanProject, ProofTask
from open_atp.verify import ProofResult, Verifier


[docs] @dataclass class AutomatedProverConfig: """Base configuration shared by all provers. Subclasses extend with their own knobs: * :class:`~open_atp.provers.agent_prover.AgentProverConfig`: harness (claude/opencode/codex), effort, skills, MCP. * :class:`~open_atp.provers.numina.NuminaProverConfig`: extends the agent config + max_rounds, helper API keys. * :class:`~open_atp.provers.aristotle.AristotleProverConfig`: api key, mode, poll interval. The sandbox image (its tag plus the Lean toolchain + Mathlib pins the shared verifier checks every project against) lives on the backend's :class:`~open_atp.backends.base.BackendConfig`, not here -- a prover inherits whatever image its verification backend runs. Attributes ---------- timeout_s : int Wall-clock budget for the generation run, in seconds. Default ``1800``. env : dict[str, str] Extra environment variables exported into the run. Default empty. """ timeout_s: int = 1800 env: dict[str, str] = field(default_factory=dict)
[docs] @classmethod def from_dict(cls, data: Mapping[str, object]) -> Self: """Build a config from a mapping (e.g. parsed JSON), ignoring unknown keys. The inverse of :func:`dataclasses.asdict`: any tuple-typed field is restored from the list JSON round-trips it to. Unknown keys are dropped so a serialized superset (or a sibling subclass's extra knobs) loads cleanly. Subclasses inherit this unchanged -- ``cls`` resolves to the concrete config, so its own fields are honoured. """ known = {f.name: f for f in fields(cls)} kwargs: dict[str, Any] = {} for key, value in data.items(): f = known.get(key) if f is None: continue if ( isinstance(value, list) and isinstance(f.type, str) and f.type.startswith("tuple") ): value = tuple(tuple(v) if isinstance(v, list) else v for v in value) kwargs[key] = value return cls(**kwargs)
[docs] class AutomatedProver(abc.ABC): """Generate candidate proofs, then verify them in a shared sandbox.""" name: str = "base" def __init__( self, config: AutomatedProverConfig, verification_backend: ComputeBackend ) -> None: self.config = config # The backend used for the *final check*. Its image carries the toolchain + # Mathlib pins the verifier rejects mismatched projects against. Agentic # provers additionally run their generation in a backend; that is the # subclass's concern. self.verifier = Verifier(verification_backend) @abc.abstractmethod def _generate( self, task: ProofTask, wd: Path, logs_dir: Path, result: ProofResult ) -> None: """Generate the completed project in ``wd`` and record the run in ``result``. Implementations must leave ``wd`` containing the full completed project so the verifier can compile it in place, write the run's logs into ``logs_dir``, and fill ``result`` (``completed_files``, ``cost_usd``, ``metadata``). A prover that already verified the candidate in its own live sandbox sets ``result.verification`` itself; otherwise :meth:`prove` runs the shared check. """
[docs] def prove(self, task: ProofTask, output_dir: Path | str) -> ProofResult: """Full lifecycle: reject-on-mismatch, generate, verify, write the result. ``output_dir`` is caller-chosen and is populated as ``output_dir/{wd,logs}/``: ``wd`` is the completed lake project (the proof output) and ``logs`` is the run record (the agent ``stdout.txt``/``stderr.txt``, ``result.json``, and any harness-specific rich logs). """ self.verifier.check_compatible(task.project) output_dir = Path(output_dir) wd = output_dir / "wd" logs_dir = output_dir / "logs" wd.mkdir(parents=True, exist_ok=True) logs_dir.mkdir(parents=True, exist_ok=True) result = ProofResult(prover=self.name, verification=None, output_dir=output_dir) start = time.monotonic() self._generate(task, wd, logs_dir, result) result.duration_s = time.monotonic() - start # A prover that ran generation and the final check in one shared sandbox (the # agent/verify backend-reuse path) set ``result.verification`` itself; reuse it # rather than spinning a second sandbox. Otherwise verify standalone -- # Aristotle (no sandbox) and the split-backend case land here. if result.verification is None: result.verification = self.verifier.verify(LeanProject(wd)) # A self-describing summary so a downloaded logs dir stands on its own. (logs_dir / "result.json").write_text( json.dumps(result.to_dict(), indent=2, default=str) ) return result