"""Verification: the verification report and the shared verifier.
Every prover -- agentic, Numina, *and* Aristotle -- funnels its output through the
:class:`Verifier`. The checks are lifted from milp_flare's ``entrypoint.sh`` +
``FLARE._evaluate``:
1. Compile each target file with ``lake env lean <file>``.
2. Scan the compile log for ``sorry`` warnings.
3. Extract the axiom dependency list via ``#print axioms`` and compare against
:data:`STANDARD_AXIOMS`.
It also enforces the input contract: the project's toolchain (and locked Mathlib
revision) must match the backend image's pins, else we reject before spending
compute.
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from open_atp.backends.base import ComputeBackend, ComputeSession
from open_atp.backends.docker import DockerBackend
from open_atp.backends.modal import ModalBackend
from open_atp.images import DEFAULT_IMAGE, Image
from open_atp.lean import LeanProject, MathlibRevMismatch, ToolchainMismatch
# Axioms that a "clean" Mathlib proof is allowed to depend on. Anything else
# (notably ``sorryAx``) means the proof is not actually complete.
STANDARD_AXIOMS = frozenset({"propext", "Classical.choice", "Quot.sound"})
_SORRY_RE = re.compile(r"declaration uses 'sorry'|uses `sorry`")
_AXIOMS_RE = re.compile(r"depends on axioms: \[([^\]]*)\]")
[docs]
@dataclass(frozen=True)
class VerificationReport:
"""Result of compiling a candidate project in a sandbox.
Produced by :class:`Verifier` and shared by every prover, including Aristotle.
Attributes
----------
compiles : bool
Whether the whole project built successfully.
sorry_free : bool
Whether the build is free of ``sorry`` (no incomplete proofs remain).
axioms : tuple[str, ...]
Every axiom the compiled project depends on, as reported by Lean.
compile_log : str
The full build log. Omitted from :meth:`to_dict`.
per_file : dict[str, bool]
Per-file compile status, keyed by file path relative to the project root.
non_standard_axioms : tuple[str, ...]
The axioms outside :data:`STANDARD_AXIOMS` -- notably ``sorryAx``, which
means the proof is not actually complete.
verified : bool
True iff the project compiles, has no ``sorry``, and uses no axioms
outside :data:`STANDARD_AXIOMS`.
"""
compiles: bool
sorry_free: bool
axioms: tuple[str, ...] = ()
compile_log: str = ""
per_file: dict[str, bool] = field(default_factory=dict)
@property
def non_standard_axioms(self) -> tuple[str, ...]:
"""The depended-on axioms outside :data:`STANDARD_AXIOMS`."""
return tuple(a for a in self.axioms if a not in STANDARD_AXIOMS)
@property
def verified(self) -> bool:
"""True iff the project compiles, has no sorry, and no foreign axioms."""
return self.compiles and self.sorry_free and not self.non_standard_axioms
[docs]
def to_dict(self) -> dict[str, object]:
"""JSON-ready summary (the full ``compile_log`` is intentionally omitted).
Returns
-------
dict[str, object]
The verdict (``verified``/``compiles``/``sorry_free``), the axiom lists,
and ``per_file``; ``compile_log`` is excluded.
"""
return {
"verified": self.verified,
"compiles": self.compiles,
"sorry_free": self.sorry_free,
"axioms": list(self.axioms),
"non_standard_axioms": list(self.non_standard_axioms),
"per_file": dict(self.per_file),
}
[docs]
def docker_verifier(image: Image = DEFAULT_IMAGE) -> Verifier:
"""A :class:`Verifier` backed by a local Docker sandbox running ``image``.
Parameters
----------
image : ~open_atp.images.Image, optional
The image whose Lean toolchain + Mathlib pins projects are checked against.
Default :data:`~open_atp.images.DEFAULT_IMAGE`.
Returns
-------
Verifier
A verifier over a :class:`~open_atp.backends.docker.DockerBackend`.
"""
return Verifier(DockerBackend(image=image))
[docs]
def modal_verifier(image: Image = DEFAULT_IMAGE) -> Verifier:
"""A :class:`Verifier` backed by a Modal Sandbox running the published ``image``.
Needs Modal credentials and the image published via ``open-atp
build-modal-image``. The image's ``:tag`` is dropped for the Modal name lookup.
Parameters
----------
image : ~open_atp.images.Image, optional
The published image whose Lean toolchain + Mathlib pins projects are checked
against. Default :data:`~open_atp.images.DEFAULT_IMAGE`.
Returns
-------
Verifier
A verifier over a :class:`~open_atp.backends.modal.ModalBackend`.
"""
return Verifier(ModalBackend(image=image))
[docs]
class Verifier:
"""Compiles projects in a :class:`ComputeBackend` and reports their status.
Shared by every prover for the final compile/sorry/axiom check. Use
:func:`docker_verifier` or :func:`modal_verifier` for the common cases.
Parameters
----------
backend : ComputeBackend
The sandbox the candidate project is compiled in. Its image carries the
Lean toolchain + Mathlib pins every project is checked against.
Attributes
----------
image : ~open_atp.images.Image
The image ``backend`` runs -- the compatibility contract projects must
match (toolchain + locked Mathlib revision).
"""
def __init__(self, backend: ComputeBackend) -> None:
self.backend = backend
@property
def image(self) -> Image:
"""The image the backend runs -- the compatibility contract projects match."""
return self.backend.image
[docs]
def check_compatible(self, project: LeanProject) -> None:
"""Reject a project whose pins differ from the backend image's.
Matches the project's :attr:`~open_atp.lean.LeanProject.lean_toolchain`
against the image's :attr:`~open_atp.images.Image.lean_toolchain`, and its
locked :attr:`~open_atp.lean.LeanProject.mathlib_rev` (when the project
records one) against the image's
:attr:`~open_atp.images.Image.mathlib_rev`. Raises
:class:`~open_atp.lean.ToolchainMismatch` or
:class:`~open_atp.lean.MathlibRevMismatch` on the first mismatch.
Parameters
----------
project : ~open_atp.lean.LeanProject
The candidate project whose toolchain (and locked Mathlib revision, when
recorded) must match the backend image's pins.
Raises
------
~open_atp.lean.ToolchainMismatch
If the project's toolchain differs from the image's.
~open_atp.lean.MathlibRevMismatch
If the project records a Mathlib revision that differs from the image's.
Examples
--------
>>> import tempfile
>>> from pathlib import Path
>>> from open_atp.backends.docker import DockerBackend
>>> from open_atp.images import Image
>>> from open_atp.lean import LeanProject
>>> from open_atp.verify import Verifier
>>> root = Path(tempfile.mkdtemp())
>>> _ = (root / "lakefile.toml").write_text('name = "demo"\\n')
>>> _ = (root / "lean-toolchain").write_text("leanprover/lean4:v4.31.0\\n")
>>> project = LeanProject(root)
A matching pin passes silently:
>>> image = Image(lean_toolchain="leanprover/lean4:v4.31.0")
>>> ok = Verifier(DockerBackend(image=image))
>>> ok.check_compatible(project)
A differing pin is rejected up front:
>>> bad = Verifier(DockerBackend(image=Image()))
>>> bad.check_compatible(project) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
open_atp.lean.ToolchainMismatch: Project pins ...
"""
image = self.image
if project.lean_toolchain != image.lean_toolchain:
raise ToolchainMismatch(
f"Project pins toolchain {project.lean_toolchain!r} but backend image "
f"supports {image.lean_toolchain!r}. Re-submit against a matching "
"image."
)
if project.mathlib_rev is not None and project.mathlib_rev != image.mathlib_rev:
raise MathlibRevMismatch(
f"Project pins Mathlib {project.mathlib_rev!r} but backend image "
f"supports {image.mathlib_rev!r}. Re-submit against a matching image."
)
[docs]
def verify(
self, project: LeanProject, *, session: ComputeSession | None = None
) -> VerificationReport:
"""Compile ``project`` and return a :class:`VerificationReport`.
Standalone (``session is None``) the compile spins up its own sandbox via
``backend.run`` -- the path Aristotle and the split-backend case take. When a
caller passes a live ``session`` (the agent/verify backend-reuse path), the
compile runs in that already-hot sandbox instead, avoiding a second spin-up.
Parameters
----------
project : ~open_atp.lean.LeanProject
The candidate project to compile. Checked for compatibility first.
session : ~open_atp.backends.base.ComputeSession, optional
A live, already-hot sandbox to compile in. When ``None`` (the default),
``backend.run`` spins up a fresh sandbox for this compile.
Returns
-------
VerificationReport
The compile/``sorry``/axiom verdict. A project with no ``.lean`` files
short-circuits to a trivial passing report without touching the sandbox.
Raises
------
~open_atp.lean.ToolchainMismatch
If the project's pins differ from the backend image's (via
:meth:`check_compatible`).
~open_atp.lean.MathlibRevMismatch
If the project's locked Mathlib revision differs from the image's.
Examples
--------
A real project compiles in the backend; a project with no ``.lean`` files
short-circuits to a trivial passing report without touching the sandbox:
>>> import tempfile
>>> from pathlib import Path
>>> from open_atp.backends.docker import DockerBackend
>>> from open_atp.images import Image
>>> from open_atp.lean import LeanProject
>>> from open_atp.verify import Verifier
>>> root = Path(tempfile.mkdtemp())
>>> _ = (root / "lakefile.toml").write_text('name = "demo"\\n')
>>> _ = (root / "lean-toolchain").write_text("leanprover/lean4:v4.31.0\\n")
>>> image = Image(lean_toolchain="leanprover/lean4:v4.31.0")
>>> verifier = Verifier(DockerBackend(image=image))
>>> report = verifier.verify(LeanProject(root))
>>> report.verified
True
"""
self.check_compatible(project)
rel = [t.relative_to(project.root).as_posix() for t in project.lean_files()]
if not rel:
return VerificationReport(compiles=True, sorry_free=True)
script = self._compile_script(rel)
if session is None:
result = self.backend.run(project.root, script)
else:
with session.exec(script) as handle:
result = handle.wait()
log = result.stdout + ("\n" + result.stderr if result.stderr else "")
per_file = self._parse_per_file(log, rel)
compiles = result.exit_code == 0 and all(per_file.values())
return VerificationReport(
compiles=compiles,
sorry_free=not _SORRY_RE.search(log),
axioms=self._parse_axioms(log),
compile_log=log,
per_file=per_file,
)
@staticmethod
def _compile_script(rel: list[str]) -> str:
"""Compile each file, bracketing it with markers so we can read exit codes.
Mirrors milp_flare's ``entrypoint.sh``: every file runs (``;`` not ``&&``) so
one failure doesn't mask the rest, and the overall status is the OR of the
per-file exit codes.
"""
lines = ["fail=0"]
for f in rel:
lines += [
f'echo "=== FILE {f} ==="',
f'lake env lean "{f}" 2>&1',
"rc=$?",
'echo "=== EXIT $rc ==="',
'[ "$rc" -ne 0 ] && fail=1',
]
lines.append("exit $fail")
return "\n".join(lines)
@staticmethod
def _parse_per_file(log: str, rel: list[str]) -> dict[str, bool]:
# Walk the FILE/EXIT markers in order; a file passes iff its exit code is 0.
per_file: dict[str, bool] = {}
current: str | None = None
for line in log.splitlines():
if line.startswith("=== FILE "):
current = line[len("=== FILE ") : -len(" ===")]
elif line.startswith("=== EXIT ") and current is not None:
per_file[current] = line[len("=== EXIT ") : -len(" ===")].strip() == "0"
current = None
# Files with no recorded marker (shouldn't happen) default to failing.
for f in rel:
per_file.setdefault(f, False)
return per_file
@staticmethod
def _parse_axioms(log: str) -> tuple[str, ...]:
found: list[str] = []
for m in _AXIOMS_RE.finditer(log):
found.extend(a.strip() for a in m.group(1).split(",") if a.strip())
return tuple(dict.fromkeys(found)) # dedupe, preserve order