Source code for open_atp.harness.axprover

"""ax-prover-base harness (LangGraph Lean agent driven by ``ax-prover prove``)."""

from __future__ import annotations

import json
import os
import shutil
from pathlib import Path
from typing import Any, ClassVar

from open_atp.harness._paths import _SCRIPTS
from open_atp.harness.base import AuthSpec, Harness, HarnessRunResult, _infer_provider
from open_atp.harness.bundles import AssetBundle

#: Cap on ax-prover's per-call LLM retries (its ``DEFAULT_LLM_RETRY_CONFIG`` ships
#: ``stop_after_attempt=10000`` -- ~8h20m). That ``with_retry`` fires on *any* exception
#: and ignores Anthropic's ``x-should-retry: false``, so a non-retryable ``400`` (e.g. a
#: bad request from a model/langchain-anthropic mismatch) gets retried 10k times and the
#: run silently hangs for hours instead of failing fast. Three is plenty to ride out a
#: transient blip while turning any hard error into a prompt, visible failure.
_LLM_MAX_RETRIES = 3


[docs] class AxProverHarness(Harness): """ax-prover-base (LangGraph Lean agent), driven by ``ax-prover prove`` in-sandbox. ax-prover is a self-contained proving agent (its own proposer->builder->reviewer->memory loop) that edits the target ``.lean`` file in place. It slots in as a harness rather than a standalone prover because :meth:`AgentProver.prove` already supplies everything around the edges (staging, snapshot/diff, sandbox run, key forwarding) and the shared ``Verifier`` -- not ax-prover's own reviewer -- remains the source of truth for compile/sorry/axiom. Two things differ from the CLI harnesses (it mirrors :class:`VibeHarness` here): * **Config lives in a workdir YAML, not flags.** :meth:`configure_wd` writes ``axprover.yaml`` selecting the model/effort/iterations; it layers on top of ax-prover's bundled ``default.yaml`` (auto-prepended by the CLI), so it only needs to override the deltas. * **Cost is not on stdout.** ax-prover streams human-readable logs; token usage comes from its ``-o`` JSON (the per-target ``ax_output.<target>.json`` files the launch script writes), which carries ``input_tokens``/``output_tokens`` alongside ``{success, error, summary}`` as of the pinned fork commit (see ``AX_PROVER_REF`` in ``__main__.py``). :meth:`parse` sums those across every target and leaves ``cost_usd`` ``None`` so the prover converts tokens->USD via the fallback table, exactly like :class:`CodexHarness`. (On an ax-prover build without those fields the tokens are simply absent and the run reports zero cost.) """ name = "axprover" #: open-atp provider name -> ax-prover's LangChain ``provider:model`` prefix. _AX_PROVIDER_PREFIX: ClassVar[dict[str, str]] = { "anthropic": "anthropic", "openai": "openai", "google": "google_genai", "deepseek": "deepseek", } def __init__( self, model: str, effort: str = "medium", *, max_iterations: int | None = None, assets: AssetBundle | None = None, ) -> None: super().__init__(model, effort, assets) self.max_iterations = max_iterations #: Set in :meth:`configure_wd`; where :meth:`parse` looks for usage files. self._wd: Path | None = None
[docs] @classmethod def from_config(cls, config: Any, *, assets: AssetBundle | None = None) -> Harness: return cls( config.model, config.effort, max_iterations=getattr(config, "max_iterations", None), assets=assets, )
[docs] def auth_spec(self) -> AuthSpec: # Raw provider keys, exactly like OpenCodeHarness; ax-prover reads them from # the process env (ANTHROPIC_API_KEY / OPENAI_API_KEY / GOOGLE_API_KEY). env = [ key for key in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY") if key in os.environ ] if not env: raise RuntimeError( "axprover harness requires one of ANTHROPIC_API_KEY / " "OPENAI_API_KEY / GOOGLE_API_KEY" ) return AuthSpec(env=env)
def configure_wd(self, wd: Path, prompt: str) -> None: # The free-text prompt is ignored: ax-prover has its own prompts. We still let # the base write agent.sh + agent_prompt.txt for a uniform contract. super().configure_wd(wd, prompt) (wd / "axprover.yaml").write_text(self._render_config()) self._wd = wd def _ax_model(self) -> str: """``self.model`` as ax-prover's ``provider:model`` string.""" provider = _infer_provider(self.model) prefix = self._AX_PROVIDER_PREFIX.get(provider, "openai") return f"{prefix}:{self.model}" def _provider_config(self) -> dict[str, Any]: """Provider-specific LLM kwargs mapping ``effort`` to each API's knob.""" provider = _infer_provider(self.model) if provider == "anthropic": return { "temperature": 1.0, # required when thinking is enabled "max_tokens": None, "effort": self.effort, "thinking": {"type": "adaptive"}, } if provider == "google": return { "temperature": 1.0, "max_tokens": None, "include_thoughts": True, "thinking_level": self.effort, } return { # openai / deepseek (OpenAI-compatible) "temperature": None, "max_tokens": None, "reasoning": {"effort": self.effort}, } def _render_config(self) -> str: """The ``axprover.yaml`` overrides layered over ax-prover's ``default.yaml``. JSON is valid YAML, so we emit JSON to avoid a YAML dependency. Only the deltas are set: ``prover_llm`` (which the bundled config's ``memory_config`` and ``summarize_output`` interpolate from) and, when capped, ``max_iterations``. The bundled ``proposer_tools`` (lean + web search) are left untouched -- a missing TAVILY_API_KEY or blocked egress degrades a tool to a no-op rather than failing the run. The LLM is defined under a *fresh* ``llm_configs.open_atp`` key and ``prover_llm`` points at it via interpolation, rather than inlining the dict. ax-prover's ``--config`` argparse flag *appends* to its ``["default.yaml"]`` default (it does not replace it), so default.yaml's ``prover_llm: ${llm_configs.claude_opus_4_5}`` is always in the merge. An inline ``prover_llm`` dict would then OmegaConf-*deep-merge* onto that resolved config and silently inherit stale keys -- notably ``thinking.budget_tokens: 10000`` from ``claude_opus_4_5``'s ``thinking.type: enabled``, which the API rejects ("Extra inputs are not permitted") under our ``thinking.type: adaptive`` and which sends every request into a retry storm (no usage, just 400s). A brand-new key has nothing to merge with, so our provider config is used verbatim; ``${llm_configs.open_atp}`` (a string node) cleanly *replaces* default.yaml's interpolation, and ``llm_configs`` is stripped after resolution as a non-Config temporary key. ``retry_config`` overrides only ``stop_after_attempt`` (see :data:`_LLM_MAX_RETRIES`); merge_configs' final merge onto the structured ``LLMConfig`` schema deep-merges it over ax-prover's ``DEFAULT_LLM_RETRY_CONFIG``, so the exponential-jitter wait is preserved -- we just refuse to retry 10k times. """ config: dict[str, Any] = { "llm_configs": { "open_atp": { "model": self._ax_model(), "provider_config": self._provider_config(), "retry_config": {"stop_after_attempt": _LLM_MAX_RETRIES}, } }, "prover": {"prover_llm": "${llm_configs.open_atp}"}, } if self.max_iterations is not None: config["prover"]["max_iterations"] = int(self.max_iterations) return json.dumps(config, indent=2) def _agent_command(self) -> str: # No <<MODEL>>/<<EFFORT>> substitution: those live in axprover.yaml. return (_SCRIPTS / "axprover_agent.sh").read_text()
[docs] def parse(self, lines: list[str]) -> HarnessRunResult: # Tokens come from the per-target ``-o`` files (ax_output.<target>.json), each # a ``{location: {success, ..., input_tokens, output_tokens, ...}}`` map written # by the launch script. Sum across every target in every file; the stream itself # carries no usage. Leave cost_usd None so the prover derives USD from the token # table (like CodexHarness). result = self._parse_lines(lines) if self._wd is not None and self._wd.is_dir(): for path in sorted(self._wd.glob("ax_output.*.json")): try: data = json.loads(path.read_text()) except (OSError, json.JSONDecodeError): continue entries = data.values() if isinstance(data, dict) else [] for entry in entries: if not isinstance(entry, dict): continue result.input_tokens += int(entry.get("input_tokens", 0) or 0) result.output_tokens += int(entry.get("output_tokens", 0) or 0) return result
[docs] def collect_logs(self, wd: Path, logs_dir: Path) -> None: # ax-prover's rich record is the per-target ``ax_output.<target>.json`` (usage # + outcome) and the teed ``ax_prover.<target>.log`` (human-readable run log), # both written into the workdir. Move them out to ``logs/`` -- ``parse`` has # already summed the token fields, so relocating is safe. moved = sorted(wd.glob("ax_output.*.json")) + sorted(wd.glob("ax_prover.*.log")) if moved: logs_dir.mkdir(parents=True, exist_ok=True) for path in moved: shutil.move(str(path), str(logs_dir / path.name))
def _parse_lines(self, lines: list[str]) -> HarnessRunResult: # ax-prover's stdout is human-readable logs, not a JSON event stream; keep the # last non-empty line as result text for debugging and read tokens elsewhere. result = HarnessRunResult() for line in lines: stripped = line.strip() if stripped: result.result_text = stripped return result