Source code for open_atp.harness.base

"""The harness contract: the *agent* concern of :class:`~...agent_prover.AgentProver`.

A :class:`Harness` knows, for one agent CLI (Claude Code / Codex / OpenCode):

* how to populate the working directory (launch script, MCP config, skills,
  prompt) -- :meth:`Harness.configure_wd`;
* the bash command that launches the agent -- :attr:`Harness.command`;
* which credentials to forward into the sandbox -- :meth:`Harness.auth_spec`; and
* how to read token/cost totals out of the agent's streamed JSON
  -- :meth:`Harness.parse`.

The *compute* concern (where that command runs, with Lean+Mathlib and a warm
cache) lives in the injected :class:`~open_atp.backends.base.ComputeBackend`.

Ported from milp_flare's ``harness/`` package; skills/plugins to mount are
carried by the injected :class:`~open_atp.harness.bundles.AssetBundle` (the
default mounts the vendored ``lean-proof`` skill, plus the ``lean4`` plugin for
the Claude harness).
"""

from __future__ import annotations

import shutil
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, ClassVar

from open_atp.harness.bundles import DEFAULT_BUNDLE, AssetBundle

#: Files the harness writes into the workdir; named so they never collide with a
#: project's own sources.
SCRIPT_FILE = "agent.sh"
PROMPT_FILE = "agent_prompt.txt"


[docs] @dataclass(frozen=True) class AuthSpec: """Compute-agnostic description of the credentials a harness needs. Attributes ---------- env: Host environment-variable names to forward into the sandbox. home_dirs: Host directories to expose under the sandbox's ``$HOME``, as ``(host_dir, dest_basename)`` pairs (e.g. ``(~/.codex, ".codex")``). """ env: list[str] = field(default_factory=list) home_dirs: list[tuple[Path, str]] = field(default_factory=list)
[docs] @dataclass class HarnessRunResult: """Token totals and cost parsed from an agent's streamed output.""" input_tokens: int = 0 output_tokens: int = 0 stop_reason: str | None = None #: USD cost if the harness self-reports it (Claude Code, OpenCode); ``None`` #: when it must be estimated from token counts (Codex). cost_usd: float | None = None #: Final ``type:"result"`` subtype (Claude Code: ``success`` / #: ``error_max_turns`` / ``error_during_execution``). Used by NuminaProver's #: round loop to decide continue-vs-stop when no END_REASON marker is present. subtype: str | None = None #: The agent's final result text (Claude Code's ``result`` field), where the #: Numina coordinator prints its ``END_REASON:<reason>`` marker. result_text: str | None = None
[docs] class Harness(ABC): """Base class for an agent CLI harness.""" name: ClassVar[str] def __init__( self, model: str, effort: str = "medium", assets: AssetBundle | None = None ) -> None: self.model = model self.effort = effort self.assets = assets or DEFAULT_BUNDLE
[docs] @classmethod def from_config(cls, config: Any, *, assets: AssetBundle | None = None) -> Harness: """Build a harness from an :class:`AgentProverConfig`. The default reads only the shared ``model``/``effort`` knobs; harnesses with extra config (e.g. :class:`VibeHarness`'s agent/turn/price) override this. """ return cls(config.model, config.effort, assets=assets)
@property def command(self) -> str: """Bash command the backend runs to launch the agent. The backend has already ``cd``'d into the workdir and symlinked ``.lake``; we export ``$PROMPT`` from the written prompt file (the launch scripts reference it) and run the rendered script. """ return f'export PROMPT="$(cat {PROMPT_FILE})" && bash {SCRIPT_FILE}'
[docs] def static_env(self) -> dict[str, str]: """Non-secret env vars to set for this harness (e.g. ``IS_SANDBOX``).""" return {}
[docs] def auth_spec(self) -> AuthSpec: """Credentials to forward into the sandbox for this harness.""" return AuthSpec()
[docs] def configure_wd(self, wd: Path, prompt: str) -> None: """Populate ``wd`` with the launch script, prompt, MCP config, and skills.""" if not wd.exists(): raise RuntimeError("The agent working directory must be created first.") (wd / SCRIPT_FILE).write_text(self._agent_command()) (wd / PROMPT_FILE).write_text(prompt) self._copy_extra_dirs(wd)
def _copy_extra_dirs(self, wd: Path) -> None: """Copy the bundle's extra asset trees (e.g. Numina's prompts) into ``wd``.""" for src, dest in self.assets.extra_dirs: target = wd / dest target.parent.mkdir(parents=True, exist_ok=True) shutil.copytree(src, target, dirs_exist_ok=True)
[docs] def parse(self, lines: list[str]) -> HarnessRunResult: """Parse the agent's streamed JSON lines into a :class:`HarnessRunResult`.""" return self._parse_lines(lines)
[docs] def collect_logs(self, wd: Path, logs_dir: Path) -> None: """Move this harness's rich log files out of ``wd`` into ``logs_dir``. The streamed event JSONL the prover captures from stdout *is* the agent's transcript for every CLI harness, so the default does nothing. Harnesses that *also* drop a richer record inside the workdir (Vibe's session log, ax-prover's per-target logs) override this to relocate those files -- so ``download_wd`` stays the proof project and ``download_logs`` carries the full record. Called after :meth:`parse` (which may read those files for cost), so moving them is safe. """
def _copy_skills(self, wd: Path, dest: str) -> None: """Copy the selected bundle's skills into ``wd/<dest>``. Two mount modes (a bundle may use either or both): * each dir in ``skills`` -> ``wd/<dest>/<dir-name>/`` (ordinary named skills; an upstream ``tests/`` fixture dir is dropped); and * the legacy ``skills_dir`` -> its *contents* to ``wd/<dest>/`` (a single root-mounted skill bundle, e.g. Numina). A no-op when the bundle mounts no skills. """ if self.assets.skills_dir is None and not self.assets.skills: return target = wd / dest target.mkdir(parents=True, exist_ok=True) if self.assets.skills_dir is not None: shutil.copytree(self.assets.skills_dir, target, dirs_exist_ok=True) for skill in self.assets.skills: shutil.copytree( skill, target / skill.name, ignore=shutil.ignore_patterns("tests"), dirs_exist_ok=True, ) def _render(self, template: str) -> str: """Substitute ``<<MODEL>>``/``<<EFFORT>>`` into a launch-script template.""" return template.replace("<<MODEL>>", self.model).replace( "<<EFFORT>>", self.effort ) @abstractmethod def _agent_command(self) -> str: """The rendered contents of the workdir's ``agent.sh``.""" @abstractmethod def _parse_lines(self, lines: list[str]) -> HarnessRunResult: ...
def _infer_provider(model: str) -> str: if model.startswith("claude"): return "anthropic" if model.startswith("deepseek"): return "deepseek" if model.startswith("gemini"): return "google" return "openai"