Source code for open_atp.harness.base

"""The harness contract: the *agent* concern of :class:`~...agent_prover.AgentProver`.

A :class:`Harness` knows, for one agent CLI (Claude Code / Codex / OpenCode):

* how to populate the working directory from its assets (launch script, MCP
  config, skills) -- :meth:`Harness.stage_wd` -- and where to write the prompt the
  prover hands it -- :meth:`Harness.write_prompt`;
* the bash command that launches the agent -- :attr:`Harness.command`;
* which credentials to resolve and forward -- :meth:`Harness.agent_auth`; and
* how to read token/cost totals out of the agent's streamed JSON
  -- :meth:`Harness.parse_result`.

The *compute* concern (where that command runs, with Lean+Mathlib and a warm
cache) lives in the injected :class:`~open_atp.backends.base.ComputeBackend`.

Ported from milp_flare's ``harness/`` package. The skills to mount are owned by
the prover (``AgentProver.skills``, resolved to source dirs and handed to
:meth:`Harness.stage_skills`); plugins are Claude-only and live on
``ClaudeCodeHarness.plugins``.
"""

from __future__ import annotations

import os
import shutil
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import ClassVar

#: Provider name (see :func:`_infer_provider`) -> the canonical env var the agent
#: CLI reads its key from. OpenCode/ax-prover forward the selected provider's key
#: under this name.
_PROVIDER_ENV = {
    "anthropic": "ANTHROPIC_API_KEY",
    "openai": "OPENAI_API_KEY",
    "google": "GOOGLE_API_KEY",
    "deepseek": "DEEPSEEK_API_KEY",
}

#: Files the harness writes into the workdir; named so they never collide with a
#: project's own sources.
SCRIPT_FILE = "agent.sh"
PROMPT_FILE = "agent_prompt.txt"


[docs] @dataclass(frozen=True) class AgentAuth: """Resolved credentials a harness hands the prover to wire into the sandbox. Unlike a declarative spec, ``env`` here holds resolved name->**value** pairs -- the harness has already read the host environment (and any explicit overrides) and validated that required credentials are present. The prover only forwards them; it never touches ``os.environ``. Attributes ---------- env: Environment variables (name -> value) to forward into the sandbox. mounts: Host directories to expose under the sandbox's ``$HOME``, as ``(host_dir, dest_basename)`` pairs (e.g. ``(~/.codex, ".codex")``). """ env: dict[str, str] = field(default_factory=dict) mounts: list[tuple[Path, str]] = field(default_factory=list)
[docs] @dataclass class HarnessRunResult: """Token totals and cost parsed from an agent's streamed output. Attributes ---------- input_tokens : int Total input (prompt) tokens the run consumed. output_tokens : int Total output (completion) tokens the run produced. stop_reason : str, optional Why the agent stopped, when the stream reports it; ``None`` otherwise. cost_usd : float, optional USD cost if the harness self-reports it (Claude Code, OpenCode); ``None`` when it must be estimated from token counts (Codex, ax-prover). subtype : str, optional Final ``type:"result"`` subtype (Claude Code: ``success`` / ``error_max_turns`` / ``error_during_execution``). Used by NuminaProver's round loop to decide continue-vs-stop when no END_REASON marker is present. result_text : str, optional The agent's final result text (Claude Code's ``result`` field), where the Numina coordinator prints its ``END_REASON:<reason>`` marker. """ input_tokens: int = 0 output_tokens: int = 0 stop_reason: str | None = None cost_usd: float | None = None subtype: str | None = None result_text: str | None = None
[docs] class Harness(ABC): """Base class for an agent CLI harness. Parameters ---------- model : str Model id the agent runs. Default ``"claude-opus-4-8"`` (Codex defaults to ``"gpt-5.5"``, Vibe to ``"magistral-medium-latest"``). effort : str Reasoning-effort level passed to harnesses that support it. Default ``"high"``. Attributes ---------- command : str Bash command the backend runs to launch the agent (read-only property). """ name: ClassVar[str] #: Workdir-relative directory this harness mounts skills into (e.g. #: ``.claude/skills``), or ``None`` if the harness doesn't consume skills #: (ax-prover ships its own). Read by :meth:`stage_skills` and the prover. skills_dest: ClassVar[str | None] = None def __init__( self, *, model: str = "claude-opus-4-8", effort: str = "high", ) -> None: # model/effort documented as class Parameters/Attributes above. self.model = model self.effort = effort @property def command(self) -> str: """Bash command the backend runs to launch the agent. The backend has already ``cd``'d into the workdir and symlinked ``.lake``; we export ``$PROMPT`` from the written prompt file (the launch scripts reference it) and run the rendered script. Returns ------- str The bash one-liner that exports ``$PROMPT`` and runs ``agent.sh``. """ return f'export PROMPT="$(cat {PROMPT_FILE})" && bash {SCRIPT_FILE}'
[docs] def agent_auth(self) -> AgentAuth: """Resolve this harness's credentials into a ready-to-forward auth bundle. Merges, in order: non-secret constants (:meth:`_static_env`) and resolved required credentials (:meth:`_required_env`, which raises if a needed key is absent). Mount dirs come from :meth:`_home_dirs`. Subclasses needing best-effort host passthrough override this (see ``NuminaHarness``). Returns ------- AgentAuth Resolved env (name -> value) and ``(host_dir, dest_basename)`` mounts the prover forwards into the sandbox. Examples -------- An explicit ``oauth_token`` is resolved into the forwarded env: >>> from open_atp.harness import ClaudeCodeHarness >>> harness = ClaudeCodeHarness(oauth_token="sk-ant-oat-fake") >>> harness.agent_auth().env["CLAUDE_CODE_OAUTH_TOKEN"] 'sk-ant-oat-fake' """ env: dict[str, str] = {} env.update(self._static_env()) env.update(self._required_env()) return AgentAuth(env=env, mounts=self._home_dirs())
def _static_env(self) -> dict[str, str]: """Non-secret env vars to set for this harness (e.g. ``IS_SANDBOX``). Returns ------- dict[str, str] Constant name -> value pairs; empty for the base class. """ return {} def _required_env(self) -> dict[str, str]: """Resolve the harness's required credentials (name -> value). Override to read explicit constructor overrides or fall back to the host environment, raising if a required key is absent. Returns ------- dict[str, str] Resolved credential name -> value pairs; empty for the base class. """ return {} def _home_dirs(self) -> list[tuple[Path, str]]: """``(src, basename)`` dirs to mount under the sandbox ``$HOME``. Returns ------- list[tuple[pathlib.Path, str]] ``(host_dir, dest_basename)`` mount pairs; empty for the base class. """ return [] def _provider_key_env(self, provider: str, explicit: str | None) -> dict[str, str]: """Resolve a provider API key, forwarded under its canonical env-var name. ``explicit`` (a constructor override) wins; otherwise read the host env under :data:`_PROVIDER_ENV`'s name for ``provider``. Raise if neither is set. No format check -- OpenAI and DeepSeek keys are both ``sk-...`` and indistinguishable, so the key is assumed correct for the selected provider. Parameters ---------- provider : str Provider name (see :func:`_infer_provider`) whose canonical env var the key is forwarded under. explicit : str, optional A constructor override that takes precedence over the host environment; ``None`` falls back to reading the host env. Returns ------- dict[str, str] A single ``{env_name: key}`` pair to forward into the sandbox. Raises ------ RuntimeError If neither ``explicit`` nor the host env supplies the key. """ env_name = _PROVIDER_ENV[provider] key = explicit or os.environ.get(env_name) if not key: raise RuntimeError( f"{self.name} harness requires {env_name} for provider {provider!r}" ) return {env_name: key}
[docs] def stage_wd(self, wd: Path) -> None: """Populate ``wd`` with the harness's launch script. Everything the harness itself owns -- *not* the skills list (the prover stages it via :meth:`stage_skills`) and *not* the prompt (the prover and task own it, written via :meth:`write_prompt`). Subclasses that need more (Vibe's VIBE_HOME, ax-prover's per-target setup) override and call ``super().stage_wd``. Parameters ---------- wd : pathlib.Path The agent working directory to populate; must already exist. Raises ------ RuntimeError If ``wd`` does not exist. """ if not wd.exists(): raise RuntimeError("The agent working directory must be created first.") (wd / SCRIPT_FILE).write_text(self._agent_command())
[docs] def write_prompt(self, wd: Path, prompt: str) -> None: """Write the composed prompt where this harness's launch script reads it. The prompt's *content* is owned by the prover (its prover prompt) and the task (the optional user prompt); the harness owns only the file location and the ``cat $PROMPT`` launch contract, so it provides the write mechanism. Parameters ---------- wd : pathlib.Path The agent working directory the launch script reads the prompt from. prompt : str The composed prompt text to write to :data:`PROMPT_FILE`. """ (wd / PROMPT_FILE).write_text(prompt)
[docs] def parse_result(self, lines: list[str]) -> HarnessRunResult: """Parse the agent's streamed JSON lines into a :class:`HarnessRunResult`. Parameters ---------- lines : list[str] The agent's streamed stdout, one JSON object per line. Returns ------- HarnessRunResult Token totals, cost, and stop metadata parsed from the stream. """ return self._parse_lines(lines)
[docs] def collect_logs(self, wd: Path, logs_dir: Path) -> None: """Move this harness's rich log files out of ``wd`` into ``logs_dir``. The streamed event JSONL the prover captures from stdout *is* the agent's transcript for every CLI harness, so the default does nothing. Harnesses that *also* drop a richer record inside the workdir (Vibe's session log, ax-prover's per-target logs) override this to relocate those files -- so ``download_wd`` stays the proof project and ``download_logs`` carries the full record. Called after :meth:`parse_result` (which may read those files for cost), so moving them is safe. Parameters ---------- wd : pathlib.Path The agent working directory rich log files are moved out of. logs_dir : pathlib.Path The run's log directory the files are relocated into. """
[docs] def stage_skills(self, wd: Path, skill_dirs: list[Path]) -> None: """Copy resolved skill source dirs into this harness's skill location. Each ``<name>/SKILL.md`` tree lands at ``wd/<skills_dest>/<dir-name>/`` (an upstream ``tests/`` fixture dir is dropped). A no-op for a harness that does not consume skills (``skills_dest is None``, e.g. ax-prover). The prover owns the *list* (``AgentProver.skills``, resolved by ``resolve_skill``); the harness owns *where* it goes. Parameters ---------- wd : pathlib.Path The agent working directory the skills are copied into (under :attr:`skills_dest`). skill_dirs : list[pathlib.Path] Resolved skill source dirs (each a ``<name>/SKILL.md`` tree) to copy. """ if self.skills_dest is None or not skill_dirs: return target = wd / self.skills_dest target.mkdir(parents=True, exist_ok=True) for skill in skill_dirs: shutil.copytree( skill, target / skill.name, ignore=shutil.ignore_patterns("tests"), dirs_exist_ok=True, )
def _render(self, template: str) -> str: """Substitute ``<<MODEL>>``/``<<EFFORT>>`` into a launch-script template. Parameters ---------- template : str A launch-script template with ``<<MODEL>>``/``<<EFFORT>>`` placeholders. Returns ------- str ``template`` with the placeholders replaced by :attr:`model`/:attr:`effort`. """ return template.replace("<<MODEL>>", self.model).replace( "<<EFFORT>>", self.effort ) @abstractmethod def _agent_command(self) -> str: """The rendered contents of the workdir's ``agent.sh``. Returns ------- str The fully rendered launch script for this harness. """ @abstractmethod def _parse_lines(self, lines: list[str]) -> HarnessRunResult: """Parse the agent's streamed JSON lines into a :class:`HarnessRunResult`. Parameters ---------- lines : list[str] The agent's streamed stdout, one JSON object per line. Returns ------- HarnessRunResult Token totals, cost, and stop metadata parsed from the stream. """
def _infer_provider(model: str) -> str: if model.startswith("claude"): return "anthropic" if model.startswith("deepseek"): return "deepseek" if model.startswith("gemini"): return "google" return "openai"