Skip to content

Agents

Agents are the core abstraction for interacting with tasks. Every agent receives a task instruction and a tmux session, and must return an AgentResult.

Base Agent

BaseAgent

BaseAgent(**kwargs)

Bases: ABC

Source code in fceval/agents/base_agent.py
def __init__(self, **kwargs):
    super().__init__()
    self._version = kwargs.get("version", None)
    self._prompt_template = kwargs.get("prompt_template", None)

version property

version

The version of the agent. Can be any string (e.g. could be a date or a semantic version, a single digit, etc.).

Can be dynamic based on a kwarg in the constructor.

Examples:

Default version (latest, or undefined)

agent = ClaudeCodeAgent() agent.version # -> "latest"

Custom version

agent = ClaudeCodeAgent(version="1.2.3") agent.version # -> "1.2.3"

With other parameters

agent = ClaudeCodeAgent( model_name="anthropic/claude-3-sonnet-20240229", version="0.5.0" ) agent.version # -> "0.5.0"

The version, if defined, is used in the installation script: npm install -g @anthropic-ai/claude-code@{version}

prompt_template property

prompt_template

The path to a custom prompt template file. If specified, this template will be used to render the instruction before passing it to the agent.

The template must be a Jinja2 template that includes an "instruction" variable.

Examples:

Default behavior (no template)

agent = OpenHandsAgent() agent.prompt_template # -> None

Custom template

agent = OpenHandsAgent(prompt_template="./custom_prompt.j2") agent.prompt_template # -> "./custom_prompt.j2"

Usage with CLI

uv run tb run --agent openhands --task-id hello-world --agent-kwarg prompt_template=./openhands_template.j2

name abstractmethod staticmethod

name()
Source code in fceval/agents/base_agent.py
@staticmethod
@abstractmethod
def name() -> str:
    raise NotImplementedError("Agents must implement this method.")

perform_task abstractmethod

perform_task(instruction, session, logging_dir=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/base_agent.py
@abstractmethod
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    raise NotImplementedError("Agents must implement this method.")

AgentResult

Bases: BaseModel

total_input_tokens class-attribute instance-attribute

total_input_tokens = Field(default=0, description='The total number of input tokens used by the agent to complete the task.')

total_output_tokens class-attribute instance-attribute

total_output_tokens = Field(default=0, description='The total number of output tokens used by the agent to complete the task.')

total_cost class-attribute instance-attribute

total_cost = Field(default=0.0, description='The total cost in USD of all LLM calls made by the agent.')

failure_mode class-attribute instance-attribute

failure_mode = Field(default=NONE, description="The failure mode of the agent's execution, if any.")

timestamped_markers class-attribute instance-attribute

timestamped_markers = Field(default=[], description="A list of timestamped markers from the agent's execution of the task. Each marker is a tuple of (timestamp, marker_text). The timestamp is the time in seconds since the start of the task, and the marker_text is the text of the marker. The timestamp can be found by calling `get_asciinema_timestamp()` on the `TmuxSession` object.")

Agent Factory

AgentFactory

get_agent staticmethod

get_agent(agent_name=None, import_path=None, **kwargs)
Source code in fceval/agents/agent_factory.py
@staticmethod
def get_agent(
    agent_name: AgentName | None = None, import_path: str | None = None, **kwargs
) -> BaseAgent:
    if (agent_name is None) == (import_path is None):
        raise ValueError(
            "Exactly one of agent_name or import_path must be provided"
        )

    agent_class = AgentFactory.get_agent_class(agent_name, import_path)

    return agent_class(**kwargs)

get_agent_class staticmethod

get_agent_class(agent_name=None, import_path=None)
Source code in fceval/agents/agent_factory.py
@staticmethod
def get_agent_class(
    agent_name: AgentName | None = None, import_path: str | None = None
) -> type[BaseAgent]:
    if (agent_name is None) == (import_path is None):
        raise ValueError(
            "Exactly one of agent_name or import_path must be provided"
        )

    if import_path is not None:
        return AgentFactory.get_agent_from_import_path(import_path)

    if agent_name is None:
        raise ValueError("agent_name must be provided if import_path is not")

    if agent_name.value not in AgentFactory.AGENT_NAME_TO_CLASS:
        raise ValueError(
            f"Unknown agent: {agent_name}. "
            f"Available agents: {AgentFactory.AGENT_NAME_TO_CLASS.keys()}"
        )

    return AgentFactory.AGENT_NAME_TO_CLASS[agent_name.value]

get_agent_from_import_path staticmethod

get_agent_from_import_path(import_path)
Source code in fceval/agents/agent_factory.py
@staticmethod
def get_agent_from_import_path(import_path: str) -> type[BaseAgent]:
    if ":" not in import_path:
        raise ValueError("Import path must be in format 'module:class'")

    module_name, class_name = import_path.split(":", 1)

    try:
        module = importlib.import_module(module_name)
        agent_class = getattr(module, class_name)

        if not issubclass(agent_class, BaseAgent):
            raise ValueError(f"Class {class_name} is not a subclass of BaseAgent")

        return agent_class
    except (ImportError, AttributeError) as e:
        raise ValueError(f"Failed to import agent from {import_path}: {e}")

Agent Name

AgentName

Bases: Enum

ORACLE class-attribute instance-attribute

ORACLE = 'oracle'

NAIVE class-attribute instance-attribute

NAIVE = 'naive'

TERMINUS class-attribute instance-attribute

TERMINUS = 'terminus'

TERMINUS_1 class-attribute instance-attribute

TERMINUS_1 = 'terminus-1'

TERMINUS_2 class-attribute instance-attribute

TERMINUS_2 = 'terminus-2'

MCP_TERMINUS class-attribute instance-attribute

MCP_TERMINUS = 'mcp-terminus'

CLAUDE_CODE class-attribute instance-attribute

CLAUDE_CODE = 'claude-code'

AIDER class-attribute instance-attribute

AIDER = 'aider'

CODEX class-attribute instance-attribute

CODEX = 'codex'

GOOSE_MCP class-attribute instance-attribute

GOOSE_MCP = 'goose-mcp'

GOOSE class-attribute instance-attribute

GOOSE = 'goose'

GEMINI_CLI class-attribute instance-attribute

GEMINI_CLI = 'gemini-cli'

NOP class-attribute instance-attribute

NOP = 'nop'

OPENHANDS class-attribute instance-attribute

OPENHANDS = 'openhands'

GROK_CLI class-attribute instance-attribute

GROK_CLI = 'grok-cli'

MINI_SWE_AGENT class-attribute instance-attribute

MINI_SWE_AGENT = 'mini-swe-agent'

OPENCODE class-attribute instance-attribute

OPENCODE = 'opencode'

QWEN_CODER class-attribute instance-attribute

QWEN_CODER = 'qwen-coder'

CURSOR_CLI class-attribute instance-attribute

CURSOR_CLI = 'cursor-cli'

Failure Modes

FailureMode

Bases: Enum

UNSET class-attribute instance-attribute

UNSET = 'unset'

NONE class-attribute instance-attribute

NONE = 'none'

UNKNOWN class-attribute instance-attribute

UNKNOWN = 'unknown'

TEST_TIMEOUT class-attribute instance-attribute

TEST_TIMEOUT = 'test_timeout'

SETUP_TIMEOUT class-attribute instance-attribute

SETUP_TIMEOUT = 'setup_timeout'

AGENT_TIMEOUT class-attribute instance-attribute

AGENT_TIMEOUT = 'agent_timeout'

UNKNOWN_AGENT_ERROR class-attribute instance-attribute

UNKNOWN_AGENT_ERROR = 'unknown_agent_error'

PARSE_ERROR class-attribute instance-attribute

PARSE_ERROR = 'parse_error'

FATAL_LLM_PARSE_ERROR class-attribute instance-attribute

FATAL_LLM_PARSE_ERROR = 'fatal_llm_parse_error'

CONTEXT_LENGTH_EXCEEDED class-attribute instance-attribute

CONTEXT_LENGTH_EXCEEDED = 'context_length_exceeded'

OUTPUT_LENGTH_EXCEEDED class-attribute instance-attribute

OUTPUT_LENGTH_EXCEEDED = 'output_length_exceeded'

AGENT_INSTALLATION_FAILED class-attribute instance-attribute

AGENT_INSTALLATION_FAILED = 'agent_installation_failed'

Built-in Agents

Oracle Agent

OracleAgent

OracleAgent(dataset_path, task_ids=None, n_tasks=None, **kwargs)

Bases: BaseAgent

Source code in fceval/agents/oracle_agent.py
def __init__(
    self,
    dataset_path: Path,
    task_ids: list[str] | None = None,
    n_tasks: int | None = None,
    **kwargs,
):
    super().__init__(**kwargs)

    self._dataset = Dataset(
        path=dataset_path,
        task_ids=task_ids,
        n_tasks=n_tasks,
    )

    self._logger = logger.getChild(__name__)
    self._init_solution_dict()

name staticmethod

name()
Source code in fceval/agents/oracle_agent.py
@staticmethod
def name() -> str:
    return AgentName.ORACLE.value

perform_task

perform_task(instruction, session, logging_dir=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/oracle_agent.py
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    solution = self._solution_dict[instruction]

    if solution["type"] == "sh":
        session.copy_to_container(
            solution["path"],
            container_dir="/oracle",
            container_filename="solution.sh",
        )
        session.send_keys(
            ["bash /oracle/solution.sh", "Enter"],
            max_timeout_sec=float("inf"),
            block=True,
        )
    else:
        for command in solution["commands"]:
            session.send_command(command)

    return AgentResult(
        total_input_tokens=0,
        total_output_tokens=0,
        failure_mode=FailureMode.NONE,
    )

Nop Agent

NopAgent

NopAgent(**kwargs)

Bases: BaseAgent

Source code in fceval/agents/null_agent.py
def __init__(self, **kwargs):
    super().__init__(**kwargs)

name staticmethod

name()
Source code in fceval/agents/null_agent.py
@staticmethod
def name() -> str:
    return "nop"

perform_task

perform_task(instruction, session, logging_dir=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/null_agent.py
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    return AgentResult(
        total_input_tokens=0,
        total_output_tokens=0,
        failure_mode=FailureMode.NONE,
    )

Naive Agent

NaiveAgent

NaiveAgent(llm, **kwargs)

Bases: BaseAgent

Source code in fceval/agents/naive_agent.py
def __init__(
    self,
    llm: BaseLLM,
    **kwargs,
):
    super().__init__(**kwargs)
    self._llm = llm
    self._prompt_template = self.PROMPT_TEMPLATE_PATH.read_text()
    self._logger = logger.getChild(__name__)

    self._running_total_input_tokens = 0
    self._running_total_output_tokens = 0

name staticmethod

name()
Source code in fceval/agents/naive_agent.py
@staticmethod
def name() -> str:
    return AgentName.NAIVE.value

perform_task

perform_task(instruction, session, logging_dir=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/naive_agent.py
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    self._running_total_input_tokens = 0
    self._running_total_output_tokens = 0

    prompt = self._prompt_template.format(instruction=instruction)

    if logging_dir is not None:
        prompt_path = logging_dir / "prompt.txt"
        response_path = logging_dir / "response.json"
        logging_path = logging_dir / "debug.json"
    else:
        logging_path = None

    if logging_dir is not None:
        prompt_path.write_text(prompt)

    response = self._llm.call(
        prompt,
        response_format=CommandResponse,
        logging_path=logging_path,
        portkey_metadata=portkey_metadata,
        portkey_trace_id=portkey_trace_id,
    )

    self._running_total_input_tokens += self._llm.count_tokens(
        [{"role": "user", "content": prompt}]
    )
    self._running_total_output_tokens += self._llm.count_tokens(
        [{"role": "assistant", "content": response}]
    )

    if logging_dir is not None:
        try:
            response_path.write_text(json.dumps(json.loads(response), indent=4))
        except Exception:
            response_path.write_text(response)

    try:
        parsed_response = CommandResponse.model_validate_json(response)
    except Exception as e:
        self._logger.error(f"Failed to parse LLM response: {e}")
        cost = self._llm.total_cost if hasattr(self._llm, "total_cost") else 0.0
        return AgentResult(
            total_input_tokens=self._running_total_input_tokens,
            total_output_tokens=self._running_total_output_tokens,
            total_cost=cost,
            failure_mode=FailureMode.FATAL_LLM_PARSE_ERROR,
        )

    for command in parsed_response.commands:
        # TODO (alexgshaw): This won't work for special keys like C-o.
        session.send_keys([command, "Enter"], block=True)

    cost = self._llm.total_cost if hasattr(self._llm, "total_cost") else 0.0
    return AgentResult(
        total_input_tokens=self._running_total_input_tokens,
        total_output_tokens=self._running_total_output_tokens,
        total_cost=cost,
        failure_mode=FailureMode.NONE,
    )

Terminus-2

Terminus2

Terminus2(model_name, max_episodes=None, parser_name='json', api_base=None, temperature=0.7, model_kwargs=None, **kwargs)

Bases: BaseAgent

Source code in fceval/agents/terminus_2/terminus_2.py
def __init__(
    self,
    model_name: str,
    max_episodes: int | None = None,
    parser_name: str = "json",
    api_base: str | None = None,
    temperature: float = 0.7,
    model_kwargs: dict | None = None,
    **kwargs,
):
    super().__init__(**kwargs)
    self._model_name = model_name
    self._parser_name = parser_name
    # Use LiteLLM directly when custom api_base is provided
    # (e.g., local vLLM server).
    # Otherwise use PortkeyLiteLLM for standard routing through Portkey gateway
    if api_base:
        self._llm = LiteLLM(
            model_name=model_name,
            api_base=api_base,
            temperature=temperature,
            model_kwargs=model_kwargs,
        )
    else:
        self._llm = PortkeyLiteLLM(
            model_name=model_name,
            temperature=temperature,
            model_kwargs=model_kwargs,
        )
    self._parser = self._get_parser()
    self._prompt_template = self._get_prompt_template_path().read_text()
    self._timeout_template = self._get_timeout_template_path().read_text()
    self._logger = logger.getChild(__name__)

    # Handle max_episodes setting
    if max_episodes is not None:
        self._logger.warning(
            f"max_episodes artificially limited to {max_episodes}. "
            "Consider removing this limit for better task completion."
        )
        self._max_episodes = max_episodes
    else:
        self._max_episodes = 1000000  # Effectively unlimited
    self._chat: Chat | None = None
    self._timestamped_markers: list[tuple[float, str]] = []
    self._pending_completion = False

name staticmethod

name()
Source code in fceval/agents/terminus_2/terminus_2.py
@staticmethod
def name() -> str:
    return "terminus-2"

perform_task

perform_task(instruction, session, logging_dir=None, time_limit_seconds=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/terminus_2/terminus_2.py
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    time_limit_seconds: float | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    chat = Chat(
        self._llm,
        portkey_metadata=portkey_metadata,
        portkey_trace_id=portkey_trace_id,
    )

    initial_prompt = self._prompt_template.format(
        instruction=instruction,
        terminal_state=self._limit_output_length(session.get_incremental_output()),
    )

    self._run_agent_loop(initial_prompt, session, chat, logging_dir, instruction)

    return AgentResult(
        total_input_tokens=chat.total_input_tokens,
        total_output_tokens=chat.total_output_tokens,
        total_cost=chat.total_cost,
        failure_mode=FailureMode.NONE,
        timestamped_markers=self._timestamped_markers,
    )

Installed Agents

AbstractInstalledAgent

AbstractInstalledAgent(*args, **kwargs)

Bases: BaseAgent, ABC

The container agent logs path is mounted to the host. Your agent can log to it if you want to view trajectories after the task finishes.

Source code in fceval/agents/installed_agents/abstract_installed_agent.py
def __init__(self, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self._portkey_metadata: dict[str, str] | None = None
    self._portkey_trace_id: str | None = None

perform_task

perform_task(instruction, session, logging_dir=None, portkey_metadata=None, portkey_trace_id=None)
Source code in fceval/agents/installed_agents/abstract_installed_agent.py
def perform_task(
    self,
    instruction: str,
    session: TmuxSession,
    logging_dir: Path | None = None,
    portkey_metadata: dict[str, str] | None = None,
    portkey_trace_id: str | None = None,
) -> AgentResult:
    self._set_portkey_context(portkey_metadata, portkey_trace_id)
    try:
        session.copy_to_container(
            self._install_agent_script_path,
            container_dir="/installed-agent",
            container_filename="install-agent.sh",
        )

        # Execute outside the session to avoid exposing the env variables.
        env_setup_content = self._create_env_setup_file()
        session.container.exec_run(
            [
                "sh",
                "-c",
                (
                    f"echo {shlex.quote(env_setup_content)} > "
                    "/installed-agent/setup-env.sh"
                ),
            ]
        )

        extra_files = self._extra_setup_files()
        for file_path, content in extra_files.items():
            parent_dir = str(Path(file_path).parent)
            session.container.exec_run(
                ["sh", "-c", f"mkdir -p {shlex.quote(parent_dir)}"]
            )
            session.container.exec_run(
                [
                    "sh",
                    "-c",
                    f"echo {shlex.quote(content)} > {shlex.quote(file_path)}",
                ]
            )

        # Set up environment variables
        session.send_keys(
            [
                "source /installed-agent/setup-env.sh",
                "Enter",
            ],
            block=True,
            max_timeout_sec=float("inf"),
        )

        # Execute installation script and capture failure status if it fails
        session.send_keys(
            [
                (
                    "source /installed-agent/install-agent.sh || "
                    "echo 'INSTALL_FAIL_STATUS'"
                ),
                "Enter",
            ],
            block=True,
            max_timeout_sec=float("inf"),
        )

        # Check if installation failed by looking for the failure status
        installation_output = session.capture_pane()

        output_lines = installation_output.split("\n")
        installation_failed = "INSTALL_FAIL_STATUS" in output_lines

        if installation_failed:
            # Parse metrics even on failure - agent may have logged before failing
            input_tokens, output_tokens, cost = self._parse_agent_metrics(
                logging_dir
            )
            return AgentResult(
                total_input_tokens=input_tokens,
                total_output_tokens=output_tokens,
                total_cost=cost,
                failure_mode=FailureMode.AGENT_INSTALLATION_FAILED,
            )

        # If installation succeeded, run the agent commands
        # Use prompt template to render instruction if provided
        rendered_instruction = self._render_instruction(instruction)
        run_agent_commands = self._run_agent_commands(rendered_instruction)
        for command in run_agent_commands:
            session.send_command(command)

        # Parse token usage and cost from agent logs
        input_tokens, output_tokens, cost = self._parse_agent_metrics(logging_dir)

        return AgentResult(
            total_input_tokens=input_tokens,
            total_output_tokens=output_tokens,
            total_cost=cost,
        )
    finally:
        self._clear_portkey_context()