Skip to content

Harness

The harness is the core orchestrator of FC-Eval. It manages the full lifecycle of a benchmark run: loading datasets, spinning up containers, running agents, executing tests, parsing results, and aggregating metrics.

Harness

Harness

Harness(output_path, run_id, agent_name=None, agent_import_path=None, agent_configs=None, dataset_name=None, dataset_version=None, dataset_path=None, dataset_config=None, registry_url=None, local_registry_path=None, model_name=None, agent_kwargs=None, no_rebuild=False, cleanup=False, log_level=INFO, task_ids=None, n_tasks=None, livestream=False, upload_results=False, n_concurrent_trials=4, exclude_task_ids=None, n_attempts=1, global_timeout_multiplier=1.0, global_agent_timeout_sec=None, global_test_timeout_sec=None, global_setup_timeout_sec=None, remote_build=False, ec2_instance_type='c6id.xlarge', ec2_instance_ami=None, ec2_use_nvme_storage=True, ec2_root_volume_size=50, history_limit=None)

Runs the Terminal-Bench harness.

Parameters:

Name Type Description Default
output_path Path

The path to the output directory. Results and logs will be written here.

required
run_id str

The ID of the run.

required
agent_name AgentName | None

The name of the agent to use to generate commands.

None
agent_import_path str | None

The import path of the agent.

None
agent_configs list[tuple[AgentName, str | None] | MultiAgentConfig | dict[str, Any]] | None

List of per-agent configs for multi-agent mode.

None
dataset_name str | None

The name of the dataset to use.

None
dataset_version str | None

The version of the dataset to use.

None
dataset_path Path | None

The path to the dataset.

None
dataset_config Path | None

Path to a YAML configuration file for the dataset. If provided, this will override dataset_name, dataset_version, dataset_path, task_ids, n_tasks, and exclude_task_ids.

None
registry_url str | None

The URL of the registry to use for the dataset.

None
local_registry_path Path | None

The path to the local registry file to use for the dataset. If provided, will use the local registry instead of the remote registry.

None
model_name str | None

The name of the model to use for the agent.

None
agent_kwargs dict[str, Any] | None

Additional keyword arguments to pass to the agent.

None
no_rebuild bool

Whether to avoid rebuilding the Docker image.

False
cleanup bool

Whether to remove the Docker image after the run.

False
log_level int

The logging level to use (debug, info, warning, error, critical).

INFO
task_ids list[str] | None

The IDs of the tasks to run. If None, all tasks will be run.

None
n_tasks int | None

The number of tasks to run. If None, all tasks will be run.

None
livestream bool

Whether to livestream the log file. Turns off all other logging to the console.

False
upload_results bool

Whether to upload results to S3. The bucket name is read from config.

False
n_concurrent_trials int

Maximum number of tasks to run concurrently.

4
exclude_task_ids list[str] | None

Set of task IDs or regex patterns to exclude from the run. Supports both exact matches and regex patterns.

None
n_attempts int

Number of attempts to make for each task.

1
global_timeout_multiplier float

Multiplier for the global timeout for agent runs.

1.0
global_agent_timeout_sec float | None

Global timeout for agent runs in seconds.

None
global_test_timeout_sec float | None

Global timeout for test runs in seconds.

None
global_setup_timeout_sec float | None

Global timeout for setup runs in seconds.

None
remote_build bool

Whether to run containers on AWS EC2 instances instead of locally.

False
ec2_instance_type str

EC2 instance type for remote builds.

'c6id.xlarge'
ec2_instance_ami str | None

EC2 AMI ID for remote builds.

None
ec2_use_nvme_storage bool

Whether to enable NVMe-backed Docker data-root optimization for remote EC2 builds.

True
ec2_root_volume_size int

Root EBS volume size in GB for remote EC2 builds.

50
history_limit int | None

Optional tmux scrollback history limit (lines).

None
Source code in fceval/harness/harness.py
def __init__(
    self,
    output_path: Path,
    run_id: str,
    agent_name: AgentName | None = None,
    agent_import_path: str | None = None,
    agent_configs: (
        list[tuple[AgentName, str | None] | MultiAgentConfig | dict[str, Any]]
        | None
    ) = None,
    dataset_name: str | None = None,
    dataset_version: str | None = None,
    dataset_path: Path | None = None,
    dataset_config: Path | None = None,
    registry_url: str | None = None,
    local_registry_path: Path | None = None,
    model_name: str | None = None,
    agent_kwargs: dict[str, Any] | None = None,
    no_rebuild: bool = False,
    cleanup: bool = False,
    log_level: int = logging.INFO,
    task_ids: list[str] | None = None,
    n_tasks: int | None = None,
    livestream: bool = False,
    upload_results: bool = False,
    n_concurrent_trials: int = 4,
    exclude_task_ids: list[str] | None = None,
    n_attempts: int = 1,
    global_timeout_multiplier: float = 1.0,
    global_agent_timeout_sec: float | None = None,
    global_test_timeout_sec: float | None = None,
    global_setup_timeout_sec: float | None = None,
    remote_build: bool = False,
    ec2_instance_type: str = "c6id.xlarge",
    ec2_instance_ami: str | None = None,
    ec2_use_nvme_storage: bool = True,
    ec2_root_volume_size: int = 50,
    history_limit: int | None = None,
):
    """
    Runs the Terminal-Bench harness.

    Args:
        output_path: The path to the output directory. Results and logs will be
            written here.
        run_id: The ID of the run.
        agent_name: The name of the agent to use to generate commands.
        agent_import_path: The import path of the agent.
        agent_configs: List of per-agent configs for multi-agent mode.
        dataset_name: The name of the dataset to use.
        dataset_version: The version of the dataset to use.
        dataset_path: The path to the dataset.
        dataset_config: Path to a YAML configuration file for the dataset.
            If provided, this will override dataset_name, dataset_version,
            dataset_path, task_ids, n_tasks, and exclude_task_ids.
        registry_url: The URL of the registry to use for the dataset.
        local_registry_path: The path to the local registry file to use for the
            dataset. If provided, will use the local registry instead of the remote
            registry.
        model_name: The name of the model to use for the agent.
        agent_kwargs: Additional keyword arguments to pass to the agent.
        no_rebuild: Whether to avoid rebuilding the Docker image.
        cleanup: Whether to remove the Docker image after the run.
        log_level: The logging level to use (debug, info, warning, error, critical).
        task_ids: The IDs of the tasks to run. If None, all tasks will be run.
        n_tasks: The number of tasks to run. If None, all tasks will be run.
        livestream: Whether to livestream the log file. Turns off all other logging
            to the console.
        upload_results: Whether to upload results to S3.
            The bucket name is read from config.
        n_concurrent_trials: Maximum number of tasks to run concurrently.
        exclude_task_ids: Set of task IDs or regex patterns to exclude from the run.
            Supports both exact matches and regex patterns.
        n_attempts: Number of attempts to make for each task.
        global_timeout_multiplier: Multiplier for the global timeout for agent runs.
        global_agent_timeout_sec: Global timeout for agent runs in seconds.
        global_test_timeout_sec: Global timeout for test runs in seconds.
        global_setup_timeout_sec: Global timeout for setup runs in seconds.
        remote_build: Whether to run containers on AWS EC2 instances instead
            of locally.
        ec2_instance_type: EC2 instance type for remote builds.
        ec2_instance_ami: EC2 AMI ID for remote builds.
        ec2_use_nvme_storage: Whether to enable NVMe-backed Docker data-root
            optimization for remote EC2 builds.
        ec2_root_volume_size: Root EBS volume size in GB for remote EC2 builds.
        history_limit: Optional tmux scrollback history limit (lines).
    """
    self._run_uuid = None
    self._start_time = datetime.now(timezone.utc).isoformat()

    self._dataset_path = dataset_path
    self._dataset_name = dataset_name
    self._dataset_version = dataset_version
    self._dataset_config = dataset_config
    self._n_tasks = n_tasks
    self._task_ids = task_ids
    self._exclude_task_ids = exclude_task_ids
    self._registry_url = registry_url
    self._local_registry_path = local_registry_path

    self._output_path = output_path
    self._agent_name = agent_name
    self._agent_import_path = agent_import_path
    self._agent_configs = self._normalize_multi_agent_configs(agent_configs)
    self._model_name = model_name if agent_name != AgentName.ORACLE else "Oracle"
    self._agent_kwargs = agent_kwargs or {}

    # Validate agent configuration
    if self._agent_configs is not None:
        if agent_name is not None or agent_import_path is not None:
            raise ValueError(
                "Cannot specify both agent_configs and single agent parameters"
            )
        if len(self._agent_configs) == 0:
            raise ValueError("agent_configs cannot be empty")
        self._is_multi_agent = True
    else:
        self._is_multi_agent = False
    self._run_id = run_id
    self._no_rebuild = no_rebuild
    self._cleanup = cleanup
    self._log_level = log_level
    self._upload_results = upload_results
    self._s3_bucket = fceval_config.s3_bucket_name
    self._evaluation_snapshots_bucket = (
        fceval_config.s3_evaluation_snapshots_bucket_name
    )
    self._n_concurrent_trials = n_concurrent_trials
    self._n_attempts = n_attempts
    self._global_timeout_multiplier = global_timeout_multiplier
    self._global_agent_timeout_sec = global_agent_timeout_sec
    self._global_test_timeout_sec = global_test_timeout_sec
    self._global_setup_timeout_sec = global_setup_timeout_sec

    # EC2 configuration
    self._remote_build = remote_build
    self._ec2_instance_type = ec2_instance_type
    self._ec2_instance_ami = ec2_instance_ami
    self._ec2_use_nvme_storage = ec2_use_nvme_storage
    self._ec2_root_volume_size = ec2_root_volume_size

    # Terminal configuration
    self._history_limit = history_limit

    # Check if we're resuming from an existing run
    self._is_resuming = self._run_path.exists()

    if not self._is_resuming:
        self._run_path.mkdir(parents=True, exist_ok=True)

    self._init_dataset()
    self._init_agent_class()

    self._livestream = livestream and (
        self._n_concurrent_trials == 1 or len(self._dataset) == 1
    )

    if self._is_resuming:
        self._validate_resume_configuration()

    self._init_logger()
    self._dataset.sort_by_duration()

    if self._is_resuming:
        self._filter_completed_and_cleanup_incomplete_tasks()

    if self._livestream != livestream:
        self._logger.warning(
            "Livestreaming has been turned off because n_concurrent_trials > 1."
        )

run

run()

Run the harness.

Returns:

Name Type Description
BenchmarkResults BenchmarkResults

The results of the harness run.

Source code in fceval/harness/harness.py
def run(self) -> BenchmarkResults:
    """Run the harness.

    Returns:
        BenchmarkResults: The results of the harness run.
    """
    logger.info("Starting harness run")
    logger.info(f"Run ID: {self._run_id}")

    # Only write metadata and lock files if not resuming
    if not self._is_resuming:
        self._write_run_metadata()
        self._create_run_lock()

    results = self._execute_tasks()

    # Only update metadata on end if not resuming
    if not self._is_resuming:
        self._update_metadata_on_end(results=results)

    self._handle_results_upload(results)

    return results

Results Models

BenchmarkResults

Bases: BaseModel

id class-attribute instance-attribute

id = Field(default_factory=uuid4)

results class-attribute instance-attribute

results = []

pass_at_k property

pass_at_k

Calculate pass@k metrics for different k values.

Uses the standard pass@k estimator per task and averages across tasks that have at least k attempts.

n_resolved property

n_resolved

n_unresolved property

n_unresolved

resolved_ids property

resolved_ids

unresolved_ids property

unresolved_ids

accuracy property

accuracy

mean_speedup_percentage property

mean_speedup_percentage

Mean of all speedup percentage metrics across trials (if any).

agent_advantages_gmean property

agent_advantages_gmean

Geometric mean of agent advantages per level for each non-nop agent.

Aggregates across all trials using geometric mean on speedup ratios.

Returns:

Type Description
dict[str, dict[str, float]]

{ "agent_name": { "agent_advantage": gmean_value, "agent_advantage_level1": gmean_value, "agent_advantage_level2": gmean_value, "agent_advantage_level3": gmean_value, "agent_advantage_level4": gmean_value, }

dict[str, dict[str, float]]

}

agent_metrics_summary property

agent_metrics_summary

Aggregate per-agent task metrics across the run.

total_cost property

total_cost

Sum of total_cost across all trials.

mean_cost_per_task property

mean_cost_per_task

Mean cost per task across all trials.

cost_by_agent_model property

cost_by_agent_model

Aggregate costs per agent:model pair across all trials.

TrialResults

Bases: BaseModel

id class-attribute instance-attribute

id = Field(default_factory=uuid4)

trial_name instance-attribute

trial_name

task_id instance-attribute

task_id

instruction instance-attribute

instruction

is_resolved class-attribute instance-attribute

is_resolved = None

failure_mode class-attribute instance-attribute

failure_mode = UNSET

parser_results class-attribute instance-attribute

parser_results = None

parser_extra_metrics class-attribute instance-attribute

parser_extra_metrics = None

recording_path class-attribute instance-attribute

recording_path = None

evaluation_snapshot_bucket_name class-attribute instance-attribute

evaluation_snapshot_bucket_name = None

evaluation_snapshot_s3_keys class-attribute instance-attribute

evaluation_snapshot_s3_keys = Field(default_factory=dict)

total_input_tokens class-attribute instance-attribute

total_input_tokens = None

total_output_tokens class-attribute instance-attribute

total_output_tokens = None

total_cost class-attribute instance-attribute

total_cost = None

costs_by_agent_model class-attribute instance-attribute

costs_by_agent_model = None

trial_started_at class-attribute instance-attribute

trial_started_at = None

trial_ended_at class-attribute instance-attribute

trial_ended_at = None

agent_started_at class-attribute instance-attribute

agent_started_at = None

agent_ended_at class-attribute instance-attribute

agent_ended_at = None

test_started_at class-attribute instance-attribute

test_started_at = None

test_ended_at class-attribute instance-attribute

test_ended_at = None

setup_started_at class-attribute instance-attribute

setup_started_at = None

setup_ended_at class-attribute instance-attribute

setup_ended_at = None

RunMetadata

Bases: BaseModel

run_id instance-attribute

run_id

uuid class-attribute instance-attribute

uuid = Field(default_factory=lambda: str(uuid4()))

dataset_path class-attribute instance-attribute

dataset_path = None

dataset_name class-attribute instance-attribute

dataset_name = None

dataset_version class-attribute instance-attribute

dataset_version = None

output_path instance-attribute

output_path

agent_name instance-attribute

agent_name

no_rebuild instance-attribute

no_rebuild

cleanup instance-attribute

cleanup

log_level instance-attribute

log_level

task_ids class-attribute instance-attribute

task_ids = None

exclude_task_ids class-attribute instance-attribute

exclude_task_ids = None

n_tasks class-attribute instance-attribute

n_tasks = None

n_concurrent_trials class-attribute instance-attribute

n_concurrent_trials = 4

n_attempts class-attribute instance-attribute

n_attempts = 1

dataset_size class-attribute instance-attribute

dataset_size = 0

accuracy class-attribute instance-attribute

accuracy = None

model_name class-attribute instance-attribute

model_name = None

start_time class-attribute instance-attribute

start_time = None

end_time class-attribute instance-attribute

end_time = None

commit_hash class-attribute instance-attribute

commit_hash = 'unknown'

username class-attribute instance-attribute

username = 'unknown'

s3_bucket class-attribute instance-attribute

s3_bucket = None

agent_kwargs class-attribute instance-attribute

agent_kwargs = None

pass_at_k class-attribute instance-attribute

pass_at_k = None

resumed_at class-attribute instance-attribute

resumed_at = None