Skip to content

Terminal

The terminal module manages Docker containers and tmux sessions that provide the sandboxed execution environment for agents.

Terminal

Terminal

Terminal(client_container_name, client_image_name, docker_compose_path, docker_image_name_prefix=None, sessions_logs_path=None, agent_logs_path=None, agent_model_name=None, commands_path=None, no_rebuild=False, cleanup=False, livestream=False, disable_recording=False, history_limit=None)
Source code in fceval/terminal/terminal.py
def __init__(
    self,
    client_container_name: str,
    client_image_name: str,
    docker_compose_path: Path,
    docker_image_name_prefix: str | None = None,
    sessions_logs_path: Path | None = None,
    agent_logs_path: Path | None = None,
    agent_model_name: str | None = None,
    commands_path: Path | None = None,
    no_rebuild: bool = False,
    cleanup: bool = False,
    livestream: bool = False,
    disable_recording: bool = False,
    history_limit: int | None = None,
):
    self._client_container_name = client_container_name
    self._docker_image_name_prefix = docker_image_name_prefix
    self._docker_compose_path = docker_compose_path
    self._sessions_logs_path = sessions_logs_path
    self._agent_logs_path = agent_logs_path
    self._commands_path = commands_path
    self._livestream = livestream
    self._disable_recording = disable_recording
    self._history_limit = history_limit

    self._logger = logger.getChild(__name__)
    self._sessions: dict[str, TmuxSession] = {}

    self._compose_manager = DockerComposeManager(
        client_container_name=client_container_name,
        client_image_name=client_image_name,
        docker_image_name_prefix=docker_image_name_prefix,
        docker_compose_path=docker_compose_path,
        no_rebuild=no_rebuild,
        cleanup=cleanup,
        sessions_logs_path=sessions_logs_path,
        agent_logs_path=agent_logs_path,
        agent_model_name=agent_model_name,
    )

    self._init_livestreamer()
    self.container = None

create_session

create_session(session_name, is_active_stream=False, as_configured_user=True, recording_filename=None)

Create a new tmux session with the given name.

Source code in fceval/terminal/terminal.py
def create_session(
    self,
    session_name: str,
    is_active_stream: bool = False,
    as_configured_user: bool = True,
    recording_filename: str | None = None,
) -> TmuxSession:
    """Create a new tmux session with the given name."""
    if self.container is None:
        raise ValueError("Container not started. Run start() first.")

    if session_name in self._sessions:
        raise ValueError(f"Session {session_name} already exists")

    if as_configured_user:
        user = self.container.attrs["Config"].get("User", "")
    else:
        user = "root"

    session = TmuxSession(
        session_name=session_name,
        container=self.container,
        commands_path=self._commands_path,
        disable_recording=self._disable_recording,
        user=user,
        recording_filename=recording_filename,
        history_limit=self._history_limit,
    )

    self._sessions[session_name] = session

    if is_active_stream:
        self.set_active_stream(session_name)

    session.start()

    return session

get_session

get_session(session_name)

Get an existing tmux session by name.

Source code in fceval/terminal/terminal.py
def get_session(self, session_name: str) -> TmuxSession:
    """Get an existing tmux session by name."""
    if session_name not in self._sessions:
        raise ValueError(f"Session {session_name} does not exist")
    return self._sessions[session_name]

close_session

close_session(session_name)

Stop and remove a tmux session if it exists.

Source code in fceval/terminal/terminal.py
def close_session(self, session_name: str) -> None:
    """Stop and remove a tmux session if it exists."""
    session = self._sessions.pop(session_name, None)
    if not session:
        return

    try:
        session.stop()
    except Exception as exc:  # pragma: no cover - best effort cleanup
        self._logger.warning(
            "Failed to stop tmux session %s: %s", session_name, exc
        )

start

start()
Source code in fceval/terminal/terminal.py
def start(self) -> None:
    self.container = self._compose_manager.start()

stop

stop()
Source code in fceval/terminal/terminal.py
def stop(self) -> None:
    for session_name in list(self._sessions.keys()):
        self.close_session(session_name)

    self._compose_manager.stop()

    if self._livestreamer:
        self._livestreamer.stop()

    self._sessions.clear()

copy_to_container

copy_to_container(paths, container_dir=None, container_filename=None)
Source code in fceval/terminal/terminal.py
def copy_to_container(
    self,
    paths: list[Path] | Path,
    container_dir: str | None = None,
    container_filename: str | None = None,
):
    self._compose_manager.copy_to_container(
        container=self.container,
        paths=paths,
        container_dir=container_dir,
        container_filename=container_filename,
    )

save_container_image

save_container_image(output_path, snapshot_s3_key=None)

Save the current container as a gzipped docker image tarball.

Source code in fceval/terminal/terminal.py
def save_container_image(
    self, output_path: Path, snapshot_s3_key: str | None = None
) -> None:
    """Save the current container as a gzipped docker image tarball."""
    self._compose_manager.save_container_image(output_path)

spin_up_terminal

spin_up_terminal(client_container_name, client_image_name, docker_compose_path, docker_image_name_prefix=None, sessions_logs_path=None, agent_logs_path=None, agent_model_name=None, commands_path=None, no_rebuild=False, cleanup=False, livestream=False, disable_recording=False)
Source code in fceval/terminal/terminal.py
@contextmanager
def spin_up_terminal(
    client_container_name: str,
    client_image_name: str,
    docker_compose_path: Path,
    docker_image_name_prefix: str | None = None,
    sessions_logs_path: Path | None = None,
    agent_logs_path: Path | None = None,
    agent_model_name: str | None = None,
    commands_path: Path | None = None,
    no_rebuild: bool = False,
    cleanup: bool = False,
    livestream: bool = False,
    disable_recording: bool = False,
) -> Generator[Terminal, None, None]:
    terminal = Terminal(
        client_container_name=client_container_name,
        client_image_name=client_image_name,
        docker_image_name_prefix=docker_image_name_prefix,
        docker_compose_path=docker_compose_path,
        sessions_logs_path=sessions_logs_path,
        agent_logs_path=agent_logs_path,
        agent_model_name=agent_model_name,
        commands_path=commands_path,
        no_rebuild=no_rebuild,
        cleanup=cleanup,
        livestream=livestream,
        disable_recording=disable_recording,
    )

    try:
        terminal.start()
        yield terminal
    finally:
        terminal.stop()

Tmux Session

TmuxSession

TmuxSession(session_name, container, commands_path=None, disable_recording=False, user='', recording_filename=None, history_limit=None)
Source code in fceval/terminal/tmux_session.py
def __init__(
    self,
    session_name: str,
    container: Container,
    commands_path: Path | None = None,
    disable_recording: bool = False,
    user: str = "",
    recording_filename: str | None = None,
    history_limit: int | None = None,
):
    self.container = container
    self._session_name = session_name
    self._commands_path = commands_path
    self._disable_recording = disable_recording
    self._logger = logger.getChild(__name__)
    self._asciinema_markers = []
    self._previous_buffer: Optional[str] = None
    self._user = user
    # Optional override for the recording filename (e.g., per-agent cast name)
    self._recording_filename_override = recording_filename
    # Tmux scrollback buffer size (lines)
    self._history_limit = history_limit or self.DEFAULT_HISTORY_LIMIT

    result = self._exec_run(["tmux", "-V"])
    if result.exit_code != 0:
        raise RuntimeError(
            "tmux is not installed in the container. "
            "Please install tmux in the container to run Terminal-Bench."
        )

    if self._recording_path:
        result = self._exec_run(["asciinema", "--version"])
        if result.exit_code != 0:
            raise RuntimeError(
                "Recording is enabled but asciinema is not "
                "installed in the container. "
                "Please install asciinema in the container to run Terminal-Bench."
            )

    self.copy_to_container(
        paths=[self._GET_ASCIINEMA_TIMESTAMP_SCRIPT_HOST_PATH],
        container_dir=str(
            self._GET_ASCIINEMA_TIMESTAMP_SCRIPT_CONTAINER_PATH.parent
        ),
    )

start

start()
Source code in fceval/terminal/tmux_session.py
def start(self) -> None:
    self._exec_run(self._tmux_start_session)

    if self._recording_path:
        self._logger.debug("Starting recording.")
        self.send_keys(
            keys=[
                f"asciinema rec --stdin {self._recording_path} --overwrite",
                "Enter",
            ],
            min_timeout_sec=3.0,
        )
        self.send_keys(
            keys=[
                "clear",
                "Enter",
            ],
        )

stop

stop()
Source code in fceval/terminal/tmux_session.py
def stop(self) -> None:
    if self._recording_path:
        try:
            self._logger.debug("Stopping recording.")
            self.send_keys(
                keys=["C-d"],
                min_timeout_sec=0.1,
            )
        except Exception as exc:  # pragma: no cover - best effort cleanup
            self._logger.warning(
                "Failed to stop recording for tmux session %s: %s",
                self._session_name,
                exc,
            )

    try:
        result = self._exec_run(["tmux", "kill-session", "-t", self._session_name])
        if result.exit_code not in (0, 1):
            self._logger.warning(
                "Failed to kill tmux session %s, exit code %s",
                self._session_name,
                result.exit_code,
            )
    except Exception as exc:  # pragma: no cover - best effort cleanup
        self._logger.debug(
            "Ignoring error while killing tmux session %s: %s",
            self._session_name,
            exc,
        )

send_keys

send_keys(keys, block=False, min_timeout_sec=0.0, max_timeout_sec=180.0)

Execute a command in the tmux session.

Parameters:

Name Type Description Default
keys str

The keys to send to the tmux session.

required
block bool

Whether to wait for the command to complete.

False
min_timeout_sec float

Minimum time in seconds to wait after executing. Defaults to 0.

0.0
max_timeout_sec float

Maximum time in seconds to wait for blocking commands. Defaults to 3 minutes.

180.0
Source code in fceval/terminal/tmux_session.py
def send_keys(
    self,
    keys: str | list[str],
    block: bool = False,
    min_timeout_sec: float = 0.0,
    max_timeout_sec: float = 180.0,
):
    """
    Execute a command in the tmux session.

    Args:
        keys (str): The keys to send to the tmux session.
        block (bool): Whether to wait for the command to complete.
        min_timeout_sec (float): Minimum time in seconds to wait after executing.
            Defaults to 0.
        max_timeout_sec (float): Maximum time in seconds to wait for blocking
            commands. Defaults to 3 minutes.
    """
    if block and min_timeout_sec > 0.0:
        self._logger.debug("min_timeout_sec will be ignored because block is True.")

    if self._commands_path:
        with self._commands_path.open("a") as f:
            f.write(f"{repr(keys)}\n")

    prepared_keys, is_blocking = self._prepare_keys(
        keys=keys,
        block=block,
    )

    self._logger.debug(
        f"Sending keys: {prepared_keys}"
        f" min_timeout_sec: {min_timeout_sec}"
        f" max_timeout_sec: {max_timeout_sec}"
    )

    if is_blocking:
        self._send_blocking_keys(
            keys=prepared_keys,
            max_timeout_sec=max_timeout_sec,
        )
    else:
        self._send_non_blocking_keys(
            keys=prepared_keys,
            min_timeout_sec=min_timeout_sec,
        )

send_command

send_command(command)
Source code in fceval/terminal/tmux_session.py
def send_command(self, command: TerminalCommand) -> None:
    if command.append_enter:
        keys = [command.command, "Enter"]
    else:
        keys = [command.command]

    self.send_keys(
        keys=keys,
        block=command.block,
        min_timeout_sec=command.min_timeout_sec,
        max_timeout_sec=command.max_timeout_sec,
    )

get_asciinema_timestamp

get_asciinema_timestamp()
Source code in fceval/terminal/tmux_session.py
def get_asciinema_timestamp(self) -> float:
    if self._recording_path is None:
        return 0.0  # Return 0 when recording is disabled
    result = self._exec_run(
        [
            "bash",
            str(self._GET_ASCIINEMA_TIMESTAMP_SCRIPT_CONTAINER_PATH),
            str(self._recording_path),
        ],
    )
    return float(result.output.decode())

copy_to_container

copy_to_container(paths, container_dir=None, container_filename=None)
Source code in fceval/terminal/tmux_session.py
def copy_to_container(
    self,
    paths: list[Path] | Path,
    container_dir: str | None = None,
    container_filename: str | None = None,
):
    DockerComposeManager.copy_to_container(
        container=self.container,
        paths=paths,
        container_dir=container_dir,
        container_filename=container_filename,
    )

Terminal Command

TerminalCommand

Bases: BaseModel

command instance-attribute

command

min_timeout_sec class-attribute instance-attribute

min_timeout_sec = 0.0

max_timeout_sec class-attribute instance-attribute

max_timeout_sec = 180.0

block class-attribute instance-attribute

block = False

append_enter class-attribute instance-attribute

append_enter = True

from_yaml_list classmethod

from_yaml_list(path)

Load a list of terminal commands from a YAML file.

Source code in fceval/terminal/models.py
@classmethod
def from_yaml_list(cls, path: Path) -> list["TerminalCommand"]:
    """Load a list of terminal commands from a YAML file."""
    data = yaml.safe_load(path.read_text())
    return [cls.model_validate(obj) for obj in data]

Docker Compose Manager

DockerComposeManager

DockerComposeManager(client_container_name, client_image_name, docker_compose_path, docker_image_name_prefix=None, no_rebuild=False, cleanup=False, sessions_logs_path=None, agent_logs_path=None, agent_model_name=None)
Source code in fceval/terminal/docker_compose_manager.py
def __init__(
    self,
    client_container_name: str,
    client_image_name: str,
    docker_compose_path: Path,
    docker_image_name_prefix: str | None = None,
    no_rebuild: bool = False,
    cleanup: bool = False,
    sessions_logs_path: Path | None = None,
    agent_logs_path: Path | None = None,
    agent_model_name: str | None = None,
):
    try:
        self._client = docker.from_env()
    except docker.errors.DockerException as e:
        raise RuntimeError(
            (
                f"Error creating docker client: {e}",
                "Please ensure that Docker is installed and running.",
            )
        )

    self._client_container_name = client_container_name
    self._client_image_name = client_image_name
    self._docker_name_prefix = docker_image_name_prefix
    self._docker_compose_path = docker_compose_path
    self._no_rebuild = no_rebuild
    self._cleanup = cleanup
    self._client_container: Container | None = None
    self._sessions_logs_path = sessions_logs_path
    self._agent_logs_path = agent_logs_path
    self._logger = logger.getChild(__name__)

    self.env = DockerComposeEnvVars(
        task_docker_client_image_name=self._client_image_name,
        task_docker_client_container_name=self._client_container_name,
        task_docker_name_prefix=self._docker_name_prefix,
        container_logs_path=self.CONTAINER_SESSION_LOGS_PATH,
        container_agent_logs_path=self.CONTAINER_AGENT_LOGS_PATH,
        test_dir=str(self.CONTAINER_TEST_DIR),
        task_logs_path=(
            str(self._sessions_logs_path.absolute())
            if self._sessions_logs_path is not None
            else None
        ),
        task_agent_logs_path=(
            str(self._agent_logs_path.absolute())
            if self._agent_logs_path is not None
            else None
        ),
        agent_model_name=agent_model_name,
    ).to_env_dict(include_os_env=True)

    # Add benchmark tuning environment variables for single-threaded
    # library operation. These ensure consistent performance by
    # preventing automatic parallelization
    benchmark_tuning_vars = {
        "OPENBLAS_NUM_THREADS": "1",
        "MKL_NUM_THREADS": "1",
        "OMP_NUM_THREADS": "1",
        "NUMEXPR_NUM_THREADS": "1",
        "VECLIB_MAXIMUM_THREADS": "1",
    }

    # Merge benchmark tuning vars (allow user overrides from existing env)
    for key, value in benchmark_tuning_vars.items():
        if key not in self.env:
            self.env[key] = value

CONTAINER_SESSION_LOGS_PATH class-attribute instance-attribute

CONTAINER_SESSION_LOGS_PATH = '/logs'

CONTAINER_AGENT_LOGS_PATH class-attribute instance-attribute

CONTAINER_AGENT_LOGS_PATH = '/agent-logs'

CONTAINER_TEST_DIR class-attribute instance-attribute

CONTAINER_TEST_DIR = Path('/tests')

env instance-attribute

env = to_env_dict(include_os_env=True)

get_docker_compose_command

get_docker_compose_command(command)
Source code in fceval/terminal/docker_compose_manager.py
def get_docker_compose_command(self, command: list[str]) -> list[str]:
    return [
        "docker",
        "compose",
        "-p",
        self._client_container_name,
        "-f",
        str(self._docker_compose_path.resolve().absolute()),
        *command,
    ]

start

start()
Source code in fceval/terminal/docker_compose_manager.py
def start(self) -> Container:
    if not self._no_rebuild:
        self._run_docker_compose_command(["build"])

    self._run_docker_compose_command(["up", "-d"])

    self._client_container = self._client.containers.get(
        self._client_container_name
    )
    return self._client_container

stop

stop()

Stop and remove the docker compose services.

Source code in fceval/terminal/docker_compose_manager.py
def stop(self) -> None:
    """Stop and remove the docker compose services."""
    try:
        self._run_docker_compose_command(["down"])

        if self._cleanup:
            self._run_docker_compose_command(["down", "--rmi", "all", "--volumes"])
            self._cleanup_build_cache()
    except Exception as e:
        self._logger.error(f"Error cleaning up docker compose services: {e}")

build

build()

Build the docker compose services.

Source code in fceval/terminal/docker_compose_manager.py
def build(self) -> None:
    """Build the docker compose services."""
    self._run_docker_compose_command(["build"])

save_container_image

save_container_image(output_path)

Commit the running container and save it as a gzipped image tarball.

Source code in fceval/terminal/docker_compose_manager.py
def save_container_image(self, output_path: Path) -> None:
    """Commit the running container and save it as a gzipped image tarball."""
    if self._client_container is None:
        raise RuntimeError("Container not started. Run start() first.")

    output_path.parent.mkdir(parents=True, exist_ok=True)

    image = self._client_container.commit()
    image_id = image.id

    try:
        self._save_image_to_gzip(image_id, output_path)
    finally:
        try:
            self._client.images.remove(image_id, force=True)
        except Exception as exc:  # pragma: no cover - best effort cleanup
            self._logger.warning(
                "Failed to remove snapshot image %s: %s", image_id, exc
            )

copy_to_container staticmethod

copy_to_container(container, paths, container_dir=None, container_filename=None)

Copy files or directories to a running docker container.

Source code in fceval/terminal/docker_compose_manager.py
@staticmethod
def copy_to_container(
    container: Container,
    paths: list[Path] | Path,
    container_dir: str | None = None,
    container_filename: str | None = None,
) -> None:
    """Copy files or directories to a running docker container."""
    container_dir = container_dir or (
        container.attrs.get("Config", {}).get("WorkingDir")
        if container.attrs
        else None
    )

    if container_dir is None:
        raise ValueError("Container working directory not found")

    if isinstance(paths, Path):
        paths = [paths]

    container.exec_run(f"mkdir -p {container_dir}")

    tar_stream = DockerComposeManager._create_tar_archive(paths, container_filename)
    container.put_archive(container_dir, tar_stream.read())
    tar_stream.close()

copy_to_client_container

copy_to_client_container(paths, container_dir=None, container_filename=None)

Copy files or directories to the client container.

Source code in fceval/terminal/docker_compose_manager.py
def copy_to_client_container(
    self,
    paths: list[Path] | Path,
    container_dir: str | None = None,
    container_filename: str | None = None,
) -> None:
    """Copy files or directories to the client container."""

    if self._client_container is None:
        raise ValueError("Client container not started")

    return self.copy_to_container(
        container=self._client_container,
        paths=paths,
        container_dir=container_dir,
        container_filename=container_filename,
    )