Skip to content

Dataset

The dataset module handles loading, filtering, and iterating over task directories.

Dataset

Dataset

Dataset(name=None, version=None, path=None, task_ids=None, n_tasks=None, exclude_task_ids=None, registry_url=None, local_registry_path=None)

A class for loading and iterating over tasks in a dataset.

Initialize the dataset.

Parameters:

Name Type Description Default
name str | None

Name of the dataset to cache from fc-eval server

None
version str | None

Version of the dataset to cache

None
path Path | None

Path to the dataset directory

None
task_ids list[str] | None

Optional list of specific task IDs or glob patterns to load.

None
n_tasks int | None

Optional number of tasks to load

None
exclude_task_ids list[str] | None

Optional list of task IDs or glob patterns to exclude.

None
registry_url str | None

The URL of the registry to use for the dataset.

None
local_registry_path Path | None

The path to the local registry file to use for the dataset. If provided, will use the local registry instead of the remote registry.

None
Source code in fceval/dataset/dataset.py
def __init__(
    self,
    name: str | None = None,
    version: str | None = None,
    path: Path | None = None,
    task_ids: list[str] | None = None,
    n_tasks: int | None = None,
    exclude_task_ids: list[str] | None = None,
    registry_url: str | None = None,
    local_registry_path: Path | None = None,
):
    """Initialize the dataset.

    Args:
        name: Name of the dataset to cache from fc-eval server
        version: Version of the dataset to cache
        path: Path to the dataset directory
        task_ids: Optional list of specific task IDs or glob patterns to load.
        n_tasks: Optional number of tasks to load
        exclude_task_ids: Optional list of task IDs or glob patterns to exclude.
        registry_url: The URL of the registry to use for the dataset.
        local_registry_path: The path to the local registry file to use for the
            dataset. If provided, will use the local registry instead of the remote
            registry.
    """
    self.config = DatasetConfig(
        name=name,
        version=version,
        path=path,
        task_ids=task_ids,
        n_tasks=n_tasks,
        exclude_task_ids=exclude_task_ids or [],
        registry_url=registry_url,
        local_registry_path=local_registry_path,
    )

    self._logger = logger.getChild(__name__)
    self._tasks: list[Path] = []

    self._maybe_cache_dataset()
    self._init_dataset()
    self._validate_task_paths()

config instance-attribute

config = DatasetConfig(name=name, version=version, path=path, task_ids=task_ids, n_tasks=n_tasks, exclude_task_ids=exclude_task_ids or [], registry_url=registry_url, local_registry_path=local_registry_path)

tasks property

tasks

Get the list of tasks in the dataset.

task_ids property

task_ids

Get the list of task IDs in the dataset.

from_config classmethod

from_config(config)

Create a Dataset instance from a DatasetConfig.

Parameters:

Name Type Description Default
config DatasetConfig

DatasetConfig instance

required

Returns:

Type Description
Dataset

A new Dataset instance

Source code in fceval/dataset/dataset.py
@classmethod
def from_config(cls, config: DatasetConfig) -> "Dataset":
    """Create a Dataset instance from a DatasetConfig.

    Args:
        config: DatasetConfig instance

    Returns:
        A new Dataset instance
    """
    return cls(
        name=config.name,
        version=config.version,
        path=config.path,
        task_ids=config.task_ids,
        n_tasks=config.n_tasks,
        exclude_task_ids=config.exclude_task_ids,
    )

from_yaml classmethod

from_yaml(yaml_path)

Create a Dataset instance from a YAML configuration file.

Parameters:

Name Type Description Default
yaml_path Path | str

Path to the YAML configuration file

required

Returns:

Type Description
Dataset

A new Dataset instance

Source code in fceval/dataset/dataset.py
@classmethod
def from_yaml(cls, yaml_path: Path | str) -> "Dataset":
    """Create a Dataset instance from a YAML configuration file.

    Args:
        yaml_path: Path to the YAML configuration file

    Returns:
        A new Dataset instance
    """
    with open(yaml_path) as f:
        config_data = yaml.safe_load(f)

    config = DatasetConfig(**config_data)
    return cls.from_config(config)

__iter__

__iter__()

Iterate over the tasks in the dataset.

Source code in fceval/dataset/dataset.py
def __iter__(self) -> Iterator[Path]:
    """Iterate over the tasks in the dataset."""
    return iter(self._tasks)

__len__

__len__()

Get the number of tasks in the dataset.

Source code in fceval/dataset/dataset.py
def __len__(self) -> int:
    """Get the number of tasks in the dataset."""
    return len(self._tasks)

sort_by_duration

sort_by_duration()

Sort tasks by estimated duration (longest first) for optimal concurrent execution.

Source code in fceval/dataset/dataset.py
def sort_by_duration(self) -> None:
    """
    Sort tasks by estimated duration (longest first) for optimal concurrent
    execution.
    """
    task_durations = []
    for task_path in self._tasks:
        try:
            task_paths_obj = TaskPaths(task_path)
            task = Task.from_yaml(task_paths_obj.task_config_path)
            duration = task.effective_estimated_duration_sec
        except Exception as e:
            self._logger.warning(
                f"Failed to load task {task_path.name}: {e}. Using fallback "
                "duration."
            )
            duration = 210.0
        task_durations.append((task_path, duration))

    task_durations.sort(key=lambda x: x[1], reverse=True)
    self._tasks = [task_path for task_path, _ in task_durations]

    # Log the task execution order
    table_data = []
    for i, (task_path, duration) in enumerate(task_durations, 1):
        task_name = task_path.name
        minutes = int(duration // 60)
        seconds = int(duration % 60)
        duration_str = f"{minutes}m {seconds}s"

        try:
            task_paths_obj = TaskPaths(task_path)
            task = Task.from_yaml(task_paths_obj.task_config_path)
            if task.estimated_duration_sec is not None:
                source = "historical"
            else:
                source = "calculated"
        except Exception:
            source = "fallback"

        table_data.append([i, task_name, duration_str, source])

    headers = ["#", "Task Name", "Duration", "Source"]
    table = tabulate(table_data, headers=headers, tablefmt="grid")

    self._logger.debug("Processing tasks in duration order (longest first):")
    self._logger.debug(table)
    self._logger.debug(f"Total tasks: {len(task_durations)}")

DatasetConfig

Bases: BaseModel

Configuration for loading a dataset.

name class-attribute instance-attribute

name = None

version class-attribute instance-attribute

version = None

registry_url class-attribute instance-attribute

registry_url = None

local_registry_path class-attribute instance-attribute

local_registry_path = None

path class-attribute instance-attribute

path = None

task_ids class-attribute instance-attribute

task_ids = None

n_tasks class-attribute instance-attribute

n_tasks = None

exclude_task_ids class-attribute instance-attribute

exclude_task_ids = Field(default_factory=list)

validate_config

validate_config()
Source code in fceval/dataset/dataset.py
@model_validator(mode="after")
def validate_config(self) -> Self:
    if self.task_ids is not None and self.n_tasks is not None:
        raise ValueError("Cannot specify both task_ids and n_tasks")

    if self.path is None and (self.version is None or self.name is None):
        raise ValueError("If path is not set, both version and name must be set")
    elif self.path is not None and (
        self.version is not None or self.name is not None
    ):
        raise ValueError("If path is set, version and name should not be set")

    return self

Dataset Metadata

DatasetMetadata

Bases: BaseModel

Metadata for a registered dataset.

name instance-attribute

name

version instance-attribute

version

created_at instance-attribute

created_at

updated_at instance-attribute

updated_at

fc_eval_version instance-attribute

fc_eval_version

source_url class-attribute instance-attribute

source_url = None

from_yaml classmethod

from_yaml(yaml_path)
Source code in fceval/dataset/dataset_metadata.py
@classmethod
def from_yaml(cls, yaml_path: Path) -> "DatasetMetadata":
    return cls.model_validate(yaml.safe_load(yaml_path.read_text()))

to_yaml

to_yaml(yaml_path)
Source code in fceval/dataset/dataset_metadata.py
def to_yaml(self, yaml_path: Path) -> None:
    with yaml_path.open("w") as f:
        yaml.dump(json.loads(self.model_dump_json(exclude_none=True)), f)

Task Models

Task

Bases: BaseModel

instruction instance-attribute

instruction

author_name class-attribute instance-attribute

author_name = Field(default='unknown', description='Name of the author of the task')

author_email class-attribute instance-attribute

author_email = Field(default='unknown', description='Email of the author of the task')

difficulty instance-attribute

difficulty

category class-attribute instance-attribute

category = Field(default='software_engineering', description='High-level category that describes the type of task this is.')

tags class-attribute instance-attribute

tags = Field(default=[], description='Tags that describe the type of task this is. Reference other tasks to see examples.')

parser_name class-attribute instance-attribute

parser_name = Field(default=PYTEST, description='Name of the parser to use for test results')

max_agent_timeout_sec class-attribute instance-attribute

max_agent_timeout_sec = Field(default=360.0, description='Maximum timeout in seconds for the agent to run.')

max_test_timeout_sec class-attribute instance-attribute

max_test_timeout_sec = Field(default=60.0, description='Maximum timeout in seconds for each individual test')

max_setup_timeout_sec class-attribute instance-attribute

max_setup_timeout_sec = Field(default=600.0, description='Maximum timeout in seconds for the setup script')

run_tests_in_same_shell class-attribute instance-attribute

run_tests_in_same_shell = Field(default=False, description='Run the tests in the same shell as the agent. This is useful if you need to test shell-scoped attributes.')

disable_asciinema class-attribute instance-attribute

disable_asciinema = Field(default=False, description='Disable asciinema recording for this task while keeping Docker containerization. When enabled, the task runs in Docker but without terminal recording.')

estimated_duration_sec class-attribute instance-attribute

estimated_duration_sec = Field(default=None, description='Estimated duration in seconds for this task based on historical data.')

expert_time_estimate_min class-attribute instance-attribute

expert_time_estimate_min = Field(default=None, description='Estimated time in minutes for an expert in the field to solve this task using the Internet but no LLMs or oracle solutions.')

junior_time_estimate_min class-attribute instance-attribute

junior_time_estimate_min = Field(default=None, description='Estimated time in minutes for an average junior engineer to solve this task using the Internet but no LLMs or oracle solutions.')

effective_estimated_duration_sec property

effective_estimated_duration_sec

Get the estimated duration, using a calculated default if not specified.

from_yaml classmethod

from_yaml(path)
Source code in fceval/handlers/trial_handler.py
@classmethod
def from_yaml(cls, path: Path) -> "Task":
    data = yaml.safe_load(path.read_text())
    try:
        return cls.model_validate(data)
    except:
        print("Error validating task", path)
        raise

to_yaml

to_yaml(path, canary_string=None)
Source code in fceval/handlers/trial_handler.py
def to_yaml(self, path: Path, canary_string: str | None = None) -> None:
    yaml = YAML()
    yaml.indent(mapping=2, sequence=4, offset=2)
    yaml.preserve_quotes = True
    yaml.width = 4096  # Prevent line wrapping
    yaml.default_flow_style = False

    # Convert to JSON and back to dict to ensure all values are serializable
    task_dict = json.loads(self.model_dump_json())
    task_dict["instruction"] = LiteralScalarString(task_dict["instruction"])

    with open(path, "w") as f:
        if canary_string is not None:
            f.write(canary_string)
        yaml.dump(task_dict, f)

TaskPaths

TaskPaths(input_path)

Manages paths for task input files and directories.

Folder structure:

input_path/ ├── task.yaml # Task configuration ├── solution.sh # Solution script (or solution.yaml) ├── run-tests.sh # Test runner script ├── run-setup.sh # Setup script ├── docker-compose.yaml # Docker configuration └── tests/ # Test directory

Source code in fceval/handlers/trial_handler.py
def __init__(self, input_path: Path):
    self.input_path = input_path

input_path instance-attribute

input_path = input_path

task_config_path property

task_config_path

solution_path property

solution_path

test_dir property

test_dir

run_tests_path property

run_tests_path

run_setup_path property

run_setup_path

Path to the optional run-setup.sh script.

docker_compose_path property

docker_compose_path

TrialPaths

TrialPaths(output_path, task_id, trial_name)

Manages paths for trial output files and directories.

Folder structure:

output_path/ └── {task_id}/ └── {trial_name}/ ├── sessions/ # Session data ├── panes/ # Terminal pane outputs │ ├── pre-agent.txt │ ├── post-agent.txt │ └── post-test.txt ├── commands.txt # Command history ├── results.json # Test results └── agent-logs/ # Agent logging directory

Source code in fceval/handlers/trial_handler.py
def __init__(self, output_path: Path, task_id: str, trial_name: str):
    self.output_path = output_path
    self.task_id = task_id
    self.trial_name = trial_name

output_path instance-attribute

output_path = output_path

task_id instance-attribute

task_id = task_id

trial_name instance-attribute

trial_name = trial_name

task_output_path property

task_output_path

sessions_path property

sessions_path

panes_path property

panes_path

post_setup_pane_path property

post_setup_pane_path

pre_agent_pane_path property

pre_agent_pane_path

post_agent_pane_path property

post_agent_pane_path

post_test_pane_path property

post_test_pane_path

commands_path property

commands_path

results_path property

results_path

agent_logging_dir property

agent_logging_dir

mkdir

mkdir()
Source code in fceval/handlers/trial_handler.py
def mkdir(self) -> None:
    self.task_output_path.mkdir(parents=True, exist_ok=True)
    self.panes_path.mkdir(parents=True, exist_ok=True)
    self.sessions_path.mkdir(parents=True, exist_ok=True)
    self.agent_logging_dir.mkdir(parents=True, exist_ok=True)