Dataset¶

The dataset module handles loading, filtering, and iterating over task directories.

Dataset¶

Dataset ¶

Dataset(name=None, version=None, path=None, task_ids=None, n_tasks=None, exclude_task_ids=None, registry_url=None, local_registry_path=None)

A class for loading and iterating over tasks in a dataset.

Initialize the dataset.

Parameters:

Name	Type	Description	Default
`name`	`str \| None`	Name of the dataset to cache from fc-eval server	`None`
`version`	`str \| None`	Version of the dataset to cache	`None`
`path`	`Path \| None`	Path to the dataset directory	`None`
`task_ids`	`list[str] \| None`	Optional list of specific task IDs or glob patterns to load.	`None`
`n_tasks`	`int \| None`	Optional number of tasks to load	`None`
`exclude_task_ids`	`list[str] \| None`	Optional list of task IDs or glob patterns to exclude.	`None`
`registry_url`	`str \| None`	The URL of the registry to use for the dataset.	`None`
`local_registry_path`	`Path \| None`	The path to the local registry file to use for the dataset. If provided, will use the local registry instead of the remote registry.	`None`

Source code in fceval/dataset/dataset.py

def __init__(
    self,
    name: str | None = None,
    version: str | None = None,
    path: Path | None = None,
    task_ids: list[str] | None = None,
    n_tasks: int | None = None,
    exclude_task_ids: list[str] | None = None,
    registry_url: str | None = None,
    local_registry_path: Path | None = None,
):
    """Initialize the dataset.

    Args:
        name: Name of the dataset to cache from fc-eval server
        version: Version of the dataset to cache
        path: Path to the dataset directory
        task_ids: Optional list of specific task IDs or glob patterns to load.
        n_tasks: Optional number of tasks to load
        exclude_task_ids: Optional list of task IDs or glob patterns to exclude.
        registry_url: The URL of the registry to use for the dataset.
        local_registry_path: The path to the local registry file to use for the
            dataset. If provided, will use the local registry instead of the remote
            registry.
    """
    self.config = DatasetConfig(
        name=name,
        version=version,
        path=path,
        task_ids=task_ids,
        n_tasks=n_tasks,
        exclude_task_ids=exclude_task_ids or [],
        registry_url=registry_url,
        local_registry_path=local_registry_path,
    )

    self._logger = logger.getChild(__name__)
    self._tasks: list[Path] = []

    self._maybe_cache_dataset()
    self._init_dataset()
    self._validate_task_paths()

config `instance-attribute` ¶

config = DatasetConfig(name=name, version=version, path=path, task_ids=task_ids, n_tasks=n_tasks, exclude_task_ids=exclude_task_ids or [], registry_url=registry_url, local_registry_path=local_registry_path)

tasks `property` ¶

tasks

Get the list of tasks in the dataset.

task_ids `property` ¶

task_ids

Get the list of task IDs in the dataset.

from_config `classmethod` ¶

from_config(config)

Create a Dataset instance from a DatasetConfig.

Parameters:

Name	Type	Description	Default
`config`	`DatasetConfig`	DatasetConfig instance	required

Returns:

Type	Description
`Dataset`	A new Dataset instance

Source code in fceval/dataset/dataset.py

@classmethod
def from_config(cls, config: DatasetConfig) -> "Dataset":
    """Create a Dataset instance from a DatasetConfig.

    Args:
        config: DatasetConfig instance

    Returns:
        A new Dataset instance
    """
    return cls(
        name=config.name,
        version=config.version,
        path=config.path,
        task_ids=config.task_ids,
        n_tasks=config.n_tasks,
        exclude_task_ids=config.exclude_task_ids,
    )

from_yaml `classmethod` ¶

from_yaml(yaml_path)

Create a Dataset instance from a YAML configuration file.

Parameters:

Name	Type	Description	Default
`yaml_path`	`Path \| str`	Path to the YAML configuration file	required

Returns:

Type	Description
`Dataset`	A new Dataset instance

Source code in fceval/dataset/dataset.py

@classmethod
def from_yaml(cls, yaml_path: Path | str) -> "Dataset":
    """Create a Dataset instance from a YAML configuration file.

    Args:
        yaml_path: Path to the YAML configuration file

    Returns:
        A new Dataset instance
    """
    with open(yaml_path) as f:
        config_data = yaml.safe_load(f)

    config = DatasetConfig(**config_data)
    return cls.from_config(config)

iter ¶

__iter__()

Iterate over the tasks in the dataset.

Source code in fceval/dataset/dataset.py

def __iter__(self) -> Iterator[Path]:
    """Iterate over the tasks in the dataset."""
    return iter(self._tasks)

len ¶

__len__()

Get the number of tasks in the dataset.

Source code in fceval/dataset/dataset.py

def __len__(self) -> int:
    """Get the number of tasks in the dataset."""
    return len(self._tasks)

sort_by_duration ¶

sort_by_duration()

Sort tasks by estimated duration (longest first) for optimal concurrent execution.

Source code in fceval/dataset/dataset.py

def sort_by_duration(self) -> None:
    """
    Sort tasks by estimated duration (longest first) for optimal concurrent
    execution.
    """
    task_durations = []
    for task_path in self._tasks:
        try:
            task_paths_obj = TaskPaths(task_path)
            task = Task.from_yaml(task_paths_obj.task_config_path)
            duration = task.effective_estimated_duration_sec
        except Exception as e:
            self._logger.warning(
                f"Failed to load task {task_path.name}: {e}. Using fallback "
                "duration."
            )
            duration = 210.0
        task_durations.append((task_path, duration))

    task_durations.sort(key=lambda x: x[1], reverse=True)
    self._tasks = [task_path for task_path, _ in task_durations]

    # Log the task execution order
    table_data = []
    for i, (task_path, duration) in enumerate(task_durations, 1):
        task_name = task_path.name
        minutes = int(duration // 60)
        seconds = int(duration % 60)
        duration_str = f"{minutes}m {seconds}s"

        try:
            task_paths_obj = TaskPaths(task_path)
            task = Task.from_yaml(task_paths_obj.task_config_path)
            if task.estimated_duration_sec is not None:
                source = "historical"
            else:
                source = "calculated"
        except Exception:
            source = "fallback"

        table_data.append([i, task_name, duration_str, source])

    headers = ["#", "Task Name", "Duration", "Source"]
    table = tabulate(table_data, headers=headers, tablefmt="grid")

    self._logger.debug("Processing tasks in duration order (longest first):")
    self._logger.debug(table)
    self._logger.debug(f"Total tasks: {len(task_durations)}")

DatasetConfig ¶

Bases: BaseModel

Configuration for loading a dataset.

name `class-attribute` `instance-attribute` ¶

name = None

version `class-attribute` `instance-attribute` ¶

version = None

registry_url `class-attribute` `instance-attribute` ¶

registry_url = None

local_registry_path `class-attribute` `instance-attribute` ¶

local_registry_path = None

path `class-attribute` `instance-attribute` ¶

path = None

task_ids `class-attribute` `instance-attribute` ¶

task_ids = None

n_tasks `class-attribute` `instance-attribute` ¶

n_tasks = None

exclude_task_ids `class-attribute` `instance-attribute` ¶

exclude_task_ids = Field(default_factory=list)

validate_config ¶

validate_config()

Source code in fceval/dataset/dataset.py

@model_validator(mode="after")
def validate_config(self) -> Self:
    if self.task_ids is not None and self.n_tasks is not None:
        raise ValueError("Cannot specify both task_ids and n_tasks")

    if self.path is None and (self.version is None or self.name is None):
        raise ValueError("If path is not set, both version and name must be set")
    elif self.path is not None and (
        self.version is not None or self.name is not None
    ):
        raise ValueError("If path is set, version and name should not be set")

    return self

Dataset Metadata¶

DatasetMetadata ¶

Bases: BaseModel

Metadata for a registered dataset.

name `instance-attribute` ¶

name

version `instance-attribute` ¶

version

created_at `instance-attribute` ¶

created_at

updated_at `instance-attribute` ¶

updated_at

fc_eval_version `instance-attribute` ¶

fc_eval_version

source_url `class-attribute` `instance-attribute` ¶

source_url = None

from_yaml `classmethod` ¶

from_yaml(yaml_path)

Source code in fceval/dataset/dataset_metadata.py

@classmethod
def from_yaml(cls, yaml_path: Path) -> "DatasetMetadata":
    return cls.model_validate(yaml.safe_load(yaml_path.read_text()))

to_yaml ¶

to_yaml(yaml_path)

Source code in fceval/dataset/dataset_metadata.py

def to_yaml(self, yaml_path: Path) -> None:
    with yaml_path.open("w") as f:
        yaml.dump(json.loads(self.model_dump_json(exclude_none=True)), f)

Task Models¶

Task ¶

Bases: BaseModel

instruction `instance-attribute` ¶

instruction

author_name `class-attribute` `instance-attribute` ¶

author_name = Field(default='unknown', description='Name of the author of the task')

author_email `class-attribute` `instance-attribute` ¶

author_email = Field(default='unknown', description='Email of the author of the task')

difficulty `instance-attribute` ¶

difficulty

category `class-attribute` `instance-attribute` ¶

category = Field(default='software_engineering', description='High-level category that describes the type of task this is.')

tags `class-attribute` `instance-attribute` ¶

tags = Field(default=[], description='Tags that describe the type of task this is. Reference other tasks to see examples.')

parser_name `class-attribute` `instance-attribute` ¶

parser_name = Field(default=PYTEST, description='Name of the parser to use for test results')

max_agent_timeout_sec `class-attribute` `instance-attribute` ¶

max_agent_timeout_sec = Field(default=360.0, description='Maximum timeout in seconds for the agent to run.')

max_test_timeout_sec `class-attribute` `instance-attribute` ¶

max_test_timeout_sec = Field(default=60.0, description='Maximum timeout in seconds for each individual test')

max_setup_timeout_sec `class-attribute` `instance-attribute` ¶

max_setup_timeout_sec = Field(default=600.0, description='Maximum timeout in seconds for the setup script')

run_tests_in_same_shell `class-attribute` `instance-attribute` ¶

run_tests_in_same_shell = Field(default=False, description='Run the tests in the same shell as the agent. This is useful if you need to test shell-scoped attributes.')

disable_asciinema `class-attribute` `instance-attribute` ¶

disable_asciinema = Field(default=False, description='Disable asciinema recording for this task while keeping Docker containerization. When enabled, the task runs in Docker but without terminal recording.')

estimated_duration_sec `class-attribute` `instance-attribute` ¶

estimated_duration_sec = Field(default=None, description='Estimated duration in seconds for this task based on historical data.')

expert_time_estimate_min `class-attribute` `instance-attribute` ¶

expert_time_estimate_min = Field(default=None, description='Estimated time in minutes for an expert in the field to solve this task using the Internet but no LLMs or oracle solutions.')

junior_time_estimate_min `class-attribute` `instance-attribute` ¶

junior_time_estimate_min = Field(default=None, description='Estimated time in minutes for an average junior engineer to solve this task using the Internet but no LLMs or oracle solutions.')

effective_estimated_duration_sec `property` ¶

effective_estimated_duration_sec

Get the estimated duration, using a calculated default if not specified.

from_yaml `classmethod` ¶

from_yaml(path)

Source code in fceval/handlers/trial_handler.py

@classmethod
def from_yaml(cls, path: Path) -> "Task":
    data = yaml.safe_load(path.read_text())
    try:
        return cls.model_validate(data)
    except:
        print("Error validating task", path)
        raise

to_yaml ¶

to_yaml(path, canary_string=None)

Source code in fceval/handlers/trial_handler.py

def to_yaml(self, path: Path, canary_string: str | None = None) -> None:
    yaml = YAML()
    yaml.indent(mapping=2, sequence=4, offset=2)
    yaml.preserve_quotes = True
    yaml.width = 4096  # Prevent line wrapping
    yaml.default_flow_style = False

    # Convert to JSON and back to dict to ensure all values are serializable
    task_dict = json.loads(self.model_dump_json())
    task_dict["instruction"] = LiteralScalarString(task_dict["instruction"])

    with open(path, "w") as f:
        if canary_string is not None:
            f.write(canary_string)
        yaml.dump(task_dict, f)

TaskPaths ¶

TaskPaths(input_path)

Manages paths for task input files and directories.

Folder structure:

input_path/ ├── task.yaml # Task configuration ├── solution.sh # Solution script (or solution.yaml) ├── run-tests.sh # Test runner script ├── run-setup.sh # Setup script ├── docker-compose.yaml # Docker configuration └── tests/ # Test directory

Source code in fceval/handlers/trial_handler.py

def __init__(self, input_path: Path):
    self.input_path = input_path

input_path `instance-attribute` ¶

input_path = input_path

task_config_path `property` ¶

task_config_path

solution_path `property` ¶

solution_path

test_dir `property` ¶

test_dir

run_tests_path `property` ¶

run_tests_path

run_setup_path `property` ¶

run_setup_path

Path to the optional run-setup.sh script.

docker_compose_path `property` ¶

docker_compose_path

TrialPaths ¶

TrialPaths(output_path, task_id, trial_name)

Manages paths for trial output files and directories.

Folder structure:

output_path/ └── {task_id}/ └── {trial_name}/ ├── sessions/ # Session data ├── panes/ # Terminal pane outputs │ ├── pre-agent.txt │ ├── post-agent.txt │ └── post-test.txt ├── commands.txt # Command history ├── results.json # Test results └── agent-logs/ # Agent logging directory

Source code in fceval/handlers/trial_handler.py

def __init__(self, output_path: Path, task_id: str, trial_name: str):
    self.output_path = output_path
    self.task_id = task_id
    self.trial_name = trial_name

output_path `instance-attribute` ¶

output_path = output_path

task_id `instance-attribute` ¶

task_id = task_id

trial_name `instance-attribute` ¶

trial_name = trial_name

task_output_path `property` ¶

task_output_path

sessions_path `property` ¶

sessions_path

panes_path `property` ¶

panes_path

post_setup_pane_path `property` ¶

post_setup_pane_path

pre_agent_pane_path `property` ¶

pre_agent_pane_path

post_agent_pane_path `property` ¶

post_agent_pane_path

post_test_pane_path `property` ¶

post_test_pane_path

commands_path `property` ¶

commands_path

results_path `property` ¶

results_path

agent_logging_dir `property` ¶

agent_logging_dir

mkdir ¶

mkdir()

Source code in fceval/handlers/trial_handler.py

def mkdir(self) -> None:
    self.task_output_path.mkdir(parents=True, exist_ok=True)
    self.panes_path.mkdir(parents=True, exist_ok=True)
    self.sessions_path.mkdir(parents=True, exist_ok=True)
    self.agent_logging_dir.mkdir(parents=True, exist_ok=True)

Dataset¶

Dataset¶

Dataset ¶

config instance-attribute ¶

tasks property ¶

task_ids property ¶

from_config classmethod ¶

from_yaml classmethod ¶

__iter__ ¶

__len__ ¶

sort_by_duration ¶

DatasetConfig ¶

name class-attribute instance-attribute ¶

version class-attribute instance-attribute ¶

registry_url class-attribute instance-attribute ¶

local_registry_path class-attribute instance-attribute ¶

path class-attribute instance-attribute ¶

task_ids class-attribute instance-attribute ¶

n_tasks class-attribute instance-attribute ¶

exclude_task_ids class-attribute instance-attribute ¶

validate_config ¶

Dataset Metadata¶

DatasetMetadata ¶

name instance-attribute ¶

version instance-attribute ¶

created_at instance-attribute ¶

updated_at instance-attribute ¶

fc_eval_version instance-attribute ¶

source_url class-attribute instance-attribute ¶

from_yaml classmethod ¶

to_yaml ¶

Task Models¶

Task ¶

instruction instance-attribute ¶

author_name class-attribute instance-attribute ¶

author_email class-attribute instance-attribute ¶

difficulty instance-attribute ¶

category class-attribute instance-attribute ¶

tags class-attribute instance-attribute ¶

parser_name class-attribute instance-attribute ¶

max_agent_timeout_sec class-attribute instance-attribute ¶

max_test_timeout_sec class-attribute instance-attribute ¶

max_setup_timeout_sec class-attribute instance-attribute ¶

run_tests_in_same_shell class-attribute instance-attribute ¶

disable_asciinema class-attribute instance-attribute ¶

estimated_duration_sec class-attribute instance-attribute ¶

expert_time_estimate_min class-attribute instance-attribute ¶

junior_time_estimate_min class-attribute instance-attribute ¶

effective_estimated_duration_sec property ¶

from_yaml classmethod ¶

to_yaml ¶

TaskPaths ¶

input_path instance-attribute ¶

task_config_path property ¶

solution_path property ¶

test_dir property ¶

run_tests_path property ¶

run_setup_path property ¶

docker_compose_path property ¶

TrialPaths ¶

output_path instance-attribute ¶

task_id instance-attribute ¶

trial_name instance-attribute ¶

task_output_path property ¶

sessions_path property ¶

panes_path property ¶

post_setup_pane_path property ¶

pre_agent_pane_path property ¶

post_agent_pane_path property ¶

post_test_pane_path property ¶

commands_path property ¶

results_path property ¶

agent_logging_dir property ¶

mkdir ¶

config `instance-attribute` ¶

tasks `property` ¶

task_ids `property` ¶

from_config `classmethod` ¶

from_yaml `classmethod` ¶

iter ¶

len ¶

name `class-attribute` `instance-attribute` ¶

version `class-attribute` `instance-attribute` ¶

registry_url `class-attribute` `instance-attribute` ¶

local_registry_path `class-attribute` `instance-attribute` ¶

path `class-attribute` `instance-attribute` ¶

task_ids `class-attribute` `instance-attribute` ¶

n_tasks `class-attribute` `instance-attribute` ¶

exclude_task_ids `class-attribute` `instance-attribute` ¶

name `instance-attribute` ¶

version `instance-attribute` ¶

created_at `instance-attribute` ¶

updated_at `instance-attribute` ¶

fc_eval_version `instance-attribute` ¶

source_url `class-attribute` `instance-attribute` ¶

from_yaml `classmethod` ¶

instruction `instance-attribute` ¶

author_name `class-attribute` `instance-attribute` ¶

author_email `class-attribute` `instance-attribute` ¶

difficulty `instance-attribute` ¶

category `class-attribute` `instance-attribute` ¶

tags `class-attribute` `instance-attribute` ¶

parser_name `class-attribute` `instance-attribute` ¶

max_agent_timeout_sec `class-attribute` `instance-attribute` ¶

max_test_timeout_sec `class-attribute` `instance-attribute` ¶

max_setup_timeout_sec `class-attribute` `instance-attribute` ¶

run_tests_in_same_shell `class-attribute` `instance-attribute` ¶

disable_asciinema `class-attribute` `instance-attribute` ¶

estimated_duration_sec `class-attribute` `instance-attribute` ¶

expert_time_estimate_min `class-attribute` `instance-attribute` ¶

junior_time_estimate_min `class-attribute` `instance-attribute` ¶

effective_estimated_duration_sec `property` ¶

from_yaml `classmethod` ¶

input_path `instance-attribute` ¶

task_config_path `property` ¶

solution_path `property` ¶

test_dir `property` ¶

run_tests_path `property` ¶

run_setup_path `property` ¶

docker_compose_path `property` ¶

output_path `instance-attribute` ¶

task_id `instance-attribute` ¶

trial_name `instance-attribute` ¶

task_output_path `property` ¶

sessions_path `property` ¶

panes_path `property` ¶

post_setup_pane_path `property` ¶

pre_agent_pane_path `property` ¶

post_agent_pane_path `property` ¶

post_test_pane_path `property` ¶

commands_path `property` ¶

results_path `property` ¶

agent_logging_dir `property` ¶