Skip to content

Datasets API

culicidaelab.datasets

Dataset management components for the CulicidaeLab library.

This module provides the DatasetsManager, a high-level interface for accessing, loading, and managing datasets as defined in the application's configuration. It simplifies interactions with different data sources and providers.

__all__ = ['DatasetsManager'] module-attribute
DatasetsManager

Manages access, loading, and caching of configured datasets.

This manager provides a high-level interface that uses the global settings for configuration and a dedicated provider service for the actual data loading. This decouples the logic of what datasets are available from how they are loaded and sourced.

Attributes:

Name Type Description
settings

The main settings object for the library.

provider_service

The service for resolving and using data providers.

loaded_datasets dict[str, str | Path]

A cache for storing the paths of downloaded datasets.

Source code in culicidaelab\datasets\datasets_manager.py
class DatasetsManager:
    """Manages access, loading, and caching of configured datasets.

    This manager provides a high-level interface that uses the global settings
    for configuration and a dedicated provider service for the actual data
    loading. This decouples the logic of what datasets are available from how
    they are loaded and sourced.

    Attributes:
        settings: The main settings object for the library.
        provider_service: The service for resolving and using data providers.
        loaded_datasets: A cache for storing the paths of downloaded datasets.
    """

    def __init__(self, settings: Settings):
        """Initializes the DatasetsManager with its dependencies.

        Args:
            settings: The main Settings object for the library.
        """
        self.settings = settings
        self.provider_service = ProviderService(settings)
        self.loaded_datasets: dict[str, str | Path] = {}

    def get_dataset_info(self, dataset_name: str) -> DatasetConfig:
        """Retrieves the configuration for a specific dataset.

        Example:
            >>> from culicidaelab.settings import Settings
            >>> from culicidaelab.datasets import DatasetsManager
            >>> settings = Settings()
            >>> manager = DatasetsManager(settings)
            >>> try:
            ...     info = manager.get_dataset_info('classification')
            ...     print(info.provider_name)
            ... except KeyError as e:
            ...     print(e)

        Args:
            dataset_name: The name of the dataset (e.g., 'classification').

        Returns:
            A Pydantic model instance containing the dataset's
            validated configuration.

        Raises:
            KeyError: If the specified dataset is not found in the configuration.
        """
        dataset_config = self.settings.get_config(f"datasets.{dataset_name}")
        if not dataset_config:
            raise KeyError(f"Dataset '{dataset_name}' not found in configuration.")
        return dataset_config

    def list_datasets(self) -> list[str]:
        """Lists all available dataset names from the configuration.

        Example:
            >>> from culicidaelab.settings import Settings
            >>> from culicidaelab.datasets import DatasetsManager
            >>> settings = Settings()
            >>> manager = DatasetsManager(settings)
            >>> available_datasets = manager.list_datasets()
            >>> print(available_datasets)

        Returns:
            A list of configured dataset names.
        """
        return self.settings.list_datasets()

    def list_loaded_datasets(self) -> list[str]:
        """Lists all datasets that have been loaded during the session.

        Example:
            >>> from culicidaelab.settings import Settings
            >>> from culicidaelab.datasets import DatasetsManager
            >>> settings = Settings()
            >>> manager = DatasetsManager(settings)
            >>> _ = manager.load_dataset('classification', split='train')
            >>> loaded = manager.list_loaded_datasets()
            >>> print(loaded)
            ['classification']

        Returns:
            A list of names for datasets that are currently cached.
        """
        return list(self.loaded_datasets.keys())

    def load_dataset(
        self,
        name: str,
        split: str | list[str] | None = None,
        config_name: str | None = "default",
    ) -> Any:
        """Loads a dataset, handling complex splits and caching automatically.

        Example:
            >>> from culicidaelab.settings import Settings
            >>> from culicidaelab.datasets import DatasetsManager
            >>> # This example assumes you have a configured settings object
            >>> settings = Settings()
            >>> manager = DatasetsManager(settings)
            >>> # Load the training split of the classification dataset
            >>> train_dataset = manager.load_dataset('classification', split='train')
            >>> # Load all splits
            >>> all_splits = manager.load_dataset('classification')

        Args:
            name: The name of the dataset to load.
            split: The split(s) to load.
                - str: A single split name (e.g., "train", "test").
                - None: Loads ALL available splits into a `DatasetDict`.
                - Advanced: Can be a slice ("train[:100]") or a list for
                  cross-validation.
            config_name: The name of the dataset configuration to use.
                Defaults to "default".

        Returns:
            The loaded dataset object, which could be a `Dataset` or `DatasetDict`
            depending on the provider and splits requested.
        """
        # 1. Get config and provider
        config = self.get_dataset_info(name)
        provider = self.provider_service.get_provider(config.provider_name)

        split_path = self.settings.get_dataset_path(
            dataset_type=name,
            split=split,
        )

        # Check cache, otherwise download
        downloaded_path = None
        if not split_path.exists():
            # Instruct the provider to download and save to the precise cache path
            downloaded_path = provider.download_dataset(
                dataset_name=name,
                config_name=config_name,
                save_dir=split_path,
                split=split,
            )
        else:
            print(f"Cache hit for split config: {split} {split_path}")

        # Instruct the provider to load from the appropriate path
        load_from = downloaded_path or split_path

        dataset = provider.load_dataset(load_from)

        # Update the session cache
        self.loaded_datasets[name] = load_from

        return dataset
settings = settings instance-attribute
provider_service = ProviderService(settings) instance-attribute
loaded_datasets: dict[str, str | Path] = {} instance-attribute
__init__(settings: Settings)

Initializes the DatasetsManager with its dependencies.

Parameters:

Name Type Description Default
settings Settings

The main Settings object for the library.

required
Source code in culicidaelab\datasets\datasets_manager.py
def __init__(self, settings: Settings):
    """Initializes the DatasetsManager with its dependencies.

    Args:
        settings: The main Settings object for the library.
    """
    self.settings = settings
    self.provider_service = ProviderService(settings)
    self.loaded_datasets: dict[str, str | Path] = {}
get_dataset_info(dataset_name: str) -> DatasetConfig

Retrieves the configuration for a specific dataset.

Example

from culicidaelab.settings import Settings from culicidaelab.datasets import DatasetsManager settings = Settings() manager = DatasetsManager(settings) try: ... info = manager.get_dataset_info('classification') ... print(info.provider_name) ... except KeyError as e: ... print(e)

Parameters:

Name Type Description Default
dataset_name str

The name of the dataset (e.g., 'classification').

required

Returns:

Type Description
DatasetConfig

A Pydantic model instance containing the dataset's

DatasetConfig

validated configuration.

Raises:

Type Description
KeyError

If the specified dataset is not found in the configuration.

Source code in culicidaelab\datasets\datasets_manager.py
def get_dataset_info(self, dataset_name: str) -> DatasetConfig:
    """Retrieves the configuration for a specific dataset.

    Example:
        >>> from culicidaelab.settings import Settings
        >>> from culicidaelab.datasets import DatasetsManager
        >>> settings = Settings()
        >>> manager = DatasetsManager(settings)
        >>> try:
        ...     info = manager.get_dataset_info('classification')
        ...     print(info.provider_name)
        ... except KeyError as e:
        ...     print(e)

    Args:
        dataset_name: The name of the dataset (e.g., 'classification').

    Returns:
        A Pydantic model instance containing the dataset's
        validated configuration.

    Raises:
        KeyError: If the specified dataset is not found in the configuration.
    """
    dataset_config = self.settings.get_config(f"datasets.{dataset_name}")
    if not dataset_config:
        raise KeyError(f"Dataset '{dataset_name}' not found in configuration.")
    return dataset_config
list_datasets() -> list[str]

Lists all available dataset names from the configuration.

Example

from culicidaelab.settings import Settings from culicidaelab.datasets import DatasetsManager settings = Settings() manager = DatasetsManager(settings) available_datasets = manager.list_datasets() print(available_datasets)

Returns:

Type Description
list[str]

A list of configured dataset names.

Source code in culicidaelab\datasets\datasets_manager.py
def list_datasets(self) -> list[str]:
    """Lists all available dataset names from the configuration.

    Example:
        >>> from culicidaelab.settings import Settings
        >>> from culicidaelab.datasets import DatasetsManager
        >>> settings = Settings()
        >>> manager = DatasetsManager(settings)
        >>> available_datasets = manager.list_datasets()
        >>> print(available_datasets)

    Returns:
        A list of configured dataset names.
    """
    return self.settings.list_datasets()
list_loaded_datasets() -> list[str]

Lists all datasets that have been loaded during the session.

Example

from culicidaelab.settings import Settings from culicidaelab.datasets import DatasetsManager settings = Settings() manager = DatasetsManager(settings) _ = manager.load_dataset('classification', split='train') loaded = manager.list_loaded_datasets() print(loaded) ['classification']

Returns:

Type Description
list[str]

A list of names for datasets that are currently cached.

Source code in culicidaelab\datasets\datasets_manager.py
def list_loaded_datasets(self) -> list[str]:
    """Lists all datasets that have been loaded during the session.

    Example:
        >>> from culicidaelab.settings import Settings
        >>> from culicidaelab.datasets import DatasetsManager
        >>> settings = Settings()
        >>> manager = DatasetsManager(settings)
        >>> _ = manager.load_dataset('classification', split='train')
        >>> loaded = manager.list_loaded_datasets()
        >>> print(loaded)
        ['classification']

    Returns:
        A list of names for datasets that are currently cached.
    """
    return list(self.loaded_datasets.keys())
load_dataset(name: str, split: str | list[str] | None = None, config_name: str | None = 'default') -> Any

Loads a dataset, handling complex splits and caching automatically.

Example

from culicidaelab.settings import Settings from culicidaelab.datasets import DatasetsManager

This example assumes you have a configured settings object

settings = Settings() manager = DatasetsManager(settings)

Load the training split of the classification dataset

train_dataset = manager.load_dataset('classification', split='train')

Load all splits

all_splits = manager.load_dataset('classification')

Parameters:

Name Type Description Default
name str

The name of the dataset to load.

required
split str | list[str] | None

The split(s) to load. - str: A single split name (e.g., "train", "test"). - None: Loads ALL available splits into a DatasetDict. - Advanced: Can be a slice ("train[:100]") or a list for cross-validation.

None
config_name str | None

The name of the dataset configuration to use. Defaults to "default".

'default'

Returns:

Type Description
Any

The loaded dataset object, which could be a Dataset or DatasetDict

Any

depending on the provider and splits requested.

Source code in culicidaelab\datasets\datasets_manager.py
def load_dataset(
    self,
    name: str,
    split: str | list[str] | None = None,
    config_name: str | None = "default",
) -> Any:
    """Loads a dataset, handling complex splits and caching automatically.

    Example:
        >>> from culicidaelab.settings import Settings
        >>> from culicidaelab.datasets import DatasetsManager
        >>> # This example assumes you have a configured settings object
        >>> settings = Settings()
        >>> manager = DatasetsManager(settings)
        >>> # Load the training split of the classification dataset
        >>> train_dataset = manager.load_dataset('classification', split='train')
        >>> # Load all splits
        >>> all_splits = manager.load_dataset('classification')

    Args:
        name: The name of the dataset to load.
        split: The split(s) to load.
            - str: A single split name (e.g., "train", "test").
            - None: Loads ALL available splits into a `DatasetDict`.
            - Advanced: Can be a slice ("train[:100]") or a list for
              cross-validation.
        config_name: The name of the dataset configuration to use.
            Defaults to "default".

    Returns:
        The loaded dataset object, which could be a `Dataset` or `DatasetDict`
        depending on the provider and splits requested.
    """
    # 1. Get config and provider
    config = self.get_dataset_info(name)
    provider = self.provider_service.get_provider(config.provider_name)

    split_path = self.settings.get_dataset_path(
        dataset_type=name,
        split=split,
    )

    # Check cache, otherwise download
    downloaded_path = None
    if not split_path.exists():
        # Instruct the provider to download and save to the precise cache path
        downloaded_path = provider.download_dataset(
            dataset_name=name,
            config_name=config_name,
            save_dir=split_path,
            split=split,
        )
    else:
        print(f"Cache hit for split config: {split} {split_path}")

    # Instruct the provider to load from the appropriate path
    load_from = downloaded_path or split_path

    dataset = provider.load_dataset(load_from)

    # Update the session cache
    self.loaded_datasets[name] = load_from

    return dataset

handler: python selection: members: true