Skip to content

Core API

culicidaelab.core

Core components of the CulicidaeLab library.

This module provides the base classes, configuration management, and resource handling functionalities that form the foundation of the library. It exports key classes and functions for convenient access from other parts of the application.

Attributes:

Name Type Description
__all__ list[str]

A list of the public objects of this module.

__all__ = ['BasePredictor', 'BaseProvider', 'WeightsManagerProtocol', 'BaseInferenceBackend', 'ConfigManager', 'CulicidaeLabConfig', 'PredictorConfig', 'DatasetConfig', 'ProviderConfig', 'SpeciesModel', 'SpeciesConfig', 'BoundingBox', 'Detection', 'DetectionPrediction', 'SegmentationPrediction', 'Classification', 'ClassificationPrediction', 'ProviderService', 'ResourceManager', 'Settings', 'get_settings', 'download_file'] module-attribute
BasePredictor

Abstract base class for all predictors.

This class defines the common interface for all predictors (e.g., detector, segmenter, classifier). It relies on the main Settings object for configuration and a backend for model execution.

Attributes:

Name Type Description
settings Settings

The main settings object for the library.

predictor_type str

The key for this predictor in the configuration (e.g., 'classifier').

backend BaseInferenceBackend

An object that inherits from BaseInferenceBackend for model loading and inference.

Source code in culicidaelab\core\base_predictor.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
class BasePredictor(Generic[InputDataType, PredictionType, GroundTruthType], ABC):
    """Abstract base class for all predictors.

    This class defines the common interface for all predictors (e.g., detector,
    segmenter, classifier). It relies on the main Settings object for
    configuration and a backend for model execution.

    Attributes:
        settings (Settings): The main settings object for the library.
        predictor_type (str): The key for this predictor in the configuration
            (e.g., 'classifier').
        backend (BaseInferenceBackend): An object that inherits from
            BaseInferenceBackend for model loading and inference.
    """

    def __init__(
        self,
        settings: Settings,
        predictor_type: str,
        backend: BaseInferenceBackend,
        load_model: bool = False,
    ):
        """Initializes the predictor.

        Args:
            settings (Settings): The main Settings object for the library.
            predictor_type (str): The key for this predictor in the configuration
                (e.g., 'classifier').
            backend (BaseInferenceBackend): An object that inherits from
                BaseInferenceBackend for model loading and inference.
            load_model (bool): If True, loads the model immediately upon
                initialization.
        """
        self.settings = settings
        self.predictor_type = predictor_type
        self.backend = backend
        self._config: PredictorConfig = self._get_predictor_config()
        self._logger = logging.getLogger(
            f"culicidaelab.predictor.{self.predictor_type}",
        )

        if load_model:
            self.load_model()

    def __call__(self, input_data: InputDataType, **kwargs: Any) -> Any:
        """Convenience method that calls `predict()`.

        This allows the predictor instance to be called as a function.

        Args:
            input_data (InputDataType): The input data for the prediction.
            **kwargs (Any): Additional arguments to pass to the `predict` method.

        Returns:
            Any: The result of the prediction.
        """
        if not self.backend.is_loaded:
            self.load_model()
        return self.predict(input_data, **kwargs)

    def __enter__(self):
        """Context manager entry.

        Loads the model if it is not already loaded.

        Returns:
            BasePredictor: The predictor instance.
        """
        if not self.backend.is_loaded:
            self.load_model()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit.

        This default implementation does nothing, but can be overridden to handle
        resource cleanup.
        """
        pass

    @property
    def config(self) -> PredictorConfig:
        """Get the predictor configuration Pydantic model.

        Returns:
            PredictorConfig: The configuration object for this predictor.
        """
        return self._config

    @property
    def model_loaded(self) -> bool:
        """Check if the model is loaded.

        Returns:
            bool: True if the model is loaded, False otherwise.
        """
        return self.backend.is_loaded

    @contextmanager
    def model_context(self):
        """A context manager for temporary model loading.

        Ensures the model is loaded upon entering the context and unloaded
        upon exiting if it was not loaded before. This is useful for managing
        memory in pipelines.

        Yields:
            BasePredictor: The predictor instance itself.

        Example:
            >>> with predictor.model_context():
            ...     predictions = predictor.predict(data)
        """
        was_loaded = self.backend.is_loaded
        try:
            if not was_loaded:
                self.load_model()
            yield self
        finally:
            if not was_loaded and self.backend.is_loaded:
                self.unload_model()

    def evaluate(
        self,
        ground_truth: GroundTruthType,
        prediction: PredictionType | None = None,
        input_data: InputDataType | None = None,
        **predict_kwargs: Any,
    ) -> dict[str, float]:
        """Evaluate a prediction against a ground truth.

        Either `prediction` or `input_data` must be provided. If `prediction`
        is provided, it is used directly. If `prediction` is None, `input_data`
        is used to generate a new prediction.

        Args:
            ground_truth (GroundTruthType): The ground truth annotation.
            prediction (PredictionType, optional): A pre-computed prediction.
            input_data (InputDataType, optional): Input data to generate a
                prediction from, if one isn't provided.
            **predict_kwargs (Any): Additional arguments passed to the `predict`
                method.

        Returns:
            dict[str, float]: Dictionary containing evaluation metrics for a
            single item.

        Raises:
            ValueError: If neither `prediction` nor `input_data` is provided.
        """
        if prediction is None:
            if input_data is not None:
                prediction = self.predict(input_data, **predict_kwargs)
            else:
                raise ValueError(
                    "Either 'prediction' or 'input_data' must be provided.",
                )
        return self._evaluate_from_prediction(
            prediction=prediction,
            ground_truth=ground_truth,
        )

    def evaluate_batch(
        self,
        ground_truth_batch: Sequence[GroundTruthType],
        predictions_batch: Sequence[PredictionType] | None = None,
        input_data_batch: Sequence[InputDataType] | None = None,
        num_workers: int = 1,
        show_progress: bool = False,
        **predict_kwargs: Any,
    ) -> dict[str, Any]:
        """Evaluate on a batch of items using parallel processing.

        Either `predictions_batch` or `input_data_batch` must be provided.

        Args:
            ground_truth_batch (Sequence[GroundTruthType]): List of corresponding
                ground truth annotations.
            predictions_batch (Sequence[PredictionType], optional): A pre-computed
                list of predictions.
            input_data_batch (Sequence[InputDataType], optional): List of input data
                to generate predictions from.
            num_workers (int): Number of parallel workers for calculating metrics.
            show_progress (bool): Whether to show a progress bar.
            **predict_kwargs (Any): Additional arguments passed to `predict_batch`.

        Returns:
            dict[str, Any]: Dictionary containing aggregated evaluation metrics.

        Raises:
            ValueError: If the number of predictions does not match the number
                of ground truths, or if required inputs are missing.
        """
        if predictions_batch is None:
            if input_data_batch is not None:
                predictions_batch = self.predict_batch(
                    input_data_batch,
                    show_progress=show_progress,
                    **predict_kwargs,
                )
            else:
                raise ValueError(
                    "Either 'predictions_batch' or 'input_data_batch' must be provided.",
                )

        if len(predictions_batch) != len(ground_truth_batch):
            raise ValueError(
                f"Number of predictions ({len(predictions_batch)}) must match "
                f"number of ground truths ({len(ground_truth_batch)}).",
            )

        per_item_metrics = self._calculate_metrics_parallel(
            predictions_batch,
            ground_truth_batch,
            num_workers,
            show_progress,
        )
        aggregated_metrics = self._aggregate_metrics(per_item_metrics)
        final_report = self._finalize_evaluation_report(
            aggregated_metrics,
            predictions_batch,
            ground_truth_batch,
        )
        return final_report

    def get_model_info(self) -> dict[str, Any]:
        """Gets information about the loaded model.

        Returns:
            dict[str, Any]: A dictionary containing details about the model, such
            as architecture, path, etc.
        """
        return {
            "predictor_type": self.predictor_type,
            "model_loaded": self.backend.is_loaded,
            "config": self.config.model_dump(),
        }

    def load_model(self) -> None:
        """Delegates model loading to the configured backend."""
        if not self.backend.is_loaded:
            self._logger.info(
                f"Loading model for {self.predictor_type} using {self.backend.__class__.__name__}",
            )
            try:
                self.backend.load_model()
                self._logger.info(f"Successfully loaded model for {self.predictor_type}")
            except Exception as e:
                self._logger.error(f"Failed to load model for {self.predictor_type}: {e}")
                raise RuntimeError(f"Failed to load model for {self.predictor_type}: {e}") from e

    def predict(
        self,
        input_data: InputDataType,
        **kwargs: Any,
    ) -> PredictionType:
        """Makes a prediction on a single input data sample.

        Args:
            input_data (InputDataType): The input data (e.g., an image as a NumPy
                array) to make a prediction on.
            **kwargs (Any): Additional predictor-specific arguments.

        Returns:
            PredictionType: The prediction result, with a format specific to the
            predictor type.

        Raises:
            RuntimeError: If the model is not loaded before calling this method.
        """
        if not self.backend.is_loaded:
            try:
                self.load_model()
            except Exception as e:
                raise RuntimeError(f"Failed to load model: {e}") from e

        image = self._load_and_validate_image(input_data)

        raw_output = self.backend.predict(image, **kwargs)

        return self._convert_raw_to_prediction(raw_output)

    def predict_batch(
        self,
        input_data_batch: Sequence[InputDataType],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> list[PredictionType]:
        """Makes predictions on a batch of inputs by delegating to the backend.

        Args:
            input_data_batch (Sequence[InputDataType]): A sequence of inputs.
            show_progress (bool): If True, displays a progress bar.
            **kwargs (Any): Additional arguments for the backend's `predict_batch`.

        Returns:
            list[PredictionType]: A list of prediction results.
        """
        if not input_data_batch:
            return []

        if not self.backend.is_loaded:
            self.load_model()

        raw_predictions = self.backend.predict_batch(list(input_data_batch), **kwargs)
        final_predictions = [self._convert_raw_to_prediction(raw_pred) for raw_pred in raw_predictions]
        return final_predictions

    def unload_model(self) -> None:
        """Unloads the model to free memory."""
        if self.backend.is_loaded:
            self.backend.unload_model()
            self._logger.info(f"Unloaded model for {self.predictor_type}")

    @abstractmethod
    def _evaluate_from_prediction(
        self,
        prediction: PredictionType,
        ground_truth: GroundTruthType,
    ) -> dict[str, float]:
        """The core metric calculation logic for a single item.

        This method must be implemented by subclasses to define how a prediction
        is evaluated against a ground truth.

        Args:
            prediction (PredictionType): Model prediction.
            ground_truth (GroundTruthType): Ground truth annotation.

        Returns:
            dict[str, float]: Dictionary containing evaluation metrics.
        """
        pass

    @abstractmethod
    def _convert_raw_to_prediction(self, raw_prediction: Any) -> PredictionType:
        """Converts raw backend output to a structured prediction model.

        Subclasses MUST implement this to convert raw output (e.g., a numpy array)
        from the backend into the final Pydantic prediction model.

        Args:
            raw_prediction (Any): The raw output from the inference backend.

        Returns:
            PredictionType: The structured prediction object.
        """
        pass

    @abstractmethod
    def visualize(
        self,
        input_data: InputDataType,
        predictions: PredictionType,
        save_path: str | Path | None = None,
    ) -> np.ndarray:
        """Visualizes the predictions on the input data.

        Args:
            input_data (InputDataType): The original input data (e.g., an image).
            predictions (PredictionType): The prediction result obtained from
                the `predict` method.
            save_path (str | Path, optional): An optional path to save the
                visualization to a file.

        Returns:
            np.ndarray: A NumPy array representing the visualized image.
        """
        pass

    def _aggregate_metrics(
        self,
        metrics_list: list[dict[str, float]],
    ) -> dict[str, float]:
        """Aggregates metrics from multiple evaluations.

        Calculates the mean and standard deviation for each metric across a list
        of evaluation results.

        Args:
            metrics_list (list[dict[str, float]]): A list of metric dictionaries.

        Returns:
            dict[str, float]: A dictionary with aggregated metrics (mean, std).
        """
        if not metrics_list:
            return {}

        valid_metrics = [m for m in metrics_list if m]
        if not valid_metrics:
            self._logger.warning("No valid metrics found for aggregation")
            return {}

        all_keys = {k for m in valid_metrics for k in m.keys()}
        aggregated = {}
        for key in all_keys:
            values = [m[key] for m in valid_metrics if key in m]
            if values:
                aggregated[f"{key}_mean"] = float(np.mean(values))
                aggregated[f"{key}_std"] = float(np.std(values))

        aggregated["count"] = len(valid_metrics)
        return aggregated

    def _calculate_metrics_parallel(
        self,
        predictions: Sequence[PredictionType],
        ground_truths: Sequence[GroundTruthType],
        num_workers: int = 4,
        show_progress: bool = True,
    ) -> list[dict[str, float]]:
        """Calculates metrics for individual items in parallel.

        Args:
            predictions (Sequence[PredictionType]): A sequence of predictions.
            ground_truths (Sequence[GroundTruthType]): A sequence of ground truths.
            num_workers (int): The number of threads to use.
            show_progress (bool): Whether to display a progress bar.

        Returns:
            list[dict[str, float]]: A list of metric dictionaries, one for each item.
        """
        per_item_metrics = []

        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            future_to_idx = {
                executor.submit(
                    self._evaluate_from_prediction,
                    predictions[i],
                    ground_truths[i],
                ): i
                for i in range(len(predictions))
            }

            iterator = as_completed(future_to_idx)
            if show_progress:
                iterator = progress_bar(
                    iterator,
                    total=len(future_to_idx),
                )
            for future in iterator:
                try:
                    per_item_metrics.append(future.result())
                except Exception as e:
                    idx = future_to_idx.get(future, "unknown")
                    self._logger.error(
                        f"Error calculating metrics for item {idx}: {e}",
                    )
                    per_item_metrics.append({})
        return per_item_metrics

    def _finalize_evaluation_report(
        self,
        aggregated_metrics: dict[str, float],
        predictions: Sequence[PredictionType],
        ground_truths: Sequence[GroundTruthType],
    ) -> dict[str, Any]:
        """Optional hook to post-process the final evaluation report.

        This method can be overridden by subclasses to add more details to the
        final report.

        Args:
            aggregated_metrics (dict[str, float]): The aggregated metrics.
            predictions (Sequence[PredictionType]): The list of predictions.
            ground_truths (Sequence[GroundTruthType]): The list of ground truths.

        Returns:
            dict[str, Any]: The finalized evaluation report.
        """
        return aggregated_metrics

    def _get_predictor_config(self) -> PredictorConfig:
        """Gets the configuration for this predictor.

        Returns:
            PredictorConfig: A Pydantic `PredictorConfig` model for this
            predictor instance.

        Raises:
            ValueError: If the configuration is invalid or not found.
        """
        config = self.settings.get_config(f"predictors.{self.predictor_type}")
        if not isinstance(config, PredictorConfig):
            raise ValueError(
                f"Configuration for predictor '{self.predictor_type}' not found or is invalid.",
            )
        return config

    def _load_and_validate_image(self, input_data: InputDataType) -> Image.Image:
        """Loads and validates an input image from various formats.

        Args:
            input_data: input data type (numpy array, file path, PIL Image, bytes,
                or io.BytesIO).

        Returns:
            A validated PIL Image in RGB format.

        Raises:
            ValueError: If input format is invalid or image cannot be loaded.
            FileNotFoundError: If image file path does not exist.
            TypeError: If the input type is not supported.
        """
        if isinstance(input_data, (str, Path)):
            image_path = Path(input_data)
            if not image_path.exists():
                raise FileNotFoundError(f"Image file not found: {image_path}")
            try:
                image = Image.open(image_path).convert("RGB")
                return image
            except Exception as e:
                raise ValueError(f"Cannot load image from {image_path}: {e}")

        elif isinstance(input_data, Image.Image):
            return input_data.convert("RGB")

        elif isinstance(input_data, np.ndarray):
            if input_data.ndim != 3 or input_data.shape[2] != 3:
                raise ValueError(
                    f"Expected 3D RGB image, got shape: {input_data.shape}",
                )
            if input_data.dtype == np.uint8:
                return Image.fromarray(input_data)
            elif input_data.dtype in [np.float32, np.float64]:
                if input_data.max() > 1.0 or input_data.min() < 0.0:
                    raise ValueError("Float images must be in range [0, 1]")
                return Image.fromarray((input_data * 255).astype(np.uint8))
            else:
                raise ValueError(f"Unsupported numpy dtype: {input_data.dtype}")

        elif isinstance(input_data, bytes):
            try:
                return Image.open(io.BytesIO(input_data)).convert("RGB")
            except Exception as e:
                raise ValueError(f"Cannot load image from bytes: {e}")

        elif isinstance(input_data, io.BytesIO):
            try:
                return Image.open(input_data).convert("RGB")
            except Exception as e:
                raise ValueError(f"Cannot load image from BytesIO stream: {e}")

        else:
            raise TypeError(
                f"Unsupported input type: {type(input_data)}. "
                f"Expected np.ndarray, str, pathlib.Path, PIL.Image.Image, bytes, or io.BytesIO",
            )

    def _prepare_batch_images(
        self,
        input_data_batch: Sequence[InputDataType],
    ) -> tuple[list[Image.Image], list[int]]:
        """Prepares and validates a batch of images for processing.

        Args:
            input_data_batch: A sequence of input images.

        Returns:
            A tuple of (valid_images, valid_indices) where valid_indices
            tracks the original position of each valid image.
        """
        valid_images = []
        valid_indices = []
        for idx, input_data in enumerate(input_data_batch):
            try:
                image = self._load_and_validate_image(input_data)
                valid_images.append(image)
                valid_indices.append(idx)
            except Exception as e:
                self._logger.warning(f"Skipping image at index {idx}: {e}")
        return valid_images, valid_indices
settings = settings instance-attribute
predictor_type = predictor_type instance-attribute
backend = backend instance-attribute
config: PredictorConfig property

Get the predictor configuration Pydantic model.

Returns:

Name Type Description
PredictorConfig PredictorConfig

The configuration object for this predictor.

model_loaded: bool property

Check if the model is loaded.

Returns:

Name Type Description
bool bool

True if the model is loaded, False otherwise.

__init__(settings: Settings, predictor_type: str, backend: BaseInferenceBackend, load_model: bool = False)

Initializes the predictor.

Parameters:

Name Type Description Default
settings Settings

The main Settings object for the library.

required
predictor_type str

The key for this predictor in the configuration (e.g., 'classifier').

required
backend BaseInferenceBackend

An object that inherits from BaseInferenceBackend for model loading and inference.

required
load_model bool

If True, loads the model immediately upon initialization.

False
Source code in culicidaelab\core\base_predictor.py
def __init__(
    self,
    settings: Settings,
    predictor_type: str,
    backend: BaseInferenceBackend,
    load_model: bool = False,
):
    """Initializes the predictor.

    Args:
        settings (Settings): The main Settings object for the library.
        predictor_type (str): The key for this predictor in the configuration
            (e.g., 'classifier').
        backend (BaseInferenceBackend): An object that inherits from
            BaseInferenceBackend for model loading and inference.
        load_model (bool): If True, loads the model immediately upon
            initialization.
    """
    self.settings = settings
    self.predictor_type = predictor_type
    self.backend = backend
    self._config: PredictorConfig = self._get_predictor_config()
    self._logger = logging.getLogger(
        f"culicidaelab.predictor.{self.predictor_type}",
    )

    if load_model:
        self.load_model()
__call__(input_data: InputDataType, **kwargs: Any) -> Any

Convenience method that calls predict().

This allows the predictor instance to be called as a function.

Parameters:

Name Type Description Default
input_data InputDataType

The input data for the prediction.

required
**kwargs Any

Additional arguments to pass to the predict method.

{}

Returns:

Name Type Description
Any Any

The result of the prediction.

Source code in culicidaelab\core\base_predictor.py
def __call__(self, input_data: InputDataType, **kwargs: Any) -> Any:
    """Convenience method that calls `predict()`.

    This allows the predictor instance to be called as a function.

    Args:
        input_data (InputDataType): The input data for the prediction.
        **kwargs (Any): Additional arguments to pass to the `predict` method.

    Returns:
        Any: The result of the prediction.
    """
    if not self.backend.is_loaded:
        self.load_model()
    return self.predict(input_data, **kwargs)
__enter__()

Context manager entry.

Loads the model if it is not already loaded.

Returns:

Name Type Description
BasePredictor

The predictor instance.

Source code in culicidaelab\core\base_predictor.py
def __enter__(self):
    """Context manager entry.

    Loads the model if it is not already loaded.

    Returns:
        BasePredictor: The predictor instance.
    """
    if not self.backend.is_loaded:
        self.load_model()
    return self
__exit__(exc_type, exc_val, exc_tb)

Context manager exit.

This default implementation does nothing, but can be overridden to handle resource cleanup.

Source code in culicidaelab\core\base_predictor.py
def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit.

    This default implementation does nothing, but can be overridden to handle
    resource cleanup.
    """
    pass
model_context()

A context manager for temporary model loading.

Ensures the model is loaded upon entering the context and unloaded upon exiting if it was not loaded before. This is useful for managing memory in pipelines.

Yields:

Name Type Description
BasePredictor

The predictor instance itself.

Example

with predictor.model_context(): ... predictions = predictor.predict(data)

Source code in culicidaelab\core\base_predictor.py
@contextmanager
def model_context(self):
    """A context manager for temporary model loading.

    Ensures the model is loaded upon entering the context and unloaded
    upon exiting if it was not loaded before. This is useful for managing
    memory in pipelines.

    Yields:
        BasePredictor: The predictor instance itself.

    Example:
        >>> with predictor.model_context():
        ...     predictions = predictor.predict(data)
    """
    was_loaded = self.backend.is_loaded
    try:
        if not was_loaded:
            self.load_model()
        yield self
    finally:
        if not was_loaded and self.backend.is_loaded:
            self.unload_model()
evaluate(ground_truth: GroundTruthType, prediction: PredictionType | None = None, input_data: InputDataType | None = None, **predict_kwargs: Any) -> dict[str, float]

Evaluate a prediction against a ground truth.

Either prediction or input_data must be provided. If prediction is provided, it is used directly. If prediction is None, input_data is used to generate a new prediction.

Parameters:

Name Type Description Default
ground_truth GroundTruthType

The ground truth annotation.

required
prediction PredictionType

A pre-computed prediction.

None
input_data InputDataType

Input data to generate a prediction from, if one isn't provided.

None
**predict_kwargs Any

Additional arguments passed to the predict method.

{}

Returns:

Type Description
dict[str, float]

dict[str, float]: Dictionary containing evaluation metrics for a

dict[str, float]

single item.

Raises:

Type Description
ValueError

If neither prediction nor input_data is provided.

Source code in culicidaelab\core\base_predictor.py
def evaluate(
    self,
    ground_truth: GroundTruthType,
    prediction: PredictionType | None = None,
    input_data: InputDataType | None = None,
    **predict_kwargs: Any,
) -> dict[str, float]:
    """Evaluate a prediction against a ground truth.

    Either `prediction` or `input_data` must be provided. If `prediction`
    is provided, it is used directly. If `prediction` is None, `input_data`
    is used to generate a new prediction.

    Args:
        ground_truth (GroundTruthType): The ground truth annotation.
        prediction (PredictionType, optional): A pre-computed prediction.
        input_data (InputDataType, optional): Input data to generate a
            prediction from, if one isn't provided.
        **predict_kwargs (Any): Additional arguments passed to the `predict`
            method.

    Returns:
        dict[str, float]: Dictionary containing evaluation metrics for a
        single item.

    Raises:
        ValueError: If neither `prediction` nor `input_data` is provided.
    """
    if prediction is None:
        if input_data is not None:
            prediction = self.predict(input_data, **predict_kwargs)
        else:
            raise ValueError(
                "Either 'prediction' or 'input_data' must be provided.",
            )
    return self._evaluate_from_prediction(
        prediction=prediction,
        ground_truth=ground_truth,
    )
evaluate_batch(ground_truth_batch: Sequence[GroundTruthType], predictions_batch: Sequence[PredictionType] | None = None, input_data_batch: Sequence[InputDataType] | None = None, num_workers: int = 1, show_progress: bool = False, **predict_kwargs: Any) -> dict[str, Any]

Evaluate on a batch of items using parallel processing.

Either predictions_batch or input_data_batch must be provided.

Parameters:

Name Type Description Default
ground_truth_batch Sequence[GroundTruthType]

List of corresponding ground truth annotations.

required
predictions_batch Sequence[PredictionType]

A pre-computed list of predictions.

None
input_data_batch Sequence[InputDataType]

List of input data to generate predictions from.

None
num_workers int

Number of parallel workers for calculating metrics.

1
show_progress bool

Whether to show a progress bar.

False
**predict_kwargs Any

Additional arguments passed to predict_batch.

{}

Returns:

Type Description
dict[str, Any]

dict[str, Any]: Dictionary containing aggregated evaluation metrics.

Raises:

Type Description
ValueError

If the number of predictions does not match the number of ground truths, or if required inputs are missing.

Source code in culicidaelab\core\base_predictor.py
def evaluate_batch(
    self,
    ground_truth_batch: Sequence[GroundTruthType],
    predictions_batch: Sequence[PredictionType] | None = None,
    input_data_batch: Sequence[InputDataType] | None = None,
    num_workers: int = 1,
    show_progress: bool = False,
    **predict_kwargs: Any,
) -> dict[str, Any]:
    """Evaluate on a batch of items using parallel processing.

    Either `predictions_batch` or `input_data_batch` must be provided.

    Args:
        ground_truth_batch (Sequence[GroundTruthType]): List of corresponding
            ground truth annotations.
        predictions_batch (Sequence[PredictionType], optional): A pre-computed
            list of predictions.
        input_data_batch (Sequence[InputDataType], optional): List of input data
            to generate predictions from.
        num_workers (int): Number of parallel workers for calculating metrics.
        show_progress (bool): Whether to show a progress bar.
        **predict_kwargs (Any): Additional arguments passed to `predict_batch`.

    Returns:
        dict[str, Any]: Dictionary containing aggregated evaluation metrics.

    Raises:
        ValueError: If the number of predictions does not match the number
            of ground truths, or if required inputs are missing.
    """
    if predictions_batch is None:
        if input_data_batch is not None:
            predictions_batch = self.predict_batch(
                input_data_batch,
                show_progress=show_progress,
                **predict_kwargs,
            )
        else:
            raise ValueError(
                "Either 'predictions_batch' or 'input_data_batch' must be provided.",
            )

    if len(predictions_batch) != len(ground_truth_batch):
        raise ValueError(
            f"Number of predictions ({len(predictions_batch)}) must match "
            f"number of ground truths ({len(ground_truth_batch)}).",
        )

    per_item_metrics = self._calculate_metrics_parallel(
        predictions_batch,
        ground_truth_batch,
        num_workers,
        show_progress,
    )
    aggregated_metrics = self._aggregate_metrics(per_item_metrics)
    final_report = self._finalize_evaluation_report(
        aggregated_metrics,
        predictions_batch,
        ground_truth_batch,
    )
    return final_report
get_model_info() -> dict[str, Any]

Gets information about the loaded model.

Returns:

Type Description
dict[str, Any]

dict[str, Any]: A dictionary containing details about the model, such

dict[str, Any]

as architecture, path, etc.

Source code in culicidaelab\core\base_predictor.py
def get_model_info(self) -> dict[str, Any]:
    """Gets information about the loaded model.

    Returns:
        dict[str, Any]: A dictionary containing details about the model, such
        as architecture, path, etc.
    """
    return {
        "predictor_type": self.predictor_type,
        "model_loaded": self.backend.is_loaded,
        "config": self.config.model_dump(),
    }
load_model() -> None

Delegates model loading to the configured backend.

Source code in culicidaelab\core\base_predictor.py
def load_model(self) -> None:
    """Delegates model loading to the configured backend."""
    if not self.backend.is_loaded:
        self._logger.info(
            f"Loading model for {self.predictor_type} using {self.backend.__class__.__name__}",
        )
        try:
            self.backend.load_model()
            self._logger.info(f"Successfully loaded model for {self.predictor_type}")
        except Exception as e:
            self._logger.error(f"Failed to load model for {self.predictor_type}: {e}")
            raise RuntimeError(f"Failed to load model for {self.predictor_type}: {e}") from e
predict(input_data: InputDataType, **kwargs: Any) -> PredictionType

Makes a prediction on a single input data sample.

Parameters:

Name Type Description Default
input_data InputDataType

The input data (e.g., an image as a NumPy array) to make a prediction on.

required
**kwargs Any

Additional predictor-specific arguments.

{}

Returns:

Name Type Description
PredictionType PredictionType

The prediction result, with a format specific to the

PredictionType

predictor type.

Raises:

Type Description
RuntimeError

If the model is not loaded before calling this method.

Source code in culicidaelab\core\base_predictor.py
def predict(
    self,
    input_data: InputDataType,
    **kwargs: Any,
) -> PredictionType:
    """Makes a prediction on a single input data sample.

    Args:
        input_data (InputDataType): The input data (e.g., an image as a NumPy
            array) to make a prediction on.
        **kwargs (Any): Additional predictor-specific arguments.

    Returns:
        PredictionType: The prediction result, with a format specific to the
        predictor type.

    Raises:
        RuntimeError: If the model is not loaded before calling this method.
    """
    if not self.backend.is_loaded:
        try:
            self.load_model()
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {e}") from e

    image = self._load_and_validate_image(input_data)

    raw_output = self.backend.predict(image, **kwargs)

    return self._convert_raw_to_prediction(raw_output)
predict_batch(input_data_batch: Sequence[InputDataType], show_progress: bool = False, **kwargs: Any) -> list[PredictionType]

Makes predictions on a batch of inputs by delegating to the backend.

Parameters:

Name Type Description Default
input_data_batch Sequence[InputDataType]

A sequence of inputs.

required
show_progress bool

If True, displays a progress bar.

False
**kwargs Any

Additional arguments for the backend's predict_batch.

{}

Returns:

Type Description
list[PredictionType]

list[PredictionType]: A list of prediction results.

Source code in culicidaelab\core\base_predictor.py
def predict_batch(
    self,
    input_data_batch: Sequence[InputDataType],
    show_progress: bool = False,
    **kwargs: Any,
) -> list[PredictionType]:
    """Makes predictions on a batch of inputs by delegating to the backend.

    Args:
        input_data_batch (Sequence[InputDataType]): A sequence of inputs.
        show_progress (bool): If True, displays a progress bar.
        **kwargs (Any): Additional arguments for the backend's `predict_batch`.

    Returns:
        list[PredictionType]: A list of prediction results.
    """
    if not input_data_batch:
        return []

    if not self.backend.is_loaded:
        self.load_model()

    raw_predictions = self.backend.predict_batch(list(input_data_batch), **kwargs)
    final_predictions = [self._convert_raw_to_prediction(raw_pred) for raw_pred in raw_predictions]
    return final_predictions
unload_model() -> None

Unloads the model to free memory.

Source code in culicidaelab\core\base_predictor.py
def unload_model(self) -> None:
    """Unloads the model to free memory."""
    if self.backend.is_loaded:
        self.backend.unload_model()
        self._logger.info(f"Unloaded model for {self.predictor_type}")
visualize(input_data: InputDataType, predictions: PredictionType, save_path: str | Path | None = None) -> np.ndarray abstractmethod

Visualizes the predictions on the input data.

Parameters:

Name Type Description Default
input_data InputDataType

The original input data (e.g., an image).

required
predictions PredictionType

The prediction result obtained from the predict method.

required
save_path str | Path

An optional path to save the visualization to a file.

None

Returns:

Type Description
ndarray

np.ndarray: A NumPy array representing the visualized image.

Source code in culicidaelab\core\base_predictor.py
@abstractmethod
def visualize(
    self,
    input_data: InputDataType,
    predictions: PredictionType,
    save_path: str | Path | None = None,
) -> np.ndarray:
    """Visualizes the predictions on the input data.

    Args:
        input_data (InputDataType): The original input data (e.g., an image).
        predictions (PredictionType): The prediction result obtained from
            the `predict` method.
        save_path (str | Path, optional): An optional path to save the
            visualization to a file.

    Returns:
        np.ndarray: A NumPy array representing the visualized image.
    """
    pass
BaseProvider

Abstract base class for all data and model providers.

This class defines the contract for providers that fetch resources like datasets and model weights.

Source code in culicidaelab\core\base_provider.py
class BaseProvider(ABC):
    """Abstract base class for all data and model providers.

    This class defines the contract for providers that fetch resources like
    datasets and model weights.
    """

    @abstractmethod
    def download_dataset(
        self,
        dataset_name: str,
        save_dir: Path | None = None,
        *args: Any,
        **kwargs: Any,
    ) -> Path:
        """Downloads a dataset from a source.

        Args:
            dataset_name (str): The name or identifier of the dataset to download.
            save_dir (Path | None, optional): The directory to save the dataset.
                If None, a default directory may be used. Defaults to None.
            *args: Additional positional arguments for the provider's implementation.
            **kwargs: Additional keyword arguments for the provider's implementation.

        Returns:
            Path: The path to the downloaded dataset directory or file.

        Raises:
            NotImplementedError: If the method is not implemented by a subclass.
        """
        raise NotImplementedError("Subclasses must implement this method")

    @abstractmethod
    def download_model_weights(
        self,
        repo_id: str,
        filename: str,
        local_dir: Path,
        *args: Any,
        **kwargs: Any,
    ) -> Path:
        """Downloads model weights and returns the path to them.

        Args:
            repo_id (str): The repository ID from which to download the model
                (e.g., 'culicidae/mosquito-detector').
            filename (str): The name of the weights file in the repository.
            local_dir (Path): The local directory to save the weights file.
            *args: Additional positional arguments for the provider's implementation.
            **kwargs: Additional keyword arguments for the provider's implementation.

        Returns:
            Path: The local path to the downloaded model weights file.

        Raises:
            NotImplementedError: If the method is not implemented by a subclass.
        """
        raise NotImplementedError("Subclasses must implement this method")

    @abstractmethod
    def get_provider_name(self) -> str:
        """Gets the unique name of the provider.

        Returns:
            str: A string representing the provider's name (e.g., 'huggingface').
        """
        pass

    @abstractmethod
    def load_dataset(
        self,
        dataset_path: str | Path,
        **kwargs: Any,
    ) -> Any:
        """Loads a dataset from a local path.

        This method is responsible for loading a dataset that has already been
        downloaded to the local filesystem.

        Args:
            dataset_path (str | Path): The local path to the dataset, typically
                a path returned by `download_dataset`.
            **kwargs: Additional keyword arguments for loading the dataset, which
                may vary by provider and dataset format.

        Returns:
            Any: The loaded dataset object, which could be a Hugging Face Dataset,
            a PyTorch Dataset, a Pandas DataFrame, or another format.

        Raises:
            NotImplementedError: If the method is not implemented by a subclass.
        """
        raise NotImplementedError("Subclasses must implement this method")
download_dataset(dataset_name: str, save_dir: Path | None = None, *args: Any, **kwargs: Any) -> Path abstractmethod

Downloads a dataset from a source.

Parameters:

Name Type Description Default
dataset_name str

The name or identifier of the dataset to download.

required
save_dir Path | None

The directory to save the dataset. If None, a default directory may be used. Defaults to None.

None
*args Any

Additional positional arguments for the provider's implementation.

()
**kwargs Any

Additional keyword arguments for the provider's implementation.

{}

Returns:

Name Type Description
Path Path

The path to the downloaded dataset directory or file.

Raises:

Type Description
NotImplementedError

If the method is not implemented by a subclass.

Source code in culicidaelab\core\base_provider.py
@abstractmethod
def download_dataset(
    self,
    dataset_name: str,
    save_dir: Path | None = None,
    *args: Any,
    **kwargs: Any,
) -> Path:
    """Downloads a dataset from a source.

    Args:
        dataset_name (str): The name or identifier of the dataset to download.
        save_dir (Path | None, optional): The directory to save the dataset.
            If None, a default directory may be used. Defaults to None.
        *args: Additional positional arguments for the provider's implementation.
        **kwargs: Additional keyword arguments for the provider's implementation.

    Returns:
        Path: The path to the downloaded dataset directory or file.

    Raises:
        NotImplementedError: If the method is not implemented by a subclass.
    """
    raise NotImplementedError("Subclasses must implement this method")
download_model_weights(repo_id: str, filename: str, local_dir: Path, *args: Any, **kwargs: Any) -> Path abstractmethod

Downloads model weights and returns the path to them.

Parameters:

Name Type Description Default
repo_id str

The repository ID from which to download the model (e.g., 'culicidae/mosquito-detector').

required
filename str

The name of the weights file in the repository.

required
local_dir Path

The local directory to save the weights file.

required
*args Any

Additional positional arguments for the provider's implementation.

()
**kwargs Any

Additional keyword arguments for the provider's implementation.

{}

Returns:

Name Type Description
Path Path

The local path to the downloaded model weights file.

Raises:

Type Description
NotImplementedError

If the method is not implemented by a subclass.

Source code in culicidaelab\core\base_provider.py
@abstractmethod
def download_model_weights(
    self,
    repo_id: str,
    filename: str,
    local_dir: Path,
    *args: Any,
    **kwargs: Any,
) -> Path:
    """Downloads model weights and returns the path to them.

    Args:
        repo_id (str): The repository ID from which to download the model
            (e.g., 'culicidae/mosquito-detector').
        filename (str): The name of the weights file in the repository.
        local_dir (Path): The local directory to save the weights file.
        *args: Additional positional arguments for the provider's implementation.
        **kwargs: Additional keyword arguments for the provider's implementation.

    Returns:
        Path: The local path to the downloaded model weights file.

    Raises:
        NotImplementedError: If the method is not implemented by a subclass.
    """
    raise NotImplementedError("Subclasses must implement this method")
get_provider_name() -> str abstractmethod

Gets the unique name of the provider.

Returns:

Name Type Description
str str

A string representing the provider's name (e.g., 'huggingface').

Source code in culicidaelab\core\base_provider.py
@abstractmethod
def get_provider_name(self) -> str:
    """Gets the unique name of the provider.

    Returns:
        str: A string representing the provider's name (e.g., 'huggingface').
    """
    pass
load_dataset(dataset_path: str | Path, **kwargs: Any) -> Any abstractmethod

Loads a dataset from a local path.

This method is responsible for loading a dataset that has already been downloaded to the local filesystem.

Parameters:

Name Type Description Default
dataset_path str | Path

The local path to the dataset, typically a path returned by download_dataset.

required
**kwargs Any

Additional keyword arguments for loading the dataset, which may vary by provider and dataset format.

{}

Returns:

Name Type Description
Any Any

The loaded dataset object, which could be a Hugging Face Dataset,

Any

a PyTorch Dataset, a Pandas DataFrame, or another format.

Raises:

Type Description
NotImplementedError

If the method is not implemented by a subclass.

Source code in culicidaelab\core\base_provider.py
@abstractmethod
def load_dataset(
    self,
    dataset_path: str | Path,
    **kwargs: Any,
) -> Any:
    """Loads a dataset from a local path.

    This method is responsible for loading a dataset that has already been
    downloaded to the local filesystem.

    Args:
        dataset_path (str | Path): The local path to the dataset, typically
            a path returned by `download_dataset`.
        **kwargs: Additional keyword arguments for loading the dataset, which
            may vary by provider and dataset format.

    Returns:
        Any: The loaded dataset object, which could be a Hugging Face Dataset,
        a PyTorch Dataset, a Pandas DataFrame, or another format.

    Raises:
        NotImplementedError: If the method is not implemented by a subclass.
    """
    raise NotImplementedError("Subclasses must implement this method")
WeightsManagerProtocol
Source code in culicidaelab\core\weights_manager_protocol.py
class WeightsManagerProtocol(Protocol):
    def ensure_weights(self, predictor_type: str, backend_type: str) -> Path:
        """Ensures model weights are available locally and returns their path.

        This method is responsible for managing model weight files, including checking
        their existence, downloading if necessary, and providing the absolute path to
        the weights file. It abstracts away the details of weight file management from
        the rest of the system.

        Args:
            predictor_type (str): The type of predictor requiring the weights.
                Common values include 'classifier', 'detector', or 'segmenter'.
            backend_type (str): The backend framework for which the weights are needed.
                Examples include 'fastai', 'onnx', 'yolo', or 'sam'.

        Returns:
            Path: Absolute path to the model weights file. The returned path is
                guaranteed to exist and be accessible.

        Example:
            ```python
            from your_module import WeightsManager

            weights_manager = WeightsManager()

            # Get weights for a FastAI classifier
            classifier_weights = weights_manager.ensure_weights(
                predictor_type="classifier",
                backend_type="fastai"
            )

            # Use the weights in a model
            model.load_state_dict(torch.load(classifier_weights))
            ```

        Note:
            Implementations should handle various scenarios such as:
            - Checking if weights exist locally
            - Downloading weights from remote sources if needed
            - Validating weight file integrity
            - Managing weight file versions
            - Handling download failures and retry logic
        """
        ...
ensure_weights(predictor_type: str, backend_type: str) -> Path

Ensures model weights are available locally and returns their path.

This method is responsible for managing model weight files, including checking their existence, downloading if necessary, and providing the absolute path to the weights file. It abstracts away the details of weight file management from the rest of the system.

Parameters:

Name Type Description Default
predictor_type str

The type of predictor requiring the weights. Common values include 'classifier', 'detector', or 'segmenter'.

required
backend_type str

The backend framework for which the weights are needed. Examples include 'fastai', 'onnx', 'yolo', or 'sam'.

required

Returns:

Name Type Description
Path Path

Absolute path to the model weights file. The returned path is guaranteed to exist and be accessible.

Example
from your_module import WeightsManager

weights_manager = WeightsManager()

# Get weights for a FastAI classifier
classifier_weights = weights_manager.ensure_weights(
    predictor_type="classifier",
    backend_type="fastai"
)

# Use the weights in a model
model.load_state_dict(torch.load(classifier_weights))
Note

Implementations should handle various scenarios such as: - Checking if weights exist locally - Downloading weights from remote sources if needed - Validating weight file integrity - Managing weight file versions - Handling download failures and retry logic

Source code in culicidaelab\core\weights_manager_protocol.py
def ensure_weights(self, predictor_type: str, backend_type: str) -> Path:
    """Ensures model weights are available locally and returns their path.

    This method is responsible for managing model weight files, including checking
    their existence, downloading if necessary, and providing the absolute path to
    the weights file. It abstracts away the details of weight file management from
    the rest of the system.

    Args:
        predictor_type (str): The type of predictor requiring the weights.
            Common values include 'classifier', 'detector', or 'segmenter'.
        backend_type (str): The backend framework for which the weights are needed.
            Examples include 'fastai', 'onnx', 'yolo', or 'sam'.

    Returns:
        Path: Absolute path to the model weights file. The returned path is
            guaranteed to exist and be accessible.

    Example:
        ```python
        from your_module import WeightsManager

        weights_manager = WeightsManager()

        # Get weights for a FastAI classifier
        classifier_weights = weights_manager.ensure_weights(
            predictor_type="classifier",
            backend_type="fastai"
        )

        # Use the weights in a model
        model.load_state_dict(torch.load(classifier_weights))
        ```

    Note:
        Implementations should handle various scenarios such as:
        - Checking if weights exist locally
        - Downloading weights from remote sources if needed
        - Validating weight file integrity
        - Managing weight file versions
        - Handling download failures and retry logic
    """
    ...
BaseInferenceBackend

Abstract base class for an inference backend.

This class defines the required methods for an inference backend, which is responsible for loading a model and running predictions. It includes a default implementation for batch prediction that iterates through single predictions.

Attributes:

Name Type Description
predictor_type str

The type of predictor this backend serves (e.g., 'classifier').

model Any

The loaded model object. Initially None.

Source code in culicidaelab\core\base_inference_backend.py
class BaseInferenceBackend(Generic[InputDataType, PredictionType], ABC):
    """Abstract base class for an inference backend.

    This class defines the required methods for an inference backend, which is
    responsible for loading a model and running predictions. It includes a default
    implementation for batch prediction that iterates through single predictions.

    Attributes:
        predictor_type (str): The type of predictor this backend serves (e.g., 'classifier').
        model (Any): The loaded model object. Initially None.
    """

    def __init__(
        self,
        predictor_type: str,
    ):
        """Initializes the BaseInferenceBackend.

        Args:
            predictor_type: The type of predictor (e.g., 'classifier', 'detector').
        """
        self.predictor_type = predictor_type
        self.model: Any = None

    @abstractmethod
    def load_model(self, **kwargs: Any) -> None:
        """Loads the model into memory.

        This method should handle all aspects of model loading, such as reading
        weights from a file and preparing the model for inference.

        Args:
            **kwargs: Backend-specific arguments for model loading.
        """
        ...

    @abstractmethod
    def predict(self, input_data: InputDataType, **kwargs: Any) -> PredictionType:
        """Runs a prediction on a single input.

        Args:
            input_data: The data to be processed by the model.
            **kwargs: Additional backend-specific arguments for prediction.

        Returns:
            The prediction result.
        """
        ...

    def unload_model(self) -> None:
        """Unloads the model and releases resources.

        This method is intended to free up memory (especially GPU memory) by
        deleting the model instance.
        """
        self.model = None
        logger.info(f"Model for {self.predictor_type} has been unloaded.")

    @property
    def is_loaded(self) -> bool:
        """Checks if the model is loaded into memory.

        Returns:
            True if the model is loaded, False otherwise.
        """
        return self.model is not None

    def predict_batch(
        self,
        input_data_batch: list[InputDataType],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> list[PredictionType]:
        """Makes predictions on a batch of inputs.

        This method provides a default implementation that iterates through the batch
        and calls `predict` for each item. Backends that support native batching
        should override this method for better performance.

        Args:
            input_data_batch: A list of inputs to process.
            show_progress: If True, displays a progress bar.
            **kwargs: Additional arguments to pass to the `predict` method.

        Returns:
            A list of prediction results.
        """
        if not input_data_batch:
            return []

        if not self.is_loaded:
            self.load_model(**kwargs)

        iterator = input_data_batch
        if show_progress:
            iterator = progress_bar(input_data_batch, total=len(input_data_batch))

        # The core logic for iterative batch prediction.
        raw_predictions = [self.predict(input_data, **kwargs) for input_data in iterator]
        return raw_predictions
predictor_type = predictor_type instance-attribute
model: Any = None instance-attribute
is_loaded: bool property

Checks if the model is loaded into memory.

Returns:

Type Description
bool

True if the model is loaded, False otherwise.

__init__(predictor_type: str)

Initializes the BaseInferenceBackend.

Parameters:

Name Type Description Default
predictor_type str

The type of predictor (e.g., 'classifier', 'detector').

required
Source code in culicidaelab\core\base_inference_backend.py
def __init__(
    self,
    predictor_type: str,
):
    """Initializes the BaseInferenceBackend.

    Args:
        predictor_type: The type of predictor (e.g., 'classifier', 'detector').
    """
    self.predictor_type = predictor_type
    self.model: Any = None
load_model(**kwargs: Any) -> None abstractmethod

Loads the model into memory.

This method should handle all aspects of model loading, such as reading weights from a file and preparing the model for inference.

Parameters:

Name Type Description Default
**kwargs Any

Backend-specific arguments for model loading.

{}
Source code in culicidaelab\core\base_inference_backend.py
@abstractmethod
def load_model(self, **kwargs: Any) -> None:
    """Loads the model into memory.

    This method should handle all aspects of model loading, such as reading
    weights from a file and preparing the model for inference.

    Args:
        **kwargs: Backend-specific arguments for model loading.
    """
    ...
predict(input_data: InputDataType, **kwargs: Any) -> PredictionType abstractmethod

Runs a prediction on a single input.

Parameters:

Name Type Description Default
input_data InputDataType

The data to be processed by the model.

required
**kwargs Any

Additional backend-specific arguments for prediction.

{}

Returns:

Type Description
PredictionType

The prediction result.

Source code in culicidaelab\core\base_inference_backend.py
@abstractmethod
def predict(self, input_data: InputDataType, **kwargs: Any) -> PredictionType:
    """Runs a prediction on a single input.

    Args:
        input_data: The data to be processed by the model.
        **kwargs: Additional backend-specific arguments for prediction.

    Returns:
        The prediction result.
    """
    ...
unload_model() -> None

Unloads the model and releases resources.

This method is intended to free up memory (especially GPU memory) by deleting the model instance.

Source code in culicidaelab\core\base_inference_backend.py
def unload_model(self) -> None:
    """Unloads the model and releases resources.

    This method is intended to free up memory (especially GPU memory) by
    deleting the model instance.
    """
    self.model = None
    logger.info(f"Model for {self.predictor_type} has been unloaded.")
predict_batch(input_data_batch: list[InputDataType], show_progress: bool = False, **kwargs: Any) -> list[PredictionType]

Makes predictions on a batch of inputs.

This method provides a default implementation that iterates through the batch and calls predict for each item. Backends that support native batching should override this method for better performance.

Parameters:

Name Type Description Default
input_data_batch list[InputDataType]

A list of inputs to process.

required
show_progress bool

If True, displays a progress bar.

False
**kwargs Any

Additional arguments to pass to the predict method.

{}

Returns:

Type Description
list[PredictionType]

A list of prediction results.

Source code in culicidaelab\core\base_inference_backend.py
def predict_batch(
    self,
    input_data_batch: list[InputDataType],
    show_progress: bool = False,
    **kwargs: Any,
) -> list[PredictionType]:
    """Makes predictions on a batch of inputs.

    This method provides a default implementation that iterates through the batch
    and calls `predict` for each item. Backends that support native batching
    should override this method for better performance.

    Args:
        input_data_batch: A list of inputs to process.
        show_progress: If True, displays a progress bar.
        **kwargs: Additional arguments to pass to the `predict` method.

    Returns:
        A list of prediction results.
    """
    if not input_data_batch:
        return []

    if not self.is_loaded:
        self.load_model(**kwargs)

    iterator = input_data_batch
    if show_progress:
        iterator = progress_bar(input_data_batch, total=len(input_data_batch))

    # The core logic for iterative batch prediction.
    raw_predictions = [self.predict(input_data, **kwargs) for input_data in iterator]
    return raw_predictions
ConfigManager

Handles loading, merging, and validating configurations for the library.

This manager implements a robust loading strategy: 1. Loads default YAML configurations bundled with the library. 2. Loads user-provided YAML configurations from a specified directory. 3. Merges the user's configuration on top of the defaults. 4. Validates the final merged configuration against Pydantic models.

Attributes:

Name Type Description
user_config_dir Path | None

The user configuration directory.

default_config_path Path

The path to the default config directory.

config CulicidaeLabConfig

The validated configuration object.

Source code in culicidaelab\core\config_manager.py
class ConfigManager:
    """Handles loading, merging, and validating configurations for the library.

    This manager implements a robust loading strategy:
    1. Loads default YAML configurations bundled with the library.
    2. Loads user-provided YAML configurations from a specified directory.
    3. Merges the user's configuration on top of the defaults.
    4. Validates the final merged configuration against Pydantic models.

    Attributes:
        user_config_dir (Path | None): The user configuration directory.
        default_config_path (Path): The path to the default config directory.
        config (CulicidaeLabConfig): The validated configuration object.
    """

    def __init__(self, user_config_dir: str | Path | None = None):
        """Initializes the ConfigManager.

        Args:
            user_config_dir (str | Path, optional): Path to a directory containing
                user-defined YAML configuration files. These will override the
                defaults. Defaults to None.
        """
        self.user_config_dir = Path(user_config_dir) if user_config_dir else None
        self.default_config_path = self._get_default_config_path()
        self.config: CulicidaeLabConfig = self._load()

    def get_config(self) -> CulicidaeLabConfig:
        """Returns the fully validated Pydantic configuration object.

        Returns:
            CulicidaeLabConfig: The `CulicidaeLabConfig` Pydantic model instance.
        """
        return self.config

    def instantiate_from_config(
        self,
        config_obj: Any,
        extra_params: dict[str, Any] | None = None,
        **kwargs: Any,
    ) -> Any:
        """Instantiates a Python object from its Pydantic config model.

        The config model must have a `target` field specifying the fully
        qualified class path (e.g., 'my_module.my_class.MyClass').

        Args:
            config_obj (Any): A Pydantic model instance (e.g., a predictor config).
            extra_params (dict[str, Any] | None, optional): A dictionary of
                extra parameters to inject into the constructor. Defaults to None.
            **kwargs (Any): Additional keyword arguments to pass to the object's
                constructor, overriding any existing parameters in the config.

        Returns:
            Any: An instantiated Python object.

        Raises:
            ValueError: If the `target` key is not found in the config object.
            ImportError: If the class could not be imported and instantiated.
        """
        if not hasattr(config_obj, "target"):
            raise ValueError("Target key 'target' not found in configuration object")

        targetpath = config_obj.target
        config_params = config_obj.model_dump()
        config_params.pop("target", None)
        final_params = {}
        if extra_params:
            final_params.update(extra_params)
        final_params.update(config_params)
        final_params.update(kwargs)

        try:
            module_path, class_name = targetpath.rsplit(".", 1)
            module = __import__(module_path, fromlist=[class_name])
            cls = getattr(module, class_name)

            sig = inspect.signature(cls)

            has_kwargs = any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values())

            if not has_kwargs:
                # Filter final_params to only include keys that are in the signature
                allowed_keys = set(sig.parameters.keys())
                filtered_params = {k: v for k, v in final_params.items() if k in allowed_keys}
            else:
                filtered_params = final_params

            return cls(**filtered_params)
        except (ValueError, ImportError, AttributeError, TypeError) as e:
            raise ImportError(
                f"Could not import and instantiate '{targetpath}': {e}",
            )

    def save_config(self, file_path: str | Path) -> None:
        """Saves the current configuration state to a YAML file.

        This is useful for exporting the fully merged and validated config.

        Args:
            file_path (str | Path): The path where the YAML config will be saved.
        """
        path = Path(file_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        config_dict = self.config.model_dump(mode="json")
        OmegaConf.save(config=config_dict, f=path)

    def _get_default_config_path(self) -> Path:
        """Reliably finds the path to the bundled 'conf' directory.

        Returns:
            Path: The absolute path to the default configuration directory.

        Raises:
            FileNotFoundError: If the default 'conf' directory cannot be found.
        """
        try:
            files = resources.files("culicidaelab")
            # Check for Traversable with _path (for installed packages)
            if hasattr(files, "_path"):
                return Path(files._path) / "conf"
            # Otherwise, use string representation (for zip files, etc.)
            else:
                return Path(str(files)) / "conf"
        except (ModuleNotFoundError, FileNotFoundError):
            # Fallback for development mode
            dev_path = Path(__file__).parent.parent / "conf"
            if dev_path.exists():
                return dev_path
            raise FileNotFoundError(
                "Could not find the default 'conf' directory. "
                "Ensure the 'culicidaelab' package is installed correctly or "
                "you are in the project root.",
            )

    def _load(self) -> CulicidaeLabConfig:
        """Executes the full load, merge, and validation process.

        Returns:
            CulicidaeLabConfig: The validated configuration object.

        Raises:
            ValidationError: If the merged configuration fails Pydantic validation.
        """
        default_config_dict = self._load_config_from_dir(
            cast(Path, self.default_config_path),
        )
        user_config_dict = self._load_config_from_dir(self.user_config_dir)

        # User configs override defaults
        merged_config = _deep_merge(user_config_dict, default_config_dict)

        try:
            validated_config = CulicidaeLabConfig(**merged_config)
            return validated_config
        except ValidationError as e:
            print(
                "FATAL: Configuration validation failed. Please check your " "YAML files or environment variables.",
            )
            print(e)
            raise

    def _load_config_from_dir(self, config_dir: Path | None) -> ConfigDict:
        """Loads all YAML files from a directory into a nested dictionary.

        The dictionary structure mirrors the directory structure.

        Args:
            config_dir (Path | None): Directory containing YAML config files, or None.

        Returns:
            ConfigDict: A nested dictionary containing the loaded configuration.
        """
        config_dict: ConfigDict = {}
        if config_dir is None or not config_dir.is_dir():
            return config_dict

        for yaml_file in config_dir.glob("**/*.yaml"):
            try:
                with yaml_file.open("r") as f:
                    data = yaml.safe_load(f)
                    if data is None:
                        continue

                relative_path = yaml_file.relative_to(config_dir)
                keys = list(relative_path.parts[:-1]) + [relative_path.stem]

                d = config_dict
                for key in keys[:-1]:
                    d = d.setdefault(key, {})
                d[keys[-1]] = data
            except Exception as e:
                print(f"Warning: Could not load or parse {yaml_file}: {e}")
        return config_dict
user_config_dir = Path(user_config_dir) if user_config_dir else None instance-attribute
default_config_path = self._get_default_config_path() instance-attribute
config: CulicidaeLabConfig = self._load() instance-attribute
__init__(user_config_dir: str | Path | None = None)

Initializes the ConfigManager.

Parameters:

Name Type Description Default
user_config_dir str | Path

Path to a directory containing user-defined YAML configuration files. These will override the defaults. Defaults to None.

None
Source code in culicidaelab\core\config_manager.py
def __init__(self, user_config_dir: str | Path | None = None):
    """Initializes the ConfigManager.

    Args:
        user_config_dir (str | Path, optional): Path to a directory containing
            user-defined YAML configuration files. These will override the
            defaults. Defaults to None.
    """
    self.user_config_dir = Path(user_config_dir) if user_config_dir else None
    self.default_config_path = self._get_default_config_path()
    self.config: CulicidaeLabConfig = self._load()
get_config() -> CulicidaeLabConfig

Returns the fully validated Pydantic configuration object.

Returns:

Name Type Description
CulicidaeLabConfig CulicidaeLabConfig

The CulicidaeLabConfig Pydantic model instance.

Source code in culicidaelab\core\config_manager.py
def get_config(self) -> CulicidaeLabConfig:
    """Returns the fully validated Pydantic configuration object.

    Returns:
        CulicidaeLabConfig: The `CulicidaeLabConfig` Pydantic model instance.
    """
    return self.config
instantiate_from_config(config_obj: Any, extra_params: dict[str, Any] | None = None, **kwargs: Any) -> Any

Instantiates a Python object from its Pydantic config model.

The config model must have a target field specifying the fully qualified class path (e.g., 'my_module.my_class.MyClass').

Parameters:

Name Type Description Default
config_obj Any

A Pydantic model instance (e.g., a predictor config).

required
extra_params dict[str, Any] | None

A dictionary of extra parameters to inject into the constructor. Defaults to None.

None
**kwargs Any

Additional keyword arguments to pass to the object's constructor, overriding any existing parameters in the config.

{}

Returns:

Name Type Description
Any Any

An instantiated Python object.

Raises:

Type Description
ValueError

If the target key is not found in the config object.

ImportError

If the class could not be imported and instantiated.

Source code in culicidaelab\core\config_manager.py
def instantiate_from_config(
    self,
    config_obj: Any,
    extra_params: dict[str, Any] | None = None,
    **kwargs: Any,
) -> Any:
    """Instantiates a Python object from its Pydantic config model.

    The config model must have a `target` field specifying the fully
    qualified class path (e.g., 'my_module.my_class.MyClass').

    Args:
        config_obj (Any): A Pydantic model instance (e.g., a predictor config).
        extra_params (dict[str, Any] | None, optional): A dictionary of
            extra parameters to inject into the constructor. Defaults to None.
        **kwargs (Any): Additional keyword arguments to pass to the object's
            constructor, overriding any existing parameters in the config.

    Returns:
        Any: An instantiated Python object.

    Raises:
        ValueError: If the `target` key is not found in the config object.
        ImportError: If the class could not be imported and instantiated.
    """
    if not hasattr(config_obj, "target"):
        raise ValueError("Target key 'target' not found in configuration object")

    targetpath = config_obj.target
    config_params = config_obj.model_dump()
    config_params.pop("target", None)
    final_params = {}
    if extra_params:
        final_params.update(extra_params)
    final_params.update(config_params)
    final_params.update(kwargs)

    try:
        module_path, class_name = targetpath.rsplit(".", 1)
        module = __import__(module_path, fromlist=[class_name])
        cls = getattr(module, class_name)

        sig = inspect.signature(cls)

        has_kwargs = any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values())

        if not has_kwargs:
            # Filter final_params to only include keys that are in the signature
            allowed_keys = set(sig.parameters.keys())
            filtered_params = {k: v for k, v in final_params.items() if k in allowed_keys}
        else:
            filtered_params = final_params

        return cls(**filtered_params)
    except (ValueError, ImportError, AttributeError, TypeError) as e:
        raise ImportError(
            f"Could not import and instantiate '{targetpath}': {e}",
        )
save_config(file_path: str | Path) -> None

Saves the current configuration state to a YAML file.

This is useful for exporting the fully merged and validated config.

Parameters:

Name Type Description Default
file_path str | Path

The path where the YAML config will be saved.

required
Source code in culicidaelab\core\config_manager.py
def save_config(self, file_path: str | Path) -> None:
    """Saves the current configuration state to a YAML file.

    This is useful for exporting the fully merged and validated config.

    Args:
        file_path (str | Path): The path where the YAML config will be saved.
    """
    path = Path(file_path)
    path.parent.mkdir(parents=True, exist_ok=True)
    config_dict = self.config.model_dump(mode="json")
    OmegaConf.save(config=config_dict, f=path)
CulicidaeLabConfig

The root Pydantic model for all CulicidaeLab configurations.

This model validates the entire configuration structure after it is loaded from YAML files, serving as the single source of truth for all settings.

Attributes:

Name Type Description
config_version str

The version of the configuration schema. This is used to ensure compatibility with the library version.

app_settings AppSettings

Core application settings.

processing ProcessingConfig

Default processing parameters.

datasets dict[str, DatasetConfig]

A mapping of dataset names to their configs.

predictors dict[str, PredictorConfig]

A mapping of predictor names to their configs.

providers dict[str, ProviderConfig]

A mapping of provider names to their configs.

species SpeciesModel

Configuration and metadata related to all species.

Source code in culicidaelab\core\config_models.py
class CulicidaeLabConfig(BaseModel):
    """The root Pydantic model for all CulicidaeLab configurations.

    This model validates the entire configuration structure after it is loaded
    from YAML files, serving as the single source of truth for all settings.

    Attributes:
        config_version (str): The version of the configuration schema. This is used
            to ensure compatibility with the library version.
        app_settings (AppSettings): Core application settings.
        processing (ProcessingConfig): Default processing parameters.
        datasets (dict[str, DatasetConfig]): A mapping of dataset names to their configs.
        predictors (dict[str, PredictorConfig]): A mapping of predictor names to their configs.
        providers (dict[str, ProviderConfig]): A mapping of provider names to their configs.
        species (SpeciesModel): Configuration and metadata related to all species.
    """

    model_config = ConfigDict(extra="allow")
    config_version: str = Field(default=CONFIG_SCHEMA_VERSION)
    app_settings: AppSettings = Field(default_factory=AppSettings)
    processing: ProcessingConfig = Field(default_factory=ProcessingConfig)
    datasets: dict[str, DatasetConfig] = Field(default_factory=dict)
    predictors: dict[str, PredictorConfig] = Field(default_factory=dict)
    providers: dict[str, ProviderConfig] = Field(default_factory=dict)
    species: SpeciesModel = Field(default_factory=SpeciesModel)
model_config = ConfigDict(extra='allow') class-attribute instance-attribute
config_version: str = Field(default=CONFIG_SCHEMA_VERSION) class-attribute instance-attribute
app_settings: AppSettings = Field(default_factory=AppSettings) class-attribute instance-attribute
processing: ProcessingConfig = Field(default_factory=ProcessingConfig) class-attribute instance-attribute
datasets: dict[str, DatasetConfig] = Field(default_factory=dict) class-attribute instance-attribute
predictors: dict[str, PredictorConfig] = Field(default_factory=dict) class-attribute instance-attribute
providers: dict[str, ProviderConfig] = Field(default_factory=dict) class-attribute instance-attribute
species: SpeciesModel = Field(default_factory=SpeciesModel) class-attribute instance-attribute
DatasetConfig

Configuration for a single dataset.

Attributes:

Name Type Description
name str

The unique internal name for the dataset.

path str

The local directory path for storing the dataset.

format str

The dataset format (e.g., "imagefolder", "coco", "yolo").

classes list[str]

A list of class names present in the dataset.

provider_name str

The name of the data provider (e.g., "huggingface").

repository str

The repository ID on the provider's platform.

config_name str | None

The specific configuration of a Hugging Face dataset.

derived_datasets list[str] | None

A list of Hugging Face repository IDs for datasets that were derived from this one. Defaults to None.

trained_models_repositories list[str] | None

A list of Hugging Face repository IDs for models trained on this dataset. Defaults to None.

Source code in culicidaelab\core\config_models.py
class DatasetConfig(BaseModel):
    """Configuration for a single dataset.

    Attributes:
        name (str): The unique internal name for the dataset.
        path (str): The local directory path for storing the dataset.
        format (str): The dataset format (e.g., "imagefolder", "coco", "yolo").
        classes (list[str]): A list of class names present in the dataset.
        provider_name (str): The name of the data provider (e.g., "huggingface").
        repository (str): The repository ID on the provider's platform.
        config_name (str | None): The specific configuration of a Hugging Face dataset.
        derived_datasets (list[str] | None): A list of Hugging Face repository IDs
            for datasets that were derived from this one. Defaults to None.
        trained_models_repositories (list[str] | None): A list of Hugging Face
            repository IDs for models trained on this dataset. Defaults to None.
    """

    model_config = ConfigDict(extra="allow")
    name: str
    path: str
    format: str
    classes: list[str]
    provider_name: str
    repository: str
    config_name: str | None = "default"
    derived_datasets: list[str] | None = None
    trained_models_repositories: list[str] | None = None
model_config = ConfigDict(extra='allow') class-attribute instance-attribute
name: str instance-attribute
path: str instance-attribute
format: str instance-attribute
classes: list[str] instance-attribute
provider_name: str instance-attribute
repository: str instance-attribute
config_name: str | None = 'default' class-attribute instance-attribute
derived_datasets: list[str] | None = None class-attribute instance-attribute
trained_models_repositories: list[str] | None = None class-attribute instance-attribute
PredictorConfig

Configuration for a single inference predictor.

This model defines how to load and use a specific pre-trained model for inference.

Attributes:

Name Type Description
target str

The fully qualified import path to the predictor class (e.g., culicidaelab.models.YOLOv8Predictor).

confidence float

The default confidence threshold for this predictor.

device str

The compute device to use ("cpu" or "cuda").

backend str | None

The specific inference backend to use (e.g., 'yolo').

params dict[str, Any]

A dictionary of extra parameters to pass to the predictor's constructor.

repository_id str | None

The Hugging Face Hub repository ID for the model.

weights dict[str, WeightDetails] | None

A mapping of backend names to their weight details.

provider_name str | None

The name of the provider (e.g., "huggingface").

model_arch str | None

The model architecture name (e.g., "yolov8n-seg").

model_config_path str | None

The path to the model's specific config file.

model_config_filename str | None

The filename of the model's config.

visualization VisualizationConfig

Custom visualization settings for this predictor.

Source code in culicidaelab\core\config_models.py
class PredictorConfig(BaseModel):
    """Configuration for a single inference predictor.

    This model defines how to load and use a specific pre-trained model for inference.

    Attributes:
        target (str): The fully qualified import path to the predictor class
            (e.g., `culicidaelab.models.YOLOv8Predictor`).
        confidence (float): The default confidence threshold for this predictor.
        device (str): The compute device to use ("cpu" or "cuda").
        backend (str | None): The specific inference backend to use (e.g., 'yolo').
        params (dict[str, Any]): A dictionary of extra parameters to pass to the
            predictor's constructor.
        repository_id (str | None): The Hugging Face Hub repository ID for the model.
        weights (dict[str, WeightDetails] | None): A mapping of backend names to their
            weight details.
        provider_name (str | None): The name of the provider (e.g., "huggingface").
        model_arch (str | None): The model architecture name (e.g., "yolov8n-seg").
        model_config_path (str | None): The path to the model's specific config file.
        model_config_filename (str | None): The filename of the model's config.
        visualization (VisualizationConfig): Custom visualization settings for this predictor.
    """

    model_config = ConfigDict(extra="allow", protected_namespaces=())
    target: str = Field(..., alias="target")
    confidence: float = 0.5
    device: str = "cpu"
    backend: str | None = None
    params: dict[str, Any] = Field(default_factory=dict)
    repository_id: str | None = None
    weights: dict[str, WeightDetails] | None = None
    provider_name: str | None = None
    model_arch: str | None = None
    model_config_path: str | None = None
    model_config_filename: str | None = None
    visualization: VisualizationConfig = Field(default_factory=VisualizationConfig)
model_config = ConfigDict(extra='allow', protected_namespaces=()) class-attribute instance-attribute
target: str = Field(..., alias='target') class-attribute instance-attribute
confidence: float = 0.5 class-attribute instance-attribute
device: str = 'cpu' class-attribute instance-attribute
backend: str | None = None class-attribute instance-attribute
params: dict[str, Any] = Field(default_factory=dict) class-attribute instance-attribute
repository_id: str | None = None class-attribute instance-attribute
weights: dict[str, WeightDetails] | None = None class-attribute instance-attribute
provider_name: str | None = None class-attribute instance-attribute
model_arch: str | None = None class-attribute instance-attribute
model_config_path: str | None = None class-attribute instance-attribute
model_config_filename: str | None = None class-attribute instance-attribute
visualization: VisualizationConfig = Field(default_factory=VisualizationConfig) class-attribute instance-attribute
ProviderConfig

Configuration for a data provider, such as Hugging Face.

Attributes:

Name Type Description
target str

The fully qualified import path to the provider's service class.

dataset_url str

The base URL for accessing datasets from this provider.

api_key str | None

An optional API key for authentication, if required.

Source code in culicidaelab\core\config_models.py
class ProviderConfig(BaseModel):
    """Configuration for a data provider, such as Hugging Face.

    Attributes:
        target (str): The fully qualified import path to the provider's
            service class.
        dataset_url (str): The base URL for accessing datasets from this provider.
        api_key (str | None): An optional API key for authentication, if required.
    """

    model_config = ConfigDict(extra="allow")
    target: str = Field(..., alias="target")
    dataset_url: str
    api_key: str | None = None
model_config = ConfigDict(extra='allow') class-attribute instance-attribute
target: str = Field(..., alias='target') class-attribute instance-attribute
dataset_url: str instance-attribute
api_key: str | None = None class-attribute instance-attribute
SpeciesModel

Configuration for the entire 'species' section of the config.

Attributes:

Name Type Description
species_classes dict[int, str]

A mapping of integer class IDs to string-based species names.

species_metadata SpeciesFiles

The aggregated species metadata loaded from the species directory.

Source code in culicidaelab\core\config_models.py
class SpeciesModel(BaseModel):
    """Configuration for the entire 'species' section of the config.

    Attributes:
        species_classes (dict[int, str]): A mapping of integer class IDs to
            string-based species names.
        species_metadata (SpeciesFiles): The aggregated species metadata loaded
            from the species directory.
    """

    model_config = ConfigDict(extra="allow")
    species_classes: dict[int, str] = Field(default_factory=dict)
    species_metadata: SpeciesFiles = Field(default_factory=SpeciesFiles)
model_config = ConfigDict(extra='allow') class-attribute instance-attribute
species_classes: dict[int, str] = Field(default_factory=dict) class-attribute instance-attribute
species_metadata: SpeciesFiles = Field(default_factory=SpeciesFiles) class-attribute instance-attribute
SpeciesConfig

A user-friendly facade for accessing and managing species configuration data.

This class implements the Facade pattern to simplify access to species-related configuration data. It provides an intuitive interface for managing species information, including class mappings, metadata, and name translations.

Parameters:

Name Type Description Default
config SpeciesModel

A validated Pydantic model containing the complete species configuration data.

required

Attributes:

Name Type Description
_config SpeciesModel

The source configuration model containing raw data.

_species_map dict[int, str]

Maps numeric class indices to full species names.

_reverse_species_map dict[str, int]

Maps full species names to their numeric indices.

_metadata_store dict

Contains detailed metadata for each species.

class_to_full_name_map dict[str, str]

Maps short class names to full scientific names.

reverse_class_to_full_name_map dict[str, str]

Maps full scientific names to short class names.

Example
config = SpeciesModel(...)  # Your validated config
species_helper = SpeciesConfig(config)

# Get full name for a class index
species_name = species_helper.get_species_by_index(0)

# Get metadata for a species
metadata = species_helper.get_species_metadata(species_name)
Source code in culicidaelab\core\species_config.py
class SpeciesConfig:
    """A user-friendly facade for accessing and managing species configuration data.

    This class implements the Facade pattern to simplify access to species-related
    configuration data. It provides an intuitive interface for managing species
    information, including class mappings, metadata, and name translations.

    Args:
        config (SpeciesModel): A validated Pydantic model containing the complete
            species configuration data.

    Attributes:
        _config (SpeciesModel): The source configuration model containing raw data.
        _species_map (dict[int, str]): Maps numeric class indices to full species names.
        _reverse_species_map (dict[str, int]): Maps full species names to their numeric indices.
        _metadata_store (dict): Contains detailed metadata for each species.
        class_to_full_name_map (dict[str, str]): Maps short class names to full scientific names.
        reverse_class_to_full_name_map (dict[str, str]): Maps full scientific names to short class names.

    Example:
        ```python
        config = SpeciesModel(...)  # Your validated config
        species_helper = SpeciesConfig(config)

        # Get full name for a class index
        species_name = species_helper.get_species_by_index(0)

        # Get metadata for a species
        metadata = species_helper.get_species_metadata(species_name)
        ```
    """

    def __init__(self, config: SpeciesModel):
        """Initializes the species configuration helper.

        Sets up internal mappings and data structures for efficient species data access.
        Processes the input configuration to create bidirectional mappings between
        species names, class names, and indices.

        Args:
            config (SpeciesModel): The validated species configuration model.
        """
        self._config = config
        self._species_map: dict[int, str] = {}
        self.class_to_full_name_map = self._config.species_metadata.species_info_mapping
        self.reverse_class_to_full_name_map = {v: k for k, v in self.class_to_full_name_map.items()}

        for idx, class_name in self._config.species_classes.items():
            full_name = self.class_to_full_name_map.get(class_name, class_name)
            self._species_map[idx] = full_name

        self._reverse_species_map: dict[str, int] = {name: idx for idx, name in self._species_map.items()}
        self._metadata_store: dict[
            str,
            SingleSpeciesMetadataModel,
        ] = self._config.species_metadata.species_metadata

    @property
    def species_map(self) -> dict[int, str]:
        """Gets the mapping of class indices to full, human-readable species names.

        Returns:
            dict[int, str]: A dictionary mapping numeric class indices to full
                scientific species names.

        Example:
            ```python
            species_config = SpeciesConfig(config)
            mapping = species_config.species_map
            # Returns: {0: "Aedes aegypti", 1: "Aedes albopictus"}
            ```
        """
        return self._species_map

    def get_index_by_species(self, species_name: str) -> int | None:
        """Gets the numeric class index for a given species name.

        Looks up the numeric class index used by the model for a given full
        species name. This is useful for mapping between model predictions
        and species names.

        Args:
            species_name (str): The full scientific name of the species
                (e.g., "Aedes aegypti").

        Returns:
            int | None: The numeric class index used by the model, or None if the
                species is not found in the configuration.

        Example:
            ```python
            index = species_config.get_index_by_species("Aedes aegypti")
            # Returns: 0
            ```
        """
        return self._reverse_species_map.get(species_name)

    def get_species_by_index(self, index: int) -> str | None:
        """Gets the full scientific species name for a given class index.

        Converts a numeric class index used by the model into the corresponding
        full scientific species name. This is particularly useful when processing
        model predictions.

        Args:
            index (int): The numeric class index used by the model.

        Returns:
            str | None: The full scientific species name as a string, or None if the
                index is not found in the configuration.

        Example:
            ```python
            species = species_config.get_species_by_index(0)
            # Returns: "Aedes aegypti"
            ```
        """
        return self._species_map.get(index)

    def get_species_label(self, species_name: str) -> str:
        """Gets the short label/class name for a given full species name.

        Converts a full scientific species name to its corresponding short label
        used in the dataset and model classifications.

        Args:
            species_name (str): The full scientific name of the species
                (e.g., "Aedes aegypti").

        Returns:
            str: The short label/class name used in the dataset
                (e.g., "ae_aegypti").

        Example:
            ```python
            label = species_config.get_species_label("Aedes aegypti")
            # Returns: "ae_aegypti"
            ```
        """
        return self.reverse_class_to_full_name_map[species_name]

    def get_species_metadata(self, species_name: str) -> dict[str, Any] | None:
        """Gets the detailed metadata for a specific species.

        Retrieves comprehensive metadata about a species, including taxonomic
        information, characteristics, and any custom metadata fields defined
        in the configuration.

        Args:
            species_name (str): The full scientific name of the species
                (e.g., "Aedes aegypti").

        Returns:
            dict[str, Any] | None: A dictionary containing all metadata fields for the
                species, or None if the species is not found. The dictionary structure
                depends on the metadata fields defined in the configuration.

        Example:
            ```python
            metadata = species_config.get_species_metadata("Aedes aegypti")
            # Returns: {
            #     "family": "Culicidae",
            #     "genus": "Aedes",
            #     "species": "aegypti",
            #     "common_name": "Yellow fever mosquito",
            #     ...
            # }
            ```
        """
        model_object = self._metadata_store.get(species_name)
        return model_object.model_dump() if model_object else None

    def list_species_names(self) -> list[str]:
        """Returns a list of all configured full species names.

        Provides a complete list of all species names that are configured in the system.
        The names are returned in their full scientific format.

        Returns:
            list[str]: A list of full scientific species names configured in the system.

        Example:
            ```python
            species_list = species_config.list_species_names()
            # Returns: ["Aedes aegypti", "Aedes albopictus", ...]
            ```
        """
        return list(self._reverse_species_map.keys())
class_to_full_name_map = self._config.species_metadata.species_info_mapping instance-attribute
reverse_class_to_full_name_map = {v: kfor (k, v) in (self.class_to_full_name_map.items())} instance-attribute
species_map: dict[int, str] property

Gets the mapping of class indices to full, human-readable species names.

Returns:

Type Description
dict[int, str]

dict[int, str]: A dictionary mapping numeric class indices to full scientific species names.

Example
species_config = SpeciesConfig(config)
mapping = species_config.species_map
# Returns: {0: "Aedes aegypti", 1: "Aedes albopictus"}
__init__(config: SpeciesModel)

Initializes the species configuration helper.

Sets up internal mappings and data structures for efficient species data access. Processes the input configuration to create bidirectional mappings between species names, class names, and indices.

Parameters:

Name Type Description Default
config SpeciesModel

The validated species configuration model.

required
Source code in culicidaelab\core\species_config.py
def __init__(self, config: SpeciesModel):
    """Initializes the species configuration helper.

    Sets up internal mappings and data structures for efficient species data access.
    Processes the input configuration to create bidirectional mappings between
    species names, class names, and indices.

    Args:
        config (SpeciesModel): The validated species configuration model.
    """
    self._config = config
    self._species_map: dict[int, str] = {}
    self.class_to_full_name_map = self._config.species_metadata.species_info_mapping
    self.reverse_class_to_full_name_map = {v: k for k, v in self.class_to_full_name_map.items()}

    for idx, class_name in self._config.species_classes.items():
        full_name = self.class_to_full_name_map.get(class_name, class_name)
        self._species_map[idx] = full_name

    self._reverse_species_map: dict[str, int] = {name: idx for idx, name in self._species_map.items()}
    self._metadata_store: dict[
        str,
        SingleSpeciesMetadataModel,
    ] = self._config.species_metadata.species_metadata
get_index_by_species(species_name: str) -> int | None

Gets the numeric class index for a given species name.

Looks up the numeric class index used by the model for a given full species name. This is useful for mapping between model predictions and species names.

Parameters:

Name Type Description Default
species_name str

The full scientific name of the species (e.g., "Aedes aegypti").

required

Returns:

Type Description
int | None

int | None: The numeric class index used by the model, or None if the species is not found in the configuration.

Example
index = species_config.get_index_by_species("Aedes aegypti")
# Returns: 0
Source code in culicidaelab\core\species_config.py
def get_index_by_species(self, species_name: str) -> int | None:
    """Gets the numeric class index for a given species name.

    Looks up the numeric class index used by the model for a given full
    species name. This is useful for mapping between model predictions
    and species names.

    Args:
        species_name (str): The full scientific name of the species
            (e.g., "Aedes aegypti").

    Returns:
        int | None: The numeric class index used by the model, or None if the
            species is not found in the configuration.

    Example:
        ```python
        index = species_config.get_index_by_species("Aedes aegypti")
        # Returns: 0
        ```
    """
    return self._reverse_species_map.get(species_name)
get_species_by_index(index: int) -> str | None

Gets the full scientific species name for a given class index.

Converts a numeric class index used by the model into the corresponding full scientific species name. This is particularly useful when processing model predictions.

Parameters:

Name Type Description Default
index int

The numeric class index used by the model.

required

Returns:

Type Description
str | None

str | None: The full scientific species name as a string, or None if the index is not found in the configuration.

Example
species = species_config.get_species_by_index(0)
# Returns: "Aedes aegypti"
Source code in culicidaelab\core\species_config.py
def get_species_by_index(self, index: int) -> str | None:
    """Gets the full scientific species name for a given class index.

    Converts a numeric class index used by the model into the corresponding
    full scientific species name. This is particularly useful when processing
    model predictions.

    Args:
        index (int): The numeric class index used by the model.

    Returns:
        str | None: The full scientific species name as a string, or None if the
            index is not found in the configuration.

    Example:
        ```python
        species = species_config.get_species_by_index(0)
        # Returns: "Aedes aegypti"
        ```
    """
    return self._species_map.get(index)
get_species_label(species_name: str) -> str

Gets the short label/class name for a given full species name.

Converts a full scientific species name to its corresponding short label used in the dataset and model classifications.

Parameters:

Name Type Description Default
species_name str

The full scientific name of the species (e.g., "Aedes aegypti").

required

Returns:

Name Type Description
str str

The short label/class name used in the dataset (e.g., "ae_aegypti").

Example
label = species_config.get_species_label("Aedes aegypti")
# Returns: "ae_aegypti"
Source code in culicidaelab\core\species_config.py
def get_species_label(self, species_name: str) -> str:
    """Gets the short label/class name for a given full species name.

    Converts a full scientific species name to its corresponding short label
    used in the dataset and model classifications.

    Args:
        species_name (str): The full scientific name of the species
            (e.g., "Aedes aegypti").

    Returns:
        str: The short label/class name used in the dataset
            (e.g., "ae_aegypti").

    Example:
        ```python
        label = species_config.get_species_label("Aedes aegypti")
        # Returns: "ae_aegypti"
        ```
    """
    return self.reverse_class_to_full_name_map[species_name]
get_species_metadata(species_name: str) -> dict[str, Any] | None

Gets the detailed metadata for a specific species.

Retrieves comprehensive metadata about a species, including taxonomic information, characteristics, and any custom metadata fields defined in the configuration.

Parameters:

Name Type Description Default
species_name str

The full scientific name of the species (e.g., "Aedes aegypti").

required

Returns:

Type Description
dict[str, Any] | None

dict[str, Any] | None: A dictionary containing all metadata fields for the species, or None if the species is not found. The dictionary structure depends on the metadata fields defined in the configuration.

Example
metadata = species_config.get_species_metadata("Aedes aegypti")
# Returns: {
#     "family": "Culicidae",
#     "genus": "Aedes",
#     "species": "aegypti",
#     "common_name": "Yellow fever mosquito",
#     ...
# }
Source code in culicidaelab\core\species_config.py
def get_species_metadata(self, species_name: str) -> dict[str, Any] | None:
    """Gets the detailed metadata for a specific species.

    Retrieves comprehensive metadata about a species, including taxonomic
    information, characteristics, and any custom metadata fields defined
    in the configuration.

    Args:
        species_name (str): The full scientific name of the species
            (e.g., "Aedes aegypti").

    Returns:
        dict[str, Any] | None: A dictionary containing all metadata fields for the
            species, or None if the species is not found. The dictionary structure
            depends on the metadata fields defined in the configuration.

    Example:
        ```python
        metadata = species_config.get_species_metadata("Aedes aegypti")
        # Returns: {
        #     "family": "Culicidae",
        #     "genus": "Aedes",
        #     "species": "aegypti",
        #     "common_name": "Yellow fever mosquito",
        #     ...
        # }
        ```
    """
    model_object = self._metadata_store.get(species_name)
    return model_object.model_dump() if model_object else None
list_species_names() -> list[str]

Returns a list of all configured full species names.

Provides a complete list of all species names that are configured in the system. The names are returned in their full scientific format.

Returns:

Type Description
list[str]

list[str]: A list of full scientific species names configured in the system.

Example
species_list = species_config.list_species_names()
# Returns: ["Aedes aegypti", "Aedes albopictus", ...]
Source code in culicidaelab\core\species_config.py
def list_species_names(self) -> list[str]:
    """Returns a list of all configured full species names.

    Provides a complete list of all species names that are configured in the system.
    The names are returned in their full scientific format.

    Returns:
        list[str]: A list of full scientific species names configured in the system.

    Example:
        ```python
        species_list = species_config.list_species_names()
        # Returns: ["Aedes aegypti", "Aedes albopictus", ...]
        ```
    """
    return list(self._reverse_species_map.keys())
BoundingBox

Represents a single bounding box with coordinates.

Attributes:

Name Type Description
x1 float

The top-left x-coordinate of the bounding box.

y1 float

The top-left y-coordinate of the bounding box.

x2 float

The bottom-right x-coordinate of the bounding box.

y2 float

The bottom-right y-coordinate of the bounding box.

Source code in culicidaelab\core\prediction_models.py
class BoundingBox(BaseModel):
    """Represents a single bounding box with coordinates.

    Attributes:
        x1 (float): The top-left x-coordinate of the bounding box.
        y1 (float): The top-left y-coordinate of the bounding box.
        x2 (float): The bottom-right x-coordinate of the bounding box.
        y2 (float): The bottom-right y-coordinate of the bounding box.
    """

    x1: float = Field(..., description="Top-left x-coordinate")
    y1: float = Field(..., description="Top-left y-coordinate")
    x2: float = Field(..., description="Bottom-right x-coordinate")
    y2: float = Field(..., description="Bottom-right y-coordinate")

    def to_numpy(self) -> np.ndarray:
        """Converts the bounding box to a NumPy array.

        Returns:
            np.ndarray: A NumPy array of shape (4,) in the format [x1, y1, x2, y2].
        """
        return np.array([self.x1, self.y1, self.x2, self.y2])
x1: float = Field(..., description='Top-left x-coordinate') class-attribute instance-attribute
y1: float = Field(..., description='Top-left y-coordinate') class-attribute instance-attribute
x2: float = Field(..., description='Bottom-right x-coordinate') class-attribute instance-attribute
y2: float = Field(..., description='Bottom-right y-coordinate') class-attribute instance-attribute
to_numpy() -> np.ndarray

Converts the bounding box to a NumPy array.

Returns:

Type Description
ndarray

np.ndarray: A NumPy array of shape (4,) in the format [x1, y1, x2, y2].

Source code in culicidaelab\core\prediction_models.py
def to_numpy(self) -> np.ndarray:
    """Converts the bounding box to a NumPy array.

    Returns:
        np.ndarray: A NumPy array of shape (4,) in the format [x1, y1, x2, y2].
    """
    return np.array([self.x1, self.y1, self.x2, self.y2])
Detection

Represents a single detected object, including its bounding box and confidence.

Attributes:

Name Type Description
box BoundingBox

The bounding box of the detected object.

confidence float

The confidence score of the prediction, between 0.0 and 1.0.

Source code in culicidaelab\core\prediction_models.py
class Detection(BaseModel):
    """Represents a single detected object, including its bounding box and confidence.

    Attributes:
        box (BoundingBox): The bounding box of the detected object.
        confidence (float): The confidence score of the prediction, between 0.0 and 1.0.
    """

    box: BoundingBox
    confidence: float = Field(..., ge=0.0, le=1.0, description="Prediction confidence score")
box: BoundingBox instance-attribute
confidence: float = Field(..., ge=0.0, le=1.0, description='Prediction confidence score') class-attribute instance-attribute
DetectionPrediction

Represents the output of a detection model for a single image.

Attributes:

Name Type Description
detections list[Detection]

A list of all objects detected in the image.

Source code in culicidaelab\core\prediction_models.py
class DetectionPrediction(BaseModel):
    """Represents the output of a detection model for a single image.

    Attributes:
        detections (list[Detection]): A list of all objects detected in the image.
    """

    detections: list[Detection]
detections: list[Detection] instance-attribute
SegmentationPrediction

Represents the output of a segmentation model for a single image.

Attributes:

Name Type Description
mask ndarray

A 2D NumPy array (H, W) representing the binary segmentation mask, where non-zero values indicate the segmented object.

pixel_count int

The total number of positive (masked) pixels in the mask.

Source code in culicidaelab\core\prediction_models.py
class SegmentationPrediction(BaseModel):
    """Represents the output of a segmentation model for a single image.

    Attributes:
        mask (np.ndarray): A 2D NumPy array (H, W) representing the binary
            segmentation mask, where non-zero values indicate the segmented object.
        pixel_count (int): The total number of positive (masked) pixels in the mask.
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)
    mask: np.ndarray = Field(..., description="Binary segmentation mask as a NumPy array (H, W)")
    pixel_count: int = Field(..., description="Number of positive (masked) pixels")
model_config = ConfigDict(arbitrary_types_allowed=True) class-attribute instance-attribute
mask: np.ndarray = Field(..., description='Binary segmentation mask as a NumPy array (H, W)') class-attribute instance-attribute
pixel_count: int = Field(..., description='Number of positive (masked) pixels') class-attribute instance-attribute
Classification

Represents a single classification result with species name and confidence.

Attributes:

Name Type Description
species_name str

The predicted species name.

confidence float

The confidence score of the prediction, between 0.0 and 1.0.

Source code in culicidaelab\core\prediction_models.py
class Classification(BaseModel):
    """Represents a single classification result with species name and confidence.

    Attributes:
        species_name (str): The predicted species name.
        confidence (float): The confidence score of the prediction, between 0.0 and 1.0.
    """

    species_name: str
    confidence: float = Field(..., ge=0.0, le=1.0, description="Prediction confidence score")
species_name: str instance-attribute
confidence: float = Field(..., ge=0.0, le=1.0, description='Prediction confidence score') class-attribute instance-attribute
ClassificationPrediction

Represents the full output of a classification model for a single image.

The predictions are typically sorted by confidence in descending order.

Attributes:

Name Type Description
predictions list[Classification]

A list of classification results.

Source code in culicidaelab\core\prediction_models.py
class ClassificationPrediction(BaseModel):
    """Represents the full output of a classification model for a single image.

    The predictions are typically sorted by confidence in descending order.

    Attributes:
        predictions (list[Classification]): A list of classification results.
    """

    predictions: list[Classification]

    def top_prediction(self) -> Classification | None:
        """Returns the top prediction (the one with the highest confidence).

        Returns:
            Classification | None: The top classification result, or None if there
            are no predictions.
        """
        return self.predictions[0] if self.predictions else None
predictions: list[Classification] instance-attribute
top_prediction() -> Classification | None

Returns the top prediction (the one with the highest confidence).

Returns:

Type Description
Classification | None

Classification | None: The top classification result, or None if there

Classification | None

are no predictions.

Source code in culicidaelab\core\prediction_models.py
def top_prediction(self) -> Classification | None:
    """Returns the top prediction (the one with the highest confidence).

    Returns:
        Classification | None: The top classification result, or None if there
        are no predictions.
    """
    return self.predictions[0] if self.predictions else None
ProviderService

Manages the instantiation and lifecycle of data providers.

This service acts as a factory and cache for provider instances, ensuring that each provider is a singleton within the application context.

Attributes:

Name Type Description
_settings Settings

The settings instance.

_providers dict[str, BaseProvider]

A cache of instantiated providers, keyed by provider name.

Source code in culicidaelab\core\provider_service.py
class ProviderService:
    """Manages the instantiation and lifecycle of data providers.

    This service acts as a factory and cache for provider instances, ensuring that
    each provider is a singleton within the application context.

    Attributes:
        _settings (Settings): The settings instance.
        _providers (dict[str, BaseProvider]): A cache of instantiated providers,
            keyed by provider name.
    """

    def __init__(self, settings: Settings):
        """Initializes the ProviderService.

        Args:
            settings (Settings): The main `Settings` object for the library.
        """
        self._settings = settings
        self._providers: dict[str, BaseProvider] = {}

    def get_provider(self, provider_name: str) -> BaseProvider:
        """Retrieves an instantiated provider by its name.

        It looks up the provider's configuration, instantiates it if it hasn't
        been already, and caches it for future calls.

        Args:
            provider_name (str): The name of the provider (e.g., 'huggingface').

        Returns:
            BaseProvider: An instance of a class that inherits from `BaseProvider`.

        Raises:
            ValueError: If the provider is not found in the configuration.
        """
        if provider_name not in self._providers:
            provider_path = f"providers.{provider_name}"

            provider_config = self._settings.get_config(provider_path)
            if not provider_config:
                raise ValueError(
                    f"Provider '{provider_name}' not found in configuration.",
                )

            # Use `instantiate_from_config` from `Settings`
            provider_instance = self._settings.instantiate_from_config(
                provider_path,
            )
            if not isinstance(provider_instance, BaseProvider):
                raise TypeError(
                    f"Instantiated provider '{provider_name}' is not a valid BaseProvider",
                )

            self._providers[provider_name] = provider_instance

        return self._providers[provider_name]
__init__(settings: Settings)

Initializes the ProviderService.

Parameters:

Name Type Description Default
settings Settings

The main Settings object for the library.

required
Source code in culicidaelab\core\provider_service.py
def __init__(self, settings: Settings):
    """Initializes the ProviderService.

    Args:
        settings (Settings): The main `Settings` object for the library.
    """
    self._settings = settings
    self._providers: dict[str, BaseProvider] = {}
get_provider(provider_name: str) -> BaseProvider

Retrieves an instantiated provider by its name.

It looks up the provider's configuration, instantiates it if it hasn't been already, and caches it for future calls.

Parameters:

Name Type Description Default
provider_name str

The name of the provider (e.g., 'huggingface').

required

Returns:

Name Type Description
BaseProvider BaseProvider

An instance of a class that inherits from BaseProvider.

Raises:

Type Description
ValueError

If the provider is not found in the configuration.

Source code in culicidaelab\core\provider_service.py
def get_provider(self, provider_name: str) -> BaseProvider:
    """Retrieves an instantiated provider by its name.

    It looks up the provider's configuration, instantiates it if it hasn't
    been already, and caches it for future calls.

    Args:
        provider_name (str): The name of the provider (e.g., 'huggingface').

    Returns:
        BaseProvider: An instance of a class that inherits from `BaseProvider`.

    Raises:
        ValueError: If the provider is not found in the configuration.
    """
    if provider_name not in self._providers:
        provider_path = f"providers.{provider_name}"

        provider_config = self._settings.get_config(provider_path)
        if not provider_config:
            raise ValueError(
                f"Provider '{provider_name}' not found in configuration.",
            )

        # Use `instantiate_from_config` from `Settings`
        provider_instance = self._settings.instantiate_from_config(
            provider_path,
        )
        if not isinstance(provider_instance, BaseProvider):
            raise TypeError(
                f"Instantiated provider '{provider_name}' is not a valid BaseProvider",
            )

        self._providers[provider_name] = provider_instance

    return self._providers[provider_name]
ResourceManager

Centralized resource management for models, datasets, and temporary files.

This class provides thread-safe operations for managing application resources, including models, datasets, cache files, and temporary workspaces. It ensures that all file operations are handled in a consistent and safe manner.

Parameters:

Name Type Description Default
app_name str

The name of the application, used for creating dedicated directories. If not provided, it is inferred from the pyproject.toml file. Defaults to None.

None
custom_base_dir str | Path

A custom base directory for storing all resources. If None, system-appropriate default directories are used (e.g., AppData on Windows). Defaults to None.

None

Attributes:

Name Type Description
app_name str

The application name.

user_data_dir Path

The root directory for user-specific data.

user_cache_dir Path

The directory for user-specific cache files.

temp_dir Path

The directory for temporary runtime files.

model_dir Path

The directory where model files are stored.

dataset_dir Path

The directory where datasets are stored.

downloads_dir Path

The directory for downloaded files.

logs_dir Path

The directory for log files.

config_dir Path

The directory for configuration files.

Raises:

Type Description
OSError

If the resource directories cannot be created.

ValueError

If the application name cannot be determined.

Source code in culicidaelab\core\resource_manager.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
class ResourceManager:
    """Centralized resource management for models, datasets, and temporary files.

    This class provides thread-safe operations for managing application resources,
    including models, datasets, cache files, and temporary workspaces. It ensures
    that all file operations are handled in a consistent and safe manner.

    Args:
        app_name (str, optional): The name of the application, used for creating
            dedicated directories. If not provided, it is inferred from the
            `pyproject.toml` file. Defaults to None.
        custom_base_dir (str | Path, optional): A custom base directory for
            storing all resources. If None, system-appropriate default
            directories are used (e.g., AppData on Windows). Defaults to None.

    Attributes:
        app_name (str): The application name.
        user_data_dir (Path): The root directory for user-specific data.
        user_cache_dir (Path): The directory for user-specific cache files.
        temp_dir (Path): The directory for temporary runtime files.
        model_dir (Path): The directory where model files are stored.
        dataset_dir (Path): The directory where datasets are stored.
        downloads_dir (Path): The directory for downloaded files.
        logs_dir (Path): The directory for log files.
        config_dir (Path): The directory for configuration files.

    Raises:
        OSError: If the resource directories cannot be created.
        ValueError: If the application name cannot be determined.
    """

    def __init__(
        self,
        app_name: str | None = None,
        custom_base_dir: str | Path | None = None,
    ):
        """Initializes the ResourceManager with cross-platform compatibility.

        Sets up the necessary directory structure for the application's resources.
        """
        self._lock = Lock()
        self.app_name = self._determine_app_name(app_name)
        self._initialize_paths(custom_base_dir)
        self._initialize_directories()
        logger.info(f"ResourceManager initialized for app: {self.app_name}")
        logger.debug(f"Resource directories: {self.get_all_directories()}")

    def __repr__(self) -> str:
        """Returns a string representation of the ResourceManager instance.

        Returns:
            str: A string representation of the object.
        """
        return f"ResourceManager(app_name='{self.app_name}', " f"user_data_dir='{self.user_data_dir}')"

    @contextmanager
    def temp_workspace(self, prefix: str = "workspace", suffix: str = ""):
        """Provides a temporary workspace that is automatically cleaned up.

        This context manager creates a temporary directory and yields its path,
        ensuring the directory and its contents are removed upon exiting the
        context, even if errors occur.

        Args:
            prefix (str): A prefix for the temporary directory's name.
            suffix (str): A suffix for the temporary directory's name.

        Yields:
            Path: The path to the temporary workspace.

        Example:
            >>> resource_manager = ResourceManager()
            >>> with resource_manager.temp_workspace(prefix="job_") as ws:
            ...     # Perform temporary operations within this workspace
            ...     (ws / "temp_file.txt").write_text("some data")
            ...     print(f"Workspace created at: {ws}")
            # The workspace directory is automatically removed here.
        """
        workspace_path = None
        try:
            # Create the temp directory inside the app's main temp_dir
            workspace_path_str = tempfile.mkdtemp(
                prefix=prefix,
                suffix=suffix,
                dir=self.temp_dir,
            )
            workspace_path = Path(workspace_path_str)
            logger.info(f"Created temporary workspace: {workspace_path}")
            yield workspace_path
        finally:
            if workspace_path and workspace_path.exists():
                try:
                    shutil.rmtree(workspace_path)
                    logger.info(f"Cleaned up temporary workspace: {workspace_path}")
                except Exception as e:
                    # Log the error but do not raise it to avoid masking other exceptions
                    logger.error(
                        f"Failed to clean up workspace {workspace_path}: {e}",
                    )

    def clean_old_files(
        self,
        days: int = 5,
        include_cache: bool = True,
    ) -> dict[str, int]:
        """Cleans up old files from download and temporary directories.

        Args:
            days (int): The age in days for a file to be considered old.
            include_cache (bool): If True, the cache directory is also cleaned.

        Returns:
            dict[str, int]: A dictionary containing statistics of the cleanup.

        Raises:
            ValueError: If `days` is a negative number.
        """
        if days < 0:
            raise ValueError("Days must be a non-negative number.")

        cleanup_stats = {"downloads_cleaned": 0, "temp_cleaned": 0, "cache_cleaned": 0}
        cutoff_time = time.time() - (days * 86400)

        cleanup_stats["downloads_cleaned"] = self._clean_directory(
            self.downloads_dir,
            cutoff_time,
        )
        cleanup_stats["temp_cleaned"] = self._clean_directory(
            self.temp_dir,
            cutoff_time,
        )
        if include_cache:
            cleanup_stats["cache_cleaned"] = self._clean_directory(
                self.user_cache_dir,
                cutoff_time,
            )

        logger.info(f"Cleanup completed: {cleanup_stats}")
        return cleanup_stats

    def create_checksum(self, file_path: str | Path, algorithm: str = "md5") -> str:
        """Creates a checksum for a given file.

        Args:
            file_path (str | Path): The path to the file.
            algorithm (str): The hashing algorithm to use (e.g., 'md5', 'sha256').

        Returns:
            str: The hexadecimal checksum string.

        Raises:
            FileNotFoundError: If the specified file does not exist.
            OSError: If there is an error reading the file.
        """
        file_path = Path(file_path)
        if not file_path.exists():
            msg = f"File not found: {file_path}"
            logger.error(msg)
            raise FileNotFoundError(msg)

        try:
            hash_obj = hashlib.new(algorithm)
            with open(file_path, "rb") as f:
                # Read the file in chunks to handle large files efficiently
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_obj.update(chunk)
            return hash_obj.hexdigest()
        except Exception as e:
            msg = f"Failed to create checksum for {file_path}: {e}"
            logger.error(msg)
            raise OSError(msg) from e

    def get_all_directories(self) -> dict[str, Path]:
        """Retrieves all managed directory paths.

        Returns:
            dict[str, Path]: A dictionary mapping directory names to their paths.
        """
        return {
            "user_data_dir": self.user_data_dir,
            "user_cache_dir": self.user_cache_dir,
            "temp_dir": self.temp_dir,
            "model_dir": self.model_dir,
            "dataset_dir": self.dataset_dir,
            "downloads_dir": self.downloads_dir,
            "logs_dir": self.logs_dir,
            "config_dir": self.config_dir,
        }

    def get_dataset_path(
        self,
        dataset_name: str,
        create_if_missing: bool = True,
    ) -> Path:
        """Constructs a standardized path for a dataset.

        Args:
            dataset_name (str): The name of the dataset.
            create_if_missing (bool): If True, creates the directory if it
                does not exist.

        Returns:
            Path: The absolute path to the dataset directory.

        Raises:
            ValueError: If `dataset_name` is empty or contains only whitespace.
        """
        if not dataset_name or not dataset_name.strip():
            raise ValueError("Dataset name cannot be empty.")

        safe_dataset_name = create_safe_path(dataset_name)
        dataset_path = self.dataset_dir / safe_dataset_name
        if create_if_missing:
            self._create_directory(dataset_path, "dataset")
        return dataset_path

    def get_disk_usage(self) -> dict[str, dict[str, int | str]]:
        """Calculates disk usage for all managed directories.

        Returns:
            dict: A dictionary with disk usage details for each directory,
                  including size in bytes, human-readable size, and file count.
        """
        directories = {
            "user_data": self.user_data_dir,
            "cache": self.user_cache_dir,
            "models": self.model_dir,
            "datasets": self.dataset_dir,
            "downloads": self.downloads_dir,
            "temp": self.temp_dir,
        }
        return {name: self._get_directory_size(path) for name, path in directories.items()}

    def verify_checksum(
        self,
        file_path: str | Path,
        expected_checksum: str,
        algorithm: str = "md5",
    ) -> bool:
        """Verifies the checksum of a file against an expected value.

        Args:
            file_path (str | Path): The path to the file.
            expected_checksum (str): The expected checksum.
            algorithm (str): The hashing algorithm used for the checksum.

        Returns:
            bool: True if the checksums match, False otherwise.
        """
        try:
            actual_checksum = self.create_checksum(file_path, algorithm)
            return actual_checksum.lower() == expected_checksum.lower()
        except (FileNotFoundError, OSError) as e:
            logger.error(f"Checksum verification failed for {file_path}: {e}")
            return False

    def _clean_directory(self, directory: Path, cutoff_time: float) -> int:
        """Removes files in a directory older than a specified time."""
        cleaned_count = 0
        if not directory.exists():
            return cleaned_count

        try:
            for item in directory.iterdir():
                try:
                    # Check if the item's modification time is older than the cutoff
                    if item.stat().st_mtime < cutoff_time:
                        if item.is_dir():
                            shutil.rmtree(item)
                        else:
                            item.unlink()
                        cleaned_count += 1
                        logger.debug(f"Removed old item: {item}")
                except Exception as e:
                    logger.warning(f"Could not remove {item}: {e}")
        except Exception as e:
            logger.error(f"Error cleaning directory {directory}: {e}")
        return cleaned_count

    def _create_directory(self, path: Path, dir_type: str) -> None:
        """Creates a directory if it doesn't exist."""
        try:
            path.mkdir(parents=True, exist_ok=True)
        except Exception as e:
            msg = f"Failed to create {dir_type} directory at {path}: {e}"
            logger.error(msg)
            raise OSError(msg) from e

    def _determine_app_name(self, app_name: str | None = None) -> str:
        """Determines the application name."""
        if app_name:
            return app_name
        try:
            # Attempt to get the project name from pyproject.toml
            pyproject_name = self._get_project_name_from_pyproject()
            if pyproject_name:
                return pyproject_name
        except Exception as e:
            logger.warning(
                f"Could not determine app name from pyproject.toml: {e}. " "Falling back to default 'culicidaelab'.",
            )
        return "culicidaelab"

    def _get_project_name_from_pyproject(self) -> str | None:
        """Reads the project name from the pyproject.toml file."""
        try:
            # Traverse up to find the project root containing pyproject.toml
            current_dir = Path(__file__).parent
            while not (current_dir / "pyproject.toml").exists():
                if current_dir.parent == current_dir:  # Reached the filesystem root
                    return None
                current_dir = current_dir.parent

            pyproject_path = current_dir / "pyproject.toml"
            with open(pyproject_path, encoding="utf-8") as f:
                pyproject_data = toml.load(f)

            return pyproject_data.get("project", {}).get("name")
        except Exception as e:
            logger.error(f"Failed to read project name from pyproject.toml: {e}")
            return None

    def _format_bytes(self, bytes_count: int | float) -> str:
        """Formats a byte count into a human-readable string."""
        import math

        if bytes_count is None:
            raise ValueError("bytes_count cannot be None.")
        if bytes_count == 0:
            return "0 B"
        units = ["B", "KB", "MB", "GB", "TB", "PB"]
        # Determine the appropriate unit using logarithm
        power = int(math.log(bytes_count, 1024)) if bytes_count > 0 else 0
        unit_index = min(power, len(units) - 1)
        value = bytes_count / (1024**unit_index)
        return f"{value:.1f} {units[unit_index]}"

    def _get_directory_size(self, path: Path) -> dict[str, int | str]:
        """Calculates the total size and file count of a directory."""
        if not path.exists():
            return {"size_bytes": 0, "size_human": "0 B", "file_count": 0}

        total_size = 0
        file_count = 0
        try:
            for item in path.rglob("*"):
                if item.is_file():
                    total_size += item.stat().st_size
                    file_count += 1
        except Exception as e:
            logger.warning(f"Error calculating size for {path}: {e}")

        return {
            "size_bytes": total_size,
            "size_human": self._format_bytes(total_size),
            "file_count": file_count,
        }

    def _initialize_directories(self) -> None:
        """Creates all necessary application directories."""
        directories = self.get_all_directories().values()
        for directory in directories:
            try:
                directory.mkdir(parents=True, exist_ok=True)
                logger.debug(f"Ensured directory exists: {directory}")
            except Exception as e:
                msg = f"Failed to create directory {directory}: {e}"
                logger.error(msg)
                raise OSError(msg) from e

        # Set secure permissions on non-Windows systems
        if platform.system() != "Windows":
            self._set_directory_permissions(list(directories))

    def _initialize_paths(self, custom_base_dir: str | Path | None = None) -> None:
        """Initializes all resource paths based on the environment."""
        if custom_base_dir:
            base_dir = Path(custom_base_dir).resolve()
            self.user_data_dir = base_dir / "data"
            self.user_cache_dir = base_dir / "cache"
        else:
            # Use system-appropriate directories
            self.user_data_dir = Path(appdirs.user_data_dir(self.app_name))
            self.user_cache_dir = Path(appdirs.user_cache_dir(self.app_name))

        self.temp_dir = Path(tempfile.gettempdir()) / self.app_name
        self.model_dir = self.user_data_dir / "models"
        self.dataset_dir = self.user_data_dir / "datasets"
        self.downloads_dir = self.user_data_dir / "downloads"
        self.logs_dir = self.user_data_dir / "logs"
        self.config_dir = self.user_data_dir / "config"

    def _is_safe_to_delete(self, path: Path) -> bool:
        """Checks if a path is within a managed directory and safe to delete."""
        safe_parents = [self.temp_dir, self.user_cache_dir]
        try:
            resolved_path = path.resolve()
            # Ensure the path is a child of one of the safe parent directories
            return any(resolved_path.is_relative_to(p.resolve()) for p in safe_parents)
        except Exception:
            return False

    def _set_directory_permissions(self, directories: list[Path]) -> None:
        """Sets directory permissions to 0o700 on Unix-like systems."""
        try:
            for directory in directories:
                os.chmod(directory, 0o700)
        except Exception as e:
            logger.warning(f"Could not set directory permissions: {e}")
app_name = self._determine_app_name(app_name) instance-attribute
__init__(app_name: str | None = None, custom_base_dir: str | Path | None = None)

Initializes the ResourceManager with cross-platform compatibility.

Sets up the necessary directory structure for the application's resources.

Source code in culicidaelab\core\resource_manager.py
def __init__(
    self,
    app_name: str | None = None,
    custom_base_dir: str | Path | None = None,
):
    """Initializes the ResourceManager with cross-platform compatibility.

    Sets up the necessary directory structure for the application's resources.
    """
    self._lock = Lock()
    self.app_name = self._determine_app_name(app_name)
    self._initialize_paths(custom_base_dir)
    self._initialize_directories()
    logger.info(f"ResourceManager initialized for app: {self.app_name}")
    logger.debug(f"Resource directories: {self.get_all_directories()}")
__repr__() -> str

Returns a string representation of the ResourceManager instance.

Returns:

Name Type Description
str str

A string representation of the object.

Source code in culicidaelab\core\resource_manager.py
def __repr__(self) -> str:
    """Returns a string representation of the ResourceManager instance.

    Returns:
        str: A string representation of the object.
    """
    return f"ResourceManager(app_name='{self.app_name}', " f"user_data_dir='{self.user_data_dir}')"
temp_workspace(prefix: str = 'workspace', suffix: str = '')

Provides a temporary workspace that is automatically cleaned up.

This context manager creates a temporary directory and yields its path, ensuring the directory and its contents are removed upon exiting the context, even if errors occur.

Parameters:

Name Type Description Default
prefix str

A prefix for the temporary directory's name.

'workspace'
suffix str

A suffix for the temporary directory's name.

''

Yields:

Name Type Description
Path

The path to the temporary workspace.

Example

resource_manager = ResourceManager() with resource_manager.temp_workspace(prefix="job_") as ws: ... # Perform temporary operations within this workspace ... (ws / "temp_file.txt").write_text("some data") ... print(f"Workspace created at: {ws}")

The workspace directory is automatically removed here.
Source code in culicidaelab\core\resource_manager.py
@contextmanager
def temp_workspace(self, prefix: str = "workspace", suffix: str = ""):
    """Provides a temporary workspace that is automatically cleaned up.

    This context manager creates a temporary directory and yields its path,
    ensuring the directory and its contents are removed upon exiting the
    context, even if errors occur.

    Args:
        prefix (str): A prefix for the temporary directory's name.
        suffix (str): A suffix for the temporary directory's name.

    Yields:
        Path: The path to the temporary workspace.

    Example:
        >>> resource_manager = ResourceManager()
        >>> with resource_manager.temp_workspace(prefix="job_") as ws:
        ...     # Perform temporary operations within this workspace
        ...     (ws / "temp_file.txt").write_text("some data")
        ...     print(f"Workspace created at: {ws}")
        # The workspace directory is automatically removed here.
    """
    workspace_path = None
    try:
        # Create the temp directory inside the app's main temp_dir
        workspace_path_str = tempfile.mkdtemp(
            prefix=prefix,
            suffix=suffix,
            dir=self.temp_dir,
        )
        workspace_path = Path(workspace_path_str)
        logger.info(f"Created temporary workspace: {workspace_path}")
        yield workspace_path
    finally:
        if workspace_path and workspace_path.exists():
            try:
                shutil.rmtree(workspace_path)
                logger.info(f"Cleaned up temporary workspace: {workspace_path}")
            except Exception as e:
                # Log the error but do not raise it to avoid masking other exceptions
                logger.error(
                    f"Failed to clean up workspace {workspace_path}: {e}",
                )
clean_old_files(days: int = 5, include_cache: bool = True) -> dict[str, int]

Cleans up old files from download and temporary directories.

Parameters:

Name Type Description Default
days int

The age in days for a file to be considered old.

5
include_cache bool

If True, the cache directory is also cleaned.

True

Returns:

Type Description
dict[str, int]

dict[str, int]: A dictionary containing statistics of the cleanup.

Raises:

Type Description
ValueError

If days is a negative number.

Source code in culicidaelab\core\resource_manager.py
def clean_old_files(
    self,
    days: int = 5,
    include_cache: bool = True,
) -> dict[str, int]:
    """Cleans up old files from download and temporary directories.

    Args:
        days (int): The age in days for a file to be considered old.
        include_cache (bool): If True, the cache directory is also cleaned.

    Returns:
        dict[str, int]: A dictionary containing statistics of the cleanup.

    Raises:
        ValueError: If `days` is a negative number.
    """
    if days < 0:
        raise ValueError("Days must be a non-negative number.")

    cleanup_stats = {"downloads_cleaned": 0, "temp_cleaned": 0, "cache_cleaned": 0}
    cutoff_time = time.time() - (days * 86400)

    cleanup_stats["downloads_cleaned"] = self._clean_directory(
        self.downloads_dir,
        cutoff_time,
    )
    cleanup_stats["temp_cleaned"] = self._clean_directory(
        self.temp_dir,
        cutoff_time,
    )
    if include_cache:
        cleanup_stats["cache_cleaned"] = self._clean_directory(
            self.user_cache_dir,
            cutoff_time,
        )

    logger.info(f"Cleanup completed: {cleanup_stats}")
    return cleanup_stats
create_checksum(file_path: str | Path, algorithm: str = 'md5') -> str

Creates a checksum for a given file.

Parameters:

Name Type Description Default
file_path str | Path

The path to the file.

required
algorithm str

The hashing algorithm to use (e.g., 'md5', 'sha256').

'md5'

Returns:

Name Type Description
str str

The hexadecimal checksum string.

Raises:

Type Description
FileNotFoundError

If the specified file does not exist.

OSError

If there is an error reading the file.

Source code in culicidaelab\core\resource_manager.py
def create_checksum(self, file_path: str | Path, algorithm: str = "md5") -> str:
    """Creates a checksum for a given file.

    Args:
        file_path (str | Path): The path to the file.
        algorithm (str): The hashing algorithm to use (e.g., 'md5', 'sha256').

    Returns:
        str: The hexadecimal checksum string.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        OSError: If there is an error reading the file.
    """
    file_path = Path(file_path)
    if not file_path.exists():
        msg = f"File not found: {file_path}"
        logger.error(msg)
        raise FileNotFoundError(msg)

    try:
        hash_obj = hashlib.new(algorithm)
        with open(file_path, "rb") as f:
            # Read the file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(4096), b""):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()
    except Exception as e:
        msg = f"Failed to create checksum for {file_path}: {e}"
        logger.error(msg)
        raise OSError(msg) from e
get_all_directories() -> dict[str, Path]

Retrieves all managed directory paths.

Returns:

Type Description
dict[str, Path]

dict[str, Path]: A dictionary mapping directory names to their paths.

Source code in culicidaelab\core\resource_manager.py
def get_all_directories(self) -> dict[str, Path]:
    """Retrieves all managed directory paths.

    Returns:
        dict[str, Path]: A dictionary mapping directory names to their paths.
    """
    return {
        "user_data_dir": self.user_data_dir,
        "user_cache_dir": self.user_cache_dir,
        "temp_dir": self.temp_dir,
        "model_dir": self.model_dir,
        "dataset_dir": self.dataset_dir,
        "downloads_dir": self.downloads_dir,
        "logs_dir": self.logs_dir,
        "config_dir": self.config_dir,
    }
get_dataset_path(dataset_name: str, create_if_missing: bool = True) -> Path

Constructs a standardized path for a dataset.

Parameters:

Name Type Description Default
dataset_name str

The name of the dataset.

required
create_if_missing bool

If True, creates the directory if it does not exist.

True

Returns:

Name Type Description
Path Path

The absolute path to the dataset directory.

Raises:

Type Description
ValueError

If dataset_name is empty or contains only whitespace.

Source code in culicidaelab\core\resource_manager.py
def get_dataset_path(
    self,
    dataset_name: str,
    create_if_missing: bool = True,
) -> Path:
    """Constructs a standardized path for a dataset.

    Args:
        dataset_name (str): The name of the dataset.
        create_if_missing (bool): If True, creates the directory if it
            does not exist.

    Returns:
        Path: The absolute path to the dataset directory.

    Raises:
        ValueError: If `dataset_name` is empty or contains only whitespace.
    """
    if not dataset_name or not dataset_name.strip():
        raise ValueError("Dataset name cannot be empty.")

    safe_dataset_name = create_safe_path(dataset_name)
    dataset_path = self.dataset_dir / safe_dataset_name
    if create_if_missing:
        self._create_directory(dataset_path, "dataset")
    return dataset_path
get_disk_usage() -> dict[str, dict[str, int | str]]

Calculates disk usage for all managed directories.

Returns:

Name Type Description
dict dict[str, dict[str, int | str]]

A dictionary with disk usage details for each directory, including size in bytes, human-readable size, and file count.

Source code in culicidaelab\core\resource_manager.py
def get_disk_usage(self) -> dict[str, dict[str, int | str]]:
    """Calculates disk usage for all managed directories.

    Returns:
        dict: A dictionary with disk usage details for each directory,
              including size in bytes, human-readable size, and file count.
    """
    directories = {
        "user_data": self.user_data_dir,
        "cache": self.user_cache_dir,
        "models": self.model_dir,
        "datasets": self.dataset_dir,
        "downloads": self.downloads_dir,
        "temp": self.temp_dir,
    }
    return {name: self._get_directory_size(path) for name, path in directories.items()}
verify_checksum(file_path: str | Path, expected_checksum: str, algorithm: str = 'md5') -> bool

Verifies the checksum of a file against an expected value.

Parameters:

Name Type Description Default
file_path str | Path

The path to the file.

required
expected_checksum str

The expected checksum.

required
algorithm str

The hashing algorithm used for the checksum.

'md5'

Returns:

Name Type Description
bool bool

True if the checksums match, False otherwise.

Source code in culicidaelab\core\resource_manager.py
def verify_checksum(
    self,
    file_path: str | Path,
    expected_checksum: str,
    algorithm: str = "md5",
) -> bool:
    """Verifies the checksum of a file against an expected value.

    Args:
        file_path (str | Path): The path to the file.
        expected_checksum (str): The expected checksum.
        algorithm (str): The hashing algorithm used for the checksum.

    Returns:
        bool: True if the checksums match, False otherwise.
    """
    try:
        actual_checksum = self.create_checksum(file_path, algorithm)
        return actual_checksum.lower() == expected_checksum.lower()
    except (FileNotFoundError, OSError) as e:
        logger.error(f"Checksum verification failed for {file_path}: {e}")
        return False
Settings

User-friendly facade for CulicidaeLab configuration management.

This class provides a simple, stable interface to access configuration values, resource directories, and application settings. All actual operations are delegated to a validated configuration object managed by ConfigManager and a ResourceManager.

The Settings class is implemented as a singleton to ensure consistent configuration state across the application. It manages: - Configuration values through get_config() and set_config() - Resource directories for models, datasets, and cache - Dataset paths and splits - Model weights paths and types - API keys for external services - Temporary workspaces for processing

Attributes:

Name Type Description
config CulicidaeLabConfig

The current configuration object

model_dir Path

Directory for model weights

dataset_dir Path

Directory for datasets

cache_dir Path

Directory for cached data

config_dir Path

Active user configuration directory

species_config SpeciesConfig

Configuration for species detection

Source code in culicidaelab\core\settings.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
class Settings:
    """
    User-friendly facade for CulicidaeLab configuration management.

    This class provides a simple, stable interface to access configuration values,
    resource directories, and application settings. All actual operations
    are delegated to a validated configuration object managed by ConfigManager
    and a ResourceManager.

    The Settings class is implemented as a singleton to ensure consistent
    configuration state across the application. It manages:
    - Configuration values through get_config() and set_config()
    - Resource directories for models, datasets, and cache
    - Dataset paths and splits
    - Model weights paths and types
    - API keys for external services
    - Temporary workspaces for processing

    Attributes:
        config (CulicidaeLabConfig): The current configuration object
        model_dir (Path): Directory for model weights
        dataset_dir (Path): Directory for datasets
        cache_dir (Path): Directory for cached data
        config_dir (Path): Active user configuration directory
        species_config (SpeciesConfig): Configuration for species detection
    """

    _instance: Optional["Settings"] = None
    _lock = threading.Lock()
    _initialized = False

    def __init__(self, config_dir: str | Path | None = None) -> None:
        """Initializes the Settings facade.

        This loads the configuration using a ConfigManager and sets up a
        ResourceManager for file paths.

        Args:
            config_dir: Optional path to a user-provided configuration directory.
        """
        if self._initialized:
            return

        self._config_manager = ConfigManager(user_config_dir=config_dir)
        self.config: CulicidaeLabConfig = self._config_manager.get_config()
        self._resource_manager = ResourceManager()

        # Cache for species config (lazy loaded)
        self._species_config: SpeciesConfig | None = None

        # Store for singleton check
        self._current_config_dir = self._config_manager.user_config_dir

        self._initialized = True

    # Configuration Access
    def get_config(self, path: str | None = None, default: Any = None) -> Any:
        """Gets a configuration value using a dot-separated path.

        Example:
            >>> settings.get_config("predictors.classifier.confidence")

        Args:
            path: A dot-separated string path to the configuration value.
                If None, returns the entire configuration object.
            default: A default value to return if the path is not found.

        Returns:
            The configuration value, or the default value if not found.
        """
        if not path:
            return self.config

        obj = self.config
        try:
            for key in path.split("."):
                if isinstance(obj, dict):
                    obj = obj.get(key)
                else:
                    obj = getattr(obj, key)
            return obj if obj is not None else default
        except (AttributeError, KeyError):
            return default

    def set_config(self, path: str, value: Any) -> None:
        """
        Sets a configuration value at a specified dot-separated path.
        This method can traverse both objects (Pydantic models) and dictionaries.

        Note: This modifies the configuration in memory. To make it persistent,
        call `save_config()`.

        Args:
            path: A dot-separated string path to the configuration value.
            value: The new value to set.
        """
        keys = path.split(".")
        obj = self.config

        for key in keys[:-1]:
            if isinstance(obj, dict):
                obj = obj.get(key)
            else:
                obj = getattr(obj, key)

            if obj is None:
                raise KeyError(f"The path part '{key}' in '{path}' was not found.")

        last_key = keys[-1]
        if isinstance(obj, dict):
            obj[last_key] = value
        else:
            setattr(obj, last_key, value)

    def save_config(self, file_path: str | Path | None = None) -> None:
        """Save current configuration to a user config file.
        Args:
            file_path: Optional path to save the configuration file.
                If None, defaults to "culicidaelab_saved.yaml" in the user config directory.
        """
        if file_path is None:
            if not self._config_manager.user_config_dir:
                raise ValueError("Cannot save config without a specified user config directory.")
            file_path = self._config_manager.user_config_dir / "culicidaelab_saved.yaml"
        self._config_manager.save_config(file_path)

    # Resource Directory Access
    @property
    def model_dir(self) -> Path:
        """Model weights directory."""
        return self._resource_manager.model_dir

    @property
    def weights_dir(self) -> Path:
        """Alias for model_dir."""
        return self.model_dir

    @property
    def dataset_dir(self) -> Path:
        """Datasets directory."""
        return self._resource_manager.dataset_dir

    @property
    def cache_dir(self) -> Path:
        """Cache directory."""
        return self._resource_manager.user_cache_dir

    @property
    def config_dir(self) -> Path:
        """The active user configuration directory."""
        return self._config_manager.user_config_dir or self._config_manager.default_config_path

    @property
    def species_config(self) -> SpeciesConfig:
        """Species configuration (lazily loaded)."""
        if self._species_config is None:
            self._species_config = SpeciesConfig(self.config.species)
        return self._species_config

    # Dataset Management
    def get_cache_key_for_split(self, split: str | list[str] | None) -> str:
        """
        Generates a unique, deterministic hash for any valid split configuration.
        This hash is used to create unique directory names for dataset splits.

        Args:
            split (str | list[str] | None): The split configuration to hash.
                Can be a single split name (e.g., 'train'), a list of splits
                (e.g., ['train', 'val']), or None.

        Returns:
            str: A 16-character hexadecimal hash that uniquely identifies the
                split configuration. This hash is deterministic for the same
                input.

        Example:
            >>> settings.get_cache_key_for_split('train')
            'a1b2c3d4e5f6g7h8'
            >>> settings.get_cache_key_for_split(['train', 'val'])
            'h8g7f6e5d4c3b2a1'
        """
        if isinstance(split, list):
            split.sort()

        # json.dumps correctly handles None, converting it to the string "null"
        split_str = json.dumps(split, sort_keys=True)

        hasher = hashlib.sha256(split_str.encode("utf-8"))
        return hasher.hexdigest()[:16]

    def construct_split_path(
        self,
        dataset_base_path: Path,
        split: str | list[str] | None = None,
    ) -> Path:
        """
        Gets the standardized, absolute path for a dataset's directory.

        This is the single source of truth for dataset path construction.

        Args:
            name (str): The name of the dataset (e.g., 'classification').
            split (str | list[str] | None, optional): If provided, returns the specific
                cache path for this split configuration. Otherwise, returns the base
                directory for the dataset.
            ensure_exists (bool): If True, ensures the directory is created on disk.

        Returns:
            Path: The absolute path to the dataset directory.
        """
        # Determine the final path (either base or split-specific)
        final_path = dataset_base_path
        if split is not None:
            split_key = self.get_cache_key_for_split(split)
            final_path = dataset_base_path / split_key

        return final_path

    def get_dataset_path(self, dataset_type: str, split: str | list[str] | None = None) -> Path:
        """Gets the standardized path for a specific dataset directory.

        Args:
            dataset_type: The name of the dataset type (e.g., 'classification').

        Returns:
            An absolute path to the dataset directory.
        """
        if dataset_type not in self.config.datasets:
            raise ValueError(f"Dataset type '{dataset_type}' not configured.")

        dataset_path_str = self.config.datasets[dataset_type].path
        filal_path = Path(dataset_path_str)
        if not filal_path.is_absolute():
            filal_path = self.dataset_dir / filal_path

        filal_path.mkdir(parents=True, exist_ok=True)

        if split is not None:
            filal_path = self.construct_split_path(
                dataset_base_path=filal_path,
                split=split,
            )
        return filal_path

    def list_datasets(self) -> list[str]:
        """Get list of configured dataset types in the application.

        Returns:
            list[str]: A list of dataset type identifiers that are configured
                in the application settings. These correspond to the different
                dataset categories available for training and inference.

        Example:
            >>> settings.list_datasets()
            ['classification', 'detection', 'segmentation']
        """
        return list(self.config.datasets.keys())

    # Model Management
    def construct_weights_path(
        self,
        predictor_type: str,
        backend: str | None = None,
    ) -> Path:
        """
        A pure, static function to construct a fully qualified model weights path.

        This is the single source of truth for model path construction, creating a
        structured path like: .../models/<predictor_type>/<backend>/<filename>.

        Args:
            model_dir (Path): The base directory for all models (e.g., '.../culicidaelab/models').
            predictor_type (str): The type of the predictor (e.g., 'classifier'). Used as a subdirectory.
            predictor_config (PredictorConfig): The Pydantic model for the predictor's configuration.
            backend (str | None, optional): The target backend (e.g., 'torch', 'onnx').
                                            If None, uses the default from the config.

        Returns:
            Path: The absolute, structured path to the model weights file.

        Raises:
            ValueError: If a valid backend or weights filename cannot be determined.
        """
        predictor_config = self.get_config(f"predictors.{predictor_type}")
        final_backend = backend if backend is not None else predictor_config.backend
        if not final_backend:
            raise ValueError(f"No backend specified for model '{predictor_type}'.")

        if not predictor_config.weights or final_backend not in predictor_config.weights:
            raise ValueError(f"Backend '{final_backend}' not defined in weights config for '{predictor_type}'.")

        filename = predictor_config.weights[final_backend].filename
        if not filename:
            raise ValueError(f"Filename for backend '{final_backend}' is missing in config for '{predictor_type}'.")

        # Sanitize the components that will become directories
        predictor_dir = create_safe_path(predictor_type)
        backend_dir = create_safe_path(final_backend)

        # Assemble the final, structured path
        return self.model_dir / predictor_dir / backend_dir / filename

    def get_model_weights_path(
        self,
        model_type: str,
        backend: str | None = None,
    ) -> Path:
        """Gets the configured path to a model's weights file.

        Args:
            model_type: The name of the model type (e.g., 'classifier').

        Returns:
            The path to the model weights file.
        """
        if model_type not in self.config.predictors:
            raise ValueError(f"Model type '{model_type}' not configured in 'predictors'.")

        local_path = self.construct_weights_path(
            predictor_type=model_type,
            backend=backend,
        )
        return local_path

    def list_model_types(self) -> list[str]:
        """Get list of available model types configured in the application.

        Returns:
            list[str]: A list of model type identifiers (e.g., ['classifier',
                'detector', 'segmenter']) that are configured in the application.
                These types correspond to the different predictors available
                in the CulicidaeLab system.

        Example:
            >>> settings.list_model_types()
            ['classifier', 'detector', 'segmenter']
        """
        return list(self.config.predictors.keys())

    # API Key Management
    def get_api_key(self, provider: str) -> str | None:
        """Get API key for external provider from environment variables.

        The method looks for environment variables in the following format:
        - KAGGLE_API_KEY for 'kaggle' provider
        - HUGGINGFACE_API_KEY for 'huggingface' provider
        - ROBOFLOW_API_KEY for 'roboflow' provider

        Args:
            provider (str): The name of the provider. Must be one of:
                'kaggle', 'huggingface', or 'roboflow'.

        Returns:
            str | None: The API key if found in environment variables,
                None if the provider is not supported or the key is not set.

        Example:
            >>> api_key = settings.get_api_key('huggingface')
            >>> if api_key:
            ...     # Use the API key
            ... else:
            ...     # Handle missing key
        """
        api_keys = {
            "kaggle": "KAGGLE_API_KEY",
            "huggingface": "HUGGINGFACE_API_KEY",
            "roboflow": "ROBOFLOW_API_KEY",
        }
        if provider in api_keys:
            return os.getenv(api_keys[provider])
        return None

    # Utility Methods (delegated to ResourceManager)
    @contextmanager
    def temp_workspace(self, prefix: str = "workspace"):
        """Creates a temporary workspace directory that is automatically cleaned up.

        This context manager creates a temporary directory for processing operations
        and automatically cleans it up when the context is exited.

        Args:
            prefix (str, optional): Prefix for the temporary directory name.
                Defaults to "workspace".

        Yields:
            Path: Path to the temporary workspace directory.

        Example:
            >>> with settings.temp_workspace(prefix='processing') as workspace:
            ...     # Do some work in the temporary directory
            ...     (workspace / 'output.txt').write_text('results')
            # Directory is automatically cleaned up after the with block
        """
        with self._resource_manager.temp_workspace(prefix) as workspace:
            yield workspace

    # Instantiation
    def instantiate_from_config(self, config_path: str, **kwargs: Any) -> Any:
        """Instantiates an object from a configuration path.

        This is a convenience method that finds a config object by its path
        and uses the underlying ConfigManager to instantiate it.

        Args:
            config_path: A dot-separated path to the configuration object
                (e.g., "predictors.classifier").
            **kwargs: Additional keyword arguments to pass to the constructor.

        Returns:
            The instantiated object.
        """

        config_obj = self.get_config(config_path)
        if not config_obj:
            raise ValueError(f"No configuration object found at path: {config_path}")

        extra_deps = {"settings": self}

        return self._config_manager.instantiate_from_config(
            config_obj,
            extra_params=extra_deps,
            **kwargs,
        )
config: CulicidaeLabConfig = self._config_manager.get_config() instance-attribute
model_dir: Path property

Model weights directory.

weights_dir: Path property

Alias for model_dir.

dataset_dir: Path property

Datasets directory.

cache_dir: Path property

Cache directory.

config_dir: Path property

The active user configuration directory.

species_config: SpeciesConfig property

Species configuration (lazily loaded).

__init__(config_dir: str | Path | None = None) -> None

Initializes the Settings facade.

This loads the configuration using a ConfigManager and sets up a ResourceManager for file paths.

Parameters:

Name Type Description Default
config_dir str | Path | None

Optional path to a user-provided configuration directory.

None
Source code in culicidaelab\core\settings.py
def __init__(self, config_dir: str | Path | None = None) -> None:
    """Initializes the Settings facade.

    This loads the configuration using a ConfigManager and sets up a
    ResourceManager for file paths.

    Args:
        config_dir: Optional path to a user-provided configuration directory.
    """
    if self._initialized:
        return

    self._config_manager = ConfigManager(user_config_dir=config_dir)
    self.config: CulicidaeLabConfig = self._config_manager.get_config()
    self._resource_manager = ResourceManager()

    # Cache for species config (lazy loaded)
    self._species_config: SpeciesConfig | None = None

    # Store for singleton check
    self._current_config_dir = self._config_manager.user_config_dir

    self._initialized = True
get_config(path: str | None = None, default: Any = None) -> Any

Gets a configuration value using a dot-separated path.

Example

settings.get_config("predictors.classifier.confidence")

Parameters:

Name Type Description Default
path str | None

A dot-separated string path to the configuration value. If None, returns the entire configuration object.

None
default Any

A default value to return if the path is not found.

None

Returns:

Type Description
Any

The configuration value, or the default value if not found.

Source code in culicidaelab\core\settings.py
def get_config(self, path: str | None = None, default: Any = None) -> Any:
    """Gets a configuration value using a dot-separated path.

    Example:
        >>> settings.get_config("predictors.classifier.confidence")

    Args:
        path: A dot-separated string path to the configuration value.
            If None, returns the entire configuration object.
        default: A default value to return if the path is not found.

    Returns:
        The configuration value, or the default value if not found.
    """
    if not path:
        return self.config

    obj = self.config
    try:
        for key in path.split("."):
            if isinstance(obj, dict):
                obj = obj.get(key)
            else:
                obj = getattr(obj, key)
        return obj if obj is not None else default
    except (AttributeError, KeyError):
        return default
set_config(path: str, value: Any) -> None

Sets a configuration value at a specified dot-separated path. This method can traverse both objects (Pydantic models) and dictionaries.

Note: This modifies the configuration in memory. To make it persistent, call save_config().

Parameters:

Name Type Description Default
path str

A dot-separated string path to the configuration value.

required
value Any

The new value to set.

required
Source code in culicidaelab\core\settings.py
def set_config(self, path: str, value: Any) -> None:
    """
    Sets a configuration value at a specified dot-separated path.
    This method can traverse both objects (Pydantic models) and dictionaries.

    Note: This modifies the configuration in memory. To make it persistent,
    call `save_config()`.

    Args:
        path: A dot-separated string path to the configuration value.
        value: The new value to set.
    """
    keys = path.split(".")
    obj = self.config

    for key in keys[:-1]:
        if isinstance(obj, dict):
            obj = obj.get(key)
        else:
            obj = getattr(obj, key)

        if obj is None:
            raise KeyError(f"The path part '{key}' in '{path}' was not found.")

    last_key = keys[-1]
    if isinstance(obj, dict):
        obj[last_key] = value
    else:
        setattr(obj, last_key, value)
save_config(file_path: str | Path | None = None) -> None

Save current configuration to a user config file. Args: file_path: Optional path to save the configuration file. If None, defaults to "culicidaelab_saved.yaml" in the user config directory.

Source code in culicidaelab\core\settings.py
def save_config(self, file_path: str | Path | None = None) -> None:
    """Save current configuration to a user config file.
    Args:
        file_path: Optional path to save the configuration file.
            If None, defaults to "culicidaelab_saved.yaml" in the user config directory.
    """
    if file_path is None:
        if not self._config_manager.user_config_dir:
            raise ValueError("Cannot save config without a specified user config directory.")
        file_path = self._config_manager.user_config_dir / "culicidaelab_saved.yaml"
    self._config_manager.save_config(file_path)
get_cache_key_for_split(split: str | list[str] | None) -> str

Generates a unique, deterministic hash for any valid split configuration. This hash is used to create unique directory names for dataset splits.

Parameters:

Name Type Description Default
split str | list[str] | None

The split configuration to hash. Can be a single split name (e.g., 'train'), a list of splits (e.g., ['train', 'val']), or None.

required

Returns:

Name Type Description
str str

A 16-character hexadecimal hash that uniquely identifies the split configuration. This hash is deterministic for the same input.

Example

settings.get_cache_key_for_split('train') 'a1b2c3d4e5f6g7h8' settings.get_cache_key_for_split(['train', 'val']) 'h8g7f6e5d4c3b2a1'

Source code in culicidaelab\core\settings.py
def get_cache_key_for_split(self, split: str | list[str] | None) -> str:
    """
    Generates a unique, deterministic hash for any valid split configuration.
    This hash is used to create unique directory names for dataset splits.

    Args:
        split (str | list[str] | None): The split configuration to hash.
            Can be a single split name (e.g., 'train'), a list of splits
            (e.g., ['train', 'val']), or None.

    Returns:
        str: A 16-character hexadecimal hash that uniquely identifies the
            split configuration. This hash is deterministic for the same
            input.

    Example:
        >>> settings.get_cache_key_for_split('train')
        'a1b2c3d4e5f6g7h8'
        >>> settings.get_cache_key_for_split(['train', 'val'])
        'h8g7f6e5d4c3b2a1'
    """
    if isinstance(split, list):
        split.sort()

    # json.dumps correctly handles None, converting it to the string "null"
    split_str = json.dumps(split, sort_keys=True)

    hasher = hashlib.sha256(split_str.encode("utf-8"))
    return hasher.hexdigest()[:16]
construct_split_path(dataset_base_path: Path, split: str | list[str] | None = None) -> Path

Gets the standardized, absolute path for a dataset's directory.

This is the single source of truth for dataset path construction.

Parameters:

Name Type Description Default
name str

The name of the dataset (e.g., 'classification').

required
split str | list[str] | None

If provided, returns the specific cache path for this split configuration. Otherwise, returns the base directory for the dataset.

None
ensure_exists bool

If True, ensures the directory is created on disk.

required

Returns:

Name Type Description
Path Path

The absolute path to the dataset directory.

Source code in culicidaelab\core\settings.py
def construct_split_path(
    self,
    dataset_base_path: Path,
    split: str | list[str] | None = None,
) -> Path:
    """
    Gets the standardized, absolute path for a dataset's directory.

    This is the single source of truth for dataset path construction.

    Args:
        name (str): The name of the dataset (e.g., 'classification').
        split (str | list[str] | None, optional): If provided, returns the specific
            cache path for this split configuration. Otherwise, returns the base
            directory for the dataset.
        ensure_exists (bool): If True, ensures the directory is created on disk.

    Returns:
        Path: The absolute path to the dataset directory.
    """
    # Determine the final path (either base or split-specific)
    final_path = dataset_base_path
    if split is not None:
        split_key = self.get_cache_key_for_split(split)
        final_path = dataset_base_path / split_key

    return final_path
get_dataset_path(dataset_type: str, split: str | list[str] | None = None) -> Path

Gets the standardized path for a specific dataset directory.

Parameters:

Name Type Description Default
dataset_type str

The name of the dataset type (e.g., 'classification').

required

Returns:

Type Description
Path

An absolute path to the dataset directory.

Source code in culicidaelab\core\settings.py
def get_dataset_path(self, dataset_type: str, split: str | list[str] | None = None) -> Path:
    """Gets the standardized path for a specific dataset directory.

    Args:
        dataset_type: The name of the dataset type (e.g., 'classification').

    Returns:
        An absolute path to the dataset directory.
    """
    if dataset_type not in self.config.datasets:
        raise ValueError(f"Dataset type '{dataset_type}' not configured.")

    dataset_path_str = self.config.datasets[dataset_type].path
    filal_path = Path(dataset_path_str)
    if not filal_path.is_absolute():
        filal_path = self.dataset_dir / filal_path

    filal_path.mkdir(parents=True, exist_ok=True)

    if split is not None:
        filal_path = self.construct_split_path(
            dataset_base_path=filal_path,
            split=split,
        )
    return filal_path
list_datasets() -> list[str]

Get list of configured dataset types in the application.

Returns:

Type Description
list[str]

list[str]: A list of dataset type identifiers that are configured in the application settings. These correspond to the different dataset categories available for training and inference.

Example

settings.list_datasets() ['classification', 'detection', 'segmentation']

Source code in culicidaelab\core\settings.py
def list_datasets(self) -> list[str]:
    """Get list of configured dataset types in the application.

    Returns:
        list[str]: A list of dataset type identifiers that are configured
            in the application settings. These correspond to the different
            dataset categories available for training and inference.

    Example:
        >>> settings.list_datasets()
        ['classification', 'detection', 'segmentation']
    """
    return list(self.config.datasets.keys())
construct_weights_path(predictor_type: str, backend: str | None = None) -> Path

A pure, static function to construct a fully qualified model weights path.

This is the single source of truth for model path construction, creating a structured path like: .../models///.

Parameters:

Name Type Description Default
model_dir Path

The base directory for all models (e.g., '.../culicidaelab/models').

required
predictor_type str

The type of the predictor (e.g., 'classifier'). Used as a subdirectory.

required
predictor_config PredictorConfig

The Pydantic model for the predictor's configuration.

required
backend str | None

The target backend (e.g., 'torch', 'onnx'). If None, uses the default from the config.

None

Returns:

Name Type Description
Path Path

The absolute, structured path to the model weights file.

Raises:

Type Description
ValueError

If a valid backend or weights filename cannot be determined.

Source code in culicidaelab\core\settings.py
def construct_weights_path(
    self,
    predictor_type: str,
    backend: str | None = None,
) -> Path:
    """
    A pure, static function to construct a fully qualified model weights path.

    This is the single source of truth for model path construction, creating a
    structured path like: .../models/<predictor_type>/<backend>/<filename>.

    Args:
        model_dir (Path): The base directory for all models (e.g., '.../culicidaelab/models').
        predictor_type (str): The type of the predictor (e.g., 'classifier'). Used as a subdirectory.
        predictor_config (PredictorConfig): The Pydantic model for the predictor's configuration.
        backend (str | None, optional): The target backend (e.g., 'torch', 'onnx').
                                        If None, uses the default from the config.

    Returns:
        Path: The absolute, structured path to the model weights file.

    Raises:
        ValueError: If a valid backend or weights filename cannot be determined.
    """
    predictor_config = self.get_config(f"predictors.{predictor_type}")
    final_backend = backend if backend is not None else predictor_config.backend
    if not final_backend:
        raise ValueError(f"No backend specified for model '{predictor_type}'.")

    if not predictor_config.weights or final_backend not in predictor_config.weights:
        raise ValueError(f"Backend '{final_backend}' not defined in weights config for '{predictor_type}'.")

    filename = predictor_config.weights[final_backend].filename
    if not filename:
        raise ValueError(f"Filename for backend '{final_backend}' is missing in config for '{predictor_type}'.")

    # Sanitize the components that will become directories
    predictor_dir = create_safe_path(predictor_type)
    backend_dir = create_safe_path(final_backend)

    # Assemble the final, structured path
    return self.model_dir / predictor_dir / backend_dir / filename
get_model_weights_path(model_type: str, backend: str | None = None) -> Path

Gets the configured path to a model's weights file.

Parameters:

Name Type Description Default
model_type str

The name of the model type (e.g., 'classifier').

required

Returns:

Type Description
Path

The path to the model weights file.

Source code in culicidaelab\core\settings.py
def get_model_weights_path(
    self,
    model_type: str,
    backend: str | None = None,
) -> Path:
    """Gets the configured path to a model's weights file.

    Args:
        model_type: The name of the model type (e.g., 'classifier').

    Returns:
        The path to the model weights file.
    """
    if model_type not in self.config.predictors:
        raise ValueError(f"Model type '{model_type}' not configured in 'predictors'.")

    local_path = self.construct_weights_path(
        predictor_type=model_type,
        backend=backend,
    )
    return local_path
list_model_types() -> list[str]

Get list of available model types configured in the application.

Returns:

Type Description
list[str]

list[str]: A list of model type identifiers (e.g., ['classifier', 'detector', 'segmenter']) that are configured in the application. These types correspond to the different predictors available in the CulicidaeLab system.

Example

settings.list_model_types() ['classifier', 'detector', 'segmenter']

Source code in culicidaelab\core\settings.py
def list_model_types(self) -> list[str]:
    """Get list of available model types configured in the application.

    Returns:
        list[str]: A list of model type identifiers (e.g., ['classifier',
            'detector', 'segmenter']) that are configured in the application.
            These types correspond to the different predictors available
            in the CulicidaeLab system.

    Example:
        >>> settings.list_model_types()
        ['classifier', 'detector', 'segmenter']
    """
    return list(self.config.predictors.keys())
get_api_key(provider: str) -> str | None

Get API key for external provider from environment variables.

The method looks for environment variables in the following format: - KAGGLE_API_KEY for 'kaggle' provider - HUGGINGFACE_API_KEY for 'huggingface' provider - ROBOFLOW_API_KEY for 'roboflow' provider

Parameters:

Name Type Description Default
provider str

The name of the provider. Must be one of: 'kaggle', 'huggingface', or 'roboflow'.

required

Returns:

Type Description
str | None

str | None: The API key if found in environment variables, None if the provider is not supported or the key is not set.

Example

api_key = settings.get_api_key('huggingface') if api_key: ... # Use the API key ... else: ... # Handle missing key

Source code in culicidaelab\core\settings.py
def get_api_key(self, provider: str) -> str | None:
    """Get API key for external provider from environment variables.

    The method looks for environment variables in the following format:
    - KAGGLE_API_KEY for 'kaggle' provider
    - HUGGINGFACE_API_KEY for 'huggingface' provider
    - ROBOFLOW_API_KEY for 'roboflow' provider

    Args:
        provider (str): The name of the provider. Must be one of:
            'kaggle', 'huggingface', or 'roboflow'.

    Returns:
        str | None: The API key if found in environment variables,
            None if the provider is not supported or the key is not set.

    Example:
        >>> api_key = settings.get_api_key('huggingface')
        >>> if api_key:
        ...     # Use the API key
        ... else:
        ...     # Handle missing key
    """
    api_keys = {
        "kaggle": "KAGGLE_API_KEY",
        "huggingface": "HUGGINGFACE_API_KEY",
        "roboflow": "ROBOFLOW_API_KEY",
    }
    if provider in api_keys:
        return os.getenv(api_keys[provider])
    return None
temp_workspace(prefix: str = 'workspace')

Creates a temporary workspace directory that is automatically cleaned up.

This context manager creates a temporary directory for processing operations and automatically cleans it up when the context is exited.

Parameters:

Name Type Description Default
prefix str

Prefix for the temporary directory name. Defaults to "workspace".

'workspace'

Yields:

Name Type Description
Path

Path to the temporary workspace directory.

Example

with settings.temp_workspace(prefix='processing') as workspace: ... # Do some work in the temporary directory ... (workspace / 'output.txt').write_text('results')

Directory is automatically cleaned up after the with block
Source code in culicidaelab\core\settings.py
@contextmanager
def temp_workspace(self, prefix: str = "workspace"):
    """Creates a temporary workspace directory that is automatically cleaned up.

    This context manager creates a temporary directory for processing operations
    and automatically cleans it up when the context is exited.

    Args:
        prefix (str, optional): Prefix for the temporary directory name.
            Defaults to "workspace".

    Yields:
        Path: Path to the temporary workspace directory.

    Example:
        >>> with settings.temp_workspace(prefix='processing') as workspace:
        ...     # Do some work in the temporary directory
        ...     (workspace / 'output.txt').write_text('results')
        # Directory is automatically cleaned up after the with block
    """
    with self._resource_manager.temp_workspace(prefix) as workspace:
        yield workspace
instantiate_from_config(config_path: str, **kwargs: Any) -> Any

Instantiates an object from a configuration path.

This is a convenience method that finds a config object by its path and uses the underlying ConfigManager to instantiate it.

Parameters:

Name Type Description Default
config_path str

A dot-separated path to the configuration object (e.g., "predictors.classifier").

required
**kwargs Any

Additional keyword arguments to pass to the constructor.

{}

Returns:

Type Description
Any

The instantiated object.

Source code in culicidaelab\core\settings.py
def instantiate_from_config(self, config_path: str, **kwargs: Any) -> Any:
    """Instantiates an object from a configuration path.

    This is a convenience method that finds a config object by its path
    and uses the underlying ConfigManager to instantiate it.

    Args:
        config_path: A dot-separated path to the configuration object
            (e.g., "predictors.classifier").
        **kwargs: Additional keyword arguments to pass to the constructor.

    Returns:
        The instantiated object.
    """

    config_obj = self.get_config(config_path)
    if not config_obj:
        raise ValueError(f"No configuration object found at path: {config_path}")

    extra_deps = {"settings": self}

    return self._config_manager.instantiate_from_config(
        config_obj,
        extra_params=extra_deps,
        **kwargs,
    )
get_settings(config_dir: str | Path | None = None) -> Settings

Get the Settings singleton instance.

This is the primary way to access Settings throughout the application. If a config_dir is provided that differs from the existing instance, a new instance will be created and returned.

Parameters:

Name Type Description Default
config_dir str | Path | None

Optional path to a user-provided configuration directory.

None

Returns:

Type Description
Settings

The Settings instance.

Source code in culicidaelab\core\settings.py
def get_settings(config_dir: str | Path | None = None) -> Settings:
    """
    Get the Settings singleton instance.

    This is the primary way to access Settings throughout the application.
    If a `config_dir` is provided that differs from the existing instance,
    a new instance will be created and returned.

    Args:
        config_dir: Optional path to a user-provided configuration directory.

    Returns:
        The Settings instance.
    """
    global _SETTINGS_INSTANCE
    with _SETTINGS_LOCK:
        resolved_path = Path(config_dir).resolve() if config_dir else None

        # Create a new instance if one doesn't exist, or if the config path has changed.
        if _SETTINGS_INSTANCE is None or _SETTINGS_INSTANCE._current_config_dir != resolved_path:
            _SETTINGS_INSTANCE = Settings(config_dir=config_dir)

        return _SETTINGS_INSTANCE
download_file(url: str, destination: str | Path | None = None, downloads_dir: str | Path | None = None, progress_callback: Callable | None = None, chunk_size: int = 8192, timeout: int = 30, desc: str | None = None) -> Path

Downloads a file from the specified URL showing a progress bar and optionally calling a progress callback function. Supports both direct destination paths and default download directories.

Parameters:

Name Type Description Default
url str

The URL of the file to download. Must start with 'http://' or 'https://'.

required
destination Union[str, Path, None]

The complete file path where the downloaded file should be saved. If None, the file will be saved in downloads_dir with its original filename. Defaults to None.

None
downloads_dir Union[str, Path, None]

The directory to save the file in when no specific destination is provided. If None, uses current working directory. Defaults to None.

None
progress_callback Optional[Callable[[int, int], None]]

A function to call with progress updates. Takes two parameters: bytes downloaded and total bytes. Defaults to None.

None
chunk_size int

Size of chunks to download in bytes. Larger chunks use more memory but may download faster. Defaults to 8192.

8192
timeout int

Number of seconds to wait for server response before timing out. Defaults to 30.

30
desc Optional[str]

Custom description for the progress bar. If None, uses the filename. Defaults to None.

None

Returns:

Name Type Description
Path Path

Path object pointing to the downloaded file.

Raises:

Type Description
ValueError

If the URL is invalid or doesn't start with http(s).

RuntimeError

If the download fails due to network issues or if writing the file fails due to permission or disk space issues.

Example

from pathlib import Path

Basic download to current directory

path = download_file('https://example.com/data.csv') print(path) PosixPath('data.csv')

Download with custom progress tracking

def progress(current, total): ... print(f'Downloaded {current}/{total} bytes') path = download_file( ... 'https://example.com/large_file.zip', ... destination='downloads/myfile.zip', ... progress_callback=progress ... )

Source code in culicidaelab\core\utils.py
def download_file(
    url: str,
    destination: str | Path | None = None,
    downloads_dir: str | Path | None = None,
    progress_callback: Callable | None = None,
    chunk_size: int = 8192,
    timeout: int = 30,
    desc: str | None = None,
) -> Path:
    """
    Downloads a file from the specified URL showing a progress bar and optionally calling
    a progress callback function. Supports both direct destination paths and default
    download directories.

    Args:
        url (str): The URL of the file to download. Must start with 'http://' or 'https://'.
        destination (Union[str, Path, None], optional): The complete file path where the
            downloaded file should be saved. If None, the file will be saved in downloads_dir
            with its original filename. Defaults to None.
        downloads_dir (Union[str, Path, None], optional): The directory to save the file in
            when no specific destination is provided. If None, uses current working directory.
            Defaults to None.
        progress_callback (Optional[Callable[[int, int], None]], optional): A function to call
            with progress updates. Takes two parameters: bytes downloaded and total bytes.
            Defaults to None.
        chunk_size (int, optional): Size of chunks to download in bytes. Larger chunks use
            more memory but may download faster. Defaults to 8192.
        timeout (int, optional): Number of seconds to wait for server response before timing
            out. Defaults to 30.
        desc (Optional[str], optional): Custom description for the progress bar. If None,
            uses the filename. Defaults to None.

    Returns:
        Path: Path object pointing to the downloaded file.

    Raises:
        ValueError: If the URL is invalid or doesn't start with http(s).
        RuntimeError: If the download fails due to network issues or if writing the file
            fails due to permission or disk space issues.

    Example:
        >>> from pathlib import Path
        >>> # Basic download to current directory
        >>> path = download_file('https://example.com/data.csv')
        >>> print(path)
        PosixPath('data.csv')

        >>> # Download with custom progress tracking
        >>> def progress(current, total):
        ...     print(f'Downloaded {current}/{total} bytes')
        >>> path = download_file(
        ...     'https://example.com/large_file.zip',
        ...     destination='downloads/myfile.zip',
        ...     progress_callback=progress
        ... )
    """
    if not url or not url.startswith(("http://", "https://")):
        raise ValueError(f"Invalid URL: {url}")

    dest_path = Path(destination) if destination else None
    if dest_path is None:
        base_dir = Path(downloads_dir) if downloads_dir else Path.cwd()
        base_dir.mkdir(parents=True, exist_ok=True)
        filename = url.split("/")[-1]
        dest_path = base_dir / filename

    dest_path.parent.mkdir(parents=True, exist_ok=True)

    try:
        with requests.get(url, stream=True, timeout=timeout) as response:
            response.raise_for_status()
            total_size = int(response.headers.get("content-length", 0))
            progress_desc = desc or f"Downloading {dest_path.name}"

            with tqdm.tqdm(
                total=total_size,
                unit="iB",
                unit_scale=True,
                desc=progress_desc,
            ) as pbar:
                with open(dest_path, "wb") as file:
                    for chunk in response.iter_content(chunk_size=chunk_size):
                        written_size = file.write(chunk)
                        pbar.update(written_size)
                        if progress_callback:
                            try:
                                progress_callback(pbar.n, total_size)
                            except Exception as cb_err:
                                logging.warning(f"Progress callback error: {cb_err}")
        return dest_path
    except requests.RequestException as e:
        logging.error(f"Download failed for {url}: {e}")
        raise RuntimeError(f"Failed to download file from {url}: {e}") from e
    except OSError as e:
        logging.error(f"File write error for {dest_path}: {e}")
        raise RuntimeError(f"Failed to write file to {dest_path}: {e}") from e