Source code for deepaudiox.loops.evaluator

from dataclasses import dataclass, field

import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from deepaudiox.callbacks.reporter import Reporter
from deepaudiox.datasets.audio_classification_dataset import AudioClassificationDataset
from deepaudiox.modules.baseclasses import BaseAudioClassifier
from deepaudiox.schemas.types import DeviceName
from deepaudiox.utils.training_utils import get_device, get_logger, pad_collate_fn


@dataclass
class State:
    """Dataclass that stores variables accessed throughout the testing lifecycle.

    Attributes:
        y_true (np.ndarray): A NumPy array of true labels.
        y_pred (np.ndarray): A NumPy array of predicted labels.
        posteriors (np.ndarray): A NumPy array of posterior probabilities.
    """

    y_true: np.ndarray = field(default_factory=lambda: np.array([], dtype=int))
    y_pred: np.ndarray = field(default_factory=lambda: np.array([], dtype=int))
    posteriors: np.ndarray = field(default_factory=lambda: np.array([], dtype=float))


[docs] class Evaluator: """The core SDK module for testing a model. The Evaluator assembles all modules required for testing and performs the testing process. Attributes: state (State): Stores testing variables. verbose (bool): Whether to log the evaluation report after testing. device (str): The device used for testing. class_mapping (dict): A mapping between class names and IDs. logger (logging.Logger): A module used for logging messages. test_dloader (torch.DataLoader): The DataLoader of the testing set. model (BaseAudioClassifier): An AudioClassifier module inheriting from BaseAudioClassifier. callbacks (list): A list of callbacks used throughout the testing lifecycle. """
[docs] def __init__( self, test_dset: AudioClassificationDataset, model: BaseAudioClassifier, class_mapping: dict, batch_size: int = 16, num_workers: int = 4, device: DeviceName = "cpu", device_index: int | None = None, verbose: bool = True, ): """Initialize the Evaluator. Args: test_dset (AudioClassificationDataset): The testing dataset. model (BaseAudioClassifier): An AudioClassifier module inheriting from BaseAudioClassifier. class_mapping (dict): A mapping between class names and IDs. batch_size (int, optional): The batch size for Python Data Loaders. Defaults to 16. num_workers (int, optional): The number of workers for Python Data Loaders. Defaults to 4. device (DeviceName): The device to use for evaluation. One of ``"cuda"``, ``"mps"``, or ``"cpu"``. Defaults to ``"cpu"``. device_index (int | None): The GPU device index. Only applicable when ``device="cuda"``. If ``None``, uses the default CUDA device. verbose (bool): If True, prints the classification report, confusion matrix, and average posteriors after evaluation. "Evaluation has finished." is always printed. Defaults to True. Example: >>> import torch >>> from deepaudiox import AudioClassifier, Evaluator >>> from deepaudiox import audio_classification_dataset_from_dir, get_class_mapping_from_dir >>> class_mapping = get_class_mapping_from_dir(root_dir="path/to/data") >>> test_dataset = audio_classification_dataset_from_dir( ... root_dir="path/to/data", ... sample_rate=16_000, ... class_mapping=class_mapping, ... ) >>> model = AudioClassifier(num_classes=len(class_mapping), backbone="beats", sample_rate=16_000) >>> model.load_state_dict(torch.load("checkpoint.pt")) >>> evaluator = Evaluator(test_dset=test_dataset, model=model, class_mapping=class_mapping) >>> evaluator.evaluate() """ self.state = State() self.verbose = verbose self.device = get_device(device=device, device_index=device_index) self.class_mapping = class_mapping # Configure logger self.logger = get_logger() # Load dataset self.test_dloader = DataLoader( test_dset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=self.device.type == "cuda", persistent_workers=num_workers > 0, collate_fn=pad_collate_fn, ) # Load model self.model = model self.model.to(self.device) self.model.eval() # Configure callbacks self.callbacks = [Reporter(logger=self.logger)]
[docs] @torch.inference_mode() def evaluate(self) -> None: """Run the full evaluation loop over the test set. Iterates over all batches in ``test_dloader``, accumulates true labels, predicted labels, and posterior probabilities into ``self.state``, then triggers the registered callbacks via ``on_testing_end``. Always prints "Evaluation has finished." regardless of ``verbose``. The ``Reporter`` callback (classification report, confusion matrix, average posteriors) is only executed when ``verbose=True``. After this method returns, ``self.state`` holds: - ``y_true`` (np.ndarray): Ground-truth class indices, shape (N,). - ``y_pred`` (np.ndarray): Predicted class indices, shape (N,). - ``posteriors`` (np.ndarray): Max posterior probability per sample, shape (N,). Note: The model is expected to already be in eval mode (set in ``__init__``). Runs under ``torch.inference_mode()`` — gradients are fully disabled. """ # Lists to accumulate evaluation results, i.e., true_labels, prediction_labels, and posteriors y_true_batches, y_pred_batches, posterior_batches = [], [], [] with tqdm(self.test_dloader, unit="batch", leave=False, desc="Evaluation phase") as tbar: for batch in tbar: # Move inputs x = batch["feature"].to(self.device) y_true = batch["y_true"].cpu().numpy() # Run model prediction inference = self.model.predict(x) y_pred = np.array(inference["y_preds"], dtype=int) post = np.array(inference["posteriors"], dtype=float) # Update lists with new batch results y_true_batches.append(y_true) y_pred_batches.append(y_pred) posterior_batches.append(post) # Concatenate all results outside the loop self.state.y_true = np.concatenate(y_true_batches) self.state.y_pred = np.concatenate(y_pred_batches) self.state.posteriors = np.concatenate(posterior_batches) self.logger.info("Evaluation has finished.") if self.verbose: for cb in self.callbacks: cb.on_testing_end(self)