Added docstrings to audio module

2026-01-11 09:29:33 +01:00 · 2025-04-16 19:44:30 +01:00 · 2025-04-16 19:44:30 +01:00 · 23620c2233
commit 23620c2233
parent a9f91322d4
2 changed files with 513 additions and 2 deletions
--- a/batdetect2/preprocess/init.py
+++ b/batdetect2/preprocess/init.py
@ -1,6 +1,7 @@
 """Module containing functions for preprocessing audio clips."""
-from typing import Optional
+from functools import partial
 from typing import Callable, Optional, Protocol
 import xarray as xr
 from soundevent import data
@ -47,6 +48,20 @@ __all__ = [
 ]
 class AudioPreprocessor(Protocol):
    def __call__(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray: ...
 def build_preprocessor_from_config(
    config: PreprocessingConfig,
 ) -> AudioPreprocessor:
    return partial(preprocess_audio_clip, config=config)
 def preprocess_audio_clip(
    clip: data.Clip,
    config: Optional[PreprocessingConfig] = None,
--- a/batdetect2/preprocess/audio.py
+++ b/batdetect2/preprocess/audio.py
@ -1,4 +1,26 @@
-from typing import Optional
+"""Handles loading and initial preprocessing of audio waveforms.
 This module provides components for loading audio data associated with
 `soundevent` objects (Clips, Recordings, or raw files) and applying
 fundamental waveform processing steps. These steps typically include:
 1.  Loading the raw audio data.
 2.  Adjusting the audio clip to a fixed duration (optional).
 3.  Resampling the audio to a target sample rate (optional).
 4.  Centering the waveform (DC offset removal) (optional).
 5.  Scaling the waveform amplitude (optional).
 The processing pipeline is configurable via the `AudioConfig` data structure,
 allowing for reproducible preprocessing consistent between model training and
 inference. It uses the `soundevent` library for audio loading and basic array
 operations, and `scipy` for resampling implementations.
 The primary interface is the `AudioLoader` protocol, with
 `ConfigurableAudioLoader` providing a concrete implementation driven by the
 `AudioConfig`.
 """
 from typing import Optional, Protocol
 import numpy as np
 import xarray as xr
@ -10,29 +32,334 @@ from soundevent.arrays import operations as ops
 from batdetect2.configs import BaseConfig
 __all__ = [
    "AudioLoader",
    "ResampleConfig",
    "AudioConfig",
    "ConfigurableAudioLoader",
    "build_audio_loader",
    "load_file_audio",
    "load_recording_audio",
    "load_clip_audio",
    "adjust_audio_duration",
    "resample_audio",
    "TARGET_SAMPLERATE_HZ",
    "SCALE_RAW_AUDIO",
    "DEFAULT_DURATION",
 ]
 TARGET_SAMPLERATE_HZ = 256_000
 """Default target sample rate in Hz used if resampling is enabled."""
 SCALE_RAW_AUDIO = False
 """Default setting for whether to perform peak normalization."""
 DEFAULT_DURATION = None
 """Default setting for target audio duration in seconds."""
 class AudioLoader(Protocol):
    """Defines the interface for an audio loading and processing component.
    An AudioLoader is responsible for retrieving audio data corresponding to
    different soundevent objects (files, Recordings, Clips) and applying a
    configured set of initial preprocessing steps. Adhering to this protocol
    allows for different loading strategies or implementations.
    """
    def load_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess audio directly from a file path.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix to prepend to the path if `path` is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform as an xarray DataArray
            with time coordinates. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the entire audio for a Recording object.
        Parameters
        ----------
        recording : data.Recording
            The Recording object containing metadata about the audio file.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the recording
            can be found, especially if the path in the recording is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform. Typically loads only
            the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the recording cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the audio segment defined by a Clip object.
        Parameters
        ----------
        clip : data.Clip
            The Clip object specifying the recording and the start/end times
            of the segment to load.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the clip's
            recording can be found.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform for the specified clip
            duration. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the clip cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
 class ResampleConfig(BaseConfig):
    """Configuration for audio resampling.
    Attributes
    ----------
    samplerate : int, default=256000
        The target sample rate in Hz to resample the audio to. Must be > 0.
    mode : str, default="poly"
        The resampling algorithm to use. Options:
        - "poly": Polyphase resampling using `scipy.signal.resample_poly`.
                  Generally fast.
        - "fourier": Resampling via Fourier method using
                     `scipy.signal.resample`. May handle non-integer
                     resampling factors differently.
    """
    samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0)
    mode: str = "poly"
 class AudioConfig(BaseConfig):
    """Configuration for loading and initial audio preprocessing.
    Defines the sequence of operations applied to raw audio waveforms after
    loading, controlling steps like resampling, scaling, centering, and
    duration adjustment.
    Attributes
    ----------
    resample : ResampleConfig, optional
        Configuration for resampling. If provided (or defaulted), audio will
        be resampled to the specified `samplerate` using the specified `mode`.
        If set to `None` in the config file, resampling is skipped.
        Defaults to a ResampleConfig instance with standard settings.
    scale : bool, default=False
        If True, scales the audio waveform using peak normalization so that
        its maximum absolute amplitude is approximately 1.0. If False
        (default), no amplitude scaling is applied.
    center : bool, default=True
        If True (default), centers the waveform by subtracting its mean
        (DC offset removal). If False, the waveform is not centered.
    duration : float, optional
        If set to a float value (seconds), the loaded audio clip will be
        adjusted (cropped or padded with zeros) to exactly this duration.
        If None (default), the original duration is kept.
    """
    resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig)
    scale: bool = SCALE_RAW_AUDIO
    center: bool = True
    duration: Optional[float] = DEFAULT_DURATION
 class ConfigurableAudioLoader:
    """Concrete implementation of the `AudioLoader` driven by `AudioConfig`.
    This class loads audio and applies preprocessing steps (resampling,
    scaling, centering, duration adjustment) based on the settings provided
    in an `AudioConfig` object during initialization. It delegates the actual
    work to module-level functions.
    """
    def __init__(
        self,
        config: AudioConfig,
    ):
        """Initialize the ConfigurableAudioLoader.
        Parameters
        ----------
        config : AudioConfig
            The configuration object specifying the desired preprocessing steps
            and parameters.
        """
        self.config = config
    def load_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess audio directly from a file path.
        Implements the `AudioLoader.load_file` method by delegating to the
        `load_file_audio` function, passing the stored configuration.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix if `path` is relative.
        Returns
        -------
        xr.DataArray
            Loaded and preprocessed waveform (first channel).
        """
        return load_file_audio(path, config=self.config, audio_dir=audio_dir)
    def load_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the entire audio for a Recording object.
        Implements the `AudioLoader.load_recording` method by delegating to the
        `load_recording_audio` function, passing the stored configuration.
        Parameters
        ----------
        recording : data.Recording
            The Recording object.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            Loaded and preprocessed waveform (first channel).
        """
        return load_recording_audio(
            recording, config=self.config, audio_dir=audio_dir
        )
    def load_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the audio segment defined by a Clip object.
        Implements the `AudioLoader.load_clip` method by delegating to the
        `load_clip_audio` function, passing the stored configuration.
        Parameters
        ----------
        clip : data.Clip
            The Clip object specifying the segment.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            Loaded and preprocessed waveform segment (first channel).
        """
        return load_clip_audio(clip, config=self.config, audio_dir=audio_dir)
 def build_audio_loader(
    config: AudioConfig,
 ) -> AudioLoader:
    """Factory function to create an AudioLoader based on configuration.
    Instantiates and returns a `ConfigurableAudioLoader` initialized with
    the provided `AudioConfig`. The return type is `AudioLoader`, adhering
    to the protocol.
    Parameters
    ----------
    config : AudioConfig
        The configuration object specifying preprocessing steps.
    Returns
    -------
    AudioLoader
        An instance of `ConfigurableAudioLoader` ready to load and process audio
        according to the configuration.
    """
    return ConfigurableAudioLoader(config=config)
 def load_file_audio(
    path: data.PathLike,
    config: Optional[AudioConfig] = None,
    audio_dir: Optional[data.PathLike] = None,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Load and preprocess audio from a file path using specified config.
    Creates a `soundevent.data.Recording` object from the file path and then
    delegates the loading and processing to `load_recording_audio`.
    Parameters
    ----------
    path : PathLike
        Path to the audio file.
    config : AudioConfig, optional
        Audio processing configuration. If None, default settings defined
        in `AudioConfig` are used.
    audio_dir : PathLike, optional
        Directory prefix if `path` is relative.
    dtype : DTypeLike, default=np.float32
        Target NumPy data type for the loaded audio array.
    Returns
    -------
    xr.DataArray
        Loaded and preprocessed waveform (first channel only).
    """
    recording = data.Recording.from_file(path)
    return load_recording_audio(
        recording,
@ -48,6 +375,28 @@ def load_recording_audio(
    audio_dir: Optional[data.PathLike] = None,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Load and preprocess the entire audio content of a recording using config.
    Creates a `soundevent.data.Clip` spanning the full duration of the
    recording and then delegates the loading and processing to `load_clip_audio`.
    Parameters
    ----------
    recording : data.Recording
        The Recording object containing metadata and file path.
    config : AudioConfig, optional
        Audio processing configuration. If None, default settings are used.
    audio_dir : PathLike, optional
        Directory containing the audio file, used if the path in `recording`
        is relative.
    dtype : DTypeLike, default=np.float32
        Target NumPy data type for the loaded audio array.
    Returns
    -------
    xr.DataArray
        Loaded and preprocessed waveform (first channel only).
    """
    clip = data.Clip(
        recording=recording,
        start_time=0,
@ -67,6 +416,49 @@ def load_clip_audio(
    audio_dir: Optional[data.PathLike] = None,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Load and preprocess a specific audio clip segment based on config.
    This is the core function performing the configured processing pipeline:
    1. Loads the specified clip segment using `soundevent.audio.load_clip`.
    2. Selects the first audio channel.
    3. Adjusts duration (crop/pad) if `config.duration` is set.
    4. Resamples if `config.resample` is configured.
    5. Centers (DC offset removal) if `config.center` is True.
    6. Scales (peak normalization) if `config.scale` is True.
    Parameters
    ----------
    clip : data.Clip
        The Clip object defining the audio segment and source recording.
    config : AudioConfig, optional
        Audio processing configuration. If None, a default `AudioConfig` is
        used.
    audio_dir : PathLike, optional
        Directory containing the source audio file specified in the clip's
        recording.
    dtype : DTypeLike, default=np.float32
        Target NumPy data type for the processed audio array.
    Returns
    -------
    xr.DataArray
        The loaded and preprocessed waveform segment as an xarray DataArray
        with time coordinates.
    Raises
    ------
    FileNotFoundError
        If the underlying audio file cannot be found.
    Exception
        If audio loading or processing fails for other reasons (e.g., invalid
        format, resampling error).
    Notes
    -----
    - **Mono Processing:** This function currently loads and processes only the
      **first channel** (channel 0) of the audio file. Any other channels
      are ignored.
    """
    config = config or AudioConfig()
    wav = (
@ -96,6 +488,30 @@ def adjust_audio_duration(
    wave: xr.DataArray,
    duration: float,
 ) -> xr.DataArray:
    """Adjust the duration of an audio waveform array via cropping or padding.
    If the current duration is longer than the target, it crops the array
    from the beginning. If shorter, it pads the array with zeros at the end
    using `soundevent.arrays.extend_dim`.
    Parameters
    ----------
    wave : xr.DataArray
        The input audio waveform with a 'time' dimension and coordinates.
    duration : float
        The target duration in seconds.
    Returns
    -------
    xr.DataArray
        The waveform adjusted to the target duration. Returns the input
        unmodified if duration already matches or if the wave is empty.
    Raises
    ------
    ValueError
        If `duration` is negative.
    """
    start_time, end_time = arrays.get_dim_range(wave, dim="time")
    current_duration = end_time - start_time
@ -124,6 +540,36 @@ def resample_audio(
    mode: str = "poly",
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Resample an audio waveform DataArray to a target sample rate.
    Updates the 'time' coordinate axis according to the new sample rate and
    number of samples. Uses either polyphase (`scipy.signal.resample_poly`)
    or Fourier method (`scipy.signal.resample`) based on the `mode`.
    Parameters
    ----------
    wav : xr.DataArray
        Input audio waveform with 'time' dimension and coordinates.
    samplerate : int, default=TARGET_SAMPLERATE_HZ
        Target sample rate in Hz.
    mode : str, default="poly"
        Resampling algorithm: "poly" or "fourier".
    dtype : DTypeLike, default=np.float32
        Target data type for the resampled array.
    Returns
    -------
    xr.DataArray
        Resampled waveform with updated time coordinates. Returns the input
        unmodified (but dtype cast) if the sample rate is already correct or
        if the input array is empty.
    Raises
    ------
    ValueError
        If `wav` lacks a 'time' dimension, the original sample rate cannot
        be determined, `samplerate` is non-positive, or `mode` is invalid.
    """
    if "time" not in wav.dims:
        raise ValueError("Audio must have a time dimension")
@ -180,6 +626,33 @@ def resample_audio_poly(
    sr_new: int,
    axis: int = -1,
 ) -> np.ndarray:
    """Resample a numpy array using `scipy.signal.resample_poly`.
    This method is often preferred for signals when the ratio of new
    to old sample rates can be expressed as a rational number. It uses
    polyphase filtering.
    Parameters
    ----------
    array : np.ndarray
        The input array to resample.
    sr_orig : int
        The original sample rate in Hz.
    sr_new : int
        The target sample rate in Hz.
    axis : int, default=-1
        The axis of `array` along which to resample.
    Returns
    -------
    np.ndarray
        The array resampled to the target sample rate.
    Raises
    ------
    ValueError
        If sample rates are not positive.
    """
    gcd = np.gcd(sr_orig, sr_new)
    return resample_poly(
        array.values,
@ -195,5 +668,28 @@ def resample_audio_fourier(
    sr_new: int,
    axis: int = -1,
 ) -> np.ndarray:
    """Resample a numpy array using `scipy.signal.resample`.
    This method uses FFTs to resample the signal.
    Parameters
    ----------
    array : np.ndarray
        The input array to resample.
    num : int
        The desired number of samples in the output array along `axis`.
    axis : int, default=-1
        The axis of `array` along which to resample.
    Returns
    -------
    np.ndarray
        The array resampled to have `num` samples along `axis`.
    Raises
    ------
    ValueError
        If `num` is negative.
    """
    ratio = sr_new / sr_orig
    return resample(array, int(array.shape[axis] * ratio), axis=axis)  # type: ignore