From 23620c22334fa091ac7c0ac0758a3e8e9d663fe3 Mon Sep 17 00:00:00 2001 From: mbsantiago Date: Wed, 16 Apr 2025 19:44:30 +0100 Subject: [PATCH] Added docstrings to audio module --- batdetect2/preprocess/__init__.py | 17 +- batdetect2/preprocess/audio.py | 498 +++++++++++++++++++++++++++++- 2 files changed, 513 insertions(+), 2 deletions(-) diff --git a/batdetect2/preprocess/__init__.py b/batdetect2/preprocess/__init__.py index ea4b0a4..62f9c82 100644 --- a/batdetect2/preprocess/__init__.py +++ b/batdetect2/preprocess/__init__.py @@ -1,6 +1,7 @@ """Module containing functions for preprocessing audio clips.""" -from typing import Optional +from functools import partial +from typing import Callable, Optional, Protocol import xarray as xr from soundevent import data @@ -47,6 +48,20 @@ __all__ = [ ] +class AudioPreprocessor(Protocol): + def __call__( + self, + clip: data.Clip, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: ... + + +def build_preprocessor_from_config( + config: PreprocessingConfig, +) -> AudioPreprocessor: + return partial(preprocess_audio_clip, config=config) + + def preprocess_audio_clip( clip: data.Clip, config: Optional[PreprocessingConfig] = None, diff --git a/batdetect2/preprocess/audio.py b/batdetect2/preprocess/audio.py index 66820d2..ff1e751 100644 --- a/batdetect2/preprocess/audio.py +++ b/batdetect2/preprocess/audio.py @@ -1,4 +1,26 @@ -from typing import Optional +"""Handles loading and initial preprocessing of audio waveforms. + +This module provides components for loading audio data associated with +`soundevent` objects (Clips, Recordings, or raw files) and applying +fundamental waveform processing steps. These steps typically include: + +1. Loading the raw audio data. +2. Adjusting the audio clip to a fixed duration (optional). +3. Resampling the audio to a target sample rate (optional). +4. Centering the waveform (DC offset removal) (optional). +5. Scaling the waveform amplitude (optional). + +The processing pipeline is configurable via the `AudioConfig` data structure, +allowing for reproducible preprocessing consistent between model training and +inference. It uses the `soundevent` library for audio loading and basic array +operations, and `scipy` for resampling implementations. + +The primary interface is the `AudioLoader` protocol, with +`ConfigurableAudioLoader` providing a concrete implementation driven by the +`AudioConfig`. +""" + +from typing import Optional, Protocol import numpy as np import xarray as xr @@ -10,29 +32,334 @@ from soundevent.arrays import operations as ops from batdetect2.configs import BaseConfig +__all__ = [ + "AudioLoader", + "ResampleConfig", + "AudioConfig", + "ConfigurableAudioLoader", + "build_audio_loader", + "load_file_audio", + "load_recording_audio", + "load_clip_audio", + "adjust_audio_duration", + "resample_audio", + "TARGET_SAMPLERATE_HZ", + "SCALE_RAW_AUDIO", + "DEFAULT_DURATION", +] + TARGET_SAMPLERATE_HZ = 256_000 +"""Default target sample rate in Hz used if resampling is enabled.""" + SCALE_RAW_AUDIO = False +"""Default setting for whether to perform peak normalization.""" + DEFAULT_DURATION = None +"""Default setting for target audio duration in seconds.""" + + +class AudioLoader(Protocol): + """Defines the interface for an audio loading and processing component. + + An AudioLoader is responsible for retrieving audio data corresponding to + different soundevent objects (files, Recordings, Clips) and applying a + configured set of initial preprocessing steps. Adhering to this protocol + allows for different loading strategies or implementations. + """ + + def load_file( + self, + path: data.PathLike, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess audio directly from a file path. + + Parameters + ---------- + path : PathLike + Path to the audio file. + audio_dir : PathLike, optional + A directory prefix to prepend to the path if `path` is relative. + + Returns + ------- + xr.DataArray + The loaded and preprocessed audio waveform as an xarray DataArray + with time coordinates. Typically loads only the first channel. + + Raises + ------ + FileNotFoundError + If the audio file cannot be found. + Exception + If the audio file cannot be loaded or processed. + """ + ... + + def load_recording( + self, + recording: data.Recording, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess the entire audio for a Recording object. + + Parameters + ---------- + recording : data.Recording + The Recording object containing metadata about the audio file. + audio_dir : PathLike, optional + A directory where the audio file associated with the recording + can be found, especially if the path in the recording is relative. + + Returns + ------- + xr.DataArray + The loaded and preprocessed audio waveform. Typically loads only + the first channel. + + Raises + ------ + FileNotFoundError + If the audio file associated with the recording cannot be found. + Exception + If the audio file cannot be loaded or processed. + """ + ... + + def load_clip( + self, + clip: data.Clip, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess the audio segment defined by a Clip object. + + Parameters + ---------- + clip : data.Clip + The Clip object specifying the recording and the start/end times + of the segment to load. + audio_dir : PathLike, optional + A directory where the audio file associated with the clip's + recording can be found. + + Returns + ------- + xr.DataArray + The loaded and preprocessed audio waveform for the specified clip + duration. Typically loads only the first channel. + + Raises + ------ + FileNotFoundError + If the audio file associated with the clip cannot be found. + Exception + If the audio file cannot be loaded or processed. + """ + ... class ResampleConfig(BaseConfig): + """Configuration for audio resampling. + + Attributes + ---------- + samplerate : int, default=256000 + The target sample rate in Hz to resample the audio to. Must be > 0. + mode : str, default="poly" + The resampling algorithm to use. Options: + - "poly": Polyphase resampling using `scipy.signal.resample_poly`. + Generally fast. + - "fourier": Resampling via Fourier method using + `scipy.signal.resample`. May handle non-integer + resampling factors differently. + """ + samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0) mode: str = "poly" class AudioConfig(BaseConfig): + """Configuration for loading and initial audio preprocessing. + + Defines the sequence of operations applied to raw audio waveforms after + loading, controlling steps like resampling, scaling, centering, and + duration adjustment. + + Attributes + ---------- + resample : ResampleConfig, optional + Configuration for resampling. If provided (or defaulted), audio will + be resampled to the specified `samplerate` using the specified `mode`. + If set to `None` in the config file, resampling is skipped. + Defaults to a ResampleConfig instance with standard settings. + scale : bool, default=False + If True, scales the audio waveform using peak normalization so that + its maximum absolute amplitude is approximately 1.0. If False + (default), no amplitude scaling is applied. + center : bool, default=True + If True (default), centers the waveform by subtracting its mean + (DC offset removal). If False, the waveform is not centered. + duration : float, optional + If set to a float value (seconds), the loaded audio clip will be + adjusted (cropped or padded with zeros) to exactly this duration. + If None (default), the original duration is kept. + """ + resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig) scale: bool = SCALE_RAW_AUDIO center: bool = True duration: Optional[float] = DEFAULT_DURATION +class ConfigurableAudioLoader: + """Concrete implementation of the `AudioLoader` driven by `AudioConfig`. + + This class loads audio and applies preprocessing steps (resampling, + scaling, centering, duration adjustment) based on the settings provided + in an `AudioConfig` object during initialization. It delegates the actual + work to module-level functions. + """ + + def __init__( + self, + config: AudioConfig, + ): + """Initialize the ConfigurableAudioLoader. + + Parameters + ---------- + config : AudioConfig + The configuration object specifying the desired preprocessing steps + and parameters. + """ + self.config = config + + def load_file( + self, + path: data.PathLike, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess audio directly from a file path. + + Implements the `AudioLoader.load_file` method by delegating to the + `load_file_audio` function, passing the stored configuration. + + Parameters + ---------- + path : PathLike + Path to the audio file. + audio_dir : PathLike, optional + A directory prefix if `path` is relative. + + Returns + ------- + xr.DataArray + Loaded and preprocessed waveform (first channel). + """ + return load_file_audio(path, config=self.config, audio_dir=audio_dir) + + def load_recording( + self, + recording: data.Recording, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess the entire audio for a Recording object. + + Implements the `AudioLoader.load_recording` method by delegating to the + `load_recording_audio` function, passing the stored configuration. + + Parameters + ---------- + recording : data.Recording + The Recording object. + audio_dir : PathLike, optional + Directory containing the audio file. + + Returns + ------- + xr.DataArray + Loaded and preprocessed waveform (first channel). + """ + return load_recording_audio( + recording, config=self.config, audio_dir=audio_dir + ) + + def load_clip( + self, + clip: data.Clip, + audio_dir: Optional[data.PathLike] = None, + ) -> xr.DataArray: + """Load and preprocess the audio segment defined by a Clip object. + + Implements the `AudioLoader.load_clip` method by delegating to the + `load_clip_audio` function, passing the stored configuration. + + Parameters + ---------- + clip : data.Clip + The Clip object specifying the segment. + audio_dir : PathLike, optional + Directory containing the audio file. + + Returns + ------- + xr.DataArray + Loaded and preprocessed waveform segment (first channel). + """ + return load_clip_audio(clip, config=self.config, audio_dir=audio_dir) + + +def build_audio_loader( + config: AudioConfig, +) -> AudioLoader: + """Factory function to create an AudioLoader based on configuration. + + Instantiates and returns a `ConfigurableAudioLoader` initialized with + the provided `AudioConfig`. The return type is `AudioLoader`, adhering + to the protocol. + + Parameters + ---------- + config : AudioConfig + The configuration object specifying preprocessing steps. + + Returns + ------- + AudioLoader + An instance of `ConfigurableAudioLoader` ready to load and process audio + according to the configuration. + """ + return ConfigurableAudioLoader(config=config) + + def load_file_audio( path: data.PathLike, config: Optional[AudioConfig] = None, audio_dir: Optional[data.PathLike] = None, dtype: DTypeLike = np.float32, # type: ignore ) -> xr.DataArray: + """Load and preprocess audio from a file path using specified config. + + Creates a `soundevent.data.Recording` object from the file path and then + delegates the loading and processing to `load_recording_audio`. + + Parameters + ---------- + path : PathLike + Path to the audio file. + config : AudioConfig, optional + Audio processing configuration. If None, default settings defined + in `AudioConfig` are used. + audio_dir : PathLike, optional + Directory prefix if `path` is relative. + dtype : DTypeLike, default=np.float32 + Target NumPy data type for the loaded audio array. + + Returns + ------- + xr.DataArray + Loaded and preprocessed waveform (first channel only). + """ recording = data.Recording.from_file(path) return load_recording_audio( recording, @@ -48,6 +375,28 @@ def load_recording_audio( audio_dir: Optional[data.PathLike] = None, dtype: DTypeLike = np.float32, # type: ignore ) -> xr.DataArray: + """Load and preprocess the entire audio content of a recording using config. + + Creates a `soundevent.data.Clip` spanning the full duration of the + recording and then delegates the loading and processing to `load_clip_audio`. + + Parameters + ---------- + recording : data.Recording + The Recording object containing metadata and file path. + config : AudioConfig, optional + Audio processing configuration. If None, default settings are used. + audio_dir : PathLike, optional + Directory containing the audio file, used if the path in `recording` + is relative. + dtype : DTypeLike, default=np.float32 + Target NumPy data type for the loaded audio array. + + Returns + ------- + xr.DataArray + Loaded and preprocessed waveform (first channel only). + """ clip = data.Clip( recording=recording, start_time=0, @@ -67,6 +416,49 @@ def load_clip_audio( audio_dir: Optional[data.PathLike] = None, dtype: DTypeLike = np.float32, # type: ignore ) -> xr.DataArray: + """Load and preprocess a specific audio clip segment based on config. + + This is the core function performing the configured processing pipeline: + 1. Loads the specified clip segment using `soundevent.audio.load_clip`. + 2. Selects the first audio channel. + 3. Adjusts duration (crop/pad) if `config.duration` is set. + 4. Resamples if `config.resample` is configured. + 5. Centers (DC offset removal) if `config.center` is True. + 6. Scales (peak normalization) if `config.scale` is True. + + Parameters + ---------- + clip : data.Clip + The Clip object defining the audio segment and source recording. + config : AudioConfig, optional + Audio processing configuration. If None, a default `AudioConfig` is + used. + audio_dir : PathLike, optional + Directory containing the source audio file specified in the clip's + recording. + dtype : DTypeLike, default=np.float32 + Target NumPy data type for the processed audio array. + + Returns + ------- + xr.DataArray + The loaded and preprocessed waveform segment as an xarray DataArray + with time coordinates. + + Raises + ------ + FileNotFoundError + If the underlying audio file cannot be found. + Exception + If audio loading or processing fails for other reasons (e.g., invalid + format, resampling error). + + Notes + ----- + - **Mono Processing:** This function currently loads and processes only the + **first channel** (channel 0) of the audio file. Any other channels + are ignored. + """ config = config or AudioConfig() wav = ( @@ -96,6 +488,30 @@ def adjust_audio_duration( wave: xr.DataArray, duration: float, ) -> xr.DataArray: + """Adjust the duration of an audio waveform array via cropping or padding. + + If the current duration is longer than the target, it crops the array + from the beginning. If shorter, it pads the array with zeros at the end + using `soundevent.arrays.extend_dim`. + + Parameters + ---------- + wave : xr.DataArray + The input audio waveform with a 'time' dimension and coordinates. + duration : float + The target duration in seconds. + + Returns + ------- + xr.DataArray + The waveform adjusted to the target duration. Returns the input + unmodified if duration already matches or if the wave is empty. + + Raises + ------ + ValueError + If `duration` is negative. + """ start_time, end_time = arrays.get_dim_range(wave, dim="time") current_duration = end_time - start_time @@ -124,6 +540,36 @@ def resample_audio( mode: str = "poly", dtype: DTypeLike = np.float32, # type: ignore ) -> xr.DataArray: + """Resample an audio waveform DataArray to a target sample rate. + + Updates the 'time' coordinate axis according to the new sample rate and + number of samples. Uses either polyphase (`scipy.signal.resample_poly`) + or Fourier method (`scipy.signal.resample`) based on the `mode`. + + Parameters + ---------- + wav : xr.DataArray + Input audio waveform with 'time' dimension and coordinates. + samplerate : int, default=TARGET_SAMPLERATE_HZ + Target sample rate in Hz. + mode : str, default="poly" + Resampling algorithm: "poly" or "fourier". + dtype : DTypeLike, default=np.float32 + Target data type for the resampled array. + + Returns + ------- + xr.DataArray + Resampled waveform with updated time coordinates. Returns the input + unmodified (but dtype cast) if the sample rate is already correct or + if the input array is empty. + + Raises + ------ + ValueError + If `wav` lacks a 'time' dimension, the original sample rate cannot + be determined, `samplerate` is non-positive, or `mode` is invalid. + """ if "time" not in wav.dims: raise ValueError("Audio must have a time dimension") @@ -180,6 +626,33 @@ def resample_audio_poly( sr_new: int, axis: int = -1, ) -> np.ndarray: + """Resample a numpy array using `scipy.signal.resample_poly`. + + This method is often preferred for signals when the ratio of new + to old sample rates can be expressed as a rational number. It uses + polyphase filtering. + + Parameters + ---------- + array : np.ndarray + The input array to resample. + sr_orig : int + The original sample rate in Hz. + sr_new : int + The target sample rate in Hz. + axis : int, default=-1 + The axis of `array` along which to resample. + + Returns + ------- + np.ndarray + The array resampled to the target sample rate. + + Raises + ------ + ValueError + If sample rates are not positive. + """ gcd = np.gcd(sr_orig, sr_new) return resample_poly( array.values, @@ -195,5 +668,28 @@ def resample_audio_fourier( sr_new: int, axis: int = -1, ) -> np.ndarray: + """Resample a numpy array using `scipy.signal.resample`. + + This method uses FFTs to resample the signal. + + Parameters + ---------- + array : np.ndarray + The input array to resample. + num : int + The desired number of samples in the output array along `axis`. + axis : int, default=-1 + The axis of `array` along which to resample. + + Returns + ------- + np.ndarray + The array resampled to have `num` samples along `axis`. + + Raises + ------ + ValueError + If `num` is negative. + """ ratio = sr_new / sr_orig return resample(array, int(array.shape[axis] * ratio), axis=axis) # type: ignore