Added docstrings to audio module

This commit is contained in:
mbsantiago 2025-04-16 19:44:30 +01:00
parent a9f91322d4
commit 23620c2233
2 changed files with 513 additions and 2 deletions

View File

@ -1,6 +1,7 @@
"""Module containing functions for preprocessing audio clips."""
from typing import Optional
from functools import partial
from typing import Callable, Optional, Protocol
import xarray as xr
from soundevent import data
@ -47,6 +48,20 @@ __all__ = [
]
class AudioPreprocessor(Protocol):
def __call__(
self,
clip: data.Clip,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray: ...
def build_preprocessor_from_config(
config: PreprocessingConfig,
) -> AudioPreprocessor:
return partial(preprocess_audio_clip, config=config)
def preprocess_audio_clip(
clip: data.Clip,
config: Optional[PreprocessingConfig] = None,

View File

@ -1,4 +1,26 @@
from typing import Optional
"""Handles loading and initial preprocessing of audio waveforms.
This module provides components for loading audio data associated with
`soundevent` objects (Clips, Recordings, or raw files) and applying
fundamental waveform processing steps. These steps typically include:
1. Loading the raw audio data.
2. Adjusting the audio clip to a fixed duration (optional).
3. Resampling the audio to a target sample rate (optional).
4. Centering the waveform (DC offset removal) (optional).
5. Scaling the waveform amplitude (optional).
The processing pipeline is configurable via the `AudioConfig` data structure,
allowing for reproducible preprocessing consistent between model training and
inference. It uses the `soundevent` library for audio loading and basic array
operations, and `scipy` for resampling implementations.
The primary interface is the `AudioLoader` protocol, with
`ConfigurableAudioLoader` providing a concrete implementation driven by the
`AudioConfig`.
"""
from typing import Optional, Protocol
import numpy as np
import xarray as xr
@ -10,29 +32,334 @@ from soundevent.arrays import operations as ops
from batdetect2.configs import BaseConfig
__all__ = [
"AudioLoader",
"ResampleConfig",
"AudioConfig",
"ConfigurableAudioLoader",
"build_audio_loader",
"load_file_audio",
"load_recording_audio",
"load_clip_audio",
"adjust_audio_duration",
"resample_audio",
"TARGET_SAMPLERATE_HZ",
"SCALE_RAW_AUDIO",
"DEFAULT_DURATION",
]
TARGET_SAMPLERATE_HZ = 256_000
"""Default target sample rate in Hz used if resampling is enabled."""
SCALE_RAW_AUDIO = False
"""Default setting for whether to perform peak normalization."""
DEFAULT_DURATION = None
"""Default setting for target audio duration in seconds."""
class AudioLoader(Protocol):
"""Defines the interface for an audio loading and processing component.
An AudioLoader is responsible for retrieving audio data corresponding to
different soundevent objects (files, Recordings, Clips) and applying a
configured set of initial preprocessing steps. Adhering to this protocol
allows for different loading strategies or implementations.
"""
def load_file(
self,
path: data.PathLike,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess audio directly from a file path.
Parameters
----------
path : PathLike
Path to the audio file.
audio_dir : PathLike, optional
A directory prefix to prepend to the path if `path` is relative.
Returns
-------
xr.DataArray
The loaded and preprocessed audio waveform as an xarray DataArray
with time coordinates. Typically loads only the first channel.
Raises
------
FileNotFoundError
If the audio file cannot be found.
Exception
If the audio file cannot be loaded or processed.
"""
...
def load_recording(
self,
recording: data.Recording,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess the entire audio for a Recording object.
Parameters
----------
recording : data.Recording
The Recording object containing metadata about the audio file.
audio_dir : PathLike, optional
A directory where the audio file associated with the recording
can be found, especially if the path in the recording is relative.
Returns
-------
xr.DataArray
The loaded and preprocessed audio waveform. Typically loads only
the first channel.
Raises
------
FileNotFoundError
If the audio file associated with the recording cannot be found.
Exception
If the audio file cannot be loaded or processed.
"""
...
def load_clip(
self,
clip: data.Clip,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess the audio segment defined by a Clip object.
Parameters
----------
clip : data.Clip
The Clip object specifying the recording and the start/end times
of the segment to load.
audio_dir : PathLike, optional
A directory where the audio file associated with the clip's
recording can be found.
Returns
-------
xr.DataArray
The loaded and preprocessed audio waveform for the specified clip
duration. Typically loads only the first channel.
Raises
------
FileNotFoundError
If the audio file associated with the clip cannot be found.
Exception
If the audio file cannot be loaded or processed.
"""
...
class ResampleConfig(BaseConfig):
"""Configuration for audio resampling.
Attributes
----------
samplerate : int, default=256000
The target sample rate in Hz to resample the audio to. Must be > 0.
mode : str, default="poly"
The resampling algorithm to use. Options:
- "poly": Polyphase resampling using `scipy.signal.resample_poly`.
Generally fast.
- "fourier": Resampling via Fourier method using
`scipy.signal.resample`. May handle non-integer
resampling factors differently.
"""
samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0)
mode: str = "poly"
class AudioConfig(BaseConfig):
"""Configuration for loading and initial audio preprocessing.
Defines the sequence of operations applied to raw audio waveforms after
loading, controlling steps like resampling, scaling, centering, and
duration adjustment.
Attributes
----------
resample : ResampleConfig, optional
Configuration for resampling. If provided (or defaulted), audio will
be resampled to the specified `samplerate` using the specified `mode`.
If set to `None` in the config file, resampling is skipped.
Defaults to a ResampleConfig instance with standard settings.
scale : bool, default=False
If True, scales the audio waveform using peak normalization so that
its maximum absolute amplitude is approximately 1.0. If False
(default), no amplitude scaling is applied.
center : bool, default=True
If True (default), centers the waveform by subtracting its mean
(DC offset removal). If False, the waveform is not centered.
duration : float, optional
If set to a float value (seconds), the loaded audio clip will be
adjusted (cropped or padded with zeros) to exactly this duration.
If None (default), the original duration is kept.
"""
resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig)
scale: bool = SCALE_RAW_AUDIO
center: bool = True
duration: Optional[float] = DEFAULT_DURATION
class ConfigurableAudioLoader:
"""Concrete implementation of the `AudioLoader` driven by `AudioConfig`.
This class loads audio and applies preprocessing steps (resampling,
scaling, centering, duration adjustment) based on the settings provided
in an `AudioConfig` object during initialization. It delegates the actual
work to module-level functions.
"""
def __init__(
self,
config: AudioConfig,
):
"""Initialize the ConfigurableAudioLoader.
Parameters
----------
config : AudioConfig
The configuration object specifying the desired preprocessing steps
and parameters.
"""
self.config = config
def load_file(
self,
path: data.PathLike,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess audio directly from a file path.
Implements the `AudioLoader.load_file` method by delegating to the
`load_file_audio` function, passing the stored configuration.
Parameters
----------
path : PathLike
Path to the audio file.
audio_dir : PathLike, optional
A directory prefix if `path` is relative.
Returns
-------
xr.DataArray
Loaded and preprocessed waveform (first channel).
"""
return load_file_audio(path, config=self.config, audio_dir=audio_dir)
def load_recording(
self,
recording: data.Recording,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess the entire audio for a Recording object.
Implements the `AudioLoader.load_recording` method by delegating to the
`load_recording_audio` function, passing the stored configuration.
Parameters
----------
recording : data.Recording
The Recording object.
audio_dir : PathLike, optional
Directory containing the audio file.
Returns
-------
xr.DataArray
Loaded and preprocessed waveform (first channel).
"""
return load_recording_audio(
recording, config=self.config, audio_dir=audio_dir
)
def load_clip(
self,
clip: data.Clip,
audio_dir: Optional[data.PathLike] = None,
) -> xr.DataArray:
"""Load and preprocess the audio segment defined by a Clip object.
Implements the `AudioLoader.load_clip` method by delegating to the
`load_clip_audio` function, passing the stored configuration.
Parameters
----------
clip : data.Clip
The Clip object specifying the segment.
audio_dir : PathLike, optional
Directory containing the audio file.
Returns
-------
xr.DataArray
Loaded and preprocessed waveform segment (first channel).
"""
return load_clip_audio(clip, config=self.config, audio_dir=audio_dir)
def build_audio_loader(
config: AudioConfig,
) -> AudioLoader:
"""Factory function to create an AudioLoader based on configuration.
Instantiates and returns a `ConfigurableAudioLoader` initialized with
the provided `AudioConfig`. The return type is `AudioLoader`, adhering
to the protocol.
Parameters
----------
config : AudioConfig
The configuration object specifying preprocessing steps.
Returns
-------
AudioLoader
An instance of `ConfigurableAudioLoader` ready to load and process audio
according to the configuration.
"""
return ConfigurableAudioLoader(config=config)
def load_file_audio(
path: data.PathLike,
config: Optional[AudioConfig] = None,
audio_dir: Optional[data.PathLike] = None,
dtype: DTypeLike = np.float32, # type: ignore
) -> xr.DataArray:
"""Load and preprocess audio from a file path using specified config.
Creates a `soundevent.data.Recording` object from the file path and then
delegates the loading and processing to `load_recording_audio`.
Parameters
----------
path : PathLike
Path to the audio file.
config : AudioConfig, optional
Audio processing configuration. If None, default settings defined
in `AudioConfig` are used.
audio_dir : PathLike, optional
Directory prefix if `path` is relative.
dtype : DTypeLike, default=np.float32
Target NumPy data type for the loaded audio array.
Returns
-------
xr.DataArray
Loaded and preprocessed waveform (first channel only).
"""
recording = data.Recording.from_file(path)
return load_recording_audio(
recording,
@ -48,6 +375,28 @@ def load_recording_audio(
audio_dir: Optional[data.PathLike] = None,
dtype: DTypeLike = np.float32, # type: ignore
) -> xr.DataArray:
"""Load and preprocess the entire audio content of a recording using config.
Creates a `soundevent.data.Clip` spanning the full duration of the
recording and then delegates the loading and processing to `load_clip_audio`.
Parameters
----------
recording : data.Recording
The Recording object containing metadata and file path.
config : AudioConfig, optional
Audio processing configuration. If None, default settings are used.
audio_dir : PathLike, optional
Directory containing the audio file, used if the path in `recording`
is relative.
dtype : DTypeLike, default=np.float32
Target NumPy data type for the loaded audio array.
Returns
-------
xr.DataArray
Loaded and preprocessed waveform (first channel only).
"""
clip = data.Clip(
recording=recording,
start_time=0,
@ -67,6 +416,49 @@ def load_clip_audio(
audio_dir: Optional[data.PathLike] = None,
dtype: DTypeLike = np.float32, # type: ignore
) -> xr.DataArray:
"""Load and preprocess a specific audio clip segment based on config.
This is the core function performing the configured processing pipeline:
1. Loads the specified clip segment using `soundevent.audio.load_clip`.
2. Selects the first audio channel.
3. Adjusts duration (crop/pad) if `config.duration` is set.
4. Resamples if `config.resample` is configured.
5. Centers (DC offset removal) if `config.center` is True.
6. Scales (peak normalization) if `config.scale` is True.
Parameters
----------
clip : data.Clip
The Clip object defining the audio segment and source recording.
config : AudioConfig, optional
Audio processing configuration. If None, a default `AudioConfig` is
used.
audio_dir : PathLike, optional
Directory containing the source audio file specified in the clip's
recording.
dtype : DTypeLike, default=np.float32
Target NumPy data type for the processed audio array.
Returns
-------
xr.DataArray
The loaded and preprocessed waveform segment as an xarray DataArray
with time coordinates.
Raises
------
FileNotFoundError
If the underlying audio file cannot be found.
Exception
If audio loading or processing fails for other reasons (e.g., invalid
format, resampling error).
Notes
-----
- **Mono Processing:** This function currently loads and processes only the
**first channel** (channel 0) of the audio file. Any other channels
are ignored.
"""
config = config or AudioConfig()
wav = (
@ -96,6 +488,30 @@ def adjust_audio_duration(
wave: xr.DataArray,
duration: float,
) -> xr.DataArray:
"""Adjust the duration of an audio waveform array via cropping or padding.
If the current duration is longer than the target, it crops the array
from the beginning. If shorter, it pads the array with zeros at the end
using `soundevent.arrays.extend_dim`.
Parameters
----------
wave : xr.DataArray
The input audio waveform with a 'time' dimension and coordinates.
duration : float
The target duration in seconds.
Returns
-------
xr.DataArray
The waveform adjusted to the target duration. Returns the input
unmodified if duration already matches or if the wave is empty.
Raises
------
ValueError
If `duration` is negative.
"""
start_time, end_time = arrays.get_dim_range(wave, dim="time")
current_duration = end_time - start_time
@ -124,6 +540,36 @@ def resample_audio(
mode: str = "poly",
dtype: DTypeLike = np.float32, # type: ignore
) -> xr.DataArray:
"""Resample an audio waveform DataArray to a target sample rate.
Updates the 'time' coordinate axis according to the new sample rate and
number of samples. Uses either polyphase (`scipy.signal.resample_poly`)
or Fourier method (`scipy.signal.resample`) based on the `mode`.
Parameters
----------
wav : xr.DataArray
Input audio waveform with 'time' dimension and coordinates.
samplerate : int, default=TARGET_SAMPLERATE_HZ
Target sample rate in Hz.
mode : str, default="poly"
Resampling algorithm: "poly" or "fourier".
dtype : DTypeLike, default=np.float32
Target data type for the resampled array.
Returns
-------
xr.DataArray
Resampled waveform with updated time coordinates. Returns the input
unmodified (but dtype cast) if the sample rate is already correct or
if the input array is empty.
Raises
------
ValueError
If `wav` lacks a 'time' dimension, the original sample rate cannot
be determined, `samplerate` is non-positive, or `mode` is invalid.
"""
if "time" not in wav.dims:
raise ValueError("Audio must have a time dimension")
@ -180,6 +626,33 @@ def resample_audio_poly(
sr_new: int,
axis: int = -1,
) -> np.ndarray:
"""Resample a numpy array using `scipy.signal.resample_poly`.
This method is often preferred for signals when the ratio of new
to old sample rates can be expressed as a rational number. It uses
polyphase filtering.
Parameters
----------
array : np.ndarray
The input array to resample.
sr_orig : int
The original sample rate in Hz.
sr_new : int
The target sample rate in Hz.
axis : int, default=-1
The axis of `array` along which to resample.
Returns
-------
np.ndarray
The array resampled to the target sample rate.
Raises
------
ValueError
If sample rates are not positive.
"""
gcd = np.gcd(sr_orig, sr_new)
return resample_poly(
array.values,
@ -195,5 +668,28 @@ def resample_audio_fourier(
sr_new: int,
axis: int = -1,
) -> np.ndarray:
"""Resample a numpy array using `scipy.signal.resample`.
This method uses FFTs to resample the signal.
Parameters
----------
array : np.ndarray
The input array to resample.
num : int
The desired number of samples in the output array along `axis`.
axis : int, default=-1
The axis of `array` along which to resample.
Returns
-------
np.ndarray
The array resampled to have `num` samples along `axis`.
Raises
------
ValueError
If `num` is negative.
"""
ratio = sr_new / sr_orig
return resample(array, int(array.shape[axis] * ratio), axis=axis) # type: ignore