From ac4bb8f02319adecaf42a92cbe4cfc1ed4effe45 Mon Sep 17 00:00:00 2001 From: mbsantiago Date: Wed, 23 Apr 2025 23:14:31 +0100 Subject: [PATCH] Added min and max freq attributes to preprocessor protocol --- batdetect2/preprocess/__init__.py | 12 +++++ batdetect2/preprocess/audio.py | 16 ++++-- batdetect2/preprocess/spectrogram.py | 79 ++++++---------------------- batdetect2/preprocess/types.py | 4 ++ 4 files changed, 43 insertions(+), 68 deletions(-) diff --git a/batdetect2/preprocess/__init__.py b/batdetect2/preprocess/__init__.py index f875fc8..f27b591 100644 --- a/batdetect2/preprocess/__init__.py +++ b/batdetect2/preprocess/__init__.py @@ -144,12 +144,16 @@ class StandardPreprocessor(PreprocessorProtocol): audio_loader: AudioLoader spectrogram_builder: SpectrogramBuilder default_samplerate: int + max_freq: float + min_freq: float def __init__( self, audio_loader: AudioLoader, spectrogram_builder: SpectrogramBuilder, default_samplerate: int, + max_freq: float, + min_freq: float, ) -> None: """Initialize the StandardPreprocessor. @@ -167,6 +171,8 @@ class StandardPreprocessor(PreprocessorProtocol): self.audio_loader = audio_loader self.spectrogram_builder = spectrogram_builder self.default_samplerate = default_samplerate + self.max_freq = max_freq + self.min_freq = min_freq def load_file_audio( self, @@ -429,8 +435,14 @@ def build_preprocessor( if config.audio.resample else TARGET_SAMPLERATE_HZ ) + + min_freq = config.spectrogram.frequencies.min_freq + max_freq = config.spectrogram.frequencies.max_freq + return StandardPreprocessor( audio_loader=build_audio_loader(config.audio), spectrogram_builder=build_spectrogram_builder(config.spectrogram), default_samplerate=default_samplerate, + min_freq=min_freq, + max_freq=max_freq, ) diff --git a/batdetect2/preprocess/audio.py b/batdetect2/preprocess/audio.py index 481e1de..c474645 100644 --- a/batdetect2/preprocess/audio.py +++ b/batdetect2/preprocess/audio.py @@ -286,7 +286,8 @@ def load_recording_audio( """Load and preprocess the entire audio content of a recording using config. Creates a `soundevent.data.Clip` spanning the full duration of the - recording and then delegates the loading and processing to `load_clip_audio`. + recording and then delegates the loading and processing to + `load_clip_audio`. Parameters ---------- @@ -636,7 +637,11 @@ def resample_audio_fourier( If `num` is negative. """ ratio = sr_new / sr_orig - return resample(array, int(array.shape[axis] * ratio), axis=axis) # type: ignore + return resample( # type: ignore + array, + int(array.shape[axis] * ratio), + axis=axis, + ) def convert_to_xr( @@ -649,8 +654,8 @@ def convert_to_xr( Parameters ---------- wav : np.ndarray - The input waveform array. Expected to be 1D or 2D (with the first axis as - the channel dimension). + The input waveform array. Expected to be 1D or 2D (with the first + axis as the channel dimension). samplerate : int The sample rate in Hz. dtype : DTypeLike, default=np.float32 @@ -673,7 +678,8 @@ def convert_to_xr( if wav.ndim != 1: raise ValueError( - "Audio must be 1D array or 2D channel where the first axis is the channel dimension" + "Audio must be 1D array or 2D channel where the first " + "axis is the channel dimension" ) if wav.size == 0: diff --git a/batdetect2/preprocess/spectrogram.py b/batdetect2/preprocess/spectrogram.py index bb82b41..89b85ac 100644 --- a/batdetect2/preprocess/spectrogram.py +++ b/batdetect2/preprocess/spectrogram.py @@ -21,8 +21,6 @@ The core computation is performed by `compute_spectrogram`. from typing import Literal, Optional, Union -import librosa -import librosa.core.spectrum import numpy as np import xarray as xr from numpy.typing import DTypeLike @@ -147,7 +145,8 @@ class SpectrogramConfig(BaseConfig): """Unified configuration for spectrogram generation pipeline. Aggregates settings for all steps involved in converting a preprocessed - audio waveform into a final spectrogram representation suitable for model input. + audio waveform into a final spectrogram representation suitable for model + input. Attributes ---------- @@ -298,7 +297,8 @@ def compute_spectrogram( 1. Compute STFT magnitude (`stft`). 2. Crop frequency axis (`crop_spectrogram_frequencies`). 3. Apply PCEN if configured (`apply_pcen`). - 4. Apply final amplitude scaling (dB, power, amplitude) (`scale_spectrogram`). + 4. Apply final amplitude scaling (dB, power, amplitude) + (`scale_spectrogram`). 5. Apply spectral mean subtraction denoising if enabled. 6. Resize dimensions if specified (`resize_spectrogram`). 7. Apply final peak normalization if enabled. @@ -324,9 +324,6 @@ def compute_spectrogram( ------ ValueError If `wav` lacks necessary 'time' coordinates or dimensions. - Exception - Can re-raise exceptions from underlying libraries (e.g., librosa, numpy) - if invalid parameters or data are encountered. """ config = config or SpectrogramConfig() @@ -335,7 +332,6 @@ def compute_spectrogram( window_duration=config.stft.window_duration, window_overlap=config.stft.window_overlap, window_fn=config.stft.window_fn, - dtype=dtype, ) spec = crop_spectrogram_frequencies( @@ -410,7 +406,6 @@ def stft( window_duration: float, window_overlap: float, window_fn: str = "hann", - dtype: DTypeLike = np.float32, # type: ignore ) -> xr.DataArray: """Compute the Short-Time Fourier Transform (STFT) magnitude spectrogram. @@ -425,11 +420,9 @@ def stft( window_duration : float Duration of the STFT window in seconds. window_overlap : float - Fractional overlap between consecutive windows [0, 1). + Fractional overlap between consecutive windows. window_fn : str, default="hann" Name of the window function (e.g., "hann", "hamming"). - dtype : DTypeLike, default=np.float32 - Target data type for the spectrogram array. Returns ------- @@ -442,55 +435,13 @@ def stft( ValueError If sample rate cannot be determined from `wave` coordinates. """ - start_time, end_time = arrays.get_dim_range(wave, dim="time") - step = arrays.get_dim_step(wave, dim="time") - sampling_rate = 1 / step - - nfft = int(window_duration * sampling_rate) - noverlap = int(window_overlap * nfft) - hop_len = nfft - noverlap - hop_duration = hop_len / sampling_rate - - spec, _ = librosa.core.spectrum._spectrogram( - y=wave.data.astype(dtype), - power=1, - n_fft=nfft, - hop_length=nfft - noverlap, - center=False, - window=window_fn, - ) - - return xr.DataArray( - data=spec.astype(dtype), - dims=["frequency", "time"], - coords={ - "frequency": arrays.create_frequency_dim_from_array( - np.linspace( - 0, - sampling_rate / 2, - spec.shape[0], - endpoint=False, - dtype=dtype, - ), - step=sampling_rate / nfft, - ), - "time": arrays.create_time_dim_from_array( - np.linspace( - start_time, - end_time - (window_duration - hop_duration), - spec.shape[1], - endpoint=False, - dtype=dtype, - ), - step=hop_duration, - ), - }, - attrs={ - **wave.attrs, - "original_samplerate": sampling_rate, - "nfft": nfft, - "noverlap": noverlap, - }, + return audio.compute_spectrogram( + wave, + window_size=window_duration, + hop_size=(1 - window_overlap) * window_duration, + window_type=window_fn, + scale="amplitude", + sort_dims=False, ) @@ -592,8 +543,10 @@ def apply_pcen( verified against the specific `soundevent.audio.pcen` implementation details. """ - samplerate = spec.attrs["original_samplerate"] - hop_length = spec.attrs["nfft"] - spec.attrs["noverlap"] + samplerate = spec.attrs["samplerate"] + hop_size = spec.attrs["hop_size"] + + hop_length = int(hop_size * samplerate) t_frames = time_constant * samplerate / (float(hop_length) * 10) smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2) return audio.pcen( diff --git a/batdetect2/preprocess/types.py b/batdetect2/preprocess/types.py index 301fb36..b8a0650 100644 --- a/batdetect2/preprocess/types.py +++ b/batdetect2/preprocess/types.py @@ -168,6 +168,10 @@ class PreprocessorProtocol(Protocol): loading or spectrogram computation from a waveform. """ + max_freq: float + + min_freq: float + def preprocess_file( self, path: data.PathLike,