Added min and max freq attributes to preprocessor protocol

2025-06-29 22:51:58 +02:00 · 2025-04-23 23:14:31 +01:00 · 2025-04-23 23:14:31 +01:00 · ac4bb8f023
commit ac4bb8f023
parent 6498b6ca37
4 changed files with 43 additions and 68 deletions
--- a/batdetect2/preprocess/init.py
+++ b/batdetect2/preprocess/init.py
@ -144,12 +144,16 @@ class StandardPreprocessor(PreprocessorProtocol):
    audio_loader: AudioLoader
    spectrogram_builder: SpectrogramBuilder
    default_samplerate: int
    max_freq: float
    min_freq: float
    def __init__(
        self,
        audio_loader: AudioLoader,
        spectrogram_builder: SpectrogramBuilder,
        default_samplerate: int,
        max_freq: float,
        min_freq: float,
    ) -> None:
        """Initialize the StandardPreprocessor.
@ -167,6 +171,8 @@ class StandardPreprocessor(PreprocessorProtocol):
        self.audio_loader = audio_loader
        self.spectrogram_builder = spectrogram_builder
        self.default_samplerate = default_samplerate
        self.max_freq = max_freq
        self.min_freq = min_freq
    def load_file_audio(
        self,
@ -429,8 +435,14 @@ def build_preprocessor(
        if config.audio.resample
        else TARGET_SAMPLERATE_HZ
    )
    min_freq = config.spectrogram.frequencies.min_freq
    max_freq = config.spectrogram.frequencies.max_freq
    return StandardPreprocessor(
        audio_loader=build_audio_loader(config.audio),
        spectrogram_builder=build_spectrogram_builder(config.spectrogram),
        default_samplerate=default_samplerate,
        min_freq=min_freq,
        max_freq=max_freq,
    )
--- a/batdetect2/preprocess/audio.py
+++ b/batdetect2/preprocess/audio.py
@ -286,7 +286,8 @@ def load_recording_audio(
    """Load and preprocess the entire audio content of a recording using config.
    Creates a `soundevent.data.Clip` spanning the full duration of the
-    recording and then delegates the loading and processing to `load_clip_audio`.
+    recording and then delegates the loading and processing to
    `load_clip_audio`.
    Parameters
    ----------
@ -636,7 +637,11 @@ def resample_audio_fourier(
        If `num` is negative.
    """
    ratio = sr_new / sr_orig
-    return resample(array, int(array.shape[axis] * ratio), axis=axis)  # type: ignore
+    return resample(  # type: ignore
        array,
        int(array.shape[axis] * ratio),
        axis=axis,
    )
 def convert_to_xr(
@ -649,8 +654,8 @@ def convert_to_xr(
    Parameters
    ----------
    wav : np.ndarray
-        The input waveform array. Expected to be 1D or 2D (with the first axis as
+        The input waveform array. Expected to be 1D or 2D (with the first
-        the channel dimension).
+        axis as the channel dimension).
    samplerate : int
        The sample rate in Hz.
    dtype : DTypeLike, default=np.float32
@ -673,7 +678,8 @@ def convert_to_xr(
    if wav.ndim != 1:
        raise ValueError(
-            "Audio must be 1D array or 2D channel where the first axis is the channel dimension"
+            "Audio must be 1D array or 2D channel where the first "
            "axis is the channel dimension"
        )
    if wav.size == 0:
--- a/batdetect2/preprocess/spectrogram.py
+++ b/batdetect2/preprocess/spectrogram.py
@ -21,8 +21,6 @@ The core computation is performed by `compute_spectrogram`.
 from typing import Literal, Optional, Union
 import librosa
 import librosa.core.spectrum
 import numpy as np
 import xarray as xr
 from numpy.typing import DTypeLike
@ -147,7 +145,8 @@ class SpectrogramConfig(BaseConfig):
    """Unified configuration for spectrogram generation pipeline.
    Aggregates settings for all steps involved in converting a preprocessed
-    audio waveform into a final spectrogram representation suitable for model input.
+    audio waveform into a final spectrogram representation suitable for model
    input.
    Attributes
    ----------
@ -298,7 +297,8 @@ def compute_spectrogram(
    1. Compute STFT magnitude (`stft`).
    2. Crop frequency axis (`crop_spectrogram_frequencies`).
    3. Apply PCEN if configured (`apply_pcen`).
-    4. Apply final amplitude scaling (dB, power, amplitude) (`scale_spectrogram`).
+    4. Apply final amplitude scaling (dB, power, amplitude)
       (`scale_spectrogram`).
    5. Apply spectral mean subtraction denoising if enabled.
    6. Resize dimensions if specified (`resize_spectrogram`).
    7. Apply final peak normalization if enabled.
@ -324,9 +324,6 @@ def compute_spectrogram(
    ------
    ValueError
        If `wav` lacks necessary 'time' coordinates or dimensions.
    Exception
        Can re-raise exceptions from underlying libraries (e.g., librosa, numpy)
        if invalid parameters or data are encountered.
    """
    config = config or SpectrogramConfig()
@ -335,7 +332,6 @@ def compute_spectrogram(
        window_duration=config.stft.window_duration,
        window_overlap=config.stft.window_overlap,
        window_fn=config.stft.window_fn,
        dtype=dtype,
    )
    spec = crop_spectrogram_frequencies(
@ -410,7 +406,6 @@ def stft(
    window_duration: float,
    window_overlap: float,
    window_fn: str = "hann",
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Compute the Short-Time Fourier Transform (STFT) magnitude spectrogram.
@ -425,11 +420,9 @@ def stft(
    window_duration : float
        Duration of the STFT window in seconds.
    window_overlap : float
-        Fractional overlap between consecutive windows [0, 1).
+        Fractional overlap between consecutive windows.
    window_fn : str, default="hann"
        Name of the window function (e.g., "hann", "hamming").
    dtype : DTypeLike, default=np.float32
        Target data type for the spectrogram array.
    Returns
    -------
@ -442,55 +435,13 @@ def stft(
    ValueError
        If sample rate cannot be determined from `wave` coordinates.
    """
-    start_time, end_time = arrays.get_dim_range(wave, dim="time")
+    return audio.compute_spectrogram(
-    step = arrays.get_dim_step(wave, dim="time")
+        wave,
-    sampling_rate = 1 / step
+        window_size=window_duration,
-
+        hop_size=(1 - window_overlap) * window_duration,
-    nfft = int(window_duration * sampling_rate)
+        window_type=window_fn,
-    noverlap = int(window_overlap * nfft)
+        scale="amplitude",
-    hop_len = nfft - noverlap
+        sort_dims=False,
    hop_duration = hop_len / sampling_rate
    spec, _ = librosa.core.spectrum._spectrogram(
        y=wave.data.astype(dtype),
        power=1,
        n_fft=nfft,
        hop_length=nfft - noverlap,
        center=False,
        window=window_fn,
    )
    return xr.DataArray(
        data=spec.astype(dtype),
        dims=["frequency", "time"],
        coords={
            "frequency": arrays.create_frequency_dim_from_array(
                np.linspace(
                    0,
                    sampling_rate / 2,
                    spec.shape[0],
                    endpoint=False,
                    dtype=dtype,
                ),
                step=sampling_rate / nfft,
            ),
            "time": arrays.create_time_dim_from_array(
                np.linspace(
                    start_time,
                    end_time - (window_duration - hop_duration),
                    spec.shape[1],
                    endpoint=False,
                    dtype=dtype,
                ),
                step=hop_duration,
            ),
        },
        attrs={
            **wave.attrs,
            "original_samplerate": sampling_rate,
            "nfft": nfft,
            "noverlap": noverlap,
        },
    )
@ -592,8 +543,10 @@ def apply_pcen(
      verified against the specific `soundevent.audio.pcen` implementation
      details.
    """
-    samplerate = spec.attrs["original_samplerate"]
+    samplerate = spec.attrs["samplerate"]
-    hop_length = spec.attrs["nfft"] - spec.attrs["noverlap"]
+    hop_size = spec.attrs["hop_size"]
    hop_length = int(hop_size * samplerate)
    t_frames = time_constant * samplerate / (float(hop_length) * 10)
    smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
    return audio.pcen(
--- a/batdetect2/preprocess/types.py
+++ b/batdetect2/preprocess/types.py
@ -168,6 +168,10 @@ class PreprocessorProtocol(Protocol):
    loading or spectrogram computation from a waveform.
    """
    max_freq: float
    min_freq: float
    def preprocess_file(
        self,
        path: data.PathLike,