Separated the protocols to separate types module

2026-01-11 09:29:33 +01:00 · 2025-04-17 15:36:21 +01:00 · 2025-04-17 15:36:21 +01:00 · 19febf2216
commit 19febf2216
parent 3417c496db
5 changed files with 627 additions and 338 deletions
--- a/batdetect2/preprocess/init.py
+++ b/batdetect2/preprocess/init.py
@ -1,87 +1,193 @@
 """Module containing functions for preprocessing audio clips."""
-from functools import partial
+from typing import Optional, Union
 from typing import Callable, Optional, Protocol
 import numpy as np
 import xarray as xr
 from pydantic import Field
 from soundevent import data
 from batdetect2.configs import BaseConfig, load_config
 from batdetect2.preprocess.audio import (
    DEFAULT_DURATION,
    SCALE_RAW_AUDIO,
    TARGET_SAMPLERATE_HZ,
    AudioConfig,
    ResampleConfig,
    adjust_audio_duration,
    build_audio_loader,
    convert_to_xr,
    load_clip_audio,
-)
+    load_file_audio,
-from batdetect2.preprocess.config import (
+    load_recording_audio,
-    PreprocessingConfig,
+    resample_audio,
    load_preprocessing_config,
 )
 from batdetect2.preprocess.spectrogram import (
    MAX_FREQ,
    MIN_FREQ,
-    AmplitudeScaleConfig,
+    ConfigurableSpectrogramBuilder,
    FrequencyConfig,
-    LogScaleConfig,
+    PcenConfig,
    PcenScaleConfig,
    Scales,
    SpecSizeConfig,
    SpectrogramConfig,
    STFTConfig,
    build_spectrogram_builder,
    compute_spectrogram,
    get_spectrogram_resolution,
 )
 from batdetect2.preprocess.types import (
    AudioLoader,
    Preprocessor,
    SpectrogramBuilder,
 )
 __all__ = [
    "AmplitudeScaleConfig",
    "AudioConfig",
    "AudioLoader",
    "ConfigurableSpectrogramBuilder",
    "DEFAULT_DURATION",
    "FrequencyConfig",
    "FrequencyConfig",
    "LogScaleConfig",
    "MAX_FREQ",
    "MIN_FREQ",
-    "PcenScaleConfig",
+    "PcenConfig",
    "PcenConfig",
    "PreprocessingConfig",
    "ResampleConfig",
    "SCALE_RAW_AUDIO",
    "STFTConfig",
    "STFTConfig",
    "Scales",
    "SpecSizeConfig",
    "SpecSizeConfig",
    "SpectrogramBuilder",
    "SpectrogramConfig",
    "SpectrogramConfig",
    "TARGET_SAMPLERATE_HZ",
    "adjust_audio_duration",
    "build_audio_loader",
    "build_spectrogram_builder",
    "compute_spectrogram",
    "convert_to_xr",
    "get_spectrogram_resolution",
    "load_clip_audio",
    "load_file_audio",
    "load_preprocessing_config",
-    "preprocess_audio_clip",
+    "load_recording_audio",
    "resample_audio",
 ]
-class AudioPreprocessor(Protocol):
+class PreprocessingConfig(BaseConfig):
-    def __call__(
+    """Configuration for preprocessing data."""
    audio: AudioConfig = Field(default_factory=AudioConfig)
    spectrogram: SpectrogramConfig = Field(default_factory=SpectrogramConfig)
 class StandardPreprocessor(Preprocessor):
    audio_loader: AudioLoader
    spectrogram_builder: SpectrogramBuilder
    default_samplerate: int
    def __init__(
        self,
        audio_loader: AudioLoader,
        spectrogram_builder: SpectrogramBuilder,
        default_samplerate: int,
    ) -> None:
        self.audio_loader = audio_loader
        self.spectrogram_builder = spectrogram_builder
        self.default_samplerate = default_samplerate
    def load_file_audio(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        return self.audio_loader.load_file(
            path,
            audio_dir=audio_dir,
        )
    def load_recording_audio(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        return self.audio_loader.load_recording(
            recording,
            audio_dir=audio_dir,
        )
    def load_clip_audio(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
-    ) -> xr.DataArray: ...
+    ) -> xr.DataArray:
        return self.audio_loader.load_clip(
            clip,
            audio_dir=audio_dir,
        )
    def preprocess_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        wav = self.load_file_audio(path, audio_dir=audio_dir)
        return self.spectrogram_builder(
            wav,
            samplerate=self.default_samplerate,
        )
    def preprocess_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        wav = self.load_recording_audio(recording, audio_dir=audio_dir)
        return self.spectrogram_builder(
            wav,
            samplerate=self.default_samplerate,
        )
    def preprocess_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        wav = self.load_clip_audio(clip, audio_dir=audio_dir)
        return self.spectrogram_builder(
            wav,
            samplerate=self.default_samplerate,
        )
    def compute_spectrogram(
        self, wav: Union[xr.DataArray, np.ndarray]
    ) -> xr.DataArray:
        return self.spectrogram_builder(
            wav,
            samplerate=self.default_samplerate,
        )
 def load_preprocessing_config(
    path: data.PathLike,
    field: Optional[str] = None,
 ) -> PreprocessingConfig:
    return load_config(path, schema=PreprocessingConfig, field=field)
 def build_preprocessor_from_config(
    config: PreprocessingConfig,
-) -> AudioPreprocessor:
+) -> Preprocessor:
-    return partial(preprocess_audio_clip, config=config)
+    default_samplerate = (
-
+        config.audio.resample.samplerate
-
+        if config.audio.resample
-def preprocess_audio_clip(
+        else TARGET_SAMPLERATE_HZ
-    clip: data.Clip,
+    )
-    config: Optional[PreprocessingConfig] = None,
+    return StandardPreprocessor(
-    audio_dir: Optional[data.PathLike] = None,
+        audio_loader=build_audio_loader(config.audio),
-) -> xr.DataArray:
+        spectrogram_builder=build_spectrogram_builder(config.spectrogram),
-    """Preprocesses audio clip to generate spectrogram.
+        default_samplerate=default_samplerate,
-
+    )
    Parameters
    ----------
    clip
        The audio clip to preprocess.
    config
        Configuration for preprocessing.
    Returns
    -------
    xr.DataArray
        Preprocessed spectrogram.
    """
    config = config or PreprocessingConfig()
    wav = load_clip_audio(clip, config=config.audio, audio_dir=audio_dir)
    return compute_spectrogram(wav, config=config.spectrogram)
--- a/batdetect2/preprocess/audio.py
+++ b/batdetect2/preprocess/audio.py
@ -20,7 +20,7 @@ The primary interface is the `AudioLoader` protocol, with
 `AudioConfig`.
 """
-from typing import Optional, Protocol
+from typing import Optional
 import numpy as np
 import xarray as xr
@ -32,9 +32,9 @@ from soundevent.arrays import operations as ops
 from soundfile import LibsndfileError
 from batdetect2.configs import BaseConfig
 from batdetect2.preprocess.types import AudioLoader
 __all__ = [
    "AudioLoader",
    "ResampleConfig",
    "AudioConfig",
    "ConfigurableAudioLoader",
@ -60,106 +60,6 @@ DEFAULT_DURATION = None
 """Default setting for target audio duration in seconds."""
 class AudioLoader(Protocol):
    """Defines the interface for an audio loading and processing component.
    An AudioLoader is responsible for retrieving audio data corresponding to
    different soundevent objects (files, Recordings, Clips) and applying a
    configured set of initial preprocessing steps. Adhering to this protocol
    allows for different loading strategies or implementations.
    """
    def load_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess audio directly from a file path.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix to prepend to the path if `path` is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform as an xarray DataArray
            with time coordinates. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the entire audio for a Recording object.
        Parameters
        ----------
        recording : data.Recording
            The Recording object containing metadata about the audio file.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the recording
            can be found, especially if the path in the recording is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform. Typically loads only
            the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the recording cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the audio segment defined by a Clip object.
        Parameters
        ----------
        clip : data.Clip
            The Clip object specifying the recording and the start/end times
            of the segment to load.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the clip's
            recording can be found.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform for the specified clip
            duration. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the clip cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
 class ResampleConfig(BaseConfig):
    """Configuration for audio resampling.
@ -167,7 +67,7 @@ class ResampleConfig(BaseConfig):
    ----------
    samplerate : int, default=256000
        The target sample rate in Hz to resample the audio to. Must be > 0.
-    mode : str, default="poly"
+    method : str, default="poly"
        The resampling algorithm to use. Options:
        - "poly": Polyphase resampling using `scipy.signal.resample_poly`.
                  Generally fast.
@ -177,7 +77,7 @@ class ResampleConfig(BaseConfig):
    """
    samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0)
-    mode: str = "poly"
+    method: str = "poly"
 class AudioConfig(BaseConfig):
@ -191,8 +91,8 @@ class AudioConfig(BaseConfig):
    ----------
    resample : ResampleConfig, optional
        Configuration for resampling. If provided (or defaulted), audio will
-        be resampled to the specified `samplerate` using the specified `mode`.
+        be resampled to the specified `samplerate` using the specified
-        If set to `None` in the config file, resampling is skipped.
+        `method`. If set to `None` in the config file, resampling is skipped.
        Defaults to a ResampleConfig instance with standard settings.
    scale : bool, default=False
        If True, scales the audio waveform using peak normalization so that
@ -579,14 +479,14 @@ def adjust_audio_duration(
 def resample_audio(
    wav: xr.DataArray,
    samplerate: int = TARGET_SAMPLERATE_HZ,
-    mode: str = "poly",
+    method: str = "poly",
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
    """Resample an audio waveform DataArray to a target sample rate.
    Updates the 'time' coordinate axis according to the new sample rate and
    number of samples. Uses either polyphase (`scipy.signal.resample_poly`)
-    or Fourier method (`scipy.signal.resample`) based on the `mode`.
+    or Fourier method (`scipy.signal.resample`) based on the `method`.
    Parameters
    ----------
@ -594,7 +494,7 @@ def resample_audio(
        Input audio waveform with 'time' dimension and coordinates.
    samplerate : int, default=TARGET_SAMPLERATE_HZ
        Target sample rate in Hz.
-    mode : str, default="poly"
+    method : str, default="poly"
        Resampling algorithm: "poly" or "fourier".
    dtype : DTypeLike, default=np.float32
        Target data type for the resampled array.
@ -610,7 +510,7 @@ def resample_audio(
    ------
    ValueError
        If `wav` lacks a 'time' dimension, the original sample rate cannot
-        be determined, `samplerate` is non-positive, or `mode` is invalid.
+        be determined, `samplerate` is non-positive, or `method` is invalid.
    """
    if "time" not in wav.dims:
        raise ValueError("Audio must have a time dimension")
@ -622,14 +522,14 @@ def resample_audio(
    if original_samplerate == samplerate:
        return wav.astype(dtype)
-    if mode == "poly":
+    if method == "poly":
        resampled = resample_audio_poly(
            wav,
            sr_orig=original_samplerate,
            sr_new=samplerate,
            axis=time_axis,
        )
-    elif mode == "fourier":
+    elif method == "fourier":
        resampled = resample_audio_fourier(
            wav,
            sr_orig=original_samplerate,
@ -637,7 +537,9 @@ def resample_audio(
            axis=time_axis,
        )
    else:
-        raise NotImplementedError(f"Resampling mode '{mode}' not implemented")
+        raise NotImplementedError(
            f"Resampling method '{method}' not implemented"
        )
    start, stop = arrays.get_dim_range(wav, dim="time")
    times = np.linspace(
--- a/batdetect2/preprocess/config.py
+++ b/batdetect2/preprocess/config.py
@ -1,31 +0,0 @@
 from typing import Optional
 from pydantic import Field
 from soundevent.data import PathLike
 from batdetect2.configs import BaseConfig, load_config
 from batdetect2.preprocess.audio import (
    AudioConfig,
 )
 from batdetect2.preprocess.spectrogram import (
    SpectrogramConfig,
 )
 __all__ = [
    "PreprocessingConfig",
    "load_preprocessing_config",
 ]
 class PreprocessingConfig(BaseConfig):
    """Configuration for preprocessing data."""
    audio: AudioConfig = Field(default_factory=AudioConfig)
    spectrogram: SpectrogramConfig = Field(default_factory=SpectrogramConfig)
 def load_preprocessing_config(
    path: PathLike,
    field: Optional[str] = None,
 ) -> PreprocessingConfig:
    return load_config(path, schema=PreprocessingConfig, field=field)
--- a/batdetect2/preprocess/spectrogram.py
+++ b/batdetect2/preprocess/spectrogram.py
@ -6,20 +6,20 @@ spectrogram representations suitable for input into deep learning models like
 BatDetect2.
 It offers a configurable pipeline including:
-1.  Short-Time Fourier Transform (STFT) calculation.
+1.  Short-Time Fourier Transform (STFT) calculation to get magnitude.
 2.  Frequency axis cropping to a relevant range.
-3.  Amplitude scaling (e.g., Logarithmic, Per-Channel Energy Normalization -
+3.  Per-Channel Energy Normalization (PCEN) (optional).
-    PCEN).
+4.  Amplitude scaling/representation (dB, power, or linear amplitude).
-4.  Simple denoising (optional).
+5.  Simple spectral mean subtraction denoising (optional).
-5.  Resizing to target dimensions (optional).
+6.  Resizing to target dimensions (optional).
-6.  Final peak normalization (optional).
+7.  Final peak normalization (optional).
 Configuration is managed via the `SpectrogramConfig` class, allowing for
 reproducible spectrogram generation consistent between training and inference.
 The core computation is performed by `compute_spectrogram`.
 """
-from typing import Literal, Optional, Protocol, Union
+from typing import Literal, Optional, Union
 import librosa
 import librosa.core.spectrum
@ -32,65 +32,23 @@ from soundevent.arrays import operations as ops
 from batdetect2.configs import BaseConfig
 from batdetect2.preprocess.audio import convert_to_xr
 from batdetect2.preprocess.types import SpectrogramBuilder
 __all__ = [
    "SpectrogramBuilder",
    "STFTConfig",
    "FrequencyConfig",
    "SpecSizeConfig",
-    "LogScaleConfig",
+    "PcenConfig",
    "PcenScaleConfig",
    "AmplitudeScaleConfig",
    "Scales",
    "SpectrogramConfig",
    "ConfigurableSpectrogramBuilder",
    "build_spectrogram_builder",
    "compute_spectrogram",
    "get_spectrogram_resolution",
    "MIN_FREQ",
    "MAX_FREQ",
 ]
 class SpectrogramBuilder(Protocol):
    """Defines the interface for a spectrogram generation component.
    A SpectrogramBuilder takes a waveform (as numpy array or xarray DataArray)
    and produces a spectrogram (as an xarray DataArray) based on its internal
    configuration or implementation.
    """
    def __call__(
        self,
        wav: Union[np.ndarray, xr.DataArray],
        samplerate: Optional[int] = None,
    ) -> xr.DataArray:
        """Generate a spectrogram from an audio waveform.
        Parameters
        ----------
        wav : Union[np.ndarray, xr.DataArray]
            The input audio waveform. If a numpy array, `samplerate` must
            also be provided. If an xarray DataArray, it must have a 'time'
            coordinate from which the sample rate can be inferred.
        samplerate : int, optional
            The sample rate of the audio in Hz. Required if `wav` is a
            numpy array. If `wav` is an xarray DataArray, this parameter is
            ignored as the sample rate is derived from the coordinates.
        Returns
        -------
        xr.DataArray
            The computed spectrogram as an xarray DataArray with 'time' and
            'frequency' coordinates.
        Raises
        ------
        ValueError
            If `wav` is a numpy array and `samplerate` is not provided, or
            if `wav` is an xarray DataArray without a valid 'time' coordinate.
        """
        ...
 MIN_FREQ = 10_000
 """Default minimum frequency (Hz) for spectrogram frequency cropping."""
@ -151,109 +109,84 @@ class SpecSizeConfig(BaseConfig):
    resize_factor : float, optional
        Factor by which to resize the spectrogram along the time axis *after*
        STFT calculation. A value of 0.5 halves the number of time bins,
-        2.0 doubles it. If None (default), no resizing along the time axis
+        2.0 doubles it. If None (default), no resizing along the time axis is
-        is performed relative to the STFT output width. Must be > 0 if provided.
+        performed relative to the STFT output width. Must be > 0 if provided.
    """
    height: int = 128
    resize_factor: Optional[float] = 0.5
-class LogScaleConfig(BaseConfig):
+class PcenConfig(BaseConfig):
-    """Configuration marker for using Logarithmic Amplitude Scaling."""
+    """Configuration for Per-Channel Energy Normalization (PCEN).
-    name: Literal["log"] = "log"
+    PCEN is an adaptive gain control method that can help emphasize transients
-
+    and suppress stationary noise. Applied after STFT and frequency cropping,
-
+    but before final amplitude scaling (dB, power, amplitude).
 class PcenScaleConfig(BaseConfig):
    """Configuration for Per-Channel Energy Normalization (PCEN) scaling.
    PCEN is an adaptive gain control method often used for audio event
    detection.
    Attributes
    ----------
    name : Literal["pcen"]
        Discriminator field identifying this scaling type.
    time_constant : float, default=0.4
-        Time constant (in seconds) for the PCEN smoothing filter. Controls how
+        Time constant (in seconds) for the PCEN smoothing filter. Controls
-        quickly the normalization adapts to energy changes.
+        how quickly the normalization adapts to energy changes.
    gain : float, default=0.98
-        Gain factor (alpha in some formulations). Controls the AGC behavior.
+        Gain factor (alpha). Controls the adaptive gain component.
    bias : float, default=2.0
-        Bias factor (delta in some formulations). Added before the
+        Bias factor (delta). Added before the exponentiation.
        exponentiation.
    power : float, default=0.5
-        Exponent (r in some formulations). Controls the compression
+        Exponent (r). Controls the compression characteristic.
        characteristic.
    """
    name: Literal["pcen"] = "pcen"
    time_constant: float = 0.4
    gain: float = 0.98
    bias: float = 2
    power: float = 0.5
 class AmplitudeScaleConfig(BaseConfig):
    """Configuration marker for using Linear Amplitude (no scaling applied).
    Note: The actual output is typically magnitude from STFT, not raw amplitude.
    This option essentially skips log or PCEN scaling.
    """
    name: Literal["amplitude"] = "amplitude"
 Scales = Union[LogScaleConfig, PcenScaleConfig, AmplitudeScaleConfig]
 """Type alias for the different amplitude scaling configuration options."""
 class SpectrogramConfig(BaseConfig):
-    """Unified configuration for spectrogram generation.
+    """Unified configuration for spectrogram generation pipeline.
-    Aggregates settings for STFT, frequency selection, amplitude scaling,
+    Aggregates settings for all steps involved in converting a preprocessed
-    resizing, and optional post-processing steps like denoising and final
+    audio waveform into a final spectrogram representation suitable for model input.
    normalization.
    Attributes
    ----------
    stft : STFTConfig
-        Configuration for the Short-Time Fourier Transform. Defaults to standard
+        Configuration for the initial Short-Time Fourier Transform.
-        settings via `STFTConfig`.
+        Defaults to standard settings via `STFTConfig`.
    frequencies : FrequencyConfig
-        Configuration for cropping the frequency range. Defaults to standard
+        Configuration for cropping the frequency range after STFT.
-        settings via `FrequencyConfig`.
+        Defaults to standard settings via `FrequencyConfig`.
-    scale : Scales
+    pcen : PcenConfig, optional
-        Configuration for amplitude scaling. Determines whether to apply
+        Configuration for applying Per-Channel Energy Normalization (PCEN). If
-        log scaling, PCEN, or leave as linear magnitude. Defaults to PCEN
+        provided, PCEN is applied after frequency cropping. If None or omitted
-        via `PcenScaleConfig`. Use the `name` field ("log", "pcen", "amplitude")
+        (default), PCEN is skipped.
-        in config files to select the type and provide relevant parameters.
+    scale : Literal["dB", "amplitude", "power"], default="amplitude"
        Determines the final amplitude representation *after* optional PCEN.
        - "amplitude": Use linear magnitude values (output of STFT or PCEN).
        - "power": Use power values (magnitude squared).
        - "dB": Use logarithmic (decibel-like) scaling applied to the magnitude
                (or PCEN output if enabled). Calculated as `log1p(C * S)`.
    size : SpecSizeConfig, optional
-        Configuration for resizing the final spectrogram dimensions (height in
+        Configuration for resizing the spectrogram dimensions
-        frequency bins, optional time resizing factor). If None or omitted,
+        (frequency height, optional time width factor). Applied after PCEN and
-        no resizing is performed after STFT and frequency cropping. Defaults
+        scaling. If None (default), no resizing is performed.
-        to standard settings via `SpecSizeConfig`.
+    spectral_mean_substraction : bool, default=True
-    denoise : bool, default=True
+        If True (default), applies simple spectral mean subtraction denoising
-        If True (default), applies a simple spectral mean subtraction denoising
+        *after* PCEN and amplitude scaling, but *before* resizing.
-        step after amplitude scaling.
+    peak_normalize : bool, default=False
    max_scale : bool, default=False
        If True, applies a final peak normalization to the spectrogram *after*
-        all other steps (including log/PCEN scaling and resizing), scaling the
+        all other steps (including resizing), scaling the overall maximum value
-        maximum value across the entire spectrogram to 1.0. If False (default),
+        to 1.0. If False (default), this final normalization is skipped.
        this final scaling is skipped. **Note:** Applying this after log or PCEN
        scaling will alter the characteristics of those scales.
    """
    stft: STFTConfig = Field(default_factory=STFTConfig)
    frequencies: FrequencyConfig = Field(default_factory=FrequencyConfig)
-    scale: Scales = Field(
+    pcen: Optional[PcenConfig] = Field(default_factory=PcenConfig)
-        default_factory=PcenScaleConfig,
+    scale: Literal["dB", "amplitude", "power"] = "amplitude"
        discriminator="name",
    )
    size: Optional[SpecSizeConfig] = Field(default_factory=SpecSizeConfig)
-    denoise: bool = True
+    spectral_mean_substraction: bool = True
-    max_scale: bool = False
+    peak_normalize: bool = False
 class ConfigurableSpectrogramBuilder(SpectrogramBuilder):
@ -362,13 +295,13 @@ def compute_spectrogram(
    """Compute a spectrogram from a waveform using specified configurations.
    Applies a sequence of operations based on the `config`:
    1. Compute STFT magnitude (`stft`).
    2. Crop frequency axis (`crop_spectrogram_frequencies`).
-    3. Apply amplitude scaling (log, PCEN, or none) (`scale_spectrogram`).
+    3. Apply PCEN if configured (`apply_pcen`).
-    4. Apply denoising if enabled (`denoise_spectrogram`).
+    4. Apply final amplitude scaling (dB, power, amplitude) (`scale_spectrogram`).
-    5. Resize dimensions if specified (`resize_spectrogram`).
+    5. Apply spectral mean subtraction denoising if enabled.
-    6. Apply final peak normalization if enabled (`max_scale`).
+    6. Resize dimensions if specified (`resize_spectrogram`).
    7. Apply final peak normalization if enabled.
    Parameters
    ----------
@ -411,10 +344,19 @@ def compute_spectrogram(
        max_freq=config.frequencies.max_freq,
    )
    if config.pcen:
        spec = apply_pcen(
            spec,
            time_constant=config.pcen.time_constant,
            gain=config.pcen.gain,
            power=config.pcen.power,
            bias=config.pcen.bias,
        )
    spec = scale_spectrogram(spec, scale=config.scale)
-    if config.denoise:
+    if config.spectral_mean_substraction:
-        spec = denoise_spectrogram(spec)
+        spec = remove_spectral_mean(spec)
    if config.size:
        spec = resize_spectrogram(
@ -423,7 +365,7 @@ def compute_spectrogram(
            resize_factor=config.size.resize_factor,
        )
-    if config.max_scale:
+    if config.peak_normalize:
        spec = ops.scale(spec, 1 / (10e-6 + np.max(spec)))
    return spec.astype(dtype)
@ -550,7 +492,7 @@ def stft(
    )
-def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
+def remove_spectral_mean(spec: xr.DataArray) -> xr.DataArray:
    """Apply simple spectral mean subtraction for denoising.
    Subtracts the mean value of each frequency bin (calculated across time)
@ -576,23 +518,22 @@ def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
 def scale_spectrogram(
    spec: xr.DataArray,
-    scale: Scales,
+    scale: Literal["dB", "power", "amplitude"],
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
-    """Apply configured amplitude scaling to the spectrogram.
+    """Apply final amplitude scaling/representation to the spectrogram.
-    Dispatches to the appropriate scaling function (log, PCEN) based on the
+    Converts the input magnitude spectrogram based on the `scale` type:
-    `scale` configuration object's `name` field. If `scale.name` is
+    - "dB": Applies logarithmic scaling `log1p(C * S)`.
-    "amplitude", the spectrogram is returned unchanged (as it's already
+    - "power": Squares the magnitude values `S^2`.
-    magnitude/amplitude).
+    - "amplitude": Returns the input magnitude values `S` unchanged.
    Parameters
    ----------
    spec : xr.DataArray
-        Input magnitude spectrogram.
+        Input magnitude spectrogram (potentially after PCEN).
-    scale : Scales
+    scale : Literal["dB", "power", "amplitude"]
-        The configuration object specifying the scaling method and parameters
+        The target amplitude representation.
        (instance of LogScaleConfig, PcenScaleConfig, or AmplitudeScaleConfig).
    dtype : DTypeLike, default=np.float32
        Target data type for the output scaled spectrogram.
@ -601,22 +542,16 @@ def scale_spectrogram(
    xr.DataArray
        Spectrogram with the specified amplitude scaling applied.
    """
-    if scale.name == "log":
+    if scale == "dB":
        return scale_log(spec, dtype=dtype)
-    if scale.name == "pcen":
+    if scale == "power":
-        return scale_pcen(
+        return spec**2
            spec,
            time_constant=scale.time_constant,
            gain=scale.gain,
            power=scale.power,
            bias=scale.bias,
        )
    return spec
-def scale_pcen(
+def apply_pcen(
    spec: xr.DataArray,
    time_constant: float = 0.4,
    gain: float = 0.98,
--- a/batdetect2/preprocess/types.py
+++ b/batdetect2/preprocess/types.py
@ -0,0 +1,377 @@
 """Defines common interfaces (Protocols) for preprocessing components.
 This module centralizes the Protocol definitions used throughout the
 `batdetect2.preprocess` package. Protocols define expected methods and
 signatures, allowing for flexible and interchangeable implementations of
 components like audio loaders and spectrogram builders.
 Using these protocols ensures that different parts of the preprocessing
 pipeline can interact consistently, regardless of the specific underlying
 implementation (e.g., different libraries or custom configurations).
 """
 from typing import Optional, Protocol, Union
 import numpy as np
 import xarray as xr
 from soundevent import data
 class AudioLoader(Protocol):
    """Defines the interface for an audio loading and processing component.
    An AudioLoader is responsible for retrieving audio data corresponding to
    different soundevent objects (files, Recordings, Clips) and applying a
    configured set of initial preprocessing steps. Adhering to this protocol
    allows for different loading strategies or implementations.
    """
    def load_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess audio directly from a file path.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix to prepend to the path if `path` is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform as an xarray DataArray
            with time coordinates. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the entire audio for a Recording object.
        Parameters
        ----------
        recording : data.Recording
            The Recording object containing metadata about the audio file.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the recording
            can be found, especially if the path in the recording is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform. Typically loads only
            the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the recording cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
    def load_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess the audio segment defined by a Clip object.
        Parameters
        ----------
        clip : data.Clip
            The Clip object specifying the recording and the start/end times
            of the segment to load.
        audio_dir : PathLike, optional
            A directory where the audio file associated with the clip's
            recording can be found.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform for the specified clip
            duration. Typically loads only the first channel.
        Raises
        ------
        FileNotFoundError
            If the audio file associated with the clip cannot be found.
        Exception
            If the audio file cannot be loaded or processed.
        """
        ...
 class SpectrogramBuilder(Protocol):
    """Defines the interface for a spectrogram generation component.
    A SpectrogramBuilder takes a waveform (as numpy array or xarray DataArray)
    and produces a spectrogram (as an xarray DataArray) based on its internal
    configuration or implementation.
    """
    def __call__(
        self,
        wav: Union[np.ndarray, xr.DataArray],
        samplerate: Optional[int] = None,
    ) -> xr.DataArray:
        """Generate a spectrogram from an audio waveform.
        Parameters
        ----------
        wav : Union[np.ndarray, xr.DataArray]
            The input audio waveform. If a numpy array, `samplerate` must
            also be provided. If an xarray DataArray, it must have a 'time'
            coordinate from which the sample rate can be inferred.
        samplerate : int, optional
            The sample rate of the audio in Hz. Required if `wav` is a
            numpy array. If `wav` is an xarray DataArray, this parameter is
            ignored as the sample rate is derived from the coordinates.
        Returns
        -------
        xr.DataArray
            The computed spectrogram as an xarray DataArray with 'time' and
            'frequency' coordinates.
        Raises
        ------
        ValueError
            If `wav` is a numpy array and `samplerate` is not provided, or
            if `wav` is an xarray DataArray without a valid 'time' coordinate.
        """
        ...
 class Preprocessor(Protocol):
    """Defines a high-level interface for the complete preprocessing pipeline.
    A Preprocessor combines audio loading and spectrogram generation steps.
    It provides methods to go directly from source descriptions (file paths,
    Recording objects, Clip objects) to the final spectrogram representation
    needed by the model. It may also expose intermediate steps like audio
    loading or spectrogram computation from a waveform.
    """
    def preprocess_file(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load audio from a file and compute the final processed spectrogram.
        Performs the full pipeline:
            Load -> Preprocess Audio -> Compute Spectrogram.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix if `path` is relative.
        Returns
        -------
        xr.DataArray
            The final processed spectrogram.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If any step in the loading or preprocessing fails.
        """
        ...
    def preprocess_recording(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load audio for a Recording and compute the processed spectrogram.
        Performs the full pipeline for the entire duration of the recording.
        Parameters
        ----------
        recording : data.Recording
            The Recording object.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            The final processed spectrogram.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If any step in the loading or preprocessing fails.
        """
        ...
    def preprocess_clip(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load audio for a Clip and compute the final processed spectrogram.
        Performs the full pipeline for the specified clip segment.
        Parameters
        ----------
        clip : data.Clip
            The Clip object defining the audio segment.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            The final processed spectrogram.
        Raises
        ------
        FileNotFoundError
            If the audio file cannot be found.
        Exception
            If any step in the loading or preprocessing fails.
        """
        ...
    def load_file_audio(
        self,
        path: data.PathLike,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess *only* the audio waveform from a file path.
        Performs the initial audio loading and waveform processing steps
        (like resampling, scaling), but stops *before* spectrogram generation.
        Parameters
        ----------
        path : PathLike
            Path to the audio file.
        audio_dir : PathLike, optional
            A directory prefix if `path` is relative.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform.
        Raises
        ------
        FileNotFoundError, Exception
            If audio loading/preprocessing fails.
        """
        ...
    def load_recording_audio(
        self,
        recording: data.Recording,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess *only* the audio waveform for a Recording.
        Performs the initial audio loading and waveform processing steps
        for the entire recording duration.
        Parameters
        ----------
        recording : data.Recording
            The Recording object.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform.
        Raises
        ------
        FileNotFoundError, Exception
            If audio loading/preprocessing fails.
        """
        ...
    def load_clip_audio(
        self,
        clip: data.Clip,
        audio_dir: Optional[data.PathLike] = None,
    ) -> xr.DataArray:
        """Load and preprocess *only* the audio waveform for a Clip.
        Performs the initial audio loading and waveform processing steps
        for the specified clip segment.
        Parameters
        ----------
        clip : data.Clip
            The Clip object defining the segment.
        audio_dir : PathLike, optional
            Directory containing the audio file.
        Returns
        -------
        xr.DataArray
            The loaded and preprocessed audio waveform segment.
        Raises
        ------
        FileNotFoundError, Exception
            If audio loading/preprocessing fails.
        """
        ...
    def compute_spectrogram(
        self,
        wav: Union[xr.DataArray, np.ndarray],
    ) -> xr.DataArray:
        """Compute the spectrogram from a pre-loaded audio waveform.
        Applies the spectrogram generation steps (STFT, scaling, etc.) defined
        by the `SpectrogramBuilder` component of the preprocessor to an
        already loaded (and potentially preprocessed) waveform.
        Parameters
        ----------
        wav : Union[xr.DataArray, np.ndarray]
            The input audio waveform. If numpy array, `samplerate` is required.
        samplerate : int, optional
            Sample rate in Hz (required if `wav` is np.ndarray).
        Returns
        -------
        xr.DataArray
            The computed spectrogram.
        Raises
        ------
        ValueError, Exception
            If waveform input is invalid or spectrogram computation fails.
        """
        ...