Add documentation to spectrogram

2026-01-10 17:19:34 +01:00 · 2025-04-17 14:40:20 +01:00 · 2025-04-17 14:40:20 +01:00 · 2212246b11
commit 2212246b11
parent aca0b58443
2 changed files with 580 additions and 61 deletions
--- a/batdetect2/preprocess/audio.py
+++ b/batdetect2/preprocess/audio.py
@ -47,6 +47,7 @@ __all__ = [
    "TARGET_SAMPLERATE_HZ",
    "SCALE_RAW_AUDIO",
    "DEFAULT_DURATION",
+    "convert_to_xr",
 ]

 TARGET_SAMPLERATE_HZ = 256_000
@ -734,3 +735,67 @@ def resample_audio_fourier(
    """
    ratio = sr_new / sr_orig
    return resample(array, int(array.shape[axis] * ratio), axis=axis)  # type: ignore
+
+
+def convert_to_xr(
+    wav: np.ndarray,
+    samplerate: int,
+    dtype: DTypeLike = np.float32,  # type: ignore
+) -> xr.DataArray:
+    """Convert a NumPy array to an xarray DataArray with time coordinates.
+
+    Parameters
+    ----------
+    wav : np.ndarray
+        The input waveform array. Expected to be 1D or 2D (with the first axis as
+        the channel dimension).
+    samplerate : int
+        The sample rate in Hz.
+    dtype : DTypeLike, default=np.float32
+        Target data type for the xarray DataArray.
+
+    Returns
+    -------
+    xr.DataArray
+        The waveform as an xarray DataArray with time coordinates.
+
+    Raises
+    ------
+    ValueError
+        If the input array is not 1D or 2D, or if the sample rate is
+        non-positive. If the input array is empty.
+    """
+
+    if wav.ndim == 2:
+        wav = wav[0, :]
+
+    if wav.ndim != 1:
+        raise ValueError(
+            "Audio must be 1D array or 2D channel where the first axis is the channel dimension"
+        )
+
+    if wav.size == 0:
+        raise ValueError("Audio array is empty")
+
+    if samplerate <= 0:
+        raise ValueError("Sample rate must be positive")
+
+    times = np.linspace(
+        0,
+        wav.shape[0] / samplerate,
+        wav.shape[0],
+        endpoint=False,
+        dtype=dtype,
+    )
+
+    return xr.DataArray(
+        data=wav.astype(dtype),
+        dims=["time"],
+        coords={
+            "time": arrays.create_time_dim_from_array(
+                times,
+                samplerate=samplerate,
+            ),
+        },
+        attrs={"samplerate": samplerate},
+    )
--- a/batdetect2/preprocess/spectrogram.py
+++ b/batdetect2/preprocess/spectrogram.py
@ -1,4 +1,25 @@
-from typing import Literal, Optional, Union
+"""Computes spectrograms from audio waveforms with configurable parameters.
+
+This module provides the functionality to convert preprocessed audio waveforms
+(typically output from the `batdetect2.preprocessing.audio` module) into
+spectrogram representations suitable for input into deep learning models like
+BatDetect2.
+
+It offers a configurable pipeline including:
+1.  Short-Time Fourier Transform (STFT) calculation.
+2.  Frequency axis cropping to a relevant range.
+3.  Amplitude scaling (e.g., Logarithmic, Per-Channel Energy Normalization -
+    PCEN).
+4.  Simple denoising (optional).
+5.  Resizing to target dimensions (optional).
+6.  Final peak normalization (optional).
+
+Configuration is managed via the `SpectrogramConfig` class, allowing for
+reproducible spectrogram generation consistent between training and inference.
+The core computation is performed by `compute_spectrogram`.
+"""
+
+from typing import Literal, Optional, Protocol, Union

 import librosa
 import librosa.core.spectrum
@ -10,66 +31,220 @@ from soundevent import arrays, audio
 from soundevent.arrays import operations as ops

 from batdetect2.configs import BaseConfig
+from batdetect2.preprocess.audio import convert_to_xr

 __all__ = [
+    "SpectrogramBuilder",
    "STFTConfig",
    "FrequencyConfig",
+    "SpecSizeConfig",
    "LogScaleConfig",
    "PcenScaleConfig",
    "AmplitudeScaleConfig",
    "Scales",
    "SpectrogramConfig",
+    "ConfigurableSpectrogramBuilder",
+    "build_spectrogram_builder",
    "compute_spectrogram",
+    "get_spectrogram_resolution",
 ]

+
+class SpectrogramBuilder(Protocol):
+    """Defines the interface for a spectrogram generation component.
+
+    A SpectrogramBuilder takes a waveform (as numpy array or xarray DataArray)
+    and produces a spectrogram (as an xarray DataArray) based on its internal
+    configuration or implementation.
+    """
+
+    def __call__(
+        self,
+        wav: Union[np.ndarray, xr.DataArray],
+        samplerate: Optional[int] = None,
+    ) -> xr.DataArray:
+        """Generate a spectrogram from an audio waveform.
+
+        Parameters
+        ----------
+        wav : Union[np.ndarray, xr.DataArray]
+            The input audio waveform. If a numpy array, `samplerate` must
+            also be provided. If an xarray DataArray, it must have a 'time'
+            coordinate from which the sample rate can be inferred.
+        samplerate : int, optional
+            The sample rate of the audio in Hz. Required if `wav` is a
+            numpy array. If `wav` is an xarray DataArray, this parameter is
+            ignored as the sample rate is derived from the coordinates.
+
+        Returns
+        -------
+        xr.DataArray
+            The computed spectrogram as an xarray DataArray with 'time' and
+            'frequency' coordinates.
+
+        Raises
+        ------
+        ValueError
+            If `wav` is a numpy array and `samplerate` is not provided, or
+            if `wav` is an xarray DataArray without a valid 'time' coordinate.
+        """
+        ...
+
+
 MIN_FREQ = 10_000
+"""Default minimum frequency (Hz) for spectrogram frequency cropping."""
+
 MAX_FREQ = 120_000
+"""Default maximum frequency (Hz) for spectrogram frequency cropping."""


 class STFTConfig(BaseConfig):
+    """Configuration for the Short-Time Fourier Transform (STFT).
+
+    Attributes
+    ----------
+    window_duration : float, default=0.002
+        Duration of the STFT window in seconds (e.g., 0.002 for 2ms). Must be
+        > 0. Determines frequency resolution (longer window = finer frequency
+        resolution).
+    window_overlap : float, default=0.75
+        Fraction of overlap between consecutive STFT windows (e.g., 0.75
+        for 75%). Must be >= 0 and < 1. Determines time resolution
+        (higher overlap = finer time resolution).
+    window_fn : str, default="hann"
+        Name of the window function to apply before FFT calculation. Common
+        options include "hann", "hamming", "blackman". See
+        `scipy.signal.get_window`.
+    """
+
    window_duration: float = Field(default=0.002, gt=0)
    window_overlap: float = Field(default=0.75, ge=0, lt=1)
    window_fn: str = "hann"


 class FrequencyConfig(BaseConfig):
+    """Configuration for frequency axis parameters.
+
+    Attributes
+    ----------
+    max_freq : int, default=120000
+        Maximum frequency in Hz to retain in the spectrogram after STFT.
+        Frequencies above this value will be cropped. Must be > 0.
+    min_freq : int, default=10000
+        Minimum frequency in Hz to retain in the spectrogram after STFT.
+        Frequencies below this value will be cropped. Must be > 0.
+    """
+
    max_freq: int = Field(default=120_000, gt=0)
    min_freq: int = Field(default=10_000, gt=0)


 class SpecSizeConfig(BaseConfig):
-    height: int = 128
-    """Height of the spectrogram in pixels. This value determines the 
-    number of frequency bands and corresponds to the vertical dimension 
-    of the spectrogram."""
+    """Configuration for the final size and shape of the spectrogram.

+    Attributes
+    ----------
+    height : int, default=128
+        Target height of the spectrogram in pixels (frequency bins). The
+        frequency axis will be resized (e.g., via interpolation) to match this
+        height after frequency cropping and amplitude scaling. Must be > 0.
+    resize_factor : float, optional
+        Factor by which to resize the spectrogram along the time axis *after*
+        STFT calculation. A value of 0.5 halves the number of time bins,
+        2.0 doubles it. If None (default), no resizing along the time axis
+        is performed relative to the STFT output width. Must be > 0 if provided.
+    """
+
+    height: int = 128
    resize_factor: Optional[float] = 0.5
-    """Factor by which to resize the spectrogram along the time axis. 
-    A value of 0.5 reduces the temporal dimension by half, while a 
-    value of 2.0 doubles it. If None, no resizing is performed."""


 class LogScaleConfig(BaseConfig):
+    """Configuration marker for using Logarithmic Amplitude Scaling."""
+
    name: Literal["log"] = "log"


 class PcenScaleConfig(BaseConfig):
+    """Configuration for Per-Channel Energy Normalization (PCEN) scaling.
+
+    PCEN is an adaptive gain control method often used for audio event
+    detection.
+
+    Attributes
+    ----------
+    name : Literal["pcen"]
+        Discriminator field identifying this scaling type.
+    time_constant : float, default=0.4
+        Time constant (in seconds) for the PCEN smoothing filter. Controls how
+        quickly the normalization adapts to energy changes.
+    gain : float, default=0.98
+        Gain factor (alpha in some formulations). Controls the AGC behavior.
+    bias : float, default=2.0
+        Bias factor (delta in some formulations). Added before the
+        exponentiation.
+    power : float, default=0.5
+        Exponent (r in some formulations). Controls the compression
+        characteristic.
+    """
+
    name: Literal["pcen"] = "pcen"
    time_constant: float = 0.4
-    hop_length: int = 512
    gain: float = 0.98
    bias: float = 2
    power: float = 0.5


 class AmplitudeScaleConfig(BaseConfig):
+    """Configuration marker for using Linear Amplitude (no scaling applied).
+
+    Note: The actual output is typically magnitude from STFT, not raw amplitude.
+    This option essentially skips log or PCEN scaling.
+    """
+
    name: Literal["amplitude"] = "amplitude"


 Scales = Union[LogScaleConfig, PcenScaleConfig, AmplitudeScaleConfig]
+"""Type alias for the different amplitude scaling configuration options."""


 class SpectrogramConfig(BaseConfig):
+    """Unified configuration for spectrogram generation.
+
+    Aggregates settings for STFT, frequency selection, amplitude scaling,
+    resizing, and optional post-processing steps like denoising and final
+    normalization.
+
+    Attributes
+    ----------
+    stft : STFTConfig
+        Configuration for the Short-Time Fourier Transform. Defaults to standard
+        settings via `STFTConfig`.
+    frequencies : FrequencyConfig
+        Configuration for cropping the frequency range. Defaults to standard
+        settings via `FrequencyConfig`.
+    scale : Scales
+        Configuration for amplitude scaling. Determines whether to apply
+        log scaling, PCEN, or leave as linear magnitude. Defaults to PCEN
+        via `PcenScaleConfig`. Use the `name` field ("log", "pcen", "amplitude")
+        in config files to select the type and provide relevant parameters.
+    size : SpecSizeConfig, optional
+        Configuration for resizing the final spectrogram dimensions (height in
+        frequency bins, optional time resizing factor). If None or omitted,
+        no resizing is performed after STFT and frequency cropping. Defaults
+        to standard settings via `SpecSizeConfig`.
+    denoise : bool, default=True
+        If True (default), applies a simple spectral mean subtraction denoising
+        step after amplitude scaling.
+    max_scale : bool, default=False
+        If True, applies a final peak normalization to the spectrogram *after*
+        all other steps (including log/PCEN scaling and resizing), scaling the
+        maximum value across the entire spectrogram to 1.0. If False (default),
+        this final scaling is skipped. **Note:** Applying this after log or PCEN
+        scaling will alter the characteristics of those scales.
+    """
+
    stft: STFTConfig = Field(default_factory=STFTConfig)
    frequencies: FrequencyConfig = Field(default_factory=FrequencyConfig)
    scale: Scales = Field(
@ -81,11 +256,145 @@ class SpectrogramConfig(BaseConfig):
    max_scale: bool = False


+class ConfigurableSpectrogramBuilder(SpectrogramBuilder):
+    """Implementation of `SpectrogramBuilder` driven by `SpectrogramConfig`.
+
+    This class computes spectrograms according to the parameters specified in a
+    `SpectrogramConfig` object provided during initialization. It handles both
+    numpy array and xarray DataArray inputs for the waveform.
+    """
+
+    def __init__(
+        self,
+        config: SpectrogramConfig,
+        dtype: DTypeLike = np.float32,  # type: ignore
+    ) -> None:
+        """Initialize the ConfigurableSpectrogramBuilder.
+
+        Parameters
+        ----------
+        config : SpectrogramConfig
+            The configuration object specifying all spectrogram parameters.
+        dtype : DTypeLike, default=np.float32
+            The target NumPy data type for the computed spectrogram array.
+        """
+        self.config = config
+        self.dtype = dtype
+
+    def __call__(
+        self,
+        wav: Union[np.ndarray, xr.DataArray],
+        samplerate: Optional[int] = None,
+    ) -> xr.DataArray:
+        """Generate a spectrogram from an audio waveform using the config.
+
+        Implements the `SpectrogramBuilder` protocol. If the input `wav` is
+        a numpy array, `samplerate` must be provided; the array will be
+        converted to an xarray DataArray internally. If `wav` is already an
+        xarray DataArray with time coordinates, `samplerate` is ignored.
+        Delegates the main computation to `compute_spectrogram`.
+
+        Parameters
+        ----------
+        wav : Union[np.ndarray, xr.DataArray]
+            The input audio waveform.
+        samplerate : int, optional
+            The sample rate in Hz (required only if `wav` is np.ndarray).
+
+        Returns
+        -------
+        xr.DataArray
+            The computed spectrogram.
+
+        Raises
+        ------
+        ValueError
+            If `wav` is np.ndarray and `samplerate` is None.
+        """
+        if isinstance(wav, np.ndarray):
+            if samplerate is None:
+                raise ValueError(
+                    "Samplerate must be provided when passing a numpy array."
+                )
+            wav = convert_to_xr(
+                wav,
+                samplerate=samplerate,
+                dtype=self.dtype,
+            )
+
+        return compute_spectrogram(
+            wav,
+            config=self.config,
+            dtype=self.dtype,
+        )
+
+
+def build_spectrogram_builder(
+    config: SpectrogramConfig,
+    dtype: DTypeLike = np.float32,  # type: ignore
+) -> SpectrogramBuilder:
+    """Factory function to create a SpectrogramBuilder based on configuration.
+
+    Instantiates and returns a `ConfigurableSpectrogramBuilder` initialized
+    with the provided `SpectrogramConfig`.
+
+    Parameters
+    ----------
+    config : SpectrogramConfig
+        The configuration object specifying spectrogram parameters.
+    dtype : DTypeLike, default=np.float32
+        The target NumPy data type for the computed spectrogram array.
+
+    Returns
+    -------
+    SpectrogramBuilder
+        An instance of `ConfigurableSpectrogramBuilder` ready to compute
+        spectrograms according to the configuration.
+    """
+    return ConfigurableSpectrogramBuilder(config=config, dtype=dtype)
+
+
 def compute_spectrogram(
    wav: xr.DataArray,
    config: Optional[SpectrogramConfig] = None,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
+    """Compute a spectrogram from a waveform using specified configurations.
+
+    Applies a sequence of operations based on the `config`:
+
+    1. Compute STFT magnitude (`stft`).
+    2. Crop frequency axis (`crop_spectrogram_frequencies`).
+    3. Apply amplitude scaling (log, PCEN, or none) (`scale_spectrogram`).
+    4. Apply denoising if enabled (`denoise_spectrogram`).
+    5. Resize dimensions if specified (`resize_spectrogram`).
+    6. Apply final peak normalization if enabled (`max_scale`).
+
+    Parameters
+    ----------
+    wav : xr.DataArray
+        Input audio waveform with a 'time' dimension and coordinates from
+        which the sample rate can be inferred.
+    config : SpectrogramConfig, optional
+        Configuration object specifying spectrogram parameters. If None,
+        default settings from `SpectrogramConfig` are used.
+    dtype : DTypeLike, default=np.float32
+        Target NumPy data type for the final spectrogram array.
+
+    Returns
+    -------
+    xr.DataArray
+        The computed and processed spectrogram with 'time' and 'frequency'
+        coordinates.
+
+    Raises
+    ------
+    ValueError
+        If `wav` lacks necessary 'time' coordinates or dimensions.
+    Exception
+        Can re-raise exceptions from underlying libraries (e.g., librosa, numpy)
+        if invalid parameters or data are encountered.
+    """
    config = config or SpectrogramConfig()

    spec = stft(
@ -125,6 +434,25 @@ def crop_spectrogram_frequencies(
    min_freq: int = 10_000,
    max_freq: int = 120_000,
 ) -> xr.DataArray:
+    """Crop the frequency axis of a spectrogram to a specified range.
+
+    Uses `soundevent.arrays.crop_dim` to select the frequency bins
+    corresponding to the range [`min_freq`, `max_freq`].
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input spectrogram with 'frequency' dimension and coordinates.
+    min_freq : int, default=MIN_FREQ
+        Minimum frequency (Hz) to keep.
+    max_freq : int, default=MAX_FREQ
+        Maximum frequency (Hz) to keep.
+
+    Returns
+    -------
+    xr.DataArray
+        Spectrogram cropped along the frequency axis. Preserves dtype.
+    """
    return arrays.crop_dim(
        spec,
        dim="frequency",
@ -140,6 +468,36 @@ def stft(
    window_fn: str = "hann",
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
+    """Compute the Short-Time Fourier Transform (STFT) magnitude spectrogram.
+
+    Calculates STFT parameters (N-FFT, hop length) based on the window
+    duration, overlap, and waveform sample rate. Returns an xarray DataArray
+    with correctly calculated 'time' and 'frequency' coordinates.
+
+    Parameters
+    ----------
+    wave : xr.DataArray
+        Input audio waveform with 'time' coordinates.
+    window_duration : float
+        Duration of the STFT window in seconds.
+    window_overlap : float
+        Fractional overlap between consecutive windows [0, 1).
+    window_fn : str, default="hann"
+        Name of the window function (e.g., "hann", "hamming").
+    dtype : DTypeLike, default=np.float32
+        Target data type for the spectrogram array.
+
+    Returns
+    -------
+    xr.DataArray
+        Magnitude spectrogram with 'time' and 'frequency' dimensions and
+        coordinates. STFT parameters are stored in the `attrs`.
+
+    Raises
+    ------
+    ValueError
+        If sample rate cannot be determined from `wave` coordinates.
+    """
    start_time, end_time = arrays.get_dim_range(wave, dim="time")
    step = arrays.get_dim_step(wave, dim="time")
    sampling_rate = 1 / step
@ -193,6 +551,21 @@ def stft(


 def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
+    """Apply simple spectral mean subtraction for denoising.
+
+    Subtracts the mean value of each frequency bin (calculated across time)
+    from that bin, then clips negative values to zero.
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input spectrogram with 'time' and 'frequency' dimensions.
+
+    Returns
+    -------
+    xr.DataArray
+        Denoised spectrogram with the same dimensions, coordinates, and dtype.
+    """
    return xr.DataArray(
        data=(spec - spec.mean("time")).clip(0),
        dims=spec.dims,
@ -206,6 +579,28 @@ def scale_spectrogram(
    scale: Scales,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
+    """Apply configured amplitude scaling to the spectrogram.
+
+    Dispatches to the appropriate scaling function (log, PCEN) based on the
+    `scale` configuration object's `name` field. If `scale.name` is
+    "amplitude", the spectrogram is returned unchanged (as it's already
+    magnitude/amplitude).
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input magnitude spectrogram.
+    scale : Scales
+        The configuration object specifying the scaling method and parameters
+        (instance of LogScaleConfig, PcenScaleConfig, or AmplitudeScaleConfig).
+    dtype : DTypeLike, default=np.float32
+        Target data type for the output scaled spectrogram.
+
+    Returns
+    -------
+    xr.DataArray
+        Spectrogram with the specified amplitude scaling applied.
+    """
    if scale.name == "log":
        return scale_log(spec, dtype=dtype)

@ -213,7 +608,6 @@ def scale_spectrogram(
        return scale_pcen(
            spec,
            time_constant=scale.time_constant,
-            hop_length=scale.hop_length,
            gain=scale.gain,
            power=scale.power,
            bias=scale.bias,
@ -225,12 +619,44 @@ def scale_spectrogram(
 def scale_pcen(
    spec: xr.DataArray,
    time_constant: float = 0.4,
-    hop_length: int = 512,
    gain: float = 0.98,
    bias: float = 2,
    power: float = 0.5,
 ) -> xr.DataArray:
+    """Apply Per-Channel Energy Normalization (PCEN) to a spectrogram.
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input magnitude spectrogram with required attributes like
+        'processing_original_samplerate'.
+    time_constant : float, default=0.4
+        PCEN time constant in seconds.
+    gain : float, default=0.98
+        Gain factor (alpha).
+    bias : float, default=2.0
+        Bias factor (delta).
+    power : float, default=0.5
+        Exponent (r).
+    dtype : DTypeLike, default=np.float32
+        Target data type for the output spectrogram.
+
+    Returns
+    -------
+    xr.DataArray
+        PCEN-scaled spectrogram.
+
+    Notes
+    -----
+    - The input spectrogram magnitude `spec` is multiplied by `2**31` before
+      being passed to `audio.pcen`. This suggests the underlying implementation
+      might expect values in a range typical of 16-bit or 32-bit signed integers,
+      even though the input here might be float. This scaling factor should be
+      verified against the specific `soundevent.audio.pcen` implementation
+      details.
+    """
    samplerate = spec.attrs["original_samplerate"]
+    hop_length = spec.attrs["nfft"] - spec.attrs["noverlap"]
    t_frames = time_constant * samplerate / (float(hop_length) * 10)
    smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
    return audio.pcen(
@ -246,6 +672,32 @@ def scale_log(
    spec: xr.DataArray,
    dtype: DTypeLike = np.float32,  # type: ignore
 ) -> xr.DataArray:
+    """Apply logarithmic scaling to a magnitude spectrogram.
+
+    Calculates `log(1 + C * S)`, where S is the input magnitude spectrogram
+    and C is a scaling factor derived from the original STFT parameters
+    (sample rate, N-FFT, window function) stored in `spec.attrs`.
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input magnitude spectrogram with required attributes like
+        'processing_original_samplerate', 'processing_nfft'.
+    dtype : DTypeLike, default=np.float32
+        Target data type for the output spectrogram.
+
+    Returns
+    -------
+    xr.DataArray
+        Log-scaled spectrogram.
+
+    Raises
+    ------
+    KeyError
+        If required attributes are missing from `spec.attrs`.
+    ValueError
+        If attributes are non-numeric or window function is invalid.
+    """
    samplerate = spec.attrs["original_samplerate"]
    nfft = spec.attrs["nfft"]
    log_scaling = 2 / (samplerate * (np.abs(np.hanning(nfft)) ** 2).sum())
@ -262,6 +714,28 @@ def resize_spectrogram(
    height: int = 128,
    resize_factor: Optional[float] = 0.5,
 ) -> xr.DataArray:
+    """Resize a spectrogram to target dimensions using interpolation.
+
+    Resizes the frequency axis to `height` bins and optionally resizes the
+    time axis by `resize_factor`.
+
+    Parameters
+    ----------
+    spec : xr.DataArray
+        Input spectrogram with 'time' and 'frequency' dimensions.
+    height : int, default=128
+        Target number of frequency bins (vertical dimension).
+    resize_factor : float, optional
+        Factor to resize the time dimension. If 1.0 or None, time dimension
+        is unchanged. If 0.5, time dimension is halved, etc.
+
+    Returns
+    -------
+    xr.DataArray
+        Resized spectrogram. Coordinates are typically adjusted by the
+        underlying resize operation if implemented in `ops.resize`.
+        The dtype is currently hardcoded to float32 by ops.resize call.
+    """
    resize_factor = resize_factor or 1
    current_width = spec.sizes["time"]
    return ops.resize(
@ -272,61 +746,41 @@ def resize_spectrogram(
    )


-def adjust_spectrogram_width(
-    spec: xr.DataArray,
-    divide_factor: int = 32,
-    time_period: float = 0.001,
-) -> xr.DataArray:
-    time_width = spec.sizes["time"]
-
-    if time_width % divide_factor == 0:
-        return spec
-
-    target_size = int(
-        np.ceil(spec.sizes["time"] / divide_factor) * divide_factor
-    )
-    extra_duration = (target_size - time_width) * time_period
-    _, stop = arrays.get_dim_range(spec, dim="time")
-    resized = ops.extend_dim(
-        spec,
-        dim="time",
-        stop=stop + extra_duration,
-    )
-    return resized
-
-
-def duration_to_spec_width(
-    duration: float,
-    samplerate: int,
-    window_duration: float,
-    window_overlap: float,
-) -> int:
-    samples = int(duration * samplerate)
-    fft_len = int(window_duration * samplerate)
-    fft_overlap = int(window_overlap * fft_len)
-    hop_len = fft_len - fft_overlap
-    width = (samples - fft_len + hop_len) / hop_len
-    return int(np.floor(width))
-
-
-def spec_width_to_samples(
-    width: int,
-    samplerate: int,
-    window_duration: float,
-    window_overlap: float,
-) -> int:
-    fft_len = int(window_duration * samplerate)
-    fft_overlap = int(window_overlap * fft_len)
-    hop_len = fft_len - fft_overlap
-    return width * hop_len + fft_len - hop_len
-
-
 def get_spectrogram_resolution(
    config: SpectrogramConfig,
 ) -> tuple[float, float]:
+    """Calculate the approximate resolution of the final spectrogram.
+
+    Computes the width of each frequency bin (Hz/bin) and the duration
+    of each time bin (seconds/bin) based on the configuration parameters.
+
+    Parameters
+    ----------
+    config : SpectrogramConfig
+        The spectrogram configuration object.
+    samplerate : int, optional
+        The sample rate of the audio *before* STFT. Required if needed to
+        calculate hop duration accurately from STFT config, but the current
+        implementation calculates hop_duration directly from STFT config times.
+
+    Returns
+    -------
+    Tuple[float, float]
+        A tuple containing:
+        - frequency_resolution (float): Approximate Hz per frequency bin.
+        - time_resolution (float): Approximate seconds per time bin.
+
+    Raises
+    ------
+    ValueError
+        If required configuration fields (like `config.size`) are missing
+        or invalid.
+    """
    max_freq = config.frequencies.max_freq
    min_freq = config.frequencies.min_freq
-    assert config.size is not None
+
+    if config.size is None:
+        raise ValueError("Spectrogram size configuration is required.")

    spec_height = config.size.height
    resize_factor = config.size.resize_factor or 1