mirror of
https://github.com/macaodha/batdetect2.git
synced 2025-06-29 22:51:58 +02:00
Add documentation to spectrogram
This commit is contained in:
parent
aca0b58443
commit
2212246b11
@ -47,6 +47,7 @@ __all__ = [
|
|||||||
"TARGET_SAMPLERATE_HZ",
|
"TARGET_SAMPLERATE_HZ",
|
||||||
"SCALE_RAW_AUDIO",
|
"SCALE_RAW_AUDIO",
|
||||||
"DEFAULT_DURATION",
|
"DEFAULT_DURATION",
|
||||||
|
"convert_to_xr",
|
||||||
]
|
]
|
||||||
|
|
||||||
TARGET_SAMPLERATE_HZ = 256_000
|
TARGET_SAMPLERATE_HZ = 256_000
|
||||||
@ -734,3 +735,67 @@ def resample_audio_fourier(
|
|||||||
"""
|
"""
|
||||||
ratio = sr_new / sr_orig
|
ratio = sr_new / sr_orig
|
||||||
return resample(array, int(array.shape[axis] * ratio), axis=axis) # type: ignore
|
return resample(array, int(array.shape[axis] * ratio), axis=axis) # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_xr(
|
||||||
|
wav: np.ndarray,
|
||||||
|
samplerate: int,
|
||||||
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
|
) -> xr.DataArray:
|
||||||
|
"""Convert a NumPy array to an xarray DataArray with time coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wav : np.ndarray
|
||||||
|
The input waveform array. Expected to be 1D or 2D (with the first axis as
|
||||||
|
the channel dimension).
|
||||||
|
samplerate : int
|
||||||
|
The sample rate in Hz.
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target data type for the xarray DataArray.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
The waveform as an xarray DataArray with time coordinates.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If the input array is not 1D or 2D, or if the sample rate is
|
||||||
|
non-positive. If the input array is empty.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if wav.ndim == 2:
|
||||||
|
wav = wav[0, :]
|
||||||
|
|
||||||
|
if wav.ndim != 1:
|
||||||
|
raise ValueError(
|
||||||
|
"Audio must be 1D array or 2D channel where the first axis is the channel dimension"
|
||||||
|
)
|
||||||
|
|
||||||
|
if wav.size == 0:
|
||||||
|
raise ValueError("Audio array is empty")
|
||||||
|
|
||||||
|
if samplerate <= 0:
|
||||||
|
raise ValueError("Sample rate must be positive")
|
||||||
|
|
||||||
|
times = np.linspace(
|
||||||
|
0,
|
||||||
|
wav.shape[0] / samplerate,
|
||||||
|
wav.shape[0],
|
||||||
|
endpoint=False,
|
||||||
|
dtype=dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
return xr.DataArray(
|
||||||
|
data=wav.astype(dtype),
|
||||||
|
dims=["time"],
|
||||||
|
coords={
|
||||||
|
"time": arrays.create_time_dim_from_array(
|
||||||
|
times,
|
||||||
|
samplerate=samplerate,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
attrs={"samplerate": samplerate},
|
||||||
|
)
|
||||||
|
@ -1,4 +1,25 @@
|
|||||||
from typing import Literal, Optional, Union
|
"""Computes spectrograms from audio waveforms with configurable parameters.
|
||||||
|
|
||||||
|
This module provides the functionality to convert preprocessed audio waveforms
|
||||||
|
(typically output from the `batdetect2.preprocessing.audio` module) into
|
||||||
|
spectrogram representations suitable for input into deep learning models like
|
||||||
|
BatDetect2.
|
||||||
|
|
||||||
|
It offers a configurable pipeline including:
|
||||||
|
1. Short-Time Fourier Transform (STFT) calculation.
|
||||||
|
2. Frequency axis cropping to a relevant range.
|
||||||
|
3. Amplitude scaling (e.g., Logarithmic, Per-Channel Energy Normalization -
|
||||||
|
PCEN).
|
||||||
|
4. Simple denoising (optional).
|
||||||
|
5. Resizing to target dimensions (optional).
|
||||||
|
6. Final peak normalization (optional).
|
||||||
|
|
||||||
|
Configuration is managed via the `SpectrogramConfig` class, allowing for
|
||||||
|
reproducible spectrogram generation consistent between training and inference.
|
||||||
|
The core computation is performed by `compute_spectrogram`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Literal, Optional, Protocol, Union
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import librosa.core.spectrum
|
import librosa.core.spectrum
|
||||||
@ -10,66 +31,220 @@ from soundevent import arrays, audio
|
|||||||
from soundevent.arrays import operations as ops
|
from soundevent.arrays import operations as ops
|
||||||
|
|
||||||
from batdetect2.configs import BaseConfig
|
from batdetect2.configs import BaseConfig
|
||||||
|
from batdetect2.preprocess.audio import convert_to_xr
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"SpectrogramBuilder",
|
||||||
"STFTConfig",
|
"STFTConfig",
|
||||||
"FrequencyConfig",
|
"FrequencyConfig",
|
||||||
|
"SpecSizeConfig",
|
||||||
"LogScaleConfig",
|
"LogScaleConfig",
|
||||||
"PcenScaleConfig",
|
"PcenScaleConfig",
|
||||||
"AmplitudeScaleConfig",
|
"AmplitudeScaleConfig",
|
||||||
"Scales",
|
"Scales",
|
||||||
"SpectrogramConfig",
|
"SpectrogramConfig",
|
||||||
|
"ConfigurableSpectrogramBuilder",
|
||||||
|
"build_spectrogram_builder",
|
||||||
"compute_spectrogram",
|
"compute_spectrogram",
|
||||||
|
"get_spectrogram_resolution",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class SpectrogramBuilder(Protocol):
|
||||||
|
"""Defines the interface for a spectrogram generation component.
|
||||||
|
|
||||||
|
A SpectrogramBuilder takes a waveform (as numpy array or xarray DataArray)
|
||||||
|
and produces a spectrogram (as an xarray DataArray) based on its internal
|
||||||
|
configuration or implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
wav: Union[np.ndarray, xr.DataArray],
|
||||||
|
samplerate: Optional[int] = None,
|
||||||
|
) -> xr.DataArray:
|
||||||
|
"""Generate a spectrogram from an audio waveform.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wav : Union[np.ndarray, xr.DataArray]
|
||||||
|
The input audio waveform. If a numpy array, `samplerate` must
|
||||||
|
also be provided. If an xarray DataArray, it must have a 'time'
|
||||||
|
coordinate from which the sample rate can be inferred.
|
||||||
|
samplerate : int, optional
|
||||||
|
The sample rate of the audio in Hz. Required if `wav` is a
|
||||||
|
numpy array. If `wav` is an xarray DataArray, this parameter is
|
||||||
|
ignored as the sample rate is derived from the coordinates.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
The computed spectrogram as an xarray DataArray with 'time' and
|
||||||
|
'frequency' coordinates.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If `wav` is a numpy array and `samplerate` is not provided, or
|
||||||
|
if `wav` is an xarray DataArray without a valid 'time' coordinate.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
MIN_FREQ = 10_000
|
MIN_FREQ = 10_000
|
||||||
|
"""Default minimum frequency (Hz) for spectrogram frequency cropping."""
|
||||||
|
|
||||||
MAX_FREQ = 120_000
|
MAX_FREQ = 120_000
|
||||||
|
"""Default maximum frequency (Hz) for spectrogram frequency cropping."""
|
||||||
|
|
||||||
|
|
||||||
class STFTConfig(BaseConfig):
|
class STFTConfig(BaseConfig):
|
||||||
|
"""Configuration for the Short-Time Fourier Transform (STFT).
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
window_duration : float, default=0.002
|
||||||
|
Duration of the STFT window in seconds (e.g., 0.002 for 2ms). Must be
|
||||||
|
> 0. Determines frequency resolution (longer window = finer frequency
|
||||||
|
resolution).
|
||||||
|
window_overlap : float, default=0.75
|
||||||
|
Fraction of overlap between consecutive STFT windows (e.g., 0.75
|
||||||
|
for 75%). Must be >= 0 and < 1. Determines time resolution
|
||||||
|
(higher overlap = finer time resolution).
|
||||||
|
window_fn : str, default="hann"
|
||||||
|
Name of the window function to apply before FFT calculation. Common
|
||||||
|
options include "hann", "hamming", "blackman". See
|
||||||
|
`scipy.signal.get_window`.
|
||||||
|
"""
|
||||||
|
|
||||||
window_duration: float = Field(default=0.002, gt=0)
|
window_duration: float = Field(default=0.002, gt=0)
|
||||||
window_overlap: float = Field(default=0.75, ge=0, lt=1)
|
window_overlap: float = Field(default=0.75, ge=0, lt=1)
|
||||||
window_fn: str = "hann"
|
window_fn: str = "hann"
|
||||||
|
|
||||||
|
|
||||||
class FrequencyConfig(BaseConfig):
|
class FrequencyConfig(BaseConfig):
|
||||||
|
"""Configuration for frequency axis parameters.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
max_freq : int, default=120000
|
||||||
|
Maximum frequency in Hz to retain in the spectrogram after STFT.
|
||||||
|
Frequencies above this value will be cropped. Must be > 0.
|
||||||
|
min_freq : int, default=10000
|
||||||
|
Minimum frequency in Hz to retain in the spectrogram after STFT.
|
||||||
|
Frequencies below this value will be cropped. Must be > 0.
|
||||||
|
"""
|
||||||
|
|
||||||
max_freq: int = Field(default=120_000, gt=0)
|
max_freq: int = Field(default=120_000, gt=0)
|
||||||
min_freq: int = Field(default=10_000, gt=0)
|
min_freq: int = Field(default=10_000, gt=0)
|
||||||
|
|
||||||
|
|
||||||
class SpecSizeConfig(BaseConfig):
|
class SpecSizeConfig(BaseConfig):
|
||||||
height: int = 128
|
"""Configuration for the final size and shape of the spectrogram.
|
||||||
"""Height of the spectrogram in pixels. This value determines the
|
|
||||||
number of frequency bands and corresponds to the vertical dimension
|
|
||||||
of the spectrogram."""
|
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
height : int, default=128
|
||||||
|
Target height of the spectrogram in pixels (frequency bins). The
|
||||||
|
frequency axis will be resized (e.g., via interpolation) to match this
|
||||||
|
height after frequency cropping and amplitude scaling. Must be > 0.
|
||||||
|
resize_factor : float, optional
|
||||||
|
Factor by which to resize the spectrogram along the time axis *after*
|
||||||
|
STFT calculation. A value of 0.5 halves the number of time bins,
|
||||||
|
2.0 doubles it. If None (default), no resizing along the time axis
|
||||||
|
is performed relative to the STFT output width. Must be > 0 if provided.
|
||||||
|
"""
|
||||||
|
|
||||||
|
height: int = 128
|
||||||
resize_factor: Optional[float] = 0.5
|
resize_factor: Optional[float] = 0.5
|
||||||
"""Factor by which to resize the spectrogram along the time axis.
|
|
||||||
A value of 0.5 reduces the temporal dimension by half, while a
|
|
||||||
value of 2.0 doubles it. If None, no resizing is performed."""
|
|
||||||
|
|
||||||
|
|
||||||
class LogScaleConfig(BaseConfig):
|
class LogScaleConfig(BaseConfig):
|
||||||
|
"""Configuration marker for using Logarithmic Amplitude Scaling."""
|
||||||
|
|
||||||
name: Literal["log"] = "log"
|
name: Literal["log"] = "log"
|
||||||
|
|
||||||
|
|
||||||
class PcenScaleConfig(BaseConfig):
|
class PcenScaleConfig(BaseConfig):
|
||||||
|
"""Configuration for Per-Channel Energy Normalization (PCEN) scaling.
|
||||||
|
|
||||||
|
PCEN is an adaptive gain control method often used for audio event
|
||||||
|
detection.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
name : Literal["pcen"]
|
||||||
|
Discriminator field identifying this scaling type.
|
||||||
|
time_constant : float, default=0.4
|
||||||
|
Time constant (in seconds) for the PCEN smoothing filter. Controls how
|
||||||
|
quickly the normalization adapts to energy changes.
|
||||||
|
gain : float, default=0.98
|
||||||
|
Gain factor (alpha in some formulations). Controls the AGC behavior.
|
||||||
|
bias : float, default=2.0
|
||||||
|
Bias factor (delta in some formulations). Added before the
|
||||||
|
exponentiation.
|
||||||
|
power : float, default=0.5
|
||||||
|
Exponent (r in some formulations). Controls the compression
|
||||||
|
characteristic.
|
||||||
|
"""
|
||||||
|
|
||||||
name: Literal["pcen"] = "pcen"
|
name: Literal["pcen"] = "pcen"
|
||||||
time_constant: float = 0.4
|
time_constant: float = 0.4
|
||||||
hop_length: int = 512
|
|
||||||
gain: float = 0.98
|
gain: float = 0.98
|
||||||
bias: float = 2
|
bias: float = 2
|
||||||
power: float = 0.5
|
power: float = 0.5
|
||||||
|
|
||||||
|
|
||||||
class AmplitudeScaleConfig(BaseConfig):
|
class AmplitudeScaleConfig(BaseConfig):
|
||||||
|
"""Configuration marker for using Linear Amplitude (no scaling applied).
|
||||||
|
|
||||||
|
Note: The actual output is typically magnitude from STFT, not raw amplitude.
|
||||||
|
This option essentially skips log or PCEN scaling.
|
||||||
|
"""
|
||||||
|
|
||||||
name: Literal["amplitude"] = "amplitude"
|
name: Literal["amplitude"] = "amplitude"
|
||||||
|
|
||||||
|
|
||||||
Scales = Union[LogScaleConfig, PcenScaleConfig, AmplitudeScaleConfig]
|
Scales = Union[LogScaleConfig, PcenScaleConfig, AmplitudeScaleConfig]
|
||||||
|
"""Type alias for the different amplitude scaling configuration options."""
|
||||||
|
|
||||||
|
|
||||||
class SpectrogramConfig(BaseConfig):
|
class SpectrogramConfig(BaseConfig):
|
||||||
|
"""Unified configuration for spectrogram generation.
|
||||||
|
|
||||||
|
Aggregates settings for STFT, frequency selection, amplitude scaling,
|
||||||
|
resizing, and optional post-processing steps like denoising and final
|
||||||
|
normalization.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
stft : STFTConfig
|
||||||
|
Configuration for the Short-Time Fourier Transform. Defaults to standard
|
||||||
|
settings via `STFTConfig`.
|
||||||
|
frequencies : FrequencyConfig
|
||||||
|
Configuration for cropping the frequency range. Defaults to standard
|
||||||
|
settings via `FrequencyConfig`.
|
||||||
|
scale : Scales
|
||||||
|
Configuration for amplitude scaling. Determines whether to apply
|
||||||
|
log scaling, PCEN, or leave as linear magnitude. Defaults to PCEN
|
||||||
|
via `PcenScaleConfig`. Use the `name` field ("log", "pcen", "amplitude")
|
||||||
|
in config files to select the type and provide relevant parameters.
|
||||||
|
size : SpecSizeConfig, optional
|
||||||
|
Configuration for resizing the final spectrogram dimensions (height in
|
||||||
|
frequency bins, optional time resizing factor). If None or omitted,
|
||||||
|
no resizing is performed after STFT and frequency cropping. Defaults
|
||||||
|
to standard settings via `SpecSizeConfig`.
|
||||||
|
denoise : bool, default=True
|
||||||
|
If True (default), applies a simple spectral mean subtraction denoising
|
||||||
|
step after amplitude scaling.
|
||||||
|
max_scale : bool, default=False
|
||||||
|
If True, applies a final peak normalization to the spectrogram *after*
|
||||||
|
all other steps (including log/PCEN scaling and resizing), scaling the
|
||||||
|
maximum value across the entire spectrogram to 1.0. If False (default),
|
||||||
|
this final scaling is skipped. **Note:** Applying this after log or PCEN
|
||||||
|
scaling will alter the characteristics of those scales.
|
||||||
|
"""
|
||||||
|
|
||||||
stft: STFTConfig = Field(default_factory=STFTConfig)
|
stft: STFTConfig = Field(default_factory=STFTConfig)
|
||||||
frequencies: FrequencyConfig = Field(default_factory=FrequencyConfig)
|
frequencies: FrequencyConfig = Field(default_factory=FrequencyConfig)
|
||||||
scale: Scales = Field(
|
scale: Scales = Field(
|
||||||
@ -81,11 +256,145 @@ class SpectrogramConfig(BaseConfig):
|
|||||||
max_scale: bool = False
|
max_scale: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigurableSpectrogramBuilder(SpectrogramBuilder):
|
||||||
|
"""Implementation of `SpectrogramBuilder` driven by `SpectrogramConfig`.
|
||||||
|
|
||||||
|
This class computes spectrograms according to the parameters specified in a
|
||||||
|
`SpectrogramConfig` object provided during initialization. It handles both
|
||||||
|
numpy array and xarray DataArray inputs for the waveform.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: SpectrogramConfig,
|
||||||
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
|
) -> None:
|
||||||
|
"""Initialize the ConfigurableSpectrogramBuilder.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config : SpectrogramConfig
|
||||||
|
The configuration object specifying all spectrogram parameters.
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
The target NumPy data type for the computed spectrogram array.
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
self.dtype = dtype
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
wav: Union[np.ndarray, xr.DataArray],
|
||||||
|
samplerate: Optional[int] = None,
|
||||||
|
) -> xr.DataArray:
|
||||||
|
"""Generate a spectrogram from an audio waveform using the config.
|
||||||
|
|
||||||
|
Implements the `SpectrogramBuilder` protocol. If the input `wav` is
|
||||||
|
a numpy array, `samplerate` must be provided; the array will be
|
||||||
|
converted to an xarray DataArray internally. If `wav` is already an
|
||||||
|
xarray DataArray with time coordinates, `samplerate` is ignored.
|
||||||
|
Delegates the main computation to `compute_spectrogram`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wav : Union[np.ndarray, xr.DataArray]
|
||||||
|
The input audio waveform.
|
||||||
|
samplerate : int, optional
|
||||||
|
The sample rate in Hz (required only if `wav` is np.ndarray).
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
The computed spectrogram.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If `wav` is np.ndarray and `samplerate` is None.
|
||||||
|
"""
|
||||||
|
if isinstance(wav, np.ndarray):
|
||||||
|
if samplerate is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Samplerate must be provided when passing a numpy array."
|
||||||
|
)
|
||||||
|
wav = convert_to_xr(
|
||||||
|
wav,
|
||||||
|
samplerate=samplerate,
|
||||||
|
dtype=self.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
return compute_spectrogram(
|
||||||
|
wav,
|
||||||
|
config=self.config,
|
||||||
|
dtype=self.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_spectrogram_builder(
|
||||||
|
config: SpectrogramConfig,
|
||||||
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
|
) -> SpectrogramBuilder:
|
||||||
|
"""Factory function to create a SpectrogramBuilder based on configuration.
|
||||||
|
|
||||||
|
Instantiates and returns a `ConfigurableSpectrogramBuilder` initialized
|
||||||
|
with the provided `SpectrogramConfig`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config : SpectrogramConfig
|
||||||
|
The configuration object specifying spectrogram parameters.
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
The target NumPy data type for the computed spectrogram array.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
SpectrogramBuilder
|
||||||
|
An instance of `ConfigurableSpectrogramBuilder` ready to compute
|
||||||
|
spectrograms according to the configuration.
|
||||||
|
"""
|
||||||
|
return ConfigurableSpectrogramBuilder(config=config, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
def compute_spectrogram(
|
def compute_spectrogram(
|
||||||
wav: xr.DataArray,
|
wav: xr.DataArray,
|
||||||
config: Optional[SpectrogramConfig] = None,
|
config: Optional[SpectrogramConfig] = None,
|
||||||
dtype: DTypeLike = np.float32, # type: ignore
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Compute a spectrogram from a waveform using specified configurations.
|
||||||
|
|
||||||
|
Applies a sequence of operations based on the `config`:
|
||||||
|
|
||||||
|
1. Compute STFT magnitude (`stft`).
|
||||||
|
2. Crop frequency axis (`crop_spectrogram_frequencies`).
|
||||||
|
3. Apply amplitude scaling (log, PCEN, or none) (`scale_spectrogram`).
|
||||||
|
4. Apply denoising if enabled (`denoise_spectrogram`).
|
||||||
|
5. Resize dimensions if specified (`resize_spectrogram`).
|
||||||
|
6. Apply final peak normalization if enabled (`max_scale`).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wav : xr.DataArray
|
||||||
|
Input audio waveform with a 'time' dimension and coordinates from
|
||||||
|
which the sample rate can be inferred.
|
||||||
|
config : SpectrogramConfig, optional
|
||||||
|
Configuration object specifying spectrogram parameters. If None,
|
||||||
|
default settings from `SpectrogramConfig` are used.
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target NumPy data type for the final spectrogram array.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
The computed and processed spectrogram with 'time' and 'frequency'
|
||||||
|
coordinates.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If `wav` lacks necessary 'time' coordinates or dimensions.
|
||||||
|
Exception
|
||||||
|
Can re-raise exceptions from underlying libraries (e.g., librosa, numpy)
|
||||||
|
if invalid parameters or data are encountered.
|
||||||
|
"""
|
||||||
config = config or SpectrogramConfig()
|
config = config or SpectrogramConfig()
|
||||||
|
|
||||||
spec = stft(
|
spec = stft(
|
||||||
@ -125,6 +434,25 @@ def crop_spectrogram_frequencies(
|
|||||||
min_freq: int = 10_000,
|
min_freq: int = 10_000,
|
||||||
max_freq: int = 120_000,
|
max_freq: int = 120_000,
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Crop the frequency axis of a spectrogram to a specified range.
|
||||||
|
|
||||||
|
Uses `soundevent.arrays.crop_dim` to select the frequency bins
|
||||||
|
corresponding to the range [`min_freq`, `max_freq`].
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input spectrogram with 'frequency' dimension and coordinates.
|
||||||
|
min_freq : int, default=MIN_FREQ
|
||||||
|
Minimum frequency (Hz) to keep.
|
||||||
|
max_freq : int, default=MAX_FREQ
|
||||||
|
Maximum frequency (Hz) to keep.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Spectrogram cropped along the frequency axis. Preserves dtype.
|
||||||
|
"""
|
||||||
return arrays.crop_dim(
|
return arrays.crop_dim(
|
||||||
spec,
|
spec,
|
||||||
dim="frequency",
|
dim="frequency",
|
||||||
@ -140,6 +468,36 @@ def stft(
|
|||||||
window_fn: str = "hann",
|
window_fn: str = "hann",
|
||||||
dtype: DTypeLike = np.float32, # type: ignore
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Compute the Short-Time Fourier Transform (STFT) magnitude spectrogram.
|
||||||
|
|
||||||
|
Calculates STFT parameters (N-FFT, hop length) based on the window
|
||||||
|
duration, overlap, and waveform sample rate. Returns an xarray DataArray
|
||||||
|
with correctly calculated 'time' and 'frequency' coordinates.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
wave : xr.DataArray
|
||||||
|
Input audio waveform with 'time' coordinates.
|
||||||
|
window_duration : float
|
||||||
|
Duration of the STFT window in seconds.
|
||||||
|
window_overlap : float
|
||||||
|
Fractional overlap between consecutive windows [0, 1).
|
||||||
|
window_fn : str, default="hann"
|
||||||
|
Name of the window function (e.g., "hann", "hamming").
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target data type for the spectrogram array.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Magnitude spectrogram with 'time' and 'frequency' dimensions and
|
||||||
|
coordinates. STFT parameters are stored in the `attrs`.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If sample rate cannot be determined from `wave` coordinates.
|
||||||
|
"""
|
||||||
start_time, end_time = arrays.get_dim_range(wave, dim="time")
|
start_time, end_time = arrays.get_dim_range(wave, dim="time")
|
||||||
step = arrays.get_dim_step(wave, dim="time")
|
step = arrays.get_dim_step(wave, dim="time")
|
||||||
sampling_rate = 1 / step
|
sampling_rate = 1 / step
|
||||||
@ -193,6 +551,21 @@ def stft(
|
|||||||
|
|
||||||
|
|
||||||
def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
|
def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
|
||||||
|
"""Apply simple spectral mean subtraction for denoising.
|
||||||
|
|
||||||
|
Subtracts the mean value of each frequency bin (calculated across time)
|
||||||
|
from that bin, then clips negative values to zero.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input spectrogram with 'time' and 'frequency' dimensions.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Denoised spectrogram with the same dimensions, coordinates, and dtype.
|
||||||
|
"""
|
||||||
return xr.DataArray(
|
return xr.DataArray(
|
||||||
data=(spec - spec.mean("time")).clip(0),
|
data=(spec - spec.mean("time")).clip(0),
|
||||||
dims=spec.dims,
|
dims=spec.dims,
|
||||||
@ -206,6 +579,28 @@ def scale_spectrogram(
|
|||||||
scale: Scales,
|
scale: Scales,
|
||||||
dtype: DTypeLike = np.float32, # type: ignore
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Apply configured amplitude scaling to the spectrogram.
|
||||||
|
|
||||||
|
Dispatches to the appropriate scaling function (log, PCEN) based on the
|
||||||
|
`scale` configuration object's `name` field. If `scale.name` is
|
||||||
|
"amplitude", the spectrogram is returned unchanged (as it's already
|
||||||
|
magnitude/amplitude).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input magnitude spectrogram.
|
||||||
|
scale : Scales
|
||||||
|
The configuration object specifying the scaling method and parameters
|
||||||
|
(instance of LogScaleConfig, PcenScaleConfig, or AmplitudeScaleConfig).
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target data type for the output scaled spectrogram.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Spectrogram with the specified amplitude scaling applied.
|
||||||
|
"""
|
||||||
if scale.name == "log":
|
if scale.name == "log":
|
||||||
return scale_log(spec, dtype=dtype)
|
return scale_log(spec, dtype=dtype)
|
||||||
|
|
||||||
@ -213,7 +608,6 @@ def scale_spectrogram(
|
|||||||
return scale_pcen(
|
return scale_pcen(
|
||||||
spec,
|
spec,
|
||||||
time_constant=scale.time_constant,
|
time_constant=scale.time_constant,
|
||||||
hop_length=scale.hop_length,
|
|
||||||
gain=scale.gain,
|
gain=scale.gain,
|
||||||
power=scale.power,
|
power=scale.power,
|
||||||
bias=scale.bias,
|
bias=scale.bias,
|
||||||
@ -225,12 +619,44 @@ def scale_spectrogram(
|
|||||||
def scale_pcen(
|
def scale_pcen(
|
||||||
spec: xr.DataArray,
|
spec: xr.DataArray,
|
||||||
time_constant: float = 0.4,
|
time_constant: float = 0.4,
|
||||||
hop_length: int = 512,
|
|
||||||
gain: float = 0.98,
|
gain: float = 0.98,
|
||||||
bias: float = 2,
|
bias: float = 2,
|
||||||
power: float = 0.5,
|
power: float = 0.5,
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Apply Per-Channel Energy Normalization (PCEN) to a spectrogram.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input magnitude spectrogram with required attributes like
|
||||||
|
'processing_original_samplerate'.
|
||||||
|
time_constant : float, default=0.4
|
||||||
|
PCEN time constant in seconds.
|
||||||
|
gain : float, default=0.98
|
||||||
|
Gain factor (alpha).
|
||||||
|
bias : float, default=2.0
|
||||||
|
Bias factor (delta).
|
||||||
|
power : float, default=0.5
|
||||||
|
Exponent (r).
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target data type for the output spectrogram.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
PCEN-scaled spectrogram.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- The input spectrogram magnitude `spec` is multiplied by `2**31` before
|
||||||
|
being passed to `audio.pcen`. This suggests the underlying implementation
|
||||||
|
might expect values in a range typical of 16-bit or 32-bit signed integers,
|
||||||
|
even though the input here might be float. This scaling factor should be
|
||||||
|
verified against the specific `soundevent.audio.pcen` implementation
|
||||||
|
details.
|
||||||
|
"""
|
||||||
samplerate = spec.attrs["original_samplerate"]
|
samplerate = spec.attrs["original_samplerate"]
|
||||||
|
hop_length = spec.attrs["nfft"] - spec.attrs["noverlap"]
|
||||||
t_frames = time_constant * samplerate / (float(hop_length) * 10)
|
t_frames = time_constant * samplerate / (float(hop_length) * 10)
|
||||||
smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
|
smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
|
||||||
return audio.pcen(
|
return audio.pcen(
|
||||||
@ -246,6 +672,32 @@ def scale_log(
|
|||||||
spec: xr.DataArray,
|
spec: xr.DataArray,
|
||||||
dtype: DTypeLike = np.float32, # type: ignore
|
dtype: DTypeLike = np.float32, # type: ignore
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Apply logarithmic scaling to a magnitude spectrogram.
|
||||||
|
|
||||||
|
Calculates `log(1 + C * S)`, where S is the input magnitude spectrogram
|
||||||
|
and C is a scaling factor derived from the original STFT parameters
|
||||||
|
(sample rate, N-FFT, window function) stored in `spec.attrs`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input magnitude spectrogram with required attributes like
|
||||||
|
'processing_original_samplerate', 'processing_nfft'.
|
||||||
|
dtype : DTypeLike, default=np.float32
|
||||||
|
Target data type for the output spectrogram.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Log-scaled spectrogram.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
KeyError
|
||||||
|
If required attributes are missing from `spec.attrs`.
|
||||||
|
ValueError
|
||||||
|
If attributes are non-numeric or window function is invalid.
|
||||||
|
"""
|
||||||
samplerate = spec.attrs["original_samplerate"]
|
samplerate = spec.attrs["original_samplerate"]
|
||||||
nfft = spec.attrs["nfft"]
|
nfft = spec.attrs["nfft"]
|
||||||
log_scaling = 2 / (samplerate * (np.abs(np.hanning(nfft)) ** 2).sum())
|
log_scaling = 2 / (samplerate * (np.abs(np.hanning(nfft)) ** 2).sum())
|
||||||
@ -262,6 +714,28 @@ def resize_spectrogram(
|
|||||||
height: int = 128,
|
height: int = 128,
|
||||||
resize_factor: Optional[float] = 0.5,
|
resize_factor: Optional[float] = 0.5,
|
||||||
) -> xr.DataArray:
|
) -> xr.DataArray:
|
||||||
|
"""Resize a spectrogram to target dimensions using interpolation.
|
||||||
|
|
||||||
|
Resizes the frequency axis to `height` bins and optionally resizes the
|
||||||
|
time axis by `resize_factor`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
spec : xr.DataArray
|
||||||
|
Input spectrogram with 'time' and 'frequency' dimensions.
|
||||||
|
height : int, default=128
|
||||||
|
Target number of frequency bins (vertical dimension).
|
||||||
|
resize_factor : float, optional
|
||||||
|
Factor to resize the time dimension. If 1.0 or None, time dimension
|
||||||
|
is unchanged. If 0.5, time dimension is halved, etc.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
xr.DataArray
|
||||||
|
Resized spectrogram. Coordinates are typically adjusted by the
|
||||||
|
underlying resize operation if implemented in `ops.resize`.
|
||||||
|
The dtype is currently hardcoded to float32 by ops.resize call.
|
||||||
|
"""
|
||||||
resize_factor = resize_factor or 1
|
resize_factor = resize_factor or 1
|
||||||
current_width = spec.sizes["time"]
|
current_width = spec.sizes["time"]
|
||||||
return ops.resize(
|
return ops.resize(
|
||||||
@ -272,61 +746,41 @@ def resize_spectrogram(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def adjust_spectrogram_width(
|
|
||||||
spec: xr.DataArray,
|
|
||||||
divide_factor: int = 32,
|
|
||||||
time_period: float = 0.001,
|
|
||||||
) -> xr.DataArray:
|
|
||||||
time_width = spec.sizes["time"]
|
|
||||||
|
|
||||||
if time_width % divide_factor == 0:
|
|
||||||
return spec
|
|
||||||
|
|
||||||
target_size = int(
|
|
||||||
np.ceil(spec.sizes["time"] / divide_factor) * divide_factor
|
|
||||||
)
|
|
||||||
extra_duration = (target_size - time_width) * time_period
|
|
||||||
_, stop = arrays.get_dim_range(spec, dim="time")
|
|
||||||
resized = ops.extend_dim(
|
|
||||||
spec,
|
|
||||||
dim="time",
|
|
||||||
stop=stop + extra_duration,
|
|
||||||
)
|
|
||||||
return resized
|
|
||||||
|
|
||||||
|
|
||||||
def duration_to_spec_width(
|
|
||||||
duration: float,
|
|
||||||
samplerate: int,
|
|
||||||
window_duration: float,
|
|
||||||
window_overlap: float,
|
|
||||||
) -> int:
|
|
||||||
samples = int(duration * samplerate)
|
|
||||||
fft_len = int(window_duration * samplerate)
|
|
||||||
fft_overlap = int(window_overlap * fft_len)
|
|
||||||
hop_len = fft_len - fft_overlap
|
|
||||||
width = (samples - fft_len + hop_len) / hop_len
|
|
||||||
return int(np.floor(width))
|
|
||||||
|
|
||||||
|
|
||||||
def spec_width_to_samples(
|
|
||||||
width: int,
|
|
||||||
samplerate: int,
|
|
||||||
window_duration: float,
|
|
||||||
window_overlap: float,
|
|
||||||
) -> int:
|
|
||||||
fft_len = int(window_duration * samplerate)
|
|
||||||
fft_overlap = int(window_overlap * fft_len)
|
|
||||||
hop_len = fft_len - fft_overlap
|
|
||||||
return width * hop_len + fft_len - hop_len
|
|
||||||
|
|
||||||
|
|
||||||
def get_spectrogram_resolution(
|
def get_spectrogram_resolution(
|
||||||
config: SpectrogramConfig,
|
config: SpectrogramConfig,
|
||||||
) -> tuple[float, float]:
|
) -> tuple[float, float]:
|
||||||
|
"""Calculate the approximate resolution of the final spectrogram.
|
||||||
|
|
||||||
|
Computes the width of each frequency bin (Hz/bin) and the duration
|
||||||
|
of each time bin (seconds/bin) based on the configuration parameters.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
config : SpectrogramConfig
|
||||||
|
The spectrogram configuration object.
|
||||||
|
samplerate : int, optional
|
||||||
|
The sample rate of the audio *before* STFT. Required if needed to
|
||||||
|
calculate hop duration accurately from STFT config, but the current
|
||||||
|
implementation calculates hop_duration directly from STFT config times.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Tuple[float, float]
|
||||||
|
A tuple containing:
|
||||||
|
- frequency_resolution (float): Approximate Hz per frequency bin.
|
||||||
|
- time_resolution (float): Approximate seconds per time bin.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If required configuration fields (like `config.size`) are missing
|
||||||
|
or invalid.
|
||||||
|
"""
|
||||||
max_freq = config.frequencies.max_freq
|
max_freq = config.frequencies.max_freq
|
||||||
min_freq = config.frequencies.min_freq
|
min_freq = config.frequencies.min_freq
|
||||||
assert config.size is not None
|
|
||||||
|
if config.size is None:
|
||||||
|
raise ValueError("Spectrogram size configuration is required.")
|
||||||
|
|
||||||
spec_height = config.size.height
|
spec_height = config.size.height
|
||||||
resize_factor = config.size.resize_factor or 1
|
resize_factor = config.size.resize_factor or 1
|
||||||
|
Loading…
Reference in New Issue
Block a user