Copy librosa PCEN implementation

This commit is contained in:
mbsantiago 2025-08-17 18:02:40 +01:00
parent c1945ebdb7
commit 4aea3fb2b0
2 changed files with 37 additions and 10 deletions

View File

@ -109,7 +109,7 @@ class AudioConfig(BaseConfig):
resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig) resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig)
scale: bool = SCALE_RAW_AUDIO scale: bool = SCALE_RAW_AUDIO
center: bool = True center: bool = False
duration: Optional[float] = DEFAULT_DURATION duration: Optional[float] = DEFAULT_DURATION

View File

@ -25,6 +25,7 @@ import numpy as np
import xarray as xr import xarray as xr
from numpy.typing import DTypeLike from numpy.typing import DTypeLike
from pydantic import Field from pydantic import Field
from scipy import signal
from soundevent import arrays, audio from soundevent import arrays, audio
from soundevent.arrays import operations as ops from soundevent.arrays import operations as ops
@ -135,7 +136,7 @@ class PcenConfig(BaseConfig):
Exponent (r). Controls the compression characteristic. Exponent (r). Controls the compression characteristic.
""" """
time_constant: float = 0.4 time_constant: float = 0.01
gain: float = 0.98 gain: float = 0.98
bias: float = 2 bias: float = 2
power: float = 0.5 power: float = 0.5
@ -513,6 +514,7 @@ def apply_pcen(
time_constant: float = 0.4, time_constant: float = 0.4,
gain: float = 0.98, gain: float = 0.98,
bias: float = 2, bias: float = 2,
eps: float = 1e-6,
power: float = 0.5, power: float = 0.5,
) -> xr.DataArray: ) -> xr.DataArray:
"""Apply Per-Channel Energy Normalization (PCEN) to a spectrogram. """Apply Per-Channel Energy Normalization (PCEN) to a spectrogram.
@ -538,20 +540,45 @@ def apply_pcen(
xr.DataArray xr.DataArray
PCEN-scaled spectrogram. PCEN-scaled spectrogram.
""" """
spec = spec * (2**31)
samplerate = 1 / spec.time.attrs["step"] samplerate = 1 / spec.time.attrs["step"]
hop_size = spec.attrs["hop_size"] hop_size = spec.attrs["hop_size"]
hop_length = int(hop_size * samplerate) hop_length = int(hop_size * samplerate)
t_frames = time_constant * samplerate / (float(hop_length) * 10)
t_frames = time_constant * samplerate / hop_length
smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2) smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
return audio.pcen(
spec * (2**31), axis = spec.get_axis_num("time")
smooth=smoothing_constant,
gain=gain, shape = tuple([1] * spec.ndim)
bias=bias, zi = np.empty(shape)
power=power, zi[:] = signal.lfilter_zi(
).astype(spec.dtype) [smoothing_constant],
[1, smoothing_constant - 1],
)[:]
# Smooth the input array along the given axis
smoothed, _ = signal.lfilter(
[smoothing_constant],
[1, smoothing_constant - 1],
spec.data,
zi=zi,
axis=axis, # type: ignore
)
smooth = np.exp(-gain * (np.log(eps) + np.log1p(smoothed / eps)))
data = (bias**power) * np.expm1(
power * np.log1p(spec.data * smooth / bias)
)
return xr.DataArray(
data,
dims=spec.dims,
coords=spec.coords,
attrs=spec.attrs,
)
def scale_log( def scale_log(