Make sure labels are working in the notebook

2026-01-11 09:29:33 +01:00 · 2024-11-16 18:23:43 +00:00 · 2024-11-16 18:23:43 +00:00 · ee884da8b0
commit ee884da8b0
parent 6a9e33c729
14 changed files with 428 additions and 209 deletions
--- a/batdetect2/data/compat.py
+++ b/batdetect2/data/compat.py
@ -195,18 +195,29 @@ def annotation_to_sound_event(
        uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
        sound_event=sound_event,
        tags=[
-            data.Tag(key=label_key, value=annotation.label),
-            data.Tag(key=event_key, value=annotation.event),
-            data.Tag(key=individual_key, value=str(annotation.individual)),
+            data.Tag(
+                term=data.term_from_key(label_key),
+                value=annotation.label,
+            ),
+            data.Tag(
+                term=data.term_from_key(event_key),
+                value=annotation.event,
+            ),
+            data.Tag(
+                term=data.term_from_key(individual_key),
+                value=str(annotation.individual),
+            ),
        ],
    )


 def file_annotation_to_clip(
    file_annotation: FileAnnotation,
-    audio_dir: PathLike = Path.cwd(),
+    audio_dir: Optional[PathLike] = None,
 ) -> data.Clip:
    """Convert file annotation to recording."""
+    audio_dir = audio_dir or Path.cwd()
+
    full_path = Path(audio_dir) / file_annotation.id

    if not full_path.exists():
@ -241,7 +252,11 @@ def file_annotation_to_clip_annotation(
        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
        clip=clip,
        notes=notes,
-        tags=[data.Tag(key=label_key, value=file_annotation.label)],
+        tags=[
+            data.Tag(
+                term=data.term_from_key(label_key), value=file_annotation.label
+            )
+        ],
        sound_events=[
            annotation_to_sound_event(
                annotation,
@ -286,9 +301,11 @@ def list_file_annotations(path: PathLike) -> List[Path]:
 def load_annotation_project(
    path: PathLike,
    name: Optional[str] = None,
-    audio_dir: PathLike = Path.cwd(),
+    audio_dir: Optional[PathLike] = None,
 ) -> data.AnnotationProject:
    """Convert annotations to annotation project."""
+    audio_dir = audio_dir or Path.cwd()
+
    paths = list_file_annotations(path)

    if name is None:
--- a/batdetect2/data/labels.py
+++ b/batdetect2/data/labels.py
@ -3,7 +3,7 @@ from typing import Tuple
 import numpy as np
 import xarray as xr
 from scipy.ndimage import gaussian_filter
-from soundevent import data, geometry, arrays
+from soundevent import arrays, data, geometry
 from soundevent.geometry.operations import Positions
 from soundevent.types import ClassMapper

@ -22,6 +22,8 @@ def generate_heatmaps(
    class_mapper: ClassMapper,
    target_sigma: float = TARGET_SIGMA,
    position: Positions = "bottom-left",
+    time_scale: float = 1.0,
+    frequency_scale: float = 1.0,
    dtype=np.float32,
 ) -> Tuple[xr.DataArray, xr.DataArray, xr.DataArray]:
    shape = dict(zip(spec.dims, spec.shape))
@ -31,13 +33,6 @@ def generate_heatmaps(
            "Spectrogram must have time and frequency dimensions."
        )

-    time_duration = arrays.get_dim_width(spec, dim="time")
-    freq_bandwidth = arrays.get_dim_width(spec, dim="frequency")
-
-    # Compute the size factors
-    time_scale = 1 / time_duration
-    frequency_scale = 1 / freq_bandwidth
-
    # Initialize heatmaps
    detection_heatmap = xr.zeros_like(spec, dtype=dtype)
    class_heatmap = xr.DataArray(
@ -92,7 +87,7 @@ def generate_heatmaps(
        )

        # Get the class name of the sound event
-        class_name = class_mapper.transform(sound_event_annotation)
+        class_name = class_mapper.encode(sound_event_annotation)

        if class_name is None:
            # If the label is None skip the sound event
--- a/batdetect2/data/preprocessing.py
+++ b/batdetect2/data/preprocessing.py
@ -1,7 +1,7 @@
 """Module containing functions for preprocessing audio clips."""

-from typing import Optional, Union
 from pathlib import Path
+from typing import Literal, Optional, Union

 import librosa
 import librosa.core.spectrum
@ -10,7 +10,7 @@ import xarray as xr
 from numpy.typing import DTypeLike
 from pydantic import BaseModel, Field
 from scipy.signal import resample_poly
-from soundevent import audio, data, arrays
+from soundevent import arrays, audio, data
 from soundevent.arrays import operations as ops

 __all__ = [
@ -34,32 +34,56 @@ DENOISE_SPEC_AVG = True
 MAX_SCALE_SPEC = False


+class ResampleConfig(BaseModel):
+    samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0)
+    mode: str = "poly"
+
+
+class AudioConfig(BaseModel):
+    resample: Optional[ResampleConfig] = Field(default_factory=ResampleConfig)
+    scale: bool = Field(default=SCALE_RAW_AUDIO)
+    center: bool = True
+    duration: Optional[float] = DEFAULT_DURATION
+
+
+class FFTConfig(BaseModel):
+    window_duration: float = Field(default=FFT_WIN_LENGTH_S, gt=0)
+    window_overlap: float = Field(default=FFT_OVERLAP, ge=0, lt=1)
+    window_fn: str = "hann"
+
+
+class FrequencyConfig(BaseModel):
+    max_freq: int = Field(default=MAX_FREQ_HZ, gt=0)
+    min_freq: int = Field(default=MIN_FREQ_HZ, gt=0)
+
+
+class PcenConfig(BaseModel):
+    time_constant: float = 0.4
+    hop_length: int = 512
+    gain: float = 0.98
+    bias: float = 2
+    power: float = 0.5
+
+
+class SpecSizeConfig(BaseModel):
+    height: int = SPEC_HEIGHT
+    time_period: float = SPEC_TIME_PERIOD
+
+
+class SpectrogramConfig(BaseModel):
+    fft: FFTConfig = Field(default_factory=FFTConfig)
+    frequencies: FrequencyConfig = Field(default_factory=FrequencyConfig)
+    scale: Union[Literal["log"], None, PcenConfig] = "log"
+    denoise: bool = True
+    resize: Optional[SpecSizeConfig] = Field(default_factory=SpecSizeConfig)
+    max_scale: bool = MAX_SCALE_SPEC
+
+
 class PreprocessingConfig(BaseModel):
    """Configuration for preprocessing data."""

-    target_samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0)
-
-    scale_audio: bool = Field(default=SCALE_RAW_AUDIO)
-
-    fft_win_length: float = Field(default=FFT_WIN_LENGTH_S, gt=0)
-
-    fft_overlap: float = Field(default=FFT_OVERLAP, ge=0, lt=1)
-
-    max_freq: int = Field(default=MAX_FREQ_HZ, gt=0)
-
-    min_freq: int = Field(default=MIN_FREQ_HZ, gt=0)
-
-    spec_scale: str = Field(default=SPEC_SCALE)
-
-    denoise_spec_avg: bool = DENOISE_SPEC_AVG
-
-    max_scale_spec: bool = MAX_SCALE_SPEC
-
-    duration: Optional[float] = DEFAULT_DURATION
-
-    spec_height: int = SPEC_HEIGHT
-
-    spec_time_period: float = SPEC_TIME_PERIOD
+    audio: AudioConfig = Field(default_factory=AudioConfig)
+    spectrogram: SpectrogramConfig = Field(default_factory=SpectrogramConfig)

    @classmethod
    def from_file(
@ -104,7 +128,7 @@ class PreprocessingConfig(BaseModel):

 def preprocess_audio_clip(
    clip: data.Clip,
-    config: PreprocessingConfig = PreprocessingConfig(),
+    config: Optional[PreprocessingConfig] = None,
 ) -> xr.DataArray:
    """Preprocesses audio clip to generate spectrogram.

@ -121,81 +145,117 @@ def preprocess_audio_clip(
        Preprocessed spectrogram.

    """
-    wav = load_clip_audio(
-        clip,
-        target_sampling_rate=config.target_samplerate,
-        scale=config.scale_audio,
-    )
-
-    spec = compute_spectrogram(
-        wav,
-        fft_win_length=config.fft_win_length,
-        fft_overlap=config.fft_overlap,
-        max_freq=config.max_freq,
-        min_freq=config.min_freq,
-        spec_scale=config.spec_scale,
-        denoise_spec_avg=config.denoise_spec_avg,
-        max_scale_spec=config.max_scale_spec,
-    )
-
-    if config.duration is not None:
-        spec = adjust_spec_duration(clip, spec, config.duration)
-
-    duration = arrays.get_dim_width(spec, dim="time")
-    return ops.resize(
-        spec,
-        time=int(np.ceil(duration / config.spec_time_period)),
-        frequency=config.spec_height,
-        dtype=np.float32,
-    )
-
-
-def adjust_spec_duration(
-    clip: data.Clip,
-    spec: xr.DataArray,
-    duration: float,
-) -> xr.DataArray:
-    current_duration = clip.end_time - clip.start_time
-
-    if current_duration == duration:
-        return spec
-
-    if current_duration > duration:
-        return arrays.crop_dim(
-            spec,
-            dim="time",
-            start=clip.start_time,
-            stop=clip.start_time + duration,
-        )
-
-    return arrays.extend_dim(
-        spec,
-        dim="time",
-        start=clip.start_time,
-        stop=clip.start_time + duration,
-    )
+    config = config or PreprocessingConfig()
+    wav = load_clip_audio(clip, config=config.audio)
+    spec = compute_spectrogram(wav, config=config.spectrogram)
+    return spec


 def load_clip_audio(
    clip: data.Clip,
-    target_sampling_rate: int = TARGET_SAMPLERATE_HZ,
-    scale: bool = SCALE_RAW_AUDIO,
+    config: Optional[AudioConfig] = None,
    dtype: DTypeLike = np.float32,
 ) -> xr.DataArray:
+    config = config or AudioConfig()
+
    wav = audio.load_clip(clip).sel(channel=0).astype(dtype)

-    wav = resample_audio(wav, target_sampling_rate, dtype=dtype)
+    if config.duration is not None:
+        wav = adjust_audio_duration(wav, duration=config.duration)

-    if scale:
+    if config.resample:
+        wav = resample_audio(
+            wav,
+            samplerate=config.resample.samplerate,
+            dtype=dtype,
+        )
+
+    if config.center:
        wav = ops.center(wav)
+
+    if config.scale:
        wav = ops.scale(wav, 1 / (10e-6 + np.max(np.abs(wav))))

    return wav.astype(dtype)


+def compute_spectrogram(
+    wav: xr.DataArray,
+    config: Optional[SpectrogramConfig] = None,
+    dtype: DTypeLike = np.float32,
+) -> xr.DataArray:
+    config = config or SpectrogramConfig()
+
+    spec = stft(
+        wav,
+        window_duration=config.fft.window_duration,
+        window_overlap=config.fft.window_overlap,
+        window_fn=config.fft.window_fn,
+        dtype=dtype,
+    )
+
+    spec = crop_spectrogram_frequencies(
+        spec,
+        min_freq=config.frequencies.min_freq,
+        max_freq=config.frequencies.max_freq,
+    )
+
+    spec = scale_spectrogram(spec, scale=config.scale)
+
+    if config.denoise:
+        spec = denoise_spectrogram(spec)
+
+    if config.resize:
+        spec = resize_spectrogram(spec, config=config.resize)
+
+    if config.max_scale:
+        spec = ops.scale(spec, 1 / (10e-6 + np.max(spec)))
+
+    return spec.astype(dtype)
+
+
+def crop_spectrogram_frequencies(
+    spec: xr.DataArray,
+    min_freq: int = MIN_FREQ_HZ,
+    max_freq: int = MAX_FREQ_HZ,
+) -> xr.DataArray:
+    return arrays.crop_dim(
+        spec,
+        dim="frequency",
+        start=min_freq,
+        stop=max_freq,
+    ).astype(spec.dtype)
+
+
+def adjust_audio_duration(
+    wave: xr.DataArray,
+    duration: float,
+) -> xr.DataArray:
+    start_time, end_time = arrays.get_dim_range(wave, dim="time")
+    current_duration = end_time - start_time
+
+    if current_duration == duration:
+        return wave
+
+    if current_duration > duration:
+        return arrays.crop_dim(
+            wave,
+            dim="time",
+            start=start_time,
+            stop=start_time + duration,
+        )
+
+    return arrays.extend_dim(
+        wave,
+        dim="time",
+        start=start_time,
+        stop=start_time + duration,
+    )
+
+
 def resample_audio(
    wav: xr.DataArray,
-    target_samplerate: int = TARGET_SAMPLERATE_HZ,
+    samplerate: int = TARGET_SAMPLERATE_HZ,
    dtype: DTypeLike = np.float32,
 ) -> xr.DataArray:
    if "time" not in wav.dims:
@ -207,13 +267,13 @@ def resample_audio(
    step = arrays.get_dim_step(wav, dim="time")
    original_samplerate = int(1 / step)

-    if original_samplerate == target_samplerate:
+    if original_samplerate == samplerate:
        return wav.astype(dtype)

-    gcd = np.gcd(original_samplerate, target_samplerate)
+    gcd = np.gcd(original_samplerate, samplerate)
    resampled = resample_poly(
        wav.values,
-        target_samplerate // gcd,
+        samplerate // gcd,
        original_samplerate // gcd,
        axis=time_axis,
    )
@ -225,7 +285,6 @@ def resample_audio(
        endpoint=False,
        dtype=dtype,
    )
-
    return xr.DataArray(
        data=resampled.astype(dtype),
        dims=wav.dims,
@ -233,70 +292,35 @@ def resample_audio(
            **wav.coords,
            "time": arrays.create_time_dim_from_array(
                resampled_times,
-                samplerate=target_samplerate,
+                samplerate=samplerate,
            ),
        },
        attrs=wav.attrs,
    )


-def compute_spectrogram(
-    wav: xr.DataArray,
-    fft_win_length: float = FFT_WIN_LENGTH_S,
-    fft_overlap: float = FFT_OVERLAP,
-    max_freq: int = MAX_FREQ_HZ,
-    min_freq: int = MIN_FREQ_HZ,
-    spec_scale: str = SPEC_SCALE,
-    denoise_spec_avg: bool = True,
-    max_scale_spec: bool = False,
-    dtype: DTypeLike = np.float32,
-) -> xr.DataArray:
-    spec = gen_mag_spectrogram(
-        wav,
-        window_len=fft_win_length,
-        overlap_perc=fft_overlap,
-        dtype=dtype,
-    )
-
-    spec = arrays.crop_dim(
-        spec,
-        dim="frequency",
-        start=min_freq,
-        stop=max_freq,
-    ).astype(dtype)
-
-    spec = scale_spectrogram(spec, scale=spec_scale)
-
-    if denoise_spec_avg:
-        spec = denoise_spectrogram(spec)
-
-    if max_scale_spec:
-        spec = ops.scale(spec, 1 / (10e-6 + np.max(spec)))
-
-    return spec.astype(dtype)
-
-
-def gen_mag_spectrogram(
+def stft(
    wave: xr.DataArray,
-    window_len: float,
-    overlap_perc: float,
+    window_duration: float,
+    window_overlap: float,
+    window_fn: str = "hann",
    dtype: DTypeLike = np.float32,
 ) -> xr.DataArray:
    start_time, end_time = arrays.get_dim_range(wave, dim="time")
    step = arrays.get_dim_step(wave, dim="time")
    sampling_rate = 1 / step

-    hop_len = window_len * (1 - overlap_perc)
-    nfft = int(window_len * sampling_rate)
-    noverlap = int(overlap_perc * nfft)
+    hop_len = window_duration * (1 - window_overlap)
+    nfft = int(window_duration * sampling_rate)
+    noverlap = int(window_overlap * nfft)

-    # compute spec
    spec, _ = librosa.core.spectrum._spectrogram(
-        y=wave.data,
+        y=wave.data.astype(dtype),
        power=1,
        n_fft=nfft,
        hop_length=nfft - noverlap,
        center=False,
+        window=window_fn,
    )

    return xr.DataArray(
@ -316,7 +340,7 @@ def gen_mag_spectrogram(
            "time": arrays.create_time_dim_from_array(
                np.linspace(
                    start_time,
-                    end_time - (window_len - hop_len),
+                    end_time - (window_duration - hop_len),
                    spec.shape[1],
                    endpoint=False,
                    dtype=dtype,
@ -333,9 +357,7 @@ def gen_mag_spectrogram(
    )


-def denoise_spectrogram(
-    spec: xr.DataArray,
-) -> xr.DataArray:
+def denoise_spectrogram(spec: xr.DataArray) -> xr.DataArray:
    return xr.DataArray(
        data=(spec - spec.mean("time")).clip(0),
        dims=spec.dims,
@ -346,35 +368,53 @@ def denoise_spectrogram(

 def scale_spectrogram(
    spec: xr.DataArray,
-    scale: str = SPEC_SCALE,
+    scale: Union[Literal["log"], None, PcenConfig],
    dtype: DTypeLike = np.float32,
 ) -> xr.DataArray:
-    samplerate = spec.attrs["original_samplerate"]
-
-    if scale == "pcen":
-        smoothing_constant = get_pcen_smoothing_constant(samplerate / 10)
-        return audio.pcen(
-            spec * (2**31),
-            smooth=smoothing_constant,
-        ).astype(dtype)
-
    if scale == "log":
-        return log_scale(spec, dtype=dtype)
+        return scale_log(spec, dtype=dtype)
+
+    if isinstance(scale, PcenConfig):
+        return scale_pcen(
+            spec,
+            time_constant=scale.time_constant,
+            hop_length=scale.hop_length,
+            gain=scale.gain,
+            power=scale.power,
+            bias=scale.bias,
+        )

    return spec


-def log_scale(
+def scale_pcen(
+    spec: xr.DataArray,
+    time_constant: float = 0.4,
+    hop_length: int = 512,
+    gain: float = 0.98,
+    bias: float = 2,
+    power: float = 0.5,
+) -> xr.DataArray:
+    samplerate = spec.attrs["original_samplerate"]
+    # NOTE: Not sure why the 10 is there
+    t_frames = time_constant * samplerate / (float(hop_length) * 10)
+    smoothing_constant = (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
+    return audio.pcen(
+        spec * (2**31),
+        smooth=smoothing_constant,
+        gain=gain,
+        bias=bias,
+        power=power,
+    ).astype(spec.dtype)
+
+
+def scale_log(
    spec: xr.DataArray,
    dtype: DTypeLike = np.float32,
 ) -> xr.DataArray:
    samplerate = spec.attrs["original_samplerate"]
    nfft = spec.attrs["nfft"]
-    log_scaling = (
-        2.0
-        * (1.0 / samplerate)
-        * (1.0 / (np.abs(np.hanning(nfft)) ** 2).sum())
-    )
+    log_scaling = 2 / (samplerate * (np.abs(np.hanning(nfft)) ** 2).sum())
    return xr.DataArray(
        data=np.log1p(log_scaling * spec).astype(dtype),
        dims=spec.dims,
@ -383,10 +423,14 @@ def log_scale(
    )


-def get_pcen_smoothing_constant(
-    sr: int,
-    time_constant: float = 0.4,
-    hop_length: int = 512,
-) -> float:
-    t_frames = time_constant * sr / float(hop_length)
-    return (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2)
+def resize_spectrogram(
+    spec: xr.DataArray,
+    config: SpecSizeConfig,
+) -> xr.DataArray:
+    duration = arrays.get_dim_width(spec, dim="time")
+    return ops.resize(
+        spec,
+        time=int(np.ceil(duration / config.time_period)),
+        frequency=config.height,
+        dtype=np.float32,
+    )
--- a/batdetect2/models/checkpoints/readme.md
+++ b/batdetect2/models/checkpoints/readme.md
--- a/batdetect2/models/detectors.py
+++ b/batdetect2/models/detectors.py
@ -1,4 +1,4 @@
-from typing import Type
+from typing import Optional, Type

 import pytorch_lightning as L
 import torch
@ -6,11 +6,11 @@ import xarray as xr
 from soundevent import data
 from torch import nn, optim

-from batdetect2.data.preprocessing import (
-    preprocess_audio_clip,
-    PreprocessingConfig,
-)
 from batdetect2.data.labels import ClassMapper
+from batdetect2.data.preprocessing import (
+    PreprocessingConfig,
+    preprocess_audio_clip,
+)
 from batdetect2.models.feature_extractors import Net2DFast
 from batdetect2.models.post_process import (
    PostprocessConfig,
@ -29,11 +29,14 @@ class DetectorModel(L.LightningModule):
        learning_rate: float = 1e-3,
        input_height: int = 128,
        num_features: int = 32,
-        preprocessing_config: PreprocessingConfig = PreprocessingConfig(),
-        postprocessing_config: PostprocessConfig = PostprocessConfig(),
+        preprocessing_config: Optional[PreprocessingConfig] = None,
+        postprocessing_config: Optional[PostprocessConfig] = None,
    ):
        super().__init__()

+        preprocessing_config = preprocessing_config or PreprocessingConfig()
+        postprocessing_config = postprocessing_config or PostprocessConfig()
+
        self.save_hyperparameters()

        self.preprocessing_config = preprocessing_config
--- a/batdetect2/models/post_process.py
+++ b/batdetect2/models/post_process.py
@ -1,10 +1,10 @@
 """Module for postprocessing model outputs."""

 from typing import Callable, List, Tuple, Union
-from pydantic import BaseModel, Field

 import numpy as np
 import torch
+from pydantic import BaseModel, Field
 from soundevent import data
 from torch import nn

@ -207,7 +207,7 @@ def compute_sound_events_from_outputs(
            ),
            features=[
                data.Feature(
-                    name=f"batdetect2_{i}",
+                    term=data.term_from_key(f"batdetect2_{i}"),
                    value=value.item(),
                )
                for i, value in enumerate(feature)
--- a/batdetect2/train/preprocess.py
+++ b/batdetect2/train/preprocess.py
@ -3,18 +3,18 @@
 import os
 import warnings
 from functools import partial
+from multiprocessing import Pool
 from pathlib import Path
 from typing import Callable, Optional, Sequence, Union
-from tqdm.auto import tqdm
-from multiprocessing import Pool

 import xarray as xr
 from soundevent import data
+from tqdm.auto import tqdm

 from batdetect2.data.labels import TARGET_SIGMA, ClassMapper, generate_heatmaps
 from batdetect2.data.preprocessing import (
-    preprocess_audio_clip,
    PreprocessingConfig,
+    preprocess_audio_clip,
 )

 PathLike = Union[Path, str, os.PathLike]
@ -25,14 +25,15 @@ __all__ = [
 ]


-
 def generate_train_example(
    clip_annotation: data.ClipAnnotation,
    class_mapper: ClassMapper,
-    preprocessing_config: PreprocessingConfig = PreprocessingConfig(),
+    preprocessing_config: Optional[PreprocessingConfig] = None,
    target_sigma: float = TARGET_SIGMA,
 ) -> xr.Dataset:
    """Generate a training example."""
+    preprocessing_config = preprocessing_config or PreprocessingConfig()
+
    spectrogram = preprocess_audio_clip(
        clip_annotation.clip,
        config=preprocessing_config,
@ -83,14 +84,18 @@ def load_config(path: PathLike, **kwargs) -> PreprocessingConfig:
    path = Path(path)

    if not path.is_file():
-        warnings.warn(f"Config file not found: {path}. Using default config.")
+        warnings.warn(
+            f"Config file not found: {path}. Using default config.",
+            stacklevel=1,
+        )
        return PreprocessingConfig(**kwargs)

    try:
        return PreprocessingConfig.model_validate_json(path.read_text())
    except ValueError as e:
        warnings.warn(
-            f"Failed to load config file: {e}. Using default config."
+            f"Failed to load config file: {e}. Using default config.",
+            stacklevel=1,
        )
        return PreprocessingConfig(**kwargs)

--- a/batdetect2/utils/audio_utils.py
+++ b/batdetect2/utils/audio_utils.py
@ -90,7 +90,7 @@ def generate_spectrogram(
                    np.abs(
                        np.hanning(
                            int(params["fft_win_length"] * sampling_rate)
-                        )
+                        ).astype(np.float32)
                    )
                    ** 2
                ).sum()
--- a/batdetect2/utils/detector_utils.py
+++ b/batdetect2/utils/detector_utils.py
@ -409,7 +409,7 @@ def save_results_to_file(results, op_path: str) -> None:

 def compute_spectrogram(
    audio: np.ndarray,
-    sampling_rate: float,
+    sampling_rate: int,
    params: SpectrogramParameters,
    device: torch.device,
 ) -> Tuple[float, torch.Tensor]:
@ -627,7 +627,7 @@ def process_spectrogram(

 def _process_audio_array(
    audio: np.ndarray,
-    sampling_rate: float,
+    sampling_rate: int,
    model: DetectionModel,
    config: ProcessingConfiguration,
    device: torch.device,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,7 +17,7 @@ dependencies = [
  "torch>=1.13.1,<2.5.0",
  "torchaudio>=1.13.1,<2.5.0",
  "torchvision>=0.14.0",
-  "soundevent[audio,geometry,plot]>=2.0.1",
+  "soundevent[audio,geometry,plot]>=2.2",
  "click>=8.1.7",
  "netcdf4>=1.6.5",
  "tqdm>=4.66.2",
--- a/tests/test_audio_utils.py
+++ b/tests/test_audio_utils.py
@ -94,7 +94,7 @@ def test_computed_spectrograms_are_actually_divisible_by_the_spec_divide_factor(
    params = parameters.DEFAULT_SPECTROGRAM_PARAMETERS
    length = int(duration * samplerate)
    audio = np.random.rand(length)
-    _, spectrogram, _ = detector_utils.compute_spectrogram(
+    _, spectrogram = detector_utils.compute_spectrogram(
        audio,
        samplerate,
        params,
--- a/tests/test_data/test_labels.py
+++ b/tests/test_data/test_labels.py
@ -0,0 +1,120 @@
+from pathlib import Path
+
+import numpy as np
+import xarray as xr
+from soundevent import data
+from soundevent.types import ClassMapper
+
+from batdetect2.data.labels import generate_heatmaps
+
+recording = data.Recording(
+    samplerate=256_000,
+    duration=1,
+    channels=1,
+    time_expansion=1,
+    hash="asdf98sdf",
+    path=Path("/path/to/audio.wav"),
+)
+
+clip = data.Clip(
+    recording=recording,
+    start_time=0,
+    end_time=1,
+)
+
+
+class Mapper(ClassMapper):
+    class_labels = ["bat", "cat"]
+
+    def encode(self, sound_event_annotation: data.SoundEventAnnotation) -> str:
+        return "bat"
+
+    def decode(self, label: str) -> list:
+        return [data.Tag(term=data.term_from_key("species"), value="bat")]
+
+
+def test_generated_heatmaps_have_correct_dimensions():
+    spec = xr.DataArray(
+        data=np.random.rand(100, 100),
+        dims=["time", "frequency"],
+        coords={
+            "time": np.linspace(0, 100, 100, endpoint=False),
+            "frequency": np.linspace(0, 100, 100, endpoint=False),
+        },
+    )
+
+    clip_annotation = data.ClipAnnotation(
+        clip=clip,
+        sound_events=[
+            data.SoundEventAnnotation(
+                sound_event=data.SoundEvent(
+                    recording=recording,
+                    geometry=data.BoundingBox(
+                        coordinates=[10, 10, 20, 20],
+                    ),
+                ),
+            )
+        ],
+    )
+
+    class_mapper = Mapper()
+
+    detection_heatmap, class_heatmap, size_heatmap = generate_heatmaps(
+        clip_annotation,
+        spec,
+        class_mapper,
+    )
+
+    assert isinstance(detection_heatmap, xr.DataArray)
+    assert detection_heatmap.shape == (100, 100)
+    assert detection_heatmap.dims == ("time", "frequency")
+
+    assert isinstance(class_heatmap, xr.DataArray)
+    assert class_heatmap.shape == (2, 100, 100)
+    assert class_heatmap.dims == ("category", "time", "frequency")
+    assert class_heatmap.coords["category"].values.tolist() == ["bat", "cat"]
+
+    assert isinstance(size_heatmap, xr.DataArray)
+    assert size_heatmap.shape == (2, 100, 100)
+    assert size_heatmap.dims == ("dimension", "time", "frequency")
+    assert size_heatmap.coords["dimension"].values.tolist() == [
+        "width",
+        "height",
+    ]
+
+
+def test_generated_heatmap_are_non_zero_at_correct_positions():
+    spec = xr.DataArray(
+        data=np.random.rand(100, 100),
+        dims=["time", "frequency"],
+        coords={
+            "time": np.linspace(0, 100, 100, endpoint=False),
+            "frequency": np.linspace(0, 100, 100, endpoint=False),
+        },
+    )
+
+    clip_annotation = data.ClipAnnotation(
+        clip=clip,
+        sound_events=[
+            data.SoundEventAnnotation(
+                sound_event=data.SoundEvent(
+                    recording=recording,
+                    geometry=data.BoundingBox(
+                        coordinates=[10, 10, 20, 20],
+                    ),
+                ),
+            )
+        ],
+    )
+
+    class_mapper = Mapper()
+    detection_heatmap, class_heatmap, size_heatmap = generate_heatmaps(
+        clip_annotation,
+        spec,
+        class_mapper,
+    )
+    assert size_heatmap.sel(time=10, frequency=10, dimension="width") == 10
+    assert size_heatmap.sel(time=10, frequency=10, dimension="height") == 10
+    assert class_heatmap.sel(time=10, frequency=10, category="bat") == 1.0
+    assert class_heatmap.sel(time=10, frequency=10, category="cat") == 0.0
+    assert detection_heatmap.sel(time=10, frequency=10) == 1.0
--- a/tests/test_migration/test_preprocessing.py
+++ b/tests/test_migration/test_preprocessing.py
@ -46,8 +46,14 @@ def test_audio_loading_hasnt_changed(
    )
    audio_new = preprocessing.load_clip_audio(
        clip,
-        target_sampling_rate=target_sampling_rate,
-        scale=scale,
+        config=preprocessing.AudioConfig(
+            resample=preprocessing.ResampleConfig(
+                samplerate=target_sampling_rate,
+            ),
+            center=scale,
+            scale=scale,
+            duration=None,
+        ),
        dtype=np.float32,
    )

@ -73,18 +79,46 @@ def test_spectrogram_generation_hasnt_changed(
    min_freq = 10_000
    max_freq = 120_000
    fft_overlap = 0.75
+
+    scale = None
+    if spec_scale == "log":
+        scale = "log"
+    elif spec_scale == "pcen":
+        scale = preprocessing.PcenConfig()
+
+    config = preprocessing.SpectrogramConfig(
+        fft=preprocessing.FFTConfig(
+            window_overlap=fft_overlap,
+            window_duration=fft_win_length,
+        ),
+        frequencies=preprocessing.FrequencyConfig(
+            min_freq=min_freq,
+            max_freq=max_freq,
+        ),
+        scale=scale,
+        denoise=denoise_spec_avg,
+        resize=None,
+        max_scale=max_scale_spec,
+    )
+
    recording = data.Recording.from_file(
        audio_file,
        time_expansion=time_expansion,
    )
+
    clip = data.Clip(
        recording=recording,
        start_time=0,
        end_time=recording.duration,
    )
+
    audio = preprocessing.load_clip_audio(
        clip,
-        target_sampling_rate=target_sampling_rate,
+        config=preprocessing.AudioConfig(
+            resample=preprocessing.ResampleConfig(
+                samplerate=target_sampling_rate,
+            )
+        ),
    )

    spec_original, _ = audio_utils.generate_spectrogram(
@ -103,18 +137,19 @@ def test_spectrogram_generation_hasnt_changed(

    new_spec = preprocessing.compute_spectrogram(
        audio,
-        fft_win_length=fft_win_length,
-        fft_overlap=fft_overlap,
-        max_freq=max_freq,
-        min_freq=min_freq,
-        spec_scale=spec_scale,
-        denoise_spec_avg=denoise_spec_avg,
-        max_scale_spec=max_scale_spec,
+        config=config,
        dtype=np.float32,
    )

    assert spec_original.shape == new_spec.shape
    assert spec_original.dtype == new_spec.dtype

+    # Check that the spectrogram content is the same within a tolerance of 1e-5
+    # for each element of the spectrogram at least 99.5% of the time.
+    # NOTE: The pcen function is not the same as the one in the original code
+    # thus the need for a tolerance, but the values are still very similar.
    # NOTE: The original spectrogram is flipped vertically
-    assert np.isclose(spec_original, np.flipud(new_spec.data)).all()
+    assert (
+        np.isclose(spec_original, np.flipud(new_spec.data), atol=1e-5).mean()
+        > 0.995
+    )
--- a/uv.lock
+++ b/uv.lock
@ -236,7 +236,7 @@ requires-dist = [
    { name = "pytorch-lightning", specifier = ">=2.2.2" },
    { name = "scikit-learn", specifier = ">=1.2.2" },
    { name = "scipy", specifier = ">=1.10.1" },
-    { name = "soundevent", extras = ["audio", "geometry", "plot"], specifier = ">=2.0.1" },
+    { name = "soundevent", extras = ["audio", "geometry", "plot"], specifier = ">=2.2" },
    { name = "tensorboard", specifier = ">=2.16.2" },
    { name = "torch", specifier = ">=1.13.1,<2.5.0" },
    { name = "torchaudio", specifier = ">=1.13.1,<2.5.0" },