Improve the pad_audio function

This function was the culprit of the error. Broke the function into other helper functions to make the flow easier to follow
2025-06-29 14:41:58 +02:00 · 2024-11-10 22:39:10 +00:00 · 2024-11-10 22:39:10 +00:00 · a4b22d6590
commit a4b22d6590
parent 25e0a53ad1
1 changed files with 143 additions and 50 deletions
--- a/batdetect2/utils/audio_utils.py
+++ b/batdetect2/utils/audio_utils.py
@ -6,6 +6,8 @@ import librosa.core.spectrum
 import numpy as np
 import torch
 from batdetect2.detector import parameters
 from . import wavfile
 __all__ = [
@ -15,20 +17,44 @@ __all__ = [
 ]
-def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
+def time_to_x_coords(
-    nfft = np.floor(fft_win_length * sampling_rate)  # int() uses floor
+    time_in_file: float,
-    noverlap = np.floor(fft_overlap * nfft)
+    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
-    return (time_in_file * sampling_rate - noverlap) / (nfft - noverlap)
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
    window_overlap: float = parameters.FFT_OVERLAP,
 ) -> float:
    nfft = np.floor(window_duration * samplerate)  # int() uses floor
    noverlap = np.floor(window_overlap * nfft)
    return (time_in_file * samplerate - noverlap) / (nfft - noverlap)
-# NOTE this is also defined in post_process
+def x_coords_to_time(
-def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
+    x_pos: int,
-    nfft = np.floor(fft_win_length * sampling_rate)
+    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
-    noverlap = np.floor(fft_overlap * nfft)
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
-    return ((x_pos * (nfft - noverlap)) + noverlap) / sampling_rate
+    window_overlap: float = parameters.FFT_OVERLAP,
 ) -> float:
    n_fft = np.floor(window_duration * samplerate)
    n_overlap = np.floor(window_overlap * n_fft)
    n_step = n_fft - n_overlap
    return ((x_pos * n_step) + n_overlap) / samplerate
    # return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5)  # 0.5 is for center of temporal window
 def x_coord_to_sample(
    x_pos: int,
    samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
    window_duration: float = parameters.FFT_WIN_LENGTH_S,
    window_overlap: float = parameters.FFT_OVERLAP,
    resize_factor: float = parameters.RESIZE_FACTOR,
 ) -> int:
    n_fft = np.floor(window_duration * samplerate)
    n_overlap = np.floor(window_overlap * n_fft)
    n_step = n_fft - n_overlap
    x_pos = int(x_pos / resize_factor)
    return int((x_pos * n_step) + n_overlap)
 def generate_spectrogram(
    audio,
    sampling_rate,
@ -184,55 +210,118 @@ def load_audio(
    return sampling_rate, audio_raw
 def compute_spectrogram_width(
    length: int,
    samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
    window_duration: float = parameters.FFT_WIN_LENGTH_S,
    window_overlap: float = parameters.FFT_OVERLAP,
    resize_factor: float = parameters.RESIZE_FACTOR,
 ) -> int:
    n_fft = int(window_duration * samplerate)
    n_overlap = int(window_overlap * n_fft)
    n_step = n_fft - n_overlap
    width = (length - n_overlap) // n_step
    return int(width * resize_factor)
 def pad_audio(
-    audio_raw,
+    audio: np.ndarray,
-    fs,
+    samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
-    ms,
+    window_duration: float = parameters.FFT_WIN_LENGTH_S,
-    overlap_perc,
+    window_overlap: float = parameters.FFT_OVERLAP,
-    resize_factor,
+    resize_factor: float = parameters.RESIZE_FACTOR,
-    divide_factor,
+    divide_factor: int = parameters.SPEC_DIVIDE_FACTOR,
-    fixed_width=None,
+    fixed_width: Optional[int] = None,
 ):
-    # Adds zeros to the end of the raw data so that the generated sepctrogram
+    """Pad audio to be evenly divisible by `divide_factor`.
    # will be evenly divisible by `divide_factor`
    # Also deals with very short audio clips and fixed_width during training
-    # This code could be clearer, clean up
+    This function pads the audio signal with zeros to ensure that the
-    nfft = int(ms * fs)
+    generated spectrogram length will be evenly divisible by `divide_factor`.
-    noverlap = int(overlap_perc * nfft)
+    This is important for the model to work correctly.
    step = nfft - noverlap
    min_size = int(divide_factor * (1.0 / resize_factor))
    spec_width = (audio_raw.shape[0] - noverlap) // step
    spec_width_rs = spec_width * resize_factor
-    if fixed_width is not None and spec_width < fixed_width:
+    This `divide_factor` comes from the model architecture as it downscales
-        # too small
+    the spectrogram by this factor, so the input must be divisible by this
-        # used during training to ensure all the batches are the same size
+    integer number.
-        diff = fixed_width * step + noverlap - audio_raw.shape[0]
+
-        audio_raw = np.hstack(
+    Parameters
-            (audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
+    ----------
    audio : np.ndarray
        The audio signal.
    samplerate : int
        The sampling rate of the audio signal.
    window_size : float
        The window size in seconds used for the spectrogram computation.
    window_overlap : float
        The overlap between windows in the spectrogram computation.
    resize_factor : float
        This factor is used to resize the spectrogram after the STFT
        computation. Default is 0.5 which means that the spectrogram will be
        reduced by half. Important to take into account for the final size of
        the spectrogram.
    divide_factor : int
        The factor by which the spectrogram will be divided.
    fixed_width : int, optional
        If provided, the audio will be padded or cut so that the resulting
        spectrogram width will be equal to this value.
    Returns
    -------
    np.ndarray
        The padded audio signal.
    """
    spec_width = compute_spectrogram_width(
        audio.shape[0],
        samplerate=samplerate,
        window_duration=window_duration,
        window_overlap=window_overlap,
        resize_factor=resize_factor,
    )
    if fixed_width:
        target_samples = x_coord_to_sample(
            fixed_width,
            samplerate=samplerate,
            window_duration=window_duration,
            window_overlap=window_overlap,
            resize_factor=resize_factor,
        )
-    elif fixed_width is not None and spec_width > fixed_width:
+        if spec_width < fixed_width:
-        # too big
+            # need to be at least min_size
-        # used during training to ensure all the batches are the same size
+            diff = target_samples - audio.shape[0]
-        diff = fixed_width * step + noverlap - audio_raw.shape[0]
+            return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
        audio_raw = audio_raw[:diff]
-    elif (
+        if spec_width > fixed_width:
-        spec_width_rs < min_size
+            return audio[:target_samples]
-        or (np.floor(spec_width_rs) % divide_factor) != 0
+
-    ):
+        return audio
-        # need to be at least min_size
+
-        div_amt = np.ceil(spec_width_rs / float(divide_factor))
+    min_width = int(divide_factor / resize_factor)
-        div_amt = np.maximum(1, div_amt)
+
-        target_size = int(div_amt * divide_factor * (1.0 / resize_factor))
+    if spec_width < min_width:
-        diff = target_size * step + noverlap - audio_raw.shape[0]
+        target_samples = x_coord_to_sample(
-        audio_raw = np.hstack(
+            min_width,
-            (audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
+            samplerate=samplerate,
            window_duration=window_duration,
            window_overlap=window_overlap,
            resize_factor=resize_factor,
        )
        diff = target_samples - audio.shape[0]
        return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
-    return audio_raw
+    if (spec_width % divide_factor) == 0:
        return audio
    target_width = int(np.ceil(spec_width / divide_factor)) * divide_factor
    target_samples = x_coord_to_sample(
        target_width,
        samplerate=samplerate,
        window_duration=window_duration,
        window_overlap=window_overlap,
        resize_factor=resize_factor,
    )
    diff = target_samples - audio.shape[0]
    return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
 def gen_mag_spectrogram(x, fs, ms, overlap_perc):
@ -247,7 +336,11 @@ def gen_mag_spectrogram(x, fs, ms, overlap_perc):
    # compute spec
    spec, _ = librosa.core.spectrum._spectrogram(
-        y=x, power=1, n_fft=nfft, hop_length=step, center=False
+        y=x,
        power=1,
        n_fft=nfft,
        hop_length=step,
        center=False,
    )
    # remove DC component and flip vertical orientation