mirror of
https://github.com/macaodha/batdetect2.git
synced 2025-06-29 14:41:58 +02:00
Improve the pad_audio function
This function was the culprit of the error. Broke the function into other helper functions to make the flow easier to follow
This commit is contained in:
parent
25e0a53ad1
commit
a4b22d6590
@ -6,6 +6,8 @@ import librosa.core.spectrum
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from batdetect2.detector import parameters
|
||||||
|
|
||||||
from . import wavfile
|
from . import wavfile
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -15,20 +17,44 @@ __all__ = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
|
def time_to_x_coords(
|
||||||
nfft = np.floor(fft_win_length * sampling_rate) # int() uses floor
|
time_in_file: float,
|
||||||
noverlap = np.floor(fft_overlap * nfft)
|
samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
|
||||||
return (time_in_file * sampling_rate - noverlap) / (nfft - noverlap)
|
window_duration: float = parameters.FFT_WIN_LENGTH_S,
|
||||||
|
window_overlap: float = parameters.FFT_OVERLAP,
|
||||||
|
) -> float:
|
||||||
|
nfft = np.floor(window_duration * samplerate) # int() uses floor
|
||||||
|
noverlap = np.floor(window_overlap * nfft)
|
||||||
|
return (time_in_file * samplerate - noverlap) / (nfft - noverlap)
|
||||||
|
|
||||||
|
|
||||||
# NOTE this is also defined in post_process
|
def x_coords_to_time(
|
||||||
def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
|
x_pos: int,
|
||||||
nfft = np.floor(fft_win_length * sampling_rate)
|
samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
|
||||||
noverlap = np.floor(fft_overlap * nfft)
|
window_duration: float = parameters.FFT_WIN_LENGTH_S,
|
||||||
return ((x_pos * (nfft - noverlap)) + noverlap) / sampling_rate
|
window_overlap: float = parameters.FFT_OVERLAP,
|
||||||
|
) -> float:
|
||||||
|
n_fft = np.floor(window_duration * samplerate)
|
||||||
|
n_overlap = np.floor(window_overlap * n_fft)
|
||||||
|
n_step = n_fft - n_overlap
|
||||||
|
return ((x_pos * n_step) + n_overlap) / samplerate
|
||||||
# return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5) # 0.5 is for center of temporal window
|
# return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5) # 0.5 is for center of temporal window
|
||||||
|
|
||||||
|
|
||||||
|
def x_coord_to_sample(
|
||||||
|
x_pos: int,
|
||||||
|
samplerate: float = parameters.TARGET_SAMPLERATE_HZ,
|
||||||
|
window_duration: float = parameters.FFT_WIN_LENGTH_S,
|
||||||
|
window_overlap: float = parameters.FFT_OVERLAP,
|
||||||
|
resize_factor: float = parameters.RESIZE_FACTOR,
|
||||||
|
) -> int:
|
||||||
|
n_fft = np.floor(window_duration * samplerate)
|
||||||
|
n_overlap = np.floor(window_overlap * n_fft)
|
||||||
|
n_step = n_fft - n_overlap
|
||||||
|
x_pos = int(x_pos / resize_factor)
|
||||||
|
return int((x_pos * n_step) + n_overlap)
|
||||||
|
|
||||||
|
|
||||||
def generate_spectrogram(
|
def generate_spectrogram(
|
||||||
audio,
|
audio,
|
||||||
sampling_rate,
|
sampling_rate,
|
||||||
@ -184,55 +210,118 @@ def load_audio(
|
|||||||
return sampling_rate, audio_raw
|
return sampling_rate, audio_raw
|
||||||
|
|
||||||
|
|
||||||
|
def compute_spectrogram_width(
|
||||||
|
length: int,
|
||||||
|
samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
|
||||||
|
window_duration: float = parameters.FFT_WIN_LENGTH_S,
|
||||||
|
window_overlap: float = parameters.FFT_OVERLAP,
|
||||||
|
resize_factor: float = parameters.RESIZE_FACTOR,
|
||||||
|
) -> int:
|
||||||
|
n_fft = int(window_duration * samplerate)
|
||||||
|
n_overlap = int(window_overlap * n_fft)
|
||||||
|
n_step = n_fft - n_overlap
|
||||||
|
width = (length - n_overlap) // n_step
|
||||||
|
return int(width * resize_factor)
|
||||||
|
|
||||||
|
|
||||||
def pad_audio(
|
def pad_audio(
|
||||||
audio_raw,
|
audio: np.ndarray,
|
||||||
fs,
|
samplerate: int = parameters.TARGET_SAMPLERATE_HZ,
|
||||||
ms,
|
window_duration: float = parameters.FFT_WIN_LENGTH_S,
|
||||||
overlap_perc,
|
window_overlap: float = parameters.FFT_OVERLAP,
|
||||||
resize_factor,
|
resize_factor: float = parameters.RESIZE_FACTOR,
|
||||||
divide_factor,
|
divide_factor: int = parameters.SPEC_DIVIDE_FACTOR,
|
||||||
fixed_width=None,
|
fixed_width: Optional[int] = None,
|
||||||
):
|
):
|
||||||
# Adds zeros to the end of the raw data so that the generated sepctrogram
|
"""Pad audio to be evenly divisible by `divide_factor`.
|
||||||
# will be evenly divisible by `divide_factor`
|
|
||||||
# Also deals with very short audio clips and fixed_width during training
|
|
||||||
|
|
||||||
# This code could be clearer, clean up
|
This function pads the audio signal with zeros to ensure that the
|
||||||
nfft = int(ms * fs)
|
generated spectrogram length will be evenly divisible by `divide_factor`.
|
||||||
noverlap = int(overlap_perc * nfft)
|
This is important for the model to work correctly.
|
||||||
step = nfft - noverlap
|
|
||||||
min_size = int(divide_factor * (1.0 / resize_factor))
|
|
||||||
spec_width = (audio_raw.shape[0] - noverlap) // step
|
|
||||||
spec_width_rs = spec_width * resize_factor
|
|
||||||
|
|
||||||
if fixed_width is not None and spec_width < fixed_width:
|
This `divide_factor` comes from the model architecture as it downscales
|
||||||
# too small
|
the spectrogram by this factor, so the input must be divisible by this
|
||||||
# used during training to ensure all the batches are the same size
|
integer number.
|
||||||
diff = fixed_width * step + noverlap - audio_raw.shape[0]
|
|
||||||
audio_raw = np.hstack(
|
Parameters
|
||||||
(audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
|
----------
|
||||||
|
audio : np.ndarray
|
||||||
|
The audio signal.
|
||||||
|
samplerate : int
|
||||||
|
The sampling rate of the audio signal.
|
||||||
|
window_size : float
|
||||||
|
The window size in seconds used for the spectrogram computation.
|
||||||
|
window_overlap : float
|
||||||
|
The overlap between windows in the spectrogram computation.
|
||||||
|
resize_factor : float
|
||||||
|
This factor is used to resize the spectrogram after the STFT
|
||||||
|
computation. Default is 0.5 which means that the spectrogram will be
|
||||||
|
reduced by half. Important to take into account for the final size of
|
||||||
|
the spectrogram.
|
||||||
|
divide_factor : int
|
||||||
|
The factor by which the spectrogram will be divided.
|
||||||
|
fixed_width : int, optional
|
||||||
|
If provided, the audio will be padded or cut so that the resulting
|
||||||
|
spectrogram width will be equal to this value.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
The padded audio signal.
|
||||||
|
"""
|
||||||
|
spec_width = compute_spectrogram_width(
|
||||||
|
audio.shape[0],
|
||||||
|
samplerate=samplerate,
|
||||||
|
window_duration=window_duration,
|
||||||
|
window_overlap=window_overlap,
|
||||||
|
resize_factor=resize_factor,
|
||||||
|
)
|
||||||
|
|
||||||
|
if fixed_width:
|
||||||
|
target_samples = x_coord_to_sample(
|
||||||
|
fixed_width,
|
||||||
|
samplerate=samplerate,
|
||||||
|
window_duration=window_duration,
|
||||||
|
window_overlap=window_overlap,
|
||||||
|
resize_factor=resize_factor,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif fixed_width is not None and spec_width > fixed_width:
|
if spec_width < fixed_width:
|
||||||
# too big
|
# need to be at least min_size
|
||||||
# used during training to ensure all the batches are the same size
|
diff = target_samples - audio.shape[0]
|
||||||
diff = fixed_width * step + noverlap - audio_raw.shape[0]
|
return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
|
||||||
audio_raw = audio_raw[:diff]
|
|
||||||
|
|
||||||
elif (
|
if spec_width > fixed_width:
|
||||||
spec_width_rs < min_size
|
return audio[:target_samples]
|
||||||
or (np.floor(spec_width_rs) % divide_factor) != 0
|
|
||||||
):
|
return audio
|
||||||
# need to be at least min_size
|
|
||||||
div_amt = np.ceil(spec_width_rs / float(divide_factor))
|
min_width = int(divide_factor / resize_factor)
|
||||||
div_amt = np.maximum(1, div_amt)
|
|
||||||
target_size = int(div_amt * divide_factor * (1.0 / resize_factor))
|
if spec_width < min_width:
|
||||||
diff = target_size * step + noverlap - audio_raw.shape[0]
|
target_samples = x_coord_to_sample(
|
||||||
audio_raw = np.hstack(
|
min_width,
|
||||||
(audio_raw, np.zeros(diff, dtype=audio_raw.dtype))
|
samplerate=samplerate,
|
||||||
|
window_duration=window_duration,
|
||||||
|
window_overlap=window_overlap,
|
||||||
|
resize_factor=resize_factor,
|
||||||
)
|
)
|
||||||
|
diff = target_samples - audio.shape[0]
|
||||||
|
return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
|
||||||
|
|
||||||
return audio_raw
|
if (spec_width % divide_factor) == 0:
|
||||||
|
return audio
|
||||||
|
|
||||||
|
target_width = int(np.ceil(spec_width / divide_factor)) * divide_factor
|
||||||
|
target_samples = x_coord_to_sample(
|
||||||
|
target_width,
|
||||||
|
samplerate=samplerate,
|
||||||
|
window_duration=window_duration,
|
||||||
|
window_overlap=window_overlap,
|
||||||
|
resize_factor=resize_factor,
|
||||||
|
)
|
||||||
|
diff = target_samples - audio.shape[0]
|
||||||
|
return np.hstack((audio, np.zeros(diff, dtype=audio.dtype)))
|
||||||
|
|
||||||
|
|
||||||
def gen_mag_spectrogram(x, fs, ms, overlap_perc):
|
def gen_mag_spectrogram(x, fs, ms, overlap_perc):
|
||||||
@ -247,7 +336,11 @@ def gen_mag_spectrogram(x, fs, ms, overlap_perc):
|
|||||||
|
|
||||||
# compute spec
|
# compute spec
|
||||||
spec, _ = librosa.core.spectrum._spectrogram(
|
spec, _ = librosa.core.spectrum._spectrogram(
|
||||||
y=x, power=1, n_fft=nfft, hop_length=step, center=False
|
y=x,
|
||||||
|
power=1,
|
||||||
|
n_fft=nfft,
|
||||||
|
hop_length=step,
|
||||||
|
center=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
# remove DC component and flip vertical orientation
|
# remove DC component and flip vertical orientation
|
||||||
|
Loading…
Reference in New Issue
Block a user