diff --git a/.gitignore b/.gitignore index ad423c1..4cb93c0 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ experiments/* !batdetect2_notebook.ipynb !batdetect2/models/*.pth.tar !tests/data/*.wav +notebooks/lightning_logs diff --git a/batdetect2/data/augmentations.py b/batdetect2/data/augmentations.py deleted file mode 100644 index 11d82f2..0000000 --- a/batdetect2/data/augmentations.py +++ /dev/null @@ -1,304 +0,0 @@ -from functools import wraps -from typing import Callable, List, Optional, Tuple - -import numpy as np -import xarray as xr -from soundevent import data -from soundevent.geometry import compute_bounds - -ClipAugmentation = Callable[[data.ClipAnnotation], data.ClipAnnotation] -AudioAugmentation = Callable[ - [xr.DataArray, data.ClipAnnotation], - Tuple[xr.DataArray, data.ClipAnnotation], -] -SpecAugmentation = Callable[ - [xr.DataArray, data.ClipAnnotation], - Tuple[xr.DataArray, data.ClipAnnotation], -] - -ClipProvider = Callable[ - [data.ClipAnnotation], Tuple[xr.DataArray, data.ClipAnnotation] -] -"""A function that provides some clip and its annotation. - -Usually this function loads a random clip from a dataset. Takes -as input a clip annotation that can be used to filter the clips -to load (in case you want to avoid loading the same clip multiple times). -""" - - -AUGMENTATION_PROBABILITY = 0.2 -MAX_DELAY = 0.005 -STRETCH_SQUEEZE_DELTA = 0.04 -MASK_MAX_TIME_PERC: float = 0.05 -MASK_MAX_FREQ_PERC: float = 0.10 - - -def maybe_apply( - augmentation: Callable, - prob: float = AUGMENTATION_PROBABILITY, -) -> Callable: - """Apply an augmentation with a given probability.""" - - @wraps(augmentation) - def _augmentation(x): - if np.random.rand() > prob: - return x - return augmentation(x) - - return _augmentation - - -def select_random_subclip( - clip_annotation: data.ClipAnnotation, - duration: Optional[float] = None, - proportion: float = 0.9, -) -> data.ClipAnnotation: - """Select a random subclip from a clip.""" - clip = clip_annotation.clip - - if duration is None: - clip_duration = clip.end_time - clip.start_time - duration = clip_duration * proportion - - start_time = np.random.uniform(clip.start_time, clip.end_time - duration) - return clip_annotation.model_copy( - update=dict( - clip=clip.model_copy( - update=dict( - start_time=start_time, - end_time=start_time + duration, - ) - ) - ) - ) - - -def combine_audio( - audio1: xr.DataArray, - audio2: xr.DataArray, - alpha: Optional[float] = None, - min_alpha: float = 0.3, - max_alpha: float = 0.7, -) -> xr.DataArray: - """Combine two audio clips.""" - - if alpha is None: - alpha = np.random.uniform(min_alpha, max_alpha) - - return alpha * audio1 + (1 - alpha) * audio2.data - - -def random_mix( - audio: xr.DataArray, - clip: data.ClipAnnotation, - provider: Optional[ClipProvider] = None, - alpha: Optional[float] = None, - min_alpha: float = 0.3, - max_alpha: float = 0.7, - join_annotations: bool = True, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Mix two audio clips.""" - if provider is None: - raise ValueError("No audio provider given.") - - try: - other_audio, other_clip = provider(clip) - except (StopIteration, ValueError): - raise ValueError("No more audio sources available.") - - new_audio = combine_audio( - audio, - other_audio, - alpha=alpha, - min_alpha=min_alpha, - max_alpha=max_alpha, - ) - - if join_annotations: - clip = clip.model_copy( - update=dict( - sound_events=clip.sound_events + other_clip.sound_events, - ) - ) - - return new_audio, clip - - -def add_echo( - audio: xr.DataArray, - clip: data.ClipAnnotation, - delay: Optional[float] = None, - alpha: Optional[float] = None, - min_alpha: float = 0.0, - max_alpha: float = 1.0, - max_delay: float = MAX_DELAY, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Add a delay to the audio.""" - if delay is None: - delay = np.random.uniform(0, max_delay) - - if alpha is None: - alpha = np.random.uniform(min_alpha, max_alpha) - - samplerate = audio.attrs["samplerate"] - offset = int(delay * samplerate) - - # NOTE: We use the copy method to avoid modifying the original audio - # data. - new_audio = audio.copy() - new_audio[offset:] += alpha * audio.data[:-offset] - return new_audio, clip - - -def scale_volume( - spec: xr.DataArray, - clip: data.ClipAnnotation, - factor: Optional[float] = None, - max_scaling: float = 2, - min_scaling: float = 0, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Scale the volume of a spectrogram.""" - if factor is None: - factor = np.random.uniform(min_scaling, max_scaling) - - return spec * factor, clip - - -def scale_sound_event_annotation( - sound_event_annotation: data.SoundEventAnnotation, - time_factor: float = 1, - frequency_factor: float = 1, -) -> data.SoundEventAnnotation: - sound_event = sound_event_annotation.sound_event - geometry = sound_event.geometry - - if geometry is None: - return sound_event_annotation - - start_time, low_freq, end_time, high_freq = compute_bounds(geometry) - new_geometry = data.BoundingBox( - coordinates=[ - start_time * time_factor, - low_freq * frequency_factor, - end_time * time_factor, - high_freq * frequency_factor, - ] - ) - - return sound_event_annotation.model_copy( - update=dict( - sound_event=sound_event.model_copy( - update=dict( - geometry=new_geometry, - ) - ) - ) - ) - - -def warp_spectrogram( - spec: xr.DataArray, - clip: data.ClipAnnotation, - factor: Optional[float] = None, - delta: float = STRETCH_SQUEEZE_DELTA, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Warp a spectrogram.""" - if factor is None: - factor = np.random.uniform(1 - delta, 1 + delta) - - start_time = clip.clip.start_time - end_time = clip.clip.end_time - duration = end_time - start_time - new_time = np.linspace( - start_time, - start_time + duration * factor, - spec.time.size, - ) - - scaled_spec = spec.interp( - time=new_time, - method="linear", - kwargs={"fill_value": 0}, - ) - scaled_spec.coords["time"] = spec.time - - scaled_clip = clip.model_copy( - update=dict( - sound_events=[ - scale_sound_event_annotation( - sound_event_annotation, - time_factor=1 / factor, - ) - for sound_event_annotation in clip.sound_events - ] - ) - ) - return scaled_spec, scaled_clip - - -def mask_axis( - array: xr.DataArray, - axis: str, - start: float, - end: float, - mask_value: float = 0, -) -> xr.DataArray: - if axis not in array.dims: - raise ValueError(f"Axis {axis} not found in array") - - coord = array[axis] - return array.where((coord < start) | (coord > end), mask_value) - - -def mask_time( - spec: xr.DataArray, - clip: data.ClipAnnotation, - max_time_mask: float = MASK_MAX_TIME_PERC, - max_num_masks: int = 3, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Mask a random section of the time axis.""" - - num_masks = np.random.randint(1, max_num_masks + 1) - for _ in range(num_masks): - mask_size = np.random.uniform(0, max_time_mask) - start = np.random.uniform(0, spec.time[-1] - mask_size) - end = start + mask_size - spec = mask_axis(spec, "time", start, end) - - return spec, clip - - -def mask_frequency( - spec: xr.DataArray, - clip: data.ClipAnnotation, - max_freq_mask: float = MASK_MAX_FREQ_PERC, - max_num_masks: int = 3, -) -> Tuple[xr.DataArray, data.ClipAnnotation]: - """Mask a random section of the frequency axis.""" - - num_masks = np.random.randint(1, max_num_masks + 1) - for _ in range(num_masks): - mask_size = np.random.uniform(0, max_freq_mask) - start = np.random.uniform(0, spec.frequency[-1] - mask_size) - end = start + mask_size - spec = mask_axis(spec, "frequency", start, end) - - return spec, clip - - -CLIP_AUGMENTATIONS: List[ClipAugmentation] = [ - select_random_subclip, -] - -AUDIO_AUGMENTATIONS: List[AudioAugmentation] = [ - add_echo, - random_mix, -] - -SPEC_AUGMENTATIONS: List[SpecAugmentation] = [ - scale_volume, - warp_spectrogram, - mask_time, - mask_frequency, -] diff --git a/batdetect2/data/compat.py b/batdetect2/data/compat.py index f2398f0..0704c97 100644 --- a/batdetect2/data/compat.py +++ b/batdetect2/data/compat.py @@ -11,7 +11,7 @@ from soundevent import data from soundevent.geometry import compute_bounds from batdetect2 import types -from batdetect2.data.labels import LabelFn +from batdetect2.data.labels import ClassMapper PathLike = Union[Path, str, os.PathLike] @@ -54,7 +54,7 @@ def get_annotation_notes(annotation: data.ClipAnnotation) -> str: def convert_to_annotation_group( annotation: data.ClipAnnotation, - label_fn: LabelFn = lambda _: None, + class_mapper: ClassMapper, event_fn: EventFn = lambda _: ECHOLOCATION_EVENT, class_fn: ClassFn = lambda _: 0, individual_fn: IndividualFn = lambda _: 0, @@ -80,8 +80,8 @@ def convert_to_annotation_group( continue start_time, low_freq, end_time, high_freq = compute_bounds(geometry) - class_id = label_fn(sound_event) or -1 - event = event_fn(sound_event) + class_id = class_mapper.transform(sound_event) or -1 + event = event_fn(sound_event) or "" individual_id = individual_fn(sound_event) or -1 start_times.append(start_time) diff --git a/batdetect2/data/datasets.py b/batdetect2/data/datasets.py index b215973..b3747d3 100644 --- a/batdetect2/data/datasets.py +++ b/batdetect2/data/datasets.py @@ -4,7 +4,6 @@ from soundevent import data from torch.utils.data import Dataset __all__ = [ - "ClipAnnotationDataset", "ClipDataset", ] @@ -12,31 +11,7 @@ __all__ = [ E = TypeVar("E") -class ClipAnnotationDataset(Dataset, Generic[E]): - - clip_annotations: List[data.ClipAnnotation] - - transform: Callable[[data.ClipAnnotation], E] - - def __init__( - self, - clip_annotations: Iterable[data.ClipAnnotation], - transform: Callable[[data.ClipAnnotation], E], - name: str = "ClipAnnotationDataset", - ): - self.clip_annotations = list(clip_annotations) - self.transform = transform - self.name = name - - def __len__(self) -> int: - return len(self.clip_annotations) - - def __getitem__(self, idx: int) -> E: - return self.transform(self.clip_annotations[idx]) - - class ClipDataset(Dataset, Generic[E]): - clips: List[data.Clip] transform: Callable[[data.Clip], E] diff --git a/batdetect2/data/labels.py b/batdetect2/data/labels.py index 4fd41c3..b1576e0 100644 --- a/batdetect2/data/labels.py +++ b/batdetect2/data/labels.py @@ -1,113 +1,29 @@ -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Tuple import numpy as np import xarray as xr from scipy.ndimage import gaussian_filter -from soundevent import data, geometry +from soundevent import data, geometry, arrays +from soundevent.geometry.operations import Positions +from soundevent.types import ClassMapper __all__ = [ + "ClassMapper", "generate_heatmaps", ] -PositionFn = Callable[[data.SoundEvent], Tuple[float, float]] -"""Convert a sound event to a single position in time-frequency space.""" - -SizeFn = Callable[[data.SoundEvent, float, float], np.ndarray] -"""Compute the size of a sound event in time-frequency space. - -The time and frequency scales are provided as arguments to allow -modifying the size of the sound event based on the spectrogram -parameters. -""" - -LabelFn = Callable[[data.SoundEventAnnotation], Optional[str]] -"""Convert a sound event annotation to a label. - -When the label is None, this indicates that the sound event does not -belong to any of the classes of interest. -""" TARGET_SIGMA = 3.0 -GENERIC_LABEL = "__UNKNOWN__" - - -def get_lower_left_position( - sound_event: data.SoundEvent, -) -> Tuple[float, float]: - if sound_event.geometry is None: - raise ValueError("Sound event has no geometry.") - - start_time, low_freq, _, _ = geometry.compute_bounds(sound_event.geometry) - return start_time, low_freq - - -def get_bbox_size( - sound_event: data.SoundEvent, - time_scale: float = 1.0, - frequency_scale: float = 1.0, -) -> np.ndarray: - if sound_event.geometry is None: - raise ValueError("Sound event has no geometry.") - - start_time, low_freq, end_time, high_freq = geometry.compute_bounds( - sound_event.geometry - ) - - return np.array( - [ - time_scale * (end_time - start_time), - frequency_scale * (high_freq - low_freq), - ] - ) - - -def _tag_key(tag: data.Tag) -> Tuple[str, str]: - return (tag.key, tag.value) - - -def set_value_at_position( - array: xr.DataArray, - value: Any, - **query, -) -> xr.DataArray: - dims = {dim: n for n, dim in enumerate(array.dims)} - indexer: List[Union[slice, int]] = [slice(None) for _ in range(array.ndim)] - - for key, coord in query.items(): - if key not in dims: - raise ValueError(f"Dimension {key} not found in array.") - - coordinates = array.indexes[key] - indexer[dims[key]] = coordinates.get_loc(coordinates.asof(coord)) - - if isinstance(value, (tuple, list)): - value = np.array(value) - - array.data[tuple(indexer)] = value - return array - - def generate_heatmaps( clip_annotation: data.ClipAnnotation, spec: xr.DataArray, - num_classes: int = 1, - label_fn: LabelFn = lambda _: None, + class_mapper: ClassMapper, target_sigma: float = TARGET_SIGMA, - size_fn: SizeFn = get_bbox_size, - position_fn: PositionFn = get_lower_left_position, - class_labels: Optional[List[str]] = None, + position: Positions = "bottom-left", dtype=np.float32, ) -> Tuple[xr.DataArray, xr.DataArray, xr.DataArray]: - if class_labels is None: - class_labels = [str(i) for i in range(num_classes)] - - if len(class_labels) != num_classes: - raise ValueError( - "Number of class labels must match the number of classes." - ) - shape = dict(zip(spec.dims, spec.shape)) if "time" not in shape or "frequency" not in shape: @@ -115,8 +31,8 @@ def generate_heatmaps( "Spectrogram must have time and frequency dimensions." ) - time_duration = spec.time.attrs["max"] - spec.time.attrs["min"] - freq_bandwidth = spec.frequency.attrs["max"] - spec.frequency.attrs["min"] + time_duration = arrays.get_dim_width(spec, dim="time") + freq_bandwidth = arrays.get_dim_width(spec, dim="frequency") # Compute the size factors time_scale = 1 / time_duration @@ -125,10 +41,10 @@ def generate_heatmaps( # Initialize heatmaps detection_heatmap = xr.zeros_like(spec, dtype=dtype) class_heatmap = xr.DataArray( - data=np.zeros((num_classes, *spec.shape), dtype=dtype), + data=np.zeros((class_mapper.num_classes, *spec.shape), dtype=dtype), dims=["category", *spec.dims], coords={ - "category": class_labels, + "category": class_mapper.class_labels, **spec.coords, }, ) @@ -142,11 +58,17 @@ def generate_heatmaps( ) for sound_event_annotation in clip_annotation.sound_events: + geom = sound_event_annotation.sound_event.geometry + + if geom is None: + continue + # Get the position of the sound event - time, frequency = position_fn(sound_event_annotation.sound_event) + time, frequency = geometry.get_geometry_point(geom, position=position) + print(time, frequency) # Set 1.0 at the position of the sound event in the detection heatmap - detection_heatmap = set_value_at_position( + detection_heatmap = arrays.set_value_at_pos( detection_heatmap, 1.0, time=time, @@ -154,35 +76,37 @@ def generate_heatmaps( ) # Set the size of the sound event at the position in the size heatmap - size = size_fn( - sound_event_annotation.sound_event, - time_scale, - frequency_scale, - + start_time, low_freq, end_time, high_freq = geometry.compute_bounds( + geom ) - size_heatmap = set_value_at_position( + size = np.array( + [ + (end_time - start_time) * time_scale, + (high_freq - low_freq) * frequency_scale, + ] + ) + size_heatmap = arrays.set_value_at_pos( size_heatmap, size, time=time, frequency=frequency, ) - # Get the label id for the sound event - label = label_fn(sound_event_annotation) + # Get the class name of the sound event + class_name = class_mapper.transform(sound_event_annotation) - if label is None or label not in class_labels: - # If the label is None or not in the class labels, we skip the - # sound event + if class_name is None: + # If the label is None skip the sound event continue # Set 1.0 at the position and category of the sound event in the class # heatmap - class_heatmap = set_value_at_position( + class_heatmap = arrays.set_value_at_pos( class_heatmap, 1.0, time=time, frequency=frequency, - category=label, + category=class_name, ) # Apply gaussian filters @@ -207,25 +131,3 @@ def generate_heatmaps( ).fillna(0.0) return detection_heatmap, class_heatmap, size_heatmap - - -class Labeler: - def __init__(self, tags: List[data.Tag]): - """Create a labeler from a list of tags. - - Each tag is assigned a unique label. The labeler can then be used - to convert sound event annotations to labels. - """ - self.tags = tags - self._label_map = {_tag_key(tag): i for i, tag in enumerate(tags)} - self._inverse_label_map = {v: k for k, v in self._label_map.items()} - - def __call__( - self, sound_event_annotation: data.SoundEventAnnotation - ) -> Optional[int]: - for tag in sound_event_annotation.tags: - key = _tag_key(tag) - if key in self._label_map: - return self._label_map[key] - - return None diff --git a/batdetect2/data/preprocessing.py b/batdetect2/data/preprocessing.py index 8800a80..211b191 100644 --- a/batdetect2/data/preprocessing.py +++ b/batdetect2/data/preprocessing.py @@ -1,15 +1,22 @@ """Module containing functions for preprocessing audio clips.""" -import random -from typing import List, Optional, Tuple +from typing import Optional import librosa import librosa.core.spectrum import numpy as np import xarray as xr from numpy.typing import DTypeLike +from pydantic import BaseModel, Field from scipy.signal import resample_poly -from soundevent import audio, data +from soundevent import audio, data, arrays +from soundevent.arrays import operations as ops + +__all__ = [ + "PreprocessingConfig", + "preprocess_audio_clip", +] + TARGET_SAMPLERATE_HZ = 256000 SCALE_RAW_AUDIO = False @@ -26,20 +33,37 @@ DENOISE_SPEC_AVG = True MAX_SCALE_SPEC = False +class PreprocessingConfig(BaseModel): + """Configuration for preprocessing data.""" + + target_samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0) + + scale_audio: bool = Field(default=SCALE_RAW_AUDIO) + + fft_win_length: float = Field(default=FFT_WIN_LENGTH_S, gt=0) + + fft_overlap: float = Field(default=FFT_OVERLAP, ge=0, lt=1) + + max_freq: int = Field(default=MAX_FREQ_HZ, gt=0) + + min_freq: int = Field(default=MIN_FREQ_HZ, gt=0) + + spec_scale: str = Field(default=SPEC_SCALE) + + denoise_spec_avg: bool = DENOISE_SPEC_AVG + + max_scale_spec: bool = MAX_SCALE_SPEC + + duration: Optional[float] = DEFAULT_DURATION + + spec_height: int = SPEC_HEIGHT + + spec_time_period: float = SPEC_TIME_PERIOD + + def preprocess_audio_clip( clip: data.Clip, - target_sampling_rate: int = TARGET_SAMPLERATE_HZ, - scale_audio: bool = SCALE_RAW_AUDIO, - fft_win_length: float = FFT_WIN_LENGTH_S, - fft_overlap: float = FFT_OVERLAP, - max_freq: int = MAX_FREQ_HZ, - min_freq: int = MIN_FREQ_HZ, - spec_scale: str = SPEC_SCALE, - denoise_spec_avg: bool = True, - max_scale_spec: bool = False, - duration: Optional[float] = DEFAULT_DURATION, - spec_height: int = SPEC_HEIGHT, - spec_time_period: float = SPEC_TIME_PERIOD, + config: PreprocessingConfig = PreprocessingConfig(), ) -> xr.DataArray: """Preprocesses audio clip to generate spectrogram. @@ -47,45 +71,8 @@ def preprocess_audio_clip( ---------- clip The audio clip to preprocess. - target_sampling_rate - Target sampling rate for the audio. If the audio has a different - sampling rate, it will be resampled to this rate. - scale_audio - Whether to scale the audio amplitudes to a range of [-1, 1]. - By default, the audio is not scaled. - fft_win_length - Length of the FFT window in seconds. - fft_overlap - Amount of overlap between FFT windows as a fraction of the window - length. - max_freq - Maximum frequency for spectrogram. Anything above this frequency will - be cropped. - min_freq - Minimum frequency for spectrogram. Anything below this frequency will - be cropped. - spec_scale - Scaling method for the spectrogram. Can be "pcen", "log" or - "amplitude". - denoise_spec_avg - Whether to denoise the spectrogram. Denoising is done by subtracting - the average of the spectrogram from the spectrogram and clipping - negative values to 0. - max_scale_spec - Whether to max scale the spectrogram. Max scaling is done by dividing - the spectrogram by its maximum value thus scaling values to [0, 1]. - duration - Duration of the spectrogram in seconds. If the clip duration is - different from this value, the spectrogram will be cropped or extended - to match this duration. If None, the spectrogram will have the same - duration as the clip. - spec_height - Number of frequency bins for the spectrogram. This is the height of - the final spectrogram. - spec_time_period - Time period for each spectrogram bin in seconds. The spectrogram array - will be resized (using bilinear interpolation) to have this time - period. + config + Configuration for preprocessing. Returns ------- @@ -95,35 +82,29 @@ def preprocess_audio_clip( """ wav = load_clip_audio( clip, - target_sampling_rate=target_sampling_rate, - scale=scale_audio, - ) - - wav = wav.assign_attrs( - recording_id=str(wav.attrs["recording_id"]), - clip_id=str(wav.attrs["clip_id"]), - path=str(wav.attrs["path"]), + target_sampling_rate=config.target_samplerate, + scale=config.scale_audio, ) spec = compute_spectrogram( wav, - fft_win_length=fft_win_length, - fft_overlap=fft_overlap, - max_freq=max_freq, - min_freq=min_freq, - spec_scale=spec_scale, - denoise_spec_avg=denoise_spec_avg, - max_scale_spec=max_scale_spec, + fft_win_length=config.fft_win_length, + fft_overlap=config.fft_overlap, + max_freq=config.max_freq, + min_freq=config.min_freq, + spec_scale=config.spec_scale, + denoise_spec_avg=config.denoise_spec_avg, + max_scale_spec=config.max_scale_spec, ) - if duration is not None: - spec = adjust_spec_duration(clip, spec, duration) + if config.duration is not None: + spec = adjust_spec_duration(clip, spec, config.duration) - duration = get_dim_width(spec, dim="time") - return resize_spectrogram( + duration = arrays.get_dim_width(spec, dim="time") + return ops.resize( spec, - time_bins=int(np.ceil(duration / spec_time_period)), - freq_bins=spec_height, + time=int(np.ceil(duration / config.spec_time_period)), + frequency=config.spec_height, ) @@ -138,18 +119,18 @@ def adjust_spec_duration( return spec if current_duration > duration: - return crop_axis( + return arrays.crop_dim( spec, dim="time", start=clip.start_time, - end=clip.start_time + duration, + stop=clip.start_time + duration, ) - return extend_axis( + return arrays.extend_dim( spec, dim="time", start=clip.start_time, - end=clip.start_time + duration, + stop=clip.start_time + duration, ) @@ -159,21 +140,15 @@ def load_clip_audio( scale: bool = SCALE_RAW_AUDIO, dtype: DTypeLike = np.float32, ) -> xr.DataArray: - wav = audio.load_clip(clip).sel(channel=0) + wav = audio.load_clip(clip).sel(channel=0).astype(dtype) wav = resample_audio(wav, target_sampling_rate, dtype=dtype) if scale: - wav = scale_audio(wav) + wav = ops.center(wav) + wav = ops.scale(wav, 1 / (10e-6 + np.max(np.abs(wav)))) - wav.coords["time"] = wav.time.assign_attrs( - unit="s", - long_name="Seconds since start of recording", - min=clip.start_time, - max=clip.end_time, - ) - - return wav + return wav.astype(dtype) def resample_audio( @@ -181,14 +156,14 @@ def resample_audio( target_samplerate: int = TARGET_SAMPLERATE_HZ, dtype: DTypeLike = np.float32, ) -> xr.DataArray: - if "samplerate" not in wav.attrs: - raise ValueError("Audio must have a 'samplerate' attribute") - if "time" not in wav.dims: raise ValueError("Audio must have a time dimension") time_axis: int = wav.get_axis_num("time") # type: ignore - original_samplerate = wav.attrs["samplerate"] + + start, stop = arrays.get_dim_range(wav, dim="time") + step = arrays.get_dim_step(wav, dim="time") + original_samplerate = int(1 / step) if original_samplerate == target_samplerate: return wav.astype(dtype) @@ -202,8 +177,8 @@ def resample_audio( ) resampled_times = np.linspace( - wav.time[0], - wav.time[-1], + start, + stop + step, len(resampled), endpoint=False, dtype=dtype, @@ -214,23 +189,15 @@ def resample_audio( dims=wav.dims, coords={ **wav.coords, - "time": resampled_times, - }, - attrs={ - **wav.attrs, - "samplerate": target_samplerate, + "time": arrays.create_time_dim_from_array( + resampled_times, + samplerate=target_samplerate, + ), }, + attrs=wav.attrs, ) -def scale_audio( - audio: xr.DataArray, - eps: float = 10e-6, -) -> xr.DataArray: - audio = audio - audio.mean() - return audio / np.add(np.abs(audio).max(), eps, dtype=audio.dtype) - - def compute_spectrogram( wav: xr.DataArray, fft_win_length: float = FFT_WIN_LENGTH_S, @@ -249,12 +216,12 @@ def compute_spectrogram( dtype=dtype, ) - spec = crop_axis( + spec = arrays.crop_dim( spec, dim="frequency", start=min_freq, - end=max_freq, - ) + stop=max_freq, + ).astype(dtype) spec = scale_spectrogram(spec, scale=spec_scale) @@ -262,172 +229,67 @@ def compute_spectrogram( spec = denoise_spectrogram(spec) if max_scale_spec: - spec = max_scale_spectrogram(spec) + spec = ops.scale(spec, 1 / (10e-6 + np.max(spec))) - return spec - - -def crop_axis( - arr: xr.DataArray, - dim: str, - start: float, - end: float, - right_closed: bool = False, - left_closed: bool = True, - eps: float = 10e-6, -) -> xr.DataArray: - coord = arr.coords[dim] - - if not all(attr in coord.attrs for attr in ["min", "max"]): - raise ValueError( - f"Coordinate '{dim}' must have 'min' and 'max' attributes" - ) - - current_min = coord.attrs["min"] - current_max = coord.attrs["max"] - - if start < current_min or end > current_max: - raise ValueError( - f"Cannot select axis '{dim}' from {start} to {end}. " - f"Axis range is {current_min} to {current_max}" - ) - - slice_end = end - if not right_closed: - slice_end = end - eps - - slice_start = start - if not left_closed: - slice_start = start + eps - - arr = arr.sel({dim: slice(slice_start, slice_end)}) - - arr.coords[dim].attrs.update( - min=start, - max=end, - ) - - return arr - - -def extend_axis( - arr: xr.DataArray, - dim: str, - start: float, - end: float, - fill_value: float = 0, -) -> xr.DataArray: - coord = arr.coords[dim] - - if not all(attr in coord.attrs for attr in ["min", "max", "period"]): - raise ValueError( - f"Coordinate '{dim}' must have 'min', 'max' and 'period' attributes" - " to extend axis" - ) - - current_min = coord.attrs["min"] - current_max = coord.attrs["max"] - period = coord.attrs["period"] - - coords = coord.data - - if start < current_min: - new_coords = np.arange( - current_min, - start, - -period, - dtype=coord.dtype, - )[1:][::-1] - coords = np.concatenate([new_coords, coords]) - - if end > current_max: - new_coords = np.arange( - current_max, - end, - period, - dtype=coord.dtype, - )[1:] - coords = np.concatenate([coords, new_coords]) - - arr = arr.reindex( - {dim: coords}, - fill_value=fill_value, # type: ignore - ) - - arr.coords[dim].attrs.update( - min=start, - max=end, - ) - - return arr + return spec.astype(dtype) def gen_mag_spectrogram( - audio: xr.DataArray, + wave: xr.DataArray, window_len: float, overlap_perc: float, dtype: DTypeLike = np.float32, ) -> xr.DataArray: - sampling_rate = audio.attrs["samplerate"] + start_time, end_time = arrays.get_dim_range(wave, dim="time") + step = arrays.get_dim_step(wave, dim="time") + sampling_rate = 1 / step + hop_len = window_len * (1 - overlap_perc) nfft = int(window_len * sampling_rate) noverlap = int(overlap_perc * nfft) - start_time = audio.time.attrs["min"] - end_time = audio.time.attrs["max"] # compute spec spec, _ = librosa.core.spectrum._spectrogram( - y=audio.data, + y=wave.data, power=1, n_fft=nfft, hop_length=nfft - noverlap, center=False, ) - spec = xr.DataArray( + return xr.DataArray( data=spec.astype(dtype), dims=["frequency", "time"], coords={ - "frequency": np.linspace( - 0, - sampling_rate / 2, - spec.shape[0], - endpoint=False, - dtype=dtype, + "frequency": arrays.create_frequency_dim_from_array( + np.linspace( + 0, + sampling_rate / 2, + spec.shape[0], + endpoint=False, + dtype=dtype, + ), + step=sampling_rate / nfft, ), - "time": np.linspace( - start_time, - end_time - (window_len - hop_len), - spec.shape[1], - endpoint=False, - dtype=dtype, + "time": arrays.create_time_dim_from_array( + np.linspace( + start_time, + end_time - (window_len - hop_len), + spec.shape[1], + endpoint=False, + dtype=dtype, + ), + step=hop_len, ), }, attrs={ - **audio.attrs, + **wave.attrs, + "original_samplerate": sampling_rate, "nfft": nfft, "noverlap": noverlap, }, ) - # Add metadata to coordinates - spec.coords["time"].attrs.update( - unit="s", - long_name="Time", - min=start_time, - max=end_time - (window_len - hop_len), - period=(nfft - noverlap) / sampling_rate, - ) - spec.coords["frequency"].attrs.update( - unit="Hz", - long_name="Frequency", - period=(sampling_rate / nfft), - min=0, - max=sampling_rate / 2, - ) - - return spec - def denoise_spectrogram( spec: xr.DataArray, @@ -436,10 +298,7 @@ def denoise_spectrogram( data=(spec - spec.mean("time")).clip(0), dims=spec.dims, coords=spec.coords, - attrs={ - **spec.attrs, - "denoised": 1, - }, + attrs=spec.attrs, ) @@ -448,8 +307,14 @@ def scale_spectrogram( scale: str = SPEC_SCALE, dtype: DTypeLike = np.float32, ) -> xr.DataArray: + samplerate = spec.attrs["original_samplerate"] + if scale == "pcen": - return pcen(spec, dtype=dtype) + smoothing_constant = get_pcen_smoothing_constant(samplerate / 10) + return audio.pcen( + spec * (2**31), + smooth=smoothing_constant, + ).astype(dtype) if scale == "log": return log_scale(spec, dtype=dtype) @@ -461,126 +326,25 @@ def log_scale( spec: xr.DataArray, dtype: DTypeLike = np.float32, ) -> xr.DataArray: + samplerate = spec.attrs["original_samplerate"] nfft = spec.attrs["nfft"] - sampling_rate = spec.attrs["samplerate"] log_scaling = ( 2.0 - * (1.0 / sampling_rate) + * (1.0 / samplerate) * (1.0 / (np.abs(np.hanning(nfft)) ** 2).sum()) ) return xr.DataArray( data=np.log1p(log_scaling * spec).astype(dtype), dims=spec.dims, coords=spec.coords, - attrs={ - **spec.attrs, - "scale": "log", - }, + attrs=spec.attrs, ) -def pcen(spec: xr.DataArray, dtype: DTypeLike = np.float32) -> xr.DataArray: - sampling_rate = spec.attrs["samplerate"] - data = librosa.pcen( - spec.data * (2**31), - sr=sampling_rate / 10, - ) - return xr.DataArray( - data=data.astype(dtype), - dims=spec.dims, - coords=spec.coords, - attrs={ - **spec.attrs, - "scale": "pcen", - }, - ) - - -def max_scale_spectrogram(spec: xr.DataArray, eps=10e-6) -> xr.DataArray: - return xr.DataArray( - data=spec / np.add(spec.max(), eps, dtype=spec.dtype), - dims=spec.dims, - coords=spec.coords, - attrs={ - **spec.attrs, - "max_scaled": 1, - }, - ) - - -def resize_spectrogram( - spec: xr.DataArray, - time_bins: int, - freq_bins: int, -) -> xr.DataArray: - new_times = np.linspace( - spec.time[0], - spec.time[-1], - time_bins, - dtype=spec.time.dtype, - endpoint=True, - ) - new_frequencies = np.linspace( - spec.frequency[0], - spec.frequency[-1], - freq_bins, - dtype=spec.frequency.dtype, - endpoint=True, - ) - - return spec.interp( - coords=dict( - time=new_times, - frequency=new_frequencies, - ), - method="linear", - ) - - -def get_dim_width(arr: xr.DataArray, dim: str) -> float: - coord = arr.coords[dim] - attrs = coord.attrs - if "min" in attrs and "max" in attrs: - return attrs["max"] - attrs["min"] - - coord_min = coord.min() - coord_max = coord.max() - return float(coord_max - coord_min) - - -class RandomClipProvider: - def __init__( - self, - clip_annotations: List[data.ClipAnnotation], - target_sampling_rate: int = TARGET_SAMPLERATE_HZ, - scale_audio: bool = SCALE_RAW_AUDIO, - ): - self.target_sampling_rate = target_sampling_rate - self.scale_audio = scale_audio - self.clip_annotations = clip_annotations - - def get_next_clip(self, clip: data.ClipAnnotation) -> data.ClipAnnotation: - tries = 0 - while True: - random_clip = random.choice(self.clip_annotations) - - if random_clip.clip != clip.clip: - return random_clip - - tries += 1 - if tries > 4: - raise ValueError("Could not find a different clip") - - def __call__( - self, - clip: data.ClipAnnotation, - ) -> Tuple[xr.DataArray, data.ClipAnnotation]: - random_clip = self.get_next_clip(clip) - - wav = load_clip_audio( - random_clip.clip, - target_sampling_rate=self.target_sampling_rate, - scale=self.scale_audio, - ) - - return wav, random_clip +def get_pcen_smoothing_constant( + sr: int, + time_constant: float = 0.4, + hop_length: int = 512, +) -> float: + t_frames = time_constant * sr / float(hop_length) + return (np.sqrt(1 + 4 * t_frames**2) - 1) / (2 * t_frames**2) diff --git a/batdetect2/detector/post_process.py b/batdetect2/detector/post_process.py index 1cf44fe..b47eec6 100644 --- a/batdetect2/detector/post_process.py +++ b/batdetect2/detector/post_process.py @@ -68,7 +68,6 @@ def run_nms( params["fft_win_length"], params["fft_overlap"], ) - print("duration", duration) top_k = int(duration * params["nms_top_k_per_sec"]) scores, y_pos, x_pos = get_topk_scores(pred_det_nms, top_k) diff --git a/batdetect2/models/__init__.py b/batdetect2/models/__init__.py index 8d026ca..ef37e70 100644 --- a/batdetect2/models/__init__.py +++ b/batdetect2/models/__init__.py @@ -1,91 +1,11 @@ -import os -from typing import Tuple, Union - -import torch - -from batdetect2.models.encoders import ( +from batdetect2.models.feature_extractors import ( Net2DFast, Net2DFastNoAttn, Net2DFastNoCoordConv, ) -from batdetect2.models.typing import DetectionModel __all__ = [ - "load_model", "Net2DFast", "Net2DFastNoAttn", "Net2DFastNoCoordConv", ] - -DEFAULT_MODEL_PATH = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - "models", - "checkpoints", - "Net2DFast_UK_same.pth.tar", -) - - -def load_model( - model_path: str = DEFAULT_MODEL_PATH, - load_weights: bool = True, - device: Union[torch.device, str, None] = None, -) -> Tuple[DetectionModel, dict]: - """Load model from file. - - Args: - model_path (str): Path to model file. Defaults to DEFAULT_MODEL_PATH. - load_weights (bool, optional): Load weights. Defaults to True. - - Returns: - model, params: Model and parameters. - - Raises: - FileNotFoundError: Model file not found. - ValueError: Unknown model name. - """ - if device is None: - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - if not os.path.isfile(model_path): - raise FileNotFoundError("Model file not found.") - - net_params = torch.load(model_path, map_location=device) - - params = net_params["params"] - - model: DetectionModel - - if params["model_name"] == "Net2DFast": - model = Net2DFast( - params["num_filters"], - num_classes=len(params["class_names"]), - emb_dim=params["emb_dim"], - ip_height=params["ip_height"], - resize_factor=params["resize_factor"], - ) - elif params["model_name"] == "Net2DFastNoAttn": - model = Net2DFastNoAttn( - params["num_filters"], - num_classes=len(params["class_names"]), - emb_dim=params["emb_dim"], - ip_height=params["ip_height"], - resize_factor=params["resize_factor"], - ) - elif params["model_name"] == "Net2DFastNoCoordConv": - model = Net2DFastNoCoordConv( - params["num_filters"], - num_classes=len(params["class_names"]), - emb_dim=params["emb_dim"], - ip_height=params["ip_height"], - resize_factor=params["resize_factor"], - ) - else: - raise ValueError("Unknown model.") - - if load_weights: - model.load_state_dict(net_params["state_dict"]) - - model = model.to(device) - model.eval() - - return model, params diff --git a/batdetect2/models/detectors.py b/batdetect2/models/detectors.py index e324c3e..992e04d 100644 --- a/batdetect2/models/detectors.py +++ b/batdetect2/models/detectors.py @@ -1,100 +1,104 @@ +from typing import Type + import pytorch_lightning as L import torch import xarray as xr from soundevent import data from torch import nn, optim -from batdetect2.data.preprocessing import preprocess_audio_clip -from batdetect2.models.typing import EncoderModel, ModelOutput -from batdetect2.train import losses -from batdetect2.train.dataset import TrainExample +from batdetect2.data.preprocessing import ( + preprocess_audio_clip, + PreprocessingConfig, +) +from batdetect2.data.labels import ClassMapper +from batdetect2.models.feature_extractors import Net2DFast from batdetect2.models.post_process import ( PostprocessConfig, postprocess_model_outputs, ) -from batdetect2.train.preprocess import PreprocessingConfig +from batdetect2.models.typing import FeatureExtractorModel, ModelOutput +from batdetect2.train import losses +from batdetect2.train.dataset import TrainExample class DetectorModel(L.LightningModule): def __init__( self, - encoder: EncoderModel, - num_classes: int, + class_mapper: ClassMapper, + feature_extractor_class: Type[FeatureExtractorModel] = Net2DFast, learning_rate: float = 1e-3, + input_height: int = 128, + num_features: int = 32, preprocessing_config: PreprocessingConfig = PreprocessingConfig(), postprocessing_config: PostprocessConfig = PostprocessConfig(), ): super().__init__() + self.save_hyperparameters() + self.preprocessing_config = preprocessing_config self.postprocessing_config = postprocessing_config - self.num_classes = num_classes + self.class_mapper = class_mapper self.learning_rate = learning_rate + self.input_height = input_height + self.num_features = num_features + self.num_classes = class_mapper.num_classes - self.encoder = encoder + self.feature_extractor = feature_extractor_class( + input_height=input_height, + num_features=num_features, + ) self.classifier = nn.Conv2d( - self.encoder.num_filts // 4, + self.feature_extractor.num_features // 4, self.num_classes + 1, kernel_size=1, padding=0, ) self.bbox = nn.Conv2d( - self.encoder.num_filts // 4, + self.feature_extractor.num_features // 4, 2, kernel_size=1, padding=0, ) def forward(self, spec: torch.Tensor) -> ModelOutput: # type: ignore - features = self.encoder(spec) - + features = self.feature_extractor(spec) classification_logits = self.classifier(features) classification_probs = torch.softmax(classification_logits, dim=1) detection_probs = classification_probs[:, :-1].sum(dim=1, keepdim=True) - return ModelOutput( detection_probs=detection_probs, size_preds=self.bbox(features), - class_probs=classification_probs, + class_probs=classification_probs[:, :-1], features=features, ) def compute_spectrogram(self, clip: data.Clip) -> xr.DataArray: - config = self.preprocessing_config - return preprocess_audio_clip( clip, - target_sampling_rate=config.target_samplerate, - scale_audio=config.scale_audio, - fft_win_length=config.fft_win_length, - fft_overlap=config.fft_overlap, - max_freq=config.max_freq, - min_freq=config.min_freq, - spec_scale=config.spec_scale, - denoise_spec_avg=config.denoise_spec_avg, - max_scale_spec=config.max_scale_spec, + config=self.preprocessing_config, ) - def process_clip(self, clip: data.Clip): + def compute_clip_features(self, clip: data.Clip) -> torch.Tensor: + spectrogram = self.compute_spectrogram(clip) + return self.feature_extractor( + torch.tensor(spectrogram.values).unsqueeze(0).unsqueeze(0) + ) + + def compute_clip_predictions(self, clip: data.Clip) -> data.ClipPrediction: spectrogram = self.compute_spectrogram(clip) spec_tensor = ( torch.tensor(spectrogram.values).unsqueeze(0).unsqueeze(0) ) - outputs = self(spec_tensor) - - config = self.postprocessing_config return postprocess_model_outputs( outputs, [clip], - nms_kernel_size=config.nms_kernel_size, - detection_threshold=config.detection_threshold, - min_freq=config.min_freq, - max_freq=config.max_freq, - top_k_per_sec=config.top_k_per_sec, - ) + class_mapper=self.class_mapper, + config=self.postprocessing_config, + )[0] def compute_loss( self, @@ -124,21 +128,8 @@ class DetectorModel(L.LightningModule): self, batch: TrainExample, ): - features = self.encoder(batch.spec) - - classification_logits = self.classifier(features) - classification_probs = torch.softmax(classification_logits, dim=1) - detection_probs = classification_probs[:, :-1].sum(dim=1, keepdim=True) - - loss = self.compute_loss( - ModelOutput( - detection_probs=detection_probs, - size_preds=self.bbox(features), - class_probs=classification_probs, - features=features, - ), - batch, - ) + outputs = self.forward(batch.spec) + loss = self.compute_loss(outputs, batch) self.log("train_loss", loss) return loss diff --git a/batdetect2/models/encoders.py b/batdetect2/models/feature_extractors.py similarity index 67% rename from batdetect2/models/encoders.py rename to batdetect2/models/feature_extractors.py index 8b289ae..4c437b9 100644 --- a/batdetect2/models/encoders.py +++ b/batdetect2/models/feature_extractors.py @@ -5,7 +5,6 @@ import torch.fft import torch.nn.functional as F from torch import nn -from batdetect2.models.typing import EncoderModel from batdetect2.models.blocks import ( ConvBlockDownCoordF, ConvBlockDownStandard, @@ -13,6 +12,7 @@ from batdetect2.models.blocks import ( ConvBlockUpStandard, SelfAttention, ) +from batdetect2.models.typing import FeatureExtractorModel __all__ = [ "Net2DFast", @@ -21,84 +21,84 @@ __all__ = [ ] -class Net2DFast(EncoderModel): +class Net2DFast(FeatureExtractorModel): def __init__( self, - num_filts: int, + num_features: int, input_height: int = 128, ): super().__init__() - self.num_filts = num_filts + self.num_features = num_features self.input_height = input_height self.bottleneck_height = self.input_height // 32 # encoder self.conv_dn_0 = ConvBlockDownCoordF( 1, - self.num_filts // 4, + self.num_features // 4, self.input_height, k_size=3, pad_size=1, stride=1, ) self.conv_dn_1 = ConvBlockDownCoordF( - self.num_filts // 4, - self.num_filts // 2, + self.num_features // 4, + self.num_features // 2, self.input_height // 2, k_size=3, pad_size=1, stride=1, ) self.conv_dn_2 = ConvBlockDownCoordF( - self.num_filts // 2, - self.num_filts, + self.num_features // 2, + self.num_features, self.input_height // 4, k_size=3, pad_size=1, stride=1, ) self.conv_dn_3 = nn.Conv2d( - self.num_filts, - self.num_filts * 2, + self.num_features, + self.num_features * 2, 3, padding=1, ) - self.conv_dn_3_bn = nn.BatchNorm2d(self.num_filts * 2) + self.conv_dn_3_bn = nn.BatchNorm2d(self.num_features * 2) # bottleneck self.conv_1d = nn.Conv2d( - self.num_filts * 2, - self.num_filts * 2, + self.num_features * 2, + self.num_features * 2, (self.input_height // 8, 1), padding=0, ) - self.conv_1d_bn = nn.BatchNorm2d(self.num_filts * 2) - self.att = SelfAttention(self.num_filts * 2, self.num_filts * 2) + self.conv_1d_bn = nn.BatchNorm2d(self.num_features * 2) + self.att = SelfAttention(self.num_features * 2, self.num_features * 2) # decoder self.conv_up_2 = ConvBlockUpF( - self.num_filts * 2, - self.num_filts // 2, + self.num_features * 2, + self.num_features // 2, self.input_height // 8, ) self.conv_up_3 = ConvBlockUpF( - self.num_filts // 2, - self.num_filts // 4, + self.num_features // 2, + self.num_features // 4, self.input_height // 4, ) self.conv_up_4 = ConvBlockUpF( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, self.input_height // 2, ) self.conv_op = nn.Conv2d( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, kernel_size=3, padding=1, ) - self.conv_op_bn = nn.BatchNorm2d(self.num_filts // 4) + self.conv_op_bn = nn.BatchNorm2d(self.num_features // 4) def pad_adjust(self, spec: torch.Tensor) -> Tuple[torch.Tensor, int, int]: h, w = spec.shape[2:] @@ -135,81 +135,81 @@ class Net2DFast(EncoderModel): return F.relu_(self.conv_op_bn(self.conv_op(x))) -class Net2DFastNoAttn(EncoderModel): +class Net2DFastNoAttn(FeatureExtractorModel): def __init__( self, - num_filts: int, + num_features: int, input_height: int = 128, ): super().__init__() - self.num_filts = num_filts + self.num_features = num_features self.input_height = input_height self.bottleneck_height = self.input_height // 32 self.conv_dn_0 = ConvBlockDownCoordF( 1, - self.num_filts // 4, + self.num_features // 4, self.input_height, k_size=3, pad_size=1, stride=1, ) self.conv_dn_1 = ConvBlockDownCoordF( - self.num_filts // 4, - self.num_filts // 2, + self.num_features // 4, + self.num_features // 2, self.input_height // 2, k_size=3, pad_size=1, stride=1, ) self.conv_dn_2 = ConvBlockDownCoordF( - self.num_filts // 2, - self.num_filts, + self.num_features // 2, + self.num_features, self.input_height // 4, k_size=3, pad_size=1, stride=1, ) self.conv_dn_3 = nn.Conv2d( - self.num_filts, - self.num_filts * 2, + self.num_features, + self.num_features * 2, 3, padding=1, ) - self.conv_dn_3_bn = nn.BatchNorm2d(self.num_filts * 2) + self.conv_dn_3_bn = nn.BatchNorm2d(self.num_features * 2) self.conv_1d = nn.Conv2d( - self.num_filts * 2, - self.num_filts * 2, + self.num_features * 2, + self.num_features * 2, (self.input_height // 8, 1), padding=0, ) - self.conv_1d_bn = nn.BatchNorm2d(self.num_filts * 2) + self.conv_1d_bn = nn.BatchNorm2d(self.num_features * 2) self.conv_up_2 = ConvBlockUpF( - self.num_filts * 2, - self.num_filts // 2, + self.num_features * 2, + self.num_features // 2, self.input_height // 8, ) self.conv_up_3 = ConvBlockUpF( - self.num_filts // 2, - self.num_filts // 4, + self.num_features // 2, + self.num_features // 4, self.input_height // 4, ) self.conv_up_4 = ConvBlockUpF( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, self.input_height // 2, ) self.conv_op = nn.Conv2d( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, kernel_size=3, padding=1, ) - self.conv_op_bn = nn.BatchNorm2d(self.num_filts // 4) + self.conv_op_bn = nn.BatchNorm2d(self.num_features // 4) def forward(self, spec: torch.Tensor) -> torch.Tensor: x1 = self.conv_dn_0(spec) @@ -227,80 +227,80 @@ class Net2DFastNoAttn(EncoderModel): return F.relu_(self.conv_op_bn(self.conv_op(x))) -class Net2DFastNoCoordConv(EncoderModel): +class Net2DFastNoCoordConv(FeatureExtractorModel): def __init__( self, - num_filts: int, + num_features: int, input_height: int = 128, ): super().__init__() - self.num_filts = num_filts + self.num_features = num_features self.input_height = input_height self.bottleneck_height = self.input_height // 32 self.conv_dn_0 = ConvBlockDownStandard( 1, - self.num_filts // 4, + self.num_features // 4, k_size=3, pad_size=1, stride=1, ) self.conv_dn_1 = ConvBlockDownStandard( - self.num_filts // 4, - self.num_filts // 2, + self.num_features // 4, + self.num_features // 2, k_size=3, pad_size=1, stride=1, ) self.conv_dn_2 = ConvBlockDownStandard( - self.num_filts // 2, - self.num_filts, + self.num_features // 2, + self.num_features, k_size=3, pad_size=1, stride=1, ) self.conv_dn_3 = nn.Conv2d( - self.num_filts, - self.num_filts * 2, + self.num_features, + self.num_features * 2, 3, padding=1, ) - self.conv_dn_3_bn = nn.BatchNorm2d(self.num_filts * 2) + self.conv_dn_3_bn = nn.BatchNorm2d(self.num_features * 2) self.conv_1d = nn.Conv2d( - self.num_filts * 2, - self.num_filts * 2, + self.num_features * 2, + self.num_features * 2, (self.input_height // 8, 1), padding=0, ) - self.conv_1d_bn = nn.BatchNorm2d(self.num_filts * 2) + self.conv_1d_bn = nn.BatchNorm2d(self.num_features * 2) - self.att = SelfAttention(self.num_filts * 2, self.num_filts * 2) + self.att = SelfAttention(self.num_features * 2, self.num_features * 2) self.conv_up_2 = ConvBlockUpStandard( - self.num_filts * 2, - self.num_filts // 2, + self.num_features * 2, + self.num_features // 2, self.input_height // 8, ) self.conv_up_3 = ConvBlockUpStandard( - self.num_filts // 2, - self.num_filts // 4, + self.num_features // 2, + self.num_features // 4, self.input_height // 4, ) self.conv_up_4 = ConvBlockUpStandard( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, self.input_height // 2, ) self.conv_op = nn.Conv2d( - self.num_filts // 4, - self.num_filts // 4, + self.num_features // 4, + self.num_features // 4, kernel_size=3, padding=1, ) - self.conv_op_bn = nn.BatchNorm2d(self.num_filts // 4) + self.conv_op_bn = nn.BatchNorm2d(self.num_features // 4) def forward(self, spec: torch.Tensor) -> torch.Tensor: x1 = self.conv_dn_0(spec) diff --git a/batdetect2/models/post_process.py b/batdetect2/models/post_process.py index 08b02fa..df3a47e 100644 --- a/batdetect2/models/post_process.py +++ b/batdetect2/models/post_process.py @@ -8,6 +8,7 @@ import torch from soundevent import data from torch import nn +from batdetect2.data.labels import ClassMapper from batdetect2.models.typing import ModelOutput __all__ = [ @@ -36,11 +37,8 @@ TagFunction = Callable[[int], List[data.Tag]] def postprocess_model_outputs( outputs: ModelOutput, clips: List[data.Clip], - nms_kernel_size: int = NMS_KERNEL_SIZE, - detection_threshold: float = DETECTION_THRESHOLD, - min_freq: int = 10000, - max_freq: int = 120000, - top_k_per_sec: int = TOP_K_PER_SEC, + class_mapper: ClassMapper, + config: PostprocessConfig, ) -> List[data.ClipPrediction]: """Postprocesses model outputs to generate clip predictions. @@ -57,16 +55,8 @@ def postprocess_model_outputs( clips List of clips for which predictions are made. The number of clips must match the batch dimension of the model outputs. - nms_kernel_size - Size of the non-maximum suppression kernel. Default is 9. - detection_threshold - Detection threshold. Default is 0.01. - min_freq - Minimum frequency. Default is 10000. - max_freq - Maximum frequency. Default is 120000. - top_k_per_sec - Top k per second. Default is 200. + config + Configuration for postprocessing model outputs. Returns ------- @@ -90,14 +80,14 @@ def postprocess_model_outputs( detection_probs = non_max_suppression( outputs.detection_probs, - kernel_size=nms_kernel_size, + kernel_size=config.nms_kernel_size, ) duration = clips[0].end_time - clips[0].start_time scores_batch, y_pos_batch, x_pos_batch = get_topk_scores( detection_probs, - int(top_k_per_sec * duration / 2), + int(config.top_k_per_sec * duration / 2), ) predictions: List[data.ClipPrediction] = [] @@ -118,9 +108,10 @@ def postprocess_model_outputs( size_preds, class_probs, features, - min_freq=min_freq, - max_freq=max_freq, - detection_threshold=detection_threshold, + class_mapper=class_mapper, + min_freq=config.min_freq, + max_freq=config.max_freq, + detection_threshold=config.detection_threshold, ) predictions.append( @@ -141,7 +132,7 @@ def compute_sound_events_from_outputs( size_preds: torch.Tensor, class_probs: torch.Tensor, features: torch.Tensor, - tag_fn: TagFunction = lambda _: [], + class_mapper: ClassMapper, min_freq: int = 10000, max_freq: int = 120000, detection_threshold: float = DETECTION_THRESHOLD, @@ -160,7 +151,6 @@ def compute_sound_events_from_outputs( predictions: List[data.SoundEventPrediction] = [] for score, x, y in zip(scores, x_pos, y_pos): width, height = size_preds[:, y, x] - print(width, height) class_prob = class_probs[:, y, x] feature = features[:, y, x] @@ -191,7 +181,7 @@ def compute_sound_events_from_outputs( predicted_tags: List[data.PredictedTag] = [] for label_id, class_score in enumerate(class_prob): - corresponding_tags = tag_fn(label_id) + corresponding_tags = class_mapper.inverse_transform(label_id) predicted_tags.extend( [ data.PredictedTag( diff --git a/batdetect2/models/typing.py b/batdetect2/models/typing.py index 40fbd4c..7a0d0ee 100644 --- a/batdetect2/models/typing.py +++ b/batdetect2/models/typing.py @@ -4,6 +4,11 @@ from typing import NamedTuple import torch import torch.nn as nn +__all__ = [ + "ModelOutput", + "FeatureExtractorModel", +] + class ModelOutput(NamedTuple): """Output of the detection model. @@ -36,12 +41,11 @@ class ModelOutput(NamedTuple): """Tensor with intermediate features.""" -class EncoderModel(ABC, nn.Module): - +class FeatureExtractorModel(ABC, nn.Module): input_height: int """Height of the input spectrogram.""" - num_filts: int + num_features: int """Dimension of the feature tensor.""" @abstractmethod diff --git a/batdetect2/train/augmentations.py b/batdetect2/train/augmentations.py new file mode 100644 index 0000000..f0b0130 --- /dev/null +++ b/batdetect2/train/augmentations.py @@ -0,0 +1,244 @@ +from functools import wraps +from typing import Callable, List, Optional, Tuple + +import numpy as np +import xarray as xr +from soundevent import data +from soundevent.geometry import compute_bounds + + +Augmentation = Callable[[xr.Dataset], xr.Dataset] + + +AUGMENTATION_PROBABILITY = 0.2 +MAX_DELAY = 0.005 +STRETCH_SQUEEZE_DELTA = 0.04 +MASK_MAX_TIME_PERC: float = 0.05 +MASK_MAX_FREQ_PERC: float = 0.10 + + +def maybe_apply( + augmentation: Callable, + prob: float = AUGMENTATION_PROBABILITY, +) -> Callable: + """Apply an augmentation with a given probability.""" + + @wraps(augmentation) + def _augmentation(x): + if np.random.rand() > prob: + return x + return augmentation(x) + + return _augmentation + + +def select_random_subclip( + train_example: xr.Dataset, + duration: Optional[float] = None, + proportion: float = 0.9, +) -> xr.Dataset: + """Select a random subclip from a clip.""" + + time_coords = train_example.coords["time"] + + start_time = time_coords.attrs.get("min", time_coords.min()) + end_time = time_coords.attrs.get("max", time_coords.max()) + + if duration is None: + duration = (end_time - start_time) * proportion + + start_time = np.random.uniform(start_time, end_time - duration) + return train_example.sel(time=slice(start_time, start_time + duration)) + + +def combine_audio( + audio1: xr.DataArray, + audio2: xr.DataArray, + alpha: Optional[float] = None, + min_alpha: float = 0.3, + max_alpha: float = 0.7, +) -> xr.DataArray: + """Combine two audio clips.""" + + if alpha is None: + alpha = np.random.uniform(min_alpha, max_alpha) + + return alpha * audio1 + (1 - alpha) * audio2.data + + +# def random_mix( +# audio: xr.DataArray, +# clip: data.ClipAnnotation, +# provider: Optional[ClipProvider] = None, +# alpha: Optional[float] = None, +# min_alpha: float = 0.3, +# max_alpha: float = 0.7, +# join_annotations: bool = True, +# ) -> Tuple[xr.DataArray, data.ClipAnnotation]: +# """Mix two audio clips.""" +# if provider is None: +# raise ValueError("No audio provider given.") +# +# try: +# other_audio, other_clip = provider(clip) +# except (StopIteration, ValueError): +# raise ValueError("No more audio sources available.") +# +# new_audio = combine_audio( +# audio, +# other_audio, +# alpha=alpha, +# min_alpha=min_alpha, +# max_alpha=max_alpha, +# ) +# +# if join_annotations: +# clip = clip.model_copy( +# update=dict( +# sound_events=clip.sound_events + other_clip.sound_events, +# ) +# ) +# +# return new_audio, clip + + +def add_echo( + train_example: xr.Dataset, + delay: Optional[float] = None, + alpha: Optional[float] = None, + min_alpha: float = 0.0, + max_alpha: float = 1.0, + max_delay: float = MAX_DELAY, +) -> xr.Dataset: + """Add a delay to the audio.""" + if delay is None: + delay = np.random.uniform(0, max_delay) + + if alpha is None: + alpha = np.random.uniform(min_alpha, max_alpha) + + spec = train_example["spectrogram"] + + time_coords = spec.coords["time"] + start_time = time_coords.attrs["min"] + end_time = time_coords.attrs["max"] + step = (end_time - start_time) / time_coords.size + + spec_delay = spec.shift(time=int(delay / step), fill_value=0) + + return train_example.assign(spectrogram=spec + alpha * spec_delay) + + +def scale_volume( + train_example: xr.Dataset, + factor: Optional[float] = None, + max_scaling: float = 2, + min_scaling: float = 0, +) -> xr.Dataset: + """Scale the volume of a spectrogram.""" + if factor is None: + factor = np.random.uniform(min_scaling, max_scaling) + + return train_example.assign( + spectrogram=train_example["spectrogram"] * factor + ) + + +def warp_spectrogram( + train_example: xr.Dataset, + factor: Optional[float] = None, + delta: float = STRETCH_SQUEEZE_DELTA, +) -> xr.Dataset: + """Warp a spectrogram.""" + if factor is None: + factor = np.random.uniform(1 - delta, 1 + delta) + + time_coords = train_example.coords["time"] + start_time = time_coords.attrs["min"] + end_time = time_coords.attrs["max"] + duration = end_time - start_time + + new_time = np.linspace( + start_time, + start_time + duration * factor, + train_example.time.size, + ) + + return train_example.interp(time=new_time) + + +def mask_axis( + train_example: xr.Dataset, + dim: str, + start: float, + end: float, + mask_all: bool = False, + mask_value: float = 0, +) -> xr.Dataset: + if dim not in train_example.dims: + raise ValueError(f"Axis {dim} not found in array") + + coord = train_example.coords[dim] + condition = (coord < start) | (coord > end) + + if mask_all: + return train_example.where(condition, other=mask_value) + + return train_example.assign( + spectrogram=train_example.spectrogram.where( + condition, other=mask_value + ) + ) + + +def mask_time( + train_example: xr.Dataset, + max_time_mask: float = MASK_MAX_TIME_PERC, + max_num_masks: int = 3, +) -> xr.Dataset: + """Mask a random section of the time axis.""" + + num_masks = np.random.randint(1, max_num_masks + 1) + + time_coord = train_example.coords["time"] + start_time = time_coord.attrs.get("min", time_coord.min()) + end_time = time_coord.attrs.get("max", time_coord.max()) + + for _ in range(num_masks): + mask_size = np.random.uniform(0, max_time_mask) + start = np.random.uniform(start_time, end_time - mask_size) + end = start + mask_size + train_example = mask_axis(train_example, "time", start, end) + + return train_example + + +def mask_frequency( + train_example: xr.Dataset, + max_freq_mask: float = MASK_MAX_FREQ_PERC, + max_num_masks: int = 3, +) -> xr.Dataset: + """Mask a random section of the frequency axis.""" + + num_masks = np.random.randint(1, max_num_masks + 1) + + freq_coord = train_example.coords["frequency"] + min_freq = freq_coord.min() + max_freq = freq_coord.max() + + for _ in range(num_masks): + mask_size = np.random.uniform(0, max_freq_mask) + start = np.random.uniform(min_freq, max_freq - mask_size) + end = start + mask_size + train_example = mask_axis(train_example, "frequency", start, end) + + return train_example + + +AUGMENTATIONS: List[Augmentation] = [ + select_random_subclip, + add_echo, + scale_volume, + mask_time, + mask_frequency, +] diff --git a/batdetect2/train/dataset.py b/batdetect2/train/dataset.py index 6b8df02..f51c507 100644 --- a/batdetect2/train/dataset.py +++ b/batdetect2/train/dataset.py @@ -1,16 +1,14 @@ import os -from typing import NamedTuple from pathlib import Path -from typing import Sequence, Union, Dict -from soundevent import data +from typing import Callable, Dict, NamedTuple, Optional, Sequence, Union -from torch.utils.data import Dataset import torch import xarray as xr +from soundevent import data +from torch.utils.data import Dataset from batdetect2.train.preprocess import PreprocessingConfig - __all__ = [ "TrainExample", "LabeledDataset", @@ -33,8 +31,13 @@ def get_files(directory: PathLike, extension: str = ".nc") -> Sequence[Path]: class LabeledDataset(Dataset): - def __init__(self, filenames: Sequence[PathLike]): + def __init__( + self, + filenames: Sequence[PathLike], + transform: Optional[Callable[[xr.Dataset], xr.Dataset]] = None, + ): self.filenames = filenames + self.transform = transform def __len__(self): return len(self.filenames) @@ -54,7 +57,7 @@ class LabeledDataset(Dataset): return cls(get_files(directory, extension)) def load(self, filename: PathLike) -> Dict[str, torch.Tensor]: - dataset = xr.open_dataset(filename) + dataset = self.get_dataset(filename) spectrogram = torch.tensor(dataset["spectrogram"].values).unsqueeze(0) return { "spectrogram": spectrogram, @@ -63,6 +66,15 @@ class LabeledDataset(Dataset): "size": torch.tensor(dataset["size"].values), } + def apply_augmentation(self, dataset: xr.Dataset) -> xr.Dataset: + if self.transform is not None: + return self.transform(dataset) + + return dataset + + def get_dataset(self, idx): + return xr.open_dataset(self.filenames[idx]) + def get_spectrogram(self, idx): return xr.open_dataset(self.filenames[idx])["spectrogram"] diff --git a/batdetect2/train/preprocess.py b/batdetect2/train/preprocess.py index e7284df..bfbcc64 100644 --- a/batdetect2/train/preprocess.py +++ b/batdetect2/train/preprocess.py @@ -9,21 +9,12 @@ from tqdm.auto import tqdm from multiprocessing import Pool import xarray as xr -from pydantic import BaseModel, Field from soundevent import data -from batdetect2.data.labels import TARGET_SIGMA, LabelFn, generate_heatmaps +from batdetect2.data.labels import TARGET_SIGMA, ClassMapper, generate_heatmaps from batdetect2.data.preprocessing import ( - DENOISE_SPEC_AVG, - FFT_OVERLAP, - FFT_WIN_LENGTH_S, - MAX_FREQ_HZ, - MAX_SCALE_SPEC, - MIN_FREQ_HZ, - SCALE_RAW_AUDIO, - SPEC_SCALE, - TARGET_SAMPLERATE_HZ, preprocess_audio_clip, + PreprocessingConfig, ) PathLike = Union[Path, str, os.PathLike] @@ -34,61 +25,24 @@ __all__ = [ ] -class PreprocessingConfig(BaseModel): - """Configuration for preprocessing data.""" - - target_samplerate: int = Field(default=TARGET_SAMPLERATE_HZ, gt=0) - - scale_audio: bool = Field(default=SCALE_RAW_AUDIO) - - fft_win_length: float = Field(default=FFT_WIN_LENGTH_S, gt=0) - - fft_overlap: float = Field(default=FFT_OVERLAP, ge=0, lt=1) - - max_freq: int = Field(default=MAX_FREQ_HZ, gt=0) - - min_freq: int = Field(default=MIN_FREQ_HZ, gt=0) - - spec_scale: str = Field(default=SPEC_SCALE) - - denoise_spec_avg: bool = DENOISE_SPEC_AVG - - max_scale_spec: bool = MAX_SCALE_SPEC - - target_sigma: float = Field(default=TARGET_SIGMA, gt=0) - - class_labels: Sequence[str] = ["bat"] - def generate_train_example( clip_annotation: data.ClipAnnotation, - label_fn: LabelFn = lambda _: None, - config: Optional[PreprocessingConfig] = None, + class_mapper: ClassMapper, + preprocessing_config: PreprocessingConfig = PreprocessingConfig(), + target_sigma: float = TARGET_SIGMA, ) -> xr.Dataset: """Generate a training example.""" - if config is None: - config = PreprocessingConfig() - spectrogram = preprocess_audio_clip( clip_annotation.clip, - target_sampling_rate=config.target_samplerate, - scale_audio=config.scale_audio, - fft_win_length=config.fft_win_length, - fft_overlap=config.fft_overlap, - max_freq=config.max_freq, - min_freq=config.min_freq, - spec_scale=config.spec_scale, - denoise_spec_avg=config.denoise_spec_avg, - max_scale_spec=config.max_scale_spec, + config=preprocessing_config, ) detection_heatmap, class_heatmap, size_heatmap = generate_heatmaps( clip_annotation, spectrogram, - target_sigma=config.target_sigma, - num_classes=len(config.class_labels), - class_labels=list(config.class_labels), - label_fn=label_fn, + class_mapper, + target_sigma=target_sigma, ) dataset = xr.Dataset( @@ -102,7 +56,8 @@ def generate_train_example( return dataset.assign_attrs( title=f"Training example for {clip_annotation.uuid}", - configuration=config.model_dump_json(), + preprocessing_configuration=preprocessing_config.model_dump_json(), + target_sigma=target_sigma, clip_annotation=clip_annotation.model_dump_json(), ) @@ -148,9 +103,10 @@ def preprocess_single_annotation( clip_annotation: data.ClipAnnotation, output_dir: PathLike, config: PreprocessingConfig, + class_mapper: ClassMapper, filename_fn: FilenameFn = _get_filename, replace: bool = False, - label_fn: LabelFn = lambda _: None, + target_sigma: float = TARGET_SIGMA, ) -> None: output_dir = Path(output_dir) @@ -162,8 +118,9 @@ def preprocess_single_annotation( sample = generate_train_example( clip_annotation, - label_fn=label_fn, - config=config, + class_mapper, + preprocessing_config=config, + target_sigma=target_sigma, ) save_to_file(sample, path) @@ -172,10 +129,11 @@ def preprocess_single_annotation( def preprocess_annotations( clip_annotations: Sequence[data.ClipAnnotation], output_dir: PathLike, + class_mapper: ClassMapper, + target_sigma: float = TARGET_SIGMA, filename_fn: FilenameFn = _get_filename, replace: bool = False, config_file: Optional[PathLike] = None, - label_fn: LabelFn = lambda _: None, max_workers: Optional[int] = None, **kwargs, ) -> None: @@ -198,9 +156,10 @@ def preprocess_annotations( preprocess_single_annotation, output_dir=output_dir, config=config, + class_mapper=class_mapper, filename_fn=filename_fn, replace=replace, - label_fn=label_fn, + target_sigma=target_sigma, ), clip_annotations, ), diff --git a/pyproject.toml b/pyproject.toml index f116a06..83f7312 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "torch>=1.13.1", "torchaudio", "torchvision", - "soundevent[audio,geometry,plot]>=1.3.5", + "soundevent[audio,geometry,plot]>=2.0", "click>=8.1.7", "netcdf4>=1.6.5", "tqdm>=4.66.2", diff --git a/requirements-dev.lock b/requirements-dev.lock index 930cdda..0fb3832 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -10,6 +10,8 @@ -e file:. absl-py==2.1.0 # via tensorboard +affine==2.4.0 + # via rasterio aiobotocore==2.12.3 # via s3fs aiohttp==3.9.5 @@ -37,6 +39,7 @@ async-timeout==4.0.3 # via redis attrs==23.2.0 # via aiohttp + # via rasterio audioread==3.0.1 # via librosa backcall==0.2.0 @@ -57,6 +60,7 @@ botocore==1.34.69 # via s3transfer certifi==2024.2.2 # via netcdf4 + # via rasterio # via requests cf-xarray==0.9.0 # via batdetect2 @@ -68,9 +72,16 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via batdetect2 + # via click-plugins + # via cligj # via lightning # via lightning-cloud + # via rasterio # via uvicorn +click-plugins==1.1.1 + # via rasterio +cligj==0.7.2 + # via rasterio comm==0.2.2 # via ipykernel contourpy==1.1.1 @@ -136,6 +147,7 @@ idna==3.7 importlib-metadata==7.1.0 # via jupyter-client # via markdown + # via rasterio importlib-resources==6.4.0 # via matplotlib # via typeshed-client @@ -229,9 +241,11 @@ numpy==1.24.4 # via onnx # via pandas # via pytorch-lightning + # via rasterio # via scikit-learn # via scipy # via shapely + # via snuggs # via soxr # via tensorboard # via tensorboardx @@ -335,6 +349,7 @@ pyjwt==2.8.0 # via lightning-cloud pyparsing==3.1.2 # via matplotlib + # via snuggs pytest==8.1.1 python-dateutil==2.9.0.post0 # via arrow @@ -361,6 +376,8 @@ pyyaml==6.0.1 pyzmq==26.0.0 # via ipykernel # via jupyter-client +rasterio==1.3.10 + # via soundevent readchar==4.0.6 # via inquirer redis==5.0.4 @@ -390,6 +407,7 @@ scipy==1.10.1 # via soundevent setuptools==69.5.1 # via lightning-utilities + # via rasterio # via readchar # via tensorboard shapely==2.0.3 @@ -402,7 +420,9 @@ six==1.16.0 # via tensorboard sniffio==1.3.1 # via anyio -soundevent==1.3.5 +snuggs==1.4.7 + # via rasterio +soundevent==2.0.0 # via batdetect2 soundfile==0.12.1 # via librosa diff --git a/requirements.lock b/requirements.lock index f2594d1..132dbc7 100644 --- a/requirements.lock +++ b/requirements.lock @@ -10,6 +10,8 @@ -e file:. absl-py==2.1.0 # via tensorboard +affine==2.4.0 + # via rasterio aiobotocore==2.12.3 # via s3fs aiohttp==3.9.5 @@ -35,6 +37,7 @@ async-timeout==4.0.3 # via redis attrs==23.2.0 # via aiohttp + # via rasterio audioread==3.0.1 # via librosa backoff==2.2.1 @@ -53,6 +56,7 @@ botocore==1.34.69 # via s3transfer certifi==2024.2.2 # via netcdf4 + # via rasterio # via requests cf-xarray==0.9.0 # via batdetect2 @@ -64,9 +68,16 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via batdetect2 + # via click-plugins + # via cligj # via lightning # via lightning-cloud + # via rasterio # via uvicorn +click-plugins==1.1.1 + # via rasterio +cligj==0.7.2 + # via rasterio contourpy==1.1.1 # via matplotlib croniter==1.4.1 @@ -123,6 +134,7 @@ idna==3.7 # via yarl importlib-metadata==7.1.0 # via markdown + # via rasterio importlib-resources==6.4.0 # via matplotlib # via typeshed-client @@ -199,9 +211,11 @@ numpy==1.24.4 # via onnx # via pandas # via pytorch-lightning + # via rasterio # via scikit-learn # via scipy # via shapely + # via snuggs # via soxr # via tensorboard # via tensorboardx @@ -286,6 +300,7 @@ pyjwt==2.8.0 # via lightning-cloud pyparsing==3.1.2 # via matplotlib + # via snuggs python-dateutil==2.9.0.post0 # via arrow # via botocore @@ -307,6 +322,8 @@ pyyaml==6.0.1 # via lightning # via omegaconf # via pytorch-lightning +rasterio==1.3.10 + # via soundevent readchar==4.0.6 # via inquirer redis==5.0.4 @@ -336,6 +353,7 @@ scipy==1.10.1 # via soundevent setuptools==69.5.1 # via lightning-utilities + # via rasterio # via readchar # via tensorboard shapely==2.0.3 @@ -347,7 +365,9 @@ six==1.16.0 # via tensorboard sniffio==1.3.1 # via anyio -soundevent==1.3.5 +snuggs==1.4.7 + # via rasterio +soundevent==2.0.0 # via batdetect2 soundfile==0.12.1 # via librosa