2026-07-08 05:10:09 +02:00
13 changed files with 88 additions and 120 deletions
--- a/src/batdetect2/data/annotations/legacy.py
+++ b/src/batdetect2/data/annotations/legacy.py
@ -89,9 +89,18 @@ def annotation_to_sound_event(
        uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
        sound_event=sound_event,
        tags=[
-            data.Tag(key=label_key, value=annotation.label),
+            data.Tag(
-            data.Tag(key=event_key, value=annotation.event),
+                key=label_key,  # type: ignore
-            data.Tag(key=individual_key, value=str(annotation.individual)),
+                value=annotation.label,
            ),
            data.Tag(
                key=event_key,  # type: ignore
                value=annotation.event,
            ),
            data.Tag(
                key=individual_key,  # type: ignore
                value=str(annotation.individual),
            ),
        ],
    )
@ -112,7 +121,12 @@ def file_annotation_to_clip(
    recording = data.Recording.from_file(
        full_path,
        time_expansion=file_annotation.time_exp,
-        tags=[data.Tag(key=label_key, value=file_annotation.label)],
+        tags=[
            data.Tag(
                key=label_key,  # type: ignore
                value=file_annotation.label,
            )
        ],
    )
    return data.Clip(
@ -139,7 +153,12 @@ def file_annotation_to_clip_annotation(
        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
        clip=clip,
        notes=notes,
-        tags=[data.Tag(key=label_key, value=file_annotation.label)],
+        tags=[
            data.Tag(
                key=label_key,  # type: ignore
                value=file_annotation.label,
            )
        ],
        sound_events=[
            annotation_to_sound_event(
                annotation,
--- a/src/batdetect2/evaluate/match.py
+++ b/src/batdetect2/evaluate/match.py
@ -57,7 +57,6 @@ class MatchConfig(BaseConfig):
    affinity_threshold: float = 0.0
    time_buffer: float = 0.005
    frequency_buffer: float = 1_000
    ignore_start_end: float = 0.01
 def _to_bbox(geometry: data.Geometry) -> data.BoundingBox:
@ -274,17 +273,6 @@ def greedy_match(
        yield None, target_idx, 0
 def _is_in_bounds(
    geometry: data.Geometry,
    clip: data.Clip,
    buffer: float,
 ) -> bool:
    start_time = compute_bounds(geometry)[0]
    return (start_time >= clip.start_time + buffer) and (
        start_time <= clip.end_time - buffer
    )
 def match_sound_events_and_raw_predictions(
    clip_annotation: data.ClipAnnotation,
    raw_predictions: List[RawPrediction],
@ -298,29 +286,14 @@ def match_sound_events_and_raw_predictions(
        for sound_event_annotation in clip_annotation.sound_events
        if targets.filter(sound_event_annotation)
        and sound_event_annotation.sound_event.geometry is not None
        and _is_in_bounds(
            sound_event_annotation.sound_event.geometry,
            clip=clip_annotation.clip,
            buffer=config.ignore_start_end,
        )
    ]
-    target_geometries: List[data.Geometry] = [
+    target_geometries: List[data.Geometry] = [  # type: ignore
        sound_event_annotation.sound_event.geometry
        for sound_event_annotation in target_sound_events
        if sound_event_annotation.sound_event.geometry is not None
    ]
    raw_predictions = [
        raw_prediction
        for raw_prediction in raw_predictions
        if _is_in_bounds(
            raw_prediction.geometry,
            clip=clip_annotation.clip,
            buffer=config.ignore_start_end,
        )
    ]
    predicted_geometries = [
        raw_prediction.geometry for raw_prediction in raw_predictions
    ]
--- a/src/batdetect2/models/blocks.py
+++ b/src/batdetect2/models/blocks.py
@ -225,7 +225,7 @@ class ConvBlock(nn.Module):
            kernel_size=kernel_size,
            padding=pad_size,
        )
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.conv_bn = nn.BatchNorm2d(out_channels)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Apply Conv -> BN -> ReLU.
@ -240,7 +240,7 @@ class ConvBlock(nn.Module):
        torch.Tensor
            Output tensor, shape `(B, C_out, H, W)`.
        """
-        return F.relu_(self.batch_norm(self.conv(x)))
+        return F.relu_(self.conv_bn(self.conv(x)))
 class VerticalConv(nn.Module):
@ -364,7 +364,7 @@ class FreqCoordConvDownBlock(nn.Module):
            padding=pad_size,
            stride=1,
        )
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.conv_bn = nn.BatchNorm2d(out_channels)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Apply CoordF -> Conv -> MaxPool -> BN -> ReLU.
@ -383,7 +383,7 @@ class FreqCoordConvDownBlock(nn.Module):
        freq_info = self.coords.repeat(x.shape[0], 1, 1, x.shape[3])
        x = torch.cat((x, freq_info), 1)
        x = F.max_pool2d(self.conv(x), 2, 2)
-        x = F.relu(self.batch_norm(x), inplace=True)
+        x = F.relu(self.conv_bn(x), inplace=True)
        return x
@ -438,7 +438,7 @@ class StandardConvDownBlock(nn.Module):
            padding=pad_size,
            stride=1,
        )
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.conv_bn = nn.BatchNorm2d(out_channels)
    def forward(self, x):
        """Apply Conv -> MaxPool -> BN -> ReLU.
@ -454,7 +454,7 @@ class StandardConvDownBlock(nn.Module):
            Output tensor, shape `(B, C_out, H/2, W/2)`.
        """
        x = F.max_pool2d(self.conv(x), 2, 2)
-        return F.relu(self.batch_norm(x), inplace=True)
+        return F.relu(self.conv_bn(x), inplace=True)
 class FreqCoordConvUpConfig(BaseConfig):
@ -534,7 +534,7 @@ class FreqCoordConvUpBlock(nn.Module):
            kernel_size=kernel_size,
            padding=pad_size,
        )
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.conv_bn = nn.BatchNorm2d(out_channels)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Apply Interpolate -> Concat Coords -> Conv -> BN -> ReLU.
@ -562,7 +562,7 @@ class FreqCoordConvUpBlock(nn.Module):
        freq_info = self.coords.repeat(op.shape[0], 1, 1, op.shape[3])
        op = torch.cat((op, freq_info), 1)
        op = self.conv(op)
-        op = F.relu(self.batch_norm(op), inplace=True)
+        op = F.relu(self.conv_bn(op), inplace=True)
        return op
@ -625,7 +625,7 @@ class StandardConvUpBlock(nn.Module):
            kernel_size=kernel_size,
            padding=pad_size,
        )
-        self.batch_norm = nn.BatchNorm2d(out_channels)
+        self.conv_bn = nn.BatchNorm2d(out_channels)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Apply Interpolate -> Conv -> BN -> ReLU.
@ -650,7 +650,7 @@ class StandardConvUpBlock(nn.Module):
            align_corners=False,
        )
        op = self.conv(op)
-        op = F.relu(self.batch_norm(op), inplace=True)
+        op = F.relu(self.conv_bn(op), inplace=True)
        return op
--- a/src/batdetect2/plotting/common.py
+++ b/src/batdetect2/plotting/common.py
@ -32,12 +32,9 @@ def plot_spectrogram(
    max_freq: Optional[float] = None,
    ax: Optional[axes.Axes] = None,
    figsize: Optional[Tuple[int, int]] = None,
    add_colorbar: bool = False,
    colorbar_kwargs: Optional[dict] = None,
    vmin: Optional[float] = None,
    vmax: Optional[float] = None,
    cmap="gray",
 ) -> axes.Axes:
    if isinstance(spec, torch.Tensor):
        spec = spec.numpy()
@ -57,16 +54,10 @@ def plot_spectrogram(
    if max_freq is None:
        max_freq = spec.shape[-2]
-    mappable = ax.pcolormesh(
+    ax.pcolormesh(
        np.linspace(start_time, end_time, spec.shape[-1] + 1, endpoint=True),
        np.linspace(min_freq, max_freq, spec.shape[-2] + 1, endpoint=True),
        spec,
        cmap=cmap,
        vmin=vmin,
        vmax=vmax,
    )
    if add_colorbar:
        plt.colorbar(mappable, ax=ax, **(colorbar_kwargs or {}))
    return ax
--- a/src/batdetect2/plotting/evaluation.py
+++ b/src/batdetect2/plotting/evaluation.py
@ -136,7 +136,7 @@ def plot_class_examples(
                preprocessor=preprocessor,
                duration=duration,
            )
-        except (ValueError, AssertionError, RuntimeError, FileNotFoundError):
+        except (ValueError, AssertionError, RuntimeError):
            continue
    return fig
--- a/src/batdetect2/postprocess/init.py
+++ b/src/batdetect2/postprocess/init.py
@ -51,7 +51,7 @@ __all__ = [
 DEFAULT_DETECTION_THRESHOLD = 0.01
-TOP_K_PER_SEC = 100
+TOP_K_PER_SEC = 200
 class PostprocessConfig(BaseConfig):
@ -206,13 +206,11 @@ class Postprocessor(torch.nn.Module, PostprocessorProtocol):
        if clips is None:
            return detections
        width = output.detection_probs.shape[-1]
        duration = width / self.samplerate
        return [
            map_detection_to_clip(
                detection,
                start_time=clip.start_time,
-                end_time=clip.start_time + duration,
+                end_time=clip.end_time,
                min_freq=self.min_freq,
                max_freq=self.max_freq,
            )
@ -222,9 +220,9 @@ class Postprocessor(torch.nn.Module, PostprocessorProtocol):
 def get_raw_predictions(
    output: ModelOutput,
    clips: List[data.Clip],
    targets: TargetProtocol,
    postprocessor: PostprocessorProtocol,
    clips: Optional[List[data.Clip]] = None,
 ) -> List[List[RawPrediction]]:
    """Extract intermediate RawPrediction objects for a batch.
@ -261,9 +259,9 @@ def get_sound_event_predictions(
 ) -> List[List[BatDetect2Prediction]]:
    raw_predictions = get_raw_predictions(
        output,
        clips,
        targets=targets,
        postprocessor=postprocessor,
        clips=clips,
    )
    return [
        [
@ -310,9 +308,9 @@ def get_predictions(
    """
    raw_predictions = get_raw_predictions(
        output,
        clips,
        targets=targets,
        postprocessor=postprocessor,
        clips=clips,
    )
    return [
        convert_raw_predictions_to_clip_prediction(
--- a/src/batdetect2/targets/init.py
+++ b/src/batdetect2/targets/init.py
@ -28,17 +28,12 @@ from batdetect2.targets.rois import (
    ROITargetMapper,
    build_roi_mapper,
 )
-from batdetect2.targets.terms import (
+from batdetect2.targets.terms import call_type, individual
    call_type,
    data_source,
    generic_class,
    individual,
 )
 from batdetect2.typing.targets import Position, Size, TargetProtocol
 __all__ = [
    "AnchorBBoxMapperConfig",
    "DEFAULT_TARGET_CONFIG",
    "AnchorBBoxMapperConfig",
    "ROITargetMapper",
    "SoundEventDecoder",
    "SoundEventEncoder",
@ -49,8 +44,6 @@ __all__ = [
    "build_sound_event_decoder",
    "build_sound_event_encoder",
    "call_type",
    "data_source",
    "generic_class",
    "get_class_names_from_config",
    "individual",
    "load_target_config",
--- a/src/batdetect2/targets/classes.py
+++ b/src/batdetect2/targets/classes.py
@ -14,7 +14,7 @@ from batdetect2.data.conditions import (
    SoundEventConditionConfig,
    build_sound_event_condition,
 )
-from batdetect2.targets.rois import ROIMapperConfig
+from batdetect2.targets.rois import AnchorBBoxMapperConfig, ROIMapperConfig
 from batdetect2.typing.targets import SoundEventDecoder, SoundEventEncoder
 __all__ = [
@ -140,6 +140,7 @@ DEFAULT_CLASSES = [
    TargetClassConfig(
        name="rhihip",
        tags=[data.Tag(key="class", value="Rhinolophus hipposideros")],
        roi=AnchorBBoxMapperConfig(anchor="top-left"),
    ),
    TargetClassConfig(
        name="nyclei",
@ -148,6 +149,7 @@ DEFAULT_CLASSES = [
    TargetClassConfig(
        name="rhifer",
        tags=[data.Tag(key="class", value="Rhinolophus ferrumequinum")],
        roi=AnchorBBoxMapperConfig(anchor="top-left"),
    ),
    TargetClassConfig(
        name="pleaur",
--- a/src/batdetect2/targets/terms.py
+++ b/src/batdetect2/targets/terms.py
@ -6,7 +6,6 @@ __all__ = [
    "call_type",
    "individual",
    "data_source",
    "generic_class",
 ]
 # The default key used to reference the 'generic_class' term.
--- a/src/batdetect2/train/config.py
+++ b/src/batdetect2/train/config.py
@ -52,7 +52,7 @@ class ValLoaderConfig(BaseConfig):
    num_workers: int = 0
    clipping_strategy: ClipConfig = Field(
-        default_factory=lambda: PaddedClipConfig()
+        default_factory=lambda: RandomClipConfig()
    )
--- a/src/batdetect2/train/labels.py
+++ b/src/batdetect2/train/labels.py
@ -14,8 +14,7 @@ from loguru import logger
 from soundevent import data
 from batdetect2.configs import BaseConfig, load_config
-from batdetect2.preprocess import MAX_FREQ, MIN_FREQ
+from batdetect2.targets import iterate_encoded_sound_events
 from batdetect2.targets import build_targets, iterate_encoded_sound_events
 from batdetect2.typing import (
    ClipLabeller,
    Heatmaps,
@ -46,9 +45,9 @@ class LabelConfig(BaseConfig):
 def build_clip_labeler(
-    targets: Optional[TargetProtocol] = None,
+    targets: TargetProtocol,
-    min_freq: float = MIN_FREQ,
+    min_freq: float,
-    max_freq: float = MAX_FREQ,
+    max_freq: float,
    config: Optional[LabelConfig] = None,
 ) -> ClipLabeller:
    """Construct the final clip labelling function."""
@ -57,10 +56,6 @@ def build_clip_labeler(
        "Building clip labeler with config: \n{}",
        lambda: config.to_yaml_string(),
    )
    if targets is None:
        targets = build_targets()
    return partial(
        generate_heatmaps,
        targets=targets,
--- a/src/batdetect2/train/train.py
+++ b/src/batdetect2/train/train.py
@ -226,9 +226,9 @@ def build_trainer(
 def build_train_loader(
    clip_annotations: Sequence[data.ClipAnnotation],
-    audio_loader: Optional[AudioLoader] = None,
+    audio_loader: AudioLoader,
-    labeller: Optional[ClipLabeller] = None,
+    labeller: ClipLabeller,
-    preprocessor: Optional[PreprocessorProtocol] = None,
+    preprocessor: PreprocessorProtocol,
    config: Optional[TrainLoaderConfig] = None,
    num_workers: Optional[int] = None,
 ) -> DataLoader:
@ -260,9 +260,9 @@ def build_train_loader(
 def build_val_loader(
    clip_annotations: Sequence[data.ClipAnnotation],
-    audio_loader: Optional[AudioLoader] = None,
+    audio_loader: AudioLoader,
-    labeller: Optional[ClipLabeller] = None,
+    labeller: ClipLabeller,
-    preprocessor: Optional[PreprocessorProtocol] = None,
+    preprocessor: PreprocessorProtocol,
    config: Optional[ValLoaderConfig] = None,
    num_workers: Optional[int] = None,
 ):
@ -293,9 +293,9 @@ def build_val_loader(
 def build_train_dataset(
    clip_annotations: Sequence[data.ClipAnnotation],
-    audio_loader: Optional[AudioLoader] = None,
+    audio_loader: AudioLoader,
-    labeller: Optional[ClipLabeller] = None,
+    labeller: ClipLabeller,
-    preprocessor: Optional[PreprocessorProtocol] = None,
+    preprocessor: PreprocessorProtocol,
    config: Optional[TrainLoaderConfig] = None,
 ) -> TrainingDataset:
    logger.info("Building training dataset...")
@ -303,18 +303,6 @@ def build_train_dataset(
    clipper = build_clipper(config=config.clipping_strategy)
    if audio_loader is None:
        audio_loader = build_audio_loader()
    if preprocessor is None:
        preprocessor = build_preprocessor()
    if labeller is None:
        labeller = build_clip_labeler(
            min_freq=preprocessor.min_freq,
            max_freq=preprocessor.max_freq,
        )
    random_example_source = RandomAudioSource(
        clip_annotations,
        audio_loader=audio_loader,
@ -344,26 +332,14 @@ def build_train_dataset(
 def build_val_dataset(
    clip_annotations: Sequence[data.ClipAnnotation],
-    audio_loader: Optional[AudioLoader] = None,
+    audio_loader: AudioLoader,
-    labeller: Optional[ClipLabeller] = None,
+    labeller: ClipLabeller,
-    preprocessor: Optional[PreprocessorProtocol] = None,
+    preprocessor: PreprocessorProtocol,
    config: Optional[ValLoaderConfig] = None,
 ) -> ValidationDataset:
    logger.info("Building validation dataset...")
    config = config or ValLoaderConfig()
    if audio_loader is None:
        audio_loader = build_audio_loader()
    if preprocessor is None:
        preprocessor = build_preprocessor()
    if labeller is None:
        labeller = build_clip_labeler(
            min_freq=preprocessor.min_freq,
            max_freq=preprocessor.max_freq,
        )
    clipper = build_clipper(config.clipping_strategy)
    return ValidationDataset(
        clip_annotations,
--- a/src/batdetect2/typing/postprocess.py
+++ b/src/batdetect2/typing/postprocess.py
@ -47,7 +47,29 @@ class GeometryDecoder(Protocol):
 class RawPrediction(NamedTuple):
-    """Intermediate representation of a single detected sound event."""
+    """Intermediate representation of a single detected sound event.
    Holds extracted information about a detection after initial processing
    (like peak finding, coordinate remapping, geometry recovery) but before
    final class decoding and conversion into a `SoundEventPrediction`. This
    can be useful for evaluation or simpler data handling formats.
    Attributes
    ----------
    geometry: data.Geometry
        The recovered estimated geometry of the detected sound event.
        Usually a bounding box.
    detection_score : float
        The confidence score associated with this detection, typically from
        the detection heatmap peak.
    class_scores : xr.DataArray
        An xarray DataArray containing the predicted probabilities or scores
        for each target class at the detection location. Indexed by a
        'category' coordinate containing class names.
    features : xr.DataArray
        An xarray DataArray containing extracted feature vectors at the
        detection location. Indexed by a 'feature' coordinate.
    """
    geometry: data.Geometry
    detection_score: float