Updated postprocess module with docstrings

2025-06-29 14:41:58 +02:00 · 2025-04-20 13:56:18 +01:00 · 2025-04-20 13:56:18 +01:00 · bcf339c40d
commit bcf339c40d
parent 089328a4f0
10 changed files with 1828 additions and 161 deletions
--- a/batdetect2/postprocess/init.py
+++ b/batdetect2/postprocess/init.py
@ -0,0 +1,566 @@
+"""Main entry point for the BatDetect2 Postprocessing pipeline.
+
+This package (`batdetect2.postprocess`) takes the raw outputs from a trained
+BatDetect2 neural network model and transforms them into meaningful, structured
+predictions, typically in the form of `soundevent.data.ClipPrediction` objects
+containing detected sound events with associated class tags and geometry.
+
+The pipeline involves several configurable steps, implemented in submodules:
+1.  Non-Maximum Suppression (`.nms`): Isolates distinct detection peaks.
+2.  Coordinate Remapping (`.remapping`): Adds real-world time/frequency
+    coordinates to raw model output arrays.
+3.  Detection Extraction (`.detection`): Identifies candidate detection points
+    (location and score) based on thresholds and score ranking (top-k).
+4.  Data Extraction (`.extraction`): Gathers associated model outputs (size,
+    class probabilities, features) at the detected locations.
+5.  Decoding & Formatting (`.decoding`): Converts extracted numerical data and
+    class predictions into interpretable `soundevent` objects, including
+    recovering geometry (ROIs) and decoding class names back to standard tags.
+
+This module provides the primary interface:
+- `PostprocessConfig`: A configuration object for postprocessing parameters
+  (thresholds, NMS kernel size, etc.).
+- `load_postprocess_config`: Function to load the configuration from a file.
+- `Postprocessor`: The main class (implementing `PostprocessorProtocol`) that
+  holds the configured pipeline logic.
+- `build_postprocessor`: A factory function to create a `Postprocessor`
+  instance, linking it to the necessary target definitions (`TargetProtocol`).
+It also re-exports key components from submodules for convenience.
+"""
+
+from typing import List, Optional
+
+import xarray as xr
+from pydantic import Field
+from soundevent import data
+
+from batdetect2.configs import BaseConfig, load_config
+from batdetect2.models.types import ModelOutput
+from batdetect2.postprocess.decoding import (
+    DEFAULT_CLASSIFICATION_THRESHOLD,
+    convert_raw_predictions_to_clip_prediction,
+    convert_xr_dataset_to_raw_prediction,
+)
+from batdetect2.postprocess.detection import (
+    DEFAULT_DETECTION_THRESHOLD,
+    TOP_K_PER_SEC,
+    extract_detections_from_array,
+    get_max_detections,
+)
+from batdetect2.postprocess.extraction import (
+    extract_detection_xr_dataset,
+)
+from batdetect2.postprocess.nms import (
+    NMS_KERNEL_SIZE,
+    non_max_suppression,
+)
+from batdetect2.postprocess.remapping import (
+    classification_to_xarray,
+    detection_to_xarray,
+    features_to_xarray,
+    sizes_to_xarray,
+)
+from batdetect2.postprocess.types import PostprocessorProtocol, RawPrediction
+from batdetect2.preprocess import MAX_FREQ, MIN_FREQ
+from batdetect2.targets.types import TargetProtocol
+
+__all__ = [
+    "DEFAULT_CLASSIFICATION_THRESHOLD",
+    "DEFAULT_DETECTION_THRESHOLD",
+    "MAX_FREQ",
+    "MIN_FREQ",
+    "ModelOutput",
+    "NMS_KERNEL_SIZE",
+    "PostprocessConfig",
+    "Postprocessor",
+    "PostprocessorProtocol",
+    "RawPrediction",
+    "TOP_K_PER_SEC",
+    "build_postprocessor",
+    "classification_to_xarray",
+    "convert_raw_predictions_to_clip_prediction",
+    "convert_xr_dataset_to_raw_prediction",
+    "detection_to_xarray",
+    "extract_detection_xr_dataset",
+    "extract_detections_from_array",
+    "features_to_xarray",
+    "get_max_detections",
+    "load_postprocess_config",
+    "non_max_suppression",
+    "sizes_to_xarray",
+]
+
+
+class PostprocessConfig(BaseConfig):
+    """Configuration settings for the postprocessing pipeline.
+
+    Defines tunable parameters that control how raw model outputs are
+    converted into final detections.
+
+    Attributes
+    ----------
+    nms_kernel_size : int, default=NMS_KERNEL_SIZE
+        Size (pixels) of the kernel/neighborhood for Non-Maximum Suppression.
+        Used to suppress weaker detections near stronger peaks. Must be
+        positive.
+    detection_threshold : float, default=DEFAULT_DETECTION_THRESHOLD
+        Minimum confidence score from the detection heatmap required to
+        consider a point as a potential detection. Must be >= 0.
+    classification_threshold : float, default=DEFAULT_CLASSIFICATION_THRESHOLD
+        Minimum confidence score for a specific class prediction to be included
+        in the decoded tags for a detection. Must be >= 0.
+    top_k_per_sec : int, default=TOP_K_PER_SEC
+        Desired maximum number of detections per second of audio. Used by
+        `get_max_detections` to calculate an absolute limit based on clip
+        duration before applying `extract_detections_from_array`. Must be
+        positive.
+    """
+
+    nms_kernel_size: int = Field(default=NMS_KERNEL_SIZE, gt=0)
+    detection_threshold: float = Field(
+        default=DEFAULT_DETECTION_THRESHOLD,
+        ge=0,
+    )
+    classification_threshold: float = Field(
+        default=DEFAULT_CLASSIFICATION_THRESHOLD,
+        ge=0,
+    )
+    top_k_per_sec: int = Field(default=TOP_K_PER_SEC, gt=0)
+
+
+def load_postprocess_config(
+    path: data.PathLike,
+    field: Optional[str] = None,
+) -> PostprocessConfig:
+    """Load the postprocessing configuration from a file.
+
+    Reads a configuration file (YAML) and validates it against the
+    `PostprocessConfig` schema, potentially extracting data from a nested
+    field.
+
+    Parameters
+    ----------
+    path : PathLike
+        Path to the configuration file.
+    field : str, optional
+        Dot-separated path to a nested section within the file containing the
+        postprocessing configuration (e.g., "inference.postprocessing").
+        If None, the entire file content is used.
+
+    Returns
+    -------
+    PostprocessConfig
+        The loaded and validated postprocessing configuration object.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the config file path does not exist.
+    yaml.YAMLError
+        If the file content is not valid YAML.
+    pydantic.ValidationError
+        If the loaded configuration data does not conform to the
+        `PostprocessConfig` schema.
+    KeyError, TypeError
+        If `field` specifies an invalid path within the loaded data.
+    """
+    return load_config(path, schema=PostprocessConfig, field=field)
+
+
+def build_postprocessor(
+    targets: TargetProtocol,
+    config: Optional[PostprocessConfig] = None,
+    max_freq: int = MAX_FREQ,
+    min_freq: int = MIN_FREQ,
+) -> PostprocessorProtocol:
+    """Factory function to build the standard postprocessor.
+
+    Creates and initializes the `Postprocessor` instance, providing it with the
+    necessary `targets` object and the `PostprocessConfig`.
+
+    Parameters
+    ----------
+    targets : TargetProtocol
+        An initialized object conforming to the `TargetProtocol`, providing
+        methods like `.decode()` and `.recover_roi()`, and attributes like
+        `.class_names` and `.generic_class_tags`. This links postprocessing
+        to the defined target semantics and geometry mappings.
+    config : PostprocessConfig, optional
+        Configuration object specifying postprocessing parameters (thresholds,
+        NMS kernel size, etc.). If None, default settings defined in
+        `PostprocessConfig` will be used.
+    min_freq : int, default=MIN_FREQ
+        The minimum frequency (Hz) corresponding to the frequency axis of the
+        model outputs. Required for coordinate remapping. Consider setting via
+        `PostprocessConfig` instead for better encapsulation.
+    max_freq : int, default=MAX_FREQ
+        The maximum frequency (Hz) corresponding to the frequency axis of the
+        model outputs. Required for coordinate remapping. Consider setting via
+        `PostprocessConfig`.
+
+    Returns
+    -------
+    PostprocessorProtocol
+        An initialized `Postprocessor` instance ready to process model outputs.
+    """
+    return Postprocessor(
+        targets=targets,
+        config=config or PostprocessConfig(),
+        min_freq=min_freq,
+        max_freq=max_freq,
+    )
+
+
+class Postprocessor(PostprocessorProtocol):
+    """Standard implementation of the postprocessing pipeline.
+
+    This class orchestrates the steps required to convert raw model outputs
+    into interpretable `soundevent` predictions. It uses configured parameters
+    and leverages functions from the `batdetect2.postprocess` submodules for
+    each stage (NMS, remapping, detection, extraction, decoding).
+
+    It requires a `TargetProtocol` object during initialization to access
+    necessary decoding information (class name to tag mapping,
+    ROI recovery logic) ensuring consistency with the target definitions used
+    during training or specified for inference.
+
+    Instances are typically created using the `build_postprocessor` factory
+    function.
+
+    Attributes
+    ----------
+    targets : TargetProtocol
+        The configured target definition object providing decoding and ROI
+        recovery.
+    config : PostprocessConfig
+        Configuration object holding parameters for NMS, thresholds, etc.
+    min_freq : int
+        Minimum frequency (Hz) assumed for the model output's frequency axis.
+    max_freq : int
+        Maximum frequency (Hz) assumed for the model output's frequency axis.
+    """
+
+    targets: TargetProtocol
+
+    def __init__(
+        self,
+        targets: TargetProtocol,
+        config: PostprocessConfig,
+        min_freq: int = MIN_FREQ,
+        max_freq: int = MAX_FREQ,
+    ):
+        """Initialize the Postprocessor.
+
+        Parameters
+        ----------
+        targets : TargetProtocol
+            Initialized target definition object.
+        config : PostprocessConfig
+            Configuration for postprocessing parameters.
+        min_freq : int, default=MIN_FREQ
+            Minimum frequency (Hz) for coordinate remapping.
+        max_freq : int, default=MAX_FREQ
+            Maximum frequency (Hz) for coordinate remapping.
+        """
+        self.targets = targets
+        self.config = config
+        self.min_freq = min_freq
+        self.max_freq = max_freq
+
+    def get_feature_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Extract and remap raw feature tensors for a batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw model output containing `output.features` tensor for the batch.
+        clips : List[data.Clip]
+            List of Clip objects corresponding to the batch items.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            List of coordinate-aware feature DataArrays, one per clip.
+
+        Raises
+        ------
+        ValueError
+            If batch sizes of `output.features` and `clips` do not match.
+        """
+        if len(clips) != len(output.features):
+            raise ValueError(
+                "Number of clips and batch size of feature array"
+                "do not match. "
+                f"(clips: {len(clips)}, features: {len(output.features)})"
+            )
+
+        return [
+            features_to_xarray(
+                feats,
+                start_time=clip.start_time,
+                end_time=clip.end_time,
+                min_freq=self.min_freq,
+                max_freq=self.max_freq,
+            )
+            for feats, clip in zip(output.features, clips)
+        ]
+
+    def get_detection_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Apply NMS and remap detection heatmaps for a batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw model output containing `output.detection_probs` tensor for the
+            batch.
+        clips : List[data.Clip]
+            List of Clip objects corresponding to the batch items.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            List of NMS-applied, coordinate-aware detection heatmaps, one per
+            clip.
+
+        Raises
+        ------
+        ValueError
+            If batch sizes of `output.detection_probs` and `clips` do not match.
+        """
+        detections = output.detection_probs
+
+        if len(clips) != len(output.detection_probs):
+            raise ValueError(
+                "Number of clips and batch size of detection array "
+                "do not match. "
+                f"(clips: {len(clips)}, detection: {len(detections)})"
+            )
+
+        detections = non_max_suppression(
+            detections,
+            kernel_size=self.config.nms_kernel_size,
+        )
+
+        return [
+            detection_to_xarray(
+                dets,
+                start_time=clip.start_time,
+                end_time=clip.end_time,
+                min_freq=self.min_freq,
+                max_freq=self.max_freq,
+            )
+            for dets, clip in zip(detections, clips)
+        ]
+
+    def get_classification_arrays(
+        self, output: ModelOutput, clips: List[data.Clip]
+    ) -> List[xr.DataArray]:
+        """Extract and remap raw classification tensors for a batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw model output containing `output.class_probs` tensor for the
+            batch.
+        clips : List[data.Clip]
+            List of Clip objects corresponding to the batch items.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            List of coordinate-aware class probability maps, one per clip.
+
+        Raises
+        ------
+        ValueError
+            If batch sizes of `output.class_probs` and `clips` do not match, or
+            if number of classes mismatches `self.targets.class_names`.
+        """
+        classifications = output.class_probs
+
+        if len(clips) != len(classifications):
+            raise ValueError(
+                "Number of clips and batch size of classification array "
+                "do not match. "
+                f"(clips: {len(clips)}, classification: {len(classifications)})"
+            )
+
+        return [
+            classification_to_xarray(
+                class_probs,
+                start_time=clip.start_time,
+                end_time=clip.end_time,
+                class_names=self.targets.class_names,
+                min_freq=self.min_freq,
+                max_freq=self.max_freq,
+            )
+            for class_probs, clip in zip(classifications, clips)
+        ]
+
+    def get_sizes_arrays(
+        self, output: ModelOutput, clips: List[data.Clip]
+    ) -> List[xr.DataArray]:
+        """Extract and remap raw size prediction tensors for a batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw model output containing `output.size_preds` tensor for the
+            batch.
+        clips : List[data.Clip]
+            List of Clip objects corresponding to the batch items.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            List of coordinate-aware size prediction maps, one per clip.
+
+        Raises
+        ------
+        ValueError
+            If batch sizes of `output.size_preds` and `clips` do not match.
+        """
+        sizes = output.size_preds
+
+        if len(clips) != len(sizes):
+            raise ValueError(
+                "Number of clips and batch size of sizes array do not match. "
+                f"(clips: {len(clips)}, sizes: {len(sizes)})"
+            )
+
+        return [
+            sizes_to_xarray(
+                size_preds,
+                start_time=clip.start_time,
+                end_time=clip.end_time,
+                min_freq=self.min_freq,
+                max_freq=self.max_freq,
+            )
+            for size_preds, clip in zip(sizes, clips)
+        ]
+
+    def get_detection_datasets(
+        self, output: ModelOutput, clips: List[data.Clip]
+    ) -> List[xr.Dataset]:
+        """Perform NMS, remapping, detection, and data extraction for a batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            List of `soundevent.data.Clip` objects corresponding to the batch.
+
+        Returns
+        -------
+        List[xr.Dataset]
+            List of xarray Datasets (one per clip). Each Dataset contains
+            aligned scores, dimensions, class probabilities, and features for
+            detections found in that clip.
+        """
+        detection_arrays = self.get_detection_arrays(output, clips)
+        classification_arrays = self.get_classification_arrays(output, clips)
+        size_arrays = self.get_sizes_arrays(output, clips)
+        features_arrays = self.get_feature_arrays(output, clips)
+
+        datasets = []
+        for det_array, class_array, sizes_array, feats_array in zip(
+            detection_arrays,
+            classification_arrays,
+            size_arrays,
+            features_arrays,
+        ):
+            max_detections = get_max_detections(
+                det_array,
+                top_k_per_sec=self.config.top_k_per_sec,
+            )
+
+            positions = extract_detections_from_array(
+                det_array,
+                max_detections=max_detections,
+                threshold=self.config.detection_threshold,
+            )
+
+            datasets.append(
+                extract_detection_xr_dataset(
+                    positions,
+                    sizes_array,
+                    class_array,
+                    feats_array,
+                )
+            )
+
+        return datasets
+
+    def get_raw_predictions(
+        self, output: ModelOutput, clips: List[data.Clip]
+    ) -> List[List[RawPrediction]]:
+        """Extract intermediate RawPrediction objects for a batch.
+
+        Processes raw model output through remapping, NMS, detection, data
+        extraction, and geometry recovery via the configured
+        `targets.recover_roi`.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            List of `soundevent.data.Clip` objects corresponding to the batch.
+
+        Returns
+        -------
+        List[List[RawPrediction]]
+            List of lists (one inner list per input clip). Each inner list
+            contains `RawPrediction` objects for detections in that clip.
+        """
+        detection_datasets = self.get_detection_datasets(output, clips)
+        return [
+            convert_xr_dataset_to_raw_prediction(
+                dataset,
+                self.targets.recover_roi,
+            )
+            for dataset in detection_datasets
+        ]
+
+    def get_predictions(
+        self, output: ModelOutput, clips: List[data.Clip]
+    ) -> List[data.ClipPrediction]:
+        """Perform the full postprocessing pipeline for a batch.
+
+        Takes raw model output and corresponding clips, applies the entire
+        configured chain (NMS, remapping, extraction, geometry recovery, class
+        decoding), producing final `soundevent.data.ClipPrediction` objects.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            Raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            List of `soundevent.data.Clip` objects corresponding to the batch.
+
+        Returns
+        -------
+        List[data.ClipPrediction]
+            List containing one `ClipPrediction` object for each input clip,
+            populated with `SoundEventPrediction` objects.
+        """
+        raw_predictions = self.get_raw_predictions(output, clips)
+        return [
+            convert_raw_predictions_to_clip_prediction(
+                prediction,
+                clip,
+                sound_event_decoder=self.targets.decode,
+                generic_class_tags=self.targets.generic_class_tags,
+                classification_threshold=self.config.classification_threshold,
+            )
+            for prediction, clip in zip(raw_predictions, clips)
+        ]
--- a/batdetect2/postprocess/arrays.py
+++ b/batdetect2/postprocess/arrays.py
@ -1,73 +0,0 @@
-import numpy as np
-import xarray as xr
-from soundevent.arrays import Dimensions
-
-from batdetect2.models import ModelOutput
-from batdetect2.preprocess import MAX_FREQ, MIN_FREQ
-
-
-def to_xarray(
-    output: ModelOutput,
-    start_time: float,
-    end_time: float,
-    class_names: list[str],
-    min_freq: float = MIN_FREQ,
-    max_freq: float = MAX_FREQ,
-):
-    detection = output.detection_probs
-    size = output.size_preds
-    classes = output.class_probs
-    features = output.features
-
-    if len(detection.shape) == 4:
-        if detection.shape[0] != 1:
-            raise ValueError(
-                "Expected a non-batched output or a batch of size 1, instead "
-                f"got an input of shape {detection.shape}"
-            )
-
-        detection = detection.squeeze(dim=0)
-        size = size.squeeze(dim=0)
-        classes = classes.squeeze(dim=0)
-        features = features.squeeze(dim=0)
-
-    _, width, height = detection.shape
-
-    times = np.linspace(start_time, end_time, width, endpoint=False)
-    freqs = np.linspace(min_freq, max_freq, height, endpoint=False)
-
-    if classes.shape[0] != len(class_names):
-        raise ValueError(
-            f"The number of classes does not coincide with the number of class names provided: ({classes.shape[0] = }) != ({len(class_names) = })"
-        )
-
-    return xr.Dataset(
-        data_vars={
-            "detection": (
-                [Dimensions.time.value, Dimensions.frequency.value],
-                detection.squeeze(dim=0).detach().numpy(),
-            ),
-            "size": (
-                [
-                    "dimension",
-                    Dimensions.time.value,
-                    Dimensions.frequency.value,
-                ],
-                detection.detach().numpy(),
-            ),
-            "classes": (
-                [
-                    "category",
-                    Dimensions.time.value,
-                    Dimensions.frequency.value,
-                ],
-                classes.detach().numpy(),
-            ),
-        },
-        coords={
-            Dimensions.time.value: times,
-            Dimensions.frequency.value: freqs,
-            "dimension": ["width", "height"],
-            "category": class_names,
-        },
-    )
--- a/batdetect2/postprocess/config.py
+++ b/batdetect2/postprocess/config.py
@ -1,32 +0,0 @@
-from typing import Optional
-
-from pydantic import Field
-from soundevent import data
-
-from batdetect2.configs import BaseConfig, load_config
-
-__all__ = [
-    "PostprocessConfig",
-    "load_postprocess_config",
-]
-
-NMS_KERNEL_SIZE = 9
-DETECTION_THRESHOLD = 0.01
-TOP_K_PER_SEC = 200
-
-
-class PostprocessConfig(BaseConfig):
-    """Configuration for postprocessing model outputs."""
-
-    nms_kernel_size: int = Field(default=NMS_KERNEL_SIZE, gt=0)
-    detection_threshold: float = Field(default=DETECTION_THRESHOLD, ge=0)
-    min_freq: int = Field(default=10000, gt=0)
-    max_freq: int = Field(default=120000, gt=0)
-    top_k_per_sec: int = Field(default=TOP_K_PER_SEC, gt=0)
-
-
-def load_postprocess_config(
-    path: data.PathLike,
-    field: Optional[str] = None,
-) -> PostprocessConfig:
-    return load_config(path, schema=PostprocessConfig, field=field)
--- a/batdetect2/postprocess/decoding.py
+++ b/batdetect2/postprocess/decoding.py
@ -0,0 +1,297 @@
+"""Decodes extracted detection data into standard soundevent predictions.
+
+This module handles the final stages of the BatDetect2 postprocessing pipeline.
+It takes the structured detection data extracted by the `extraction` module
+(typically an `xarray.Dataset` containing scores, positions, predicted sizes,
+ class probabilities, and features for each detection point) and converts it
+into meaningful, standardized prediction objects based on the `soundevent` data
+model.
+
+The process involves:
+1.  Converting the `xarray.Dataset` into a list of intermediate `RawPrediction`
+    objects, using a configured geometry builder to recover bounding boxes from
+    predicted positions and sizes (`convert_xr_dataset_to_raw_prediction`).
+2.  Converting each `RawPrediction` into a
+    `soundevent.data.SoundEventPrediction`, which involves:
+    - Creating the `soundevent.data.SoundEvent` with geometry and features.
+    - Decoding the predicted class probabilities into representative tags using
+      a configured class decoder (`SoundEventDecoder`).
+    - Applying a classification threshold.
+    - Optionally selecting only the single highest-scoring class (top-1) or
+      including tags for all classes above the threshold (multi-label).
+    - Adding generic class tags as a baseline.
+    - Associating scores with the final prediction and tags.
+    (`convert_raw_prediction_to_sound_event_prediction`)
+3.  Grouping the `SoundEventPrediction` objects for a given audio segment into
+    a `soundevent.data.ClipPrediction`
+    (`convert_raw_predictions_to_clip_prediction`).
+"""
+
+from typing import List, Optional
+
+import xarray as xr
+from soundevent import data
+from soundevent.geometry import compute_bounds
+
+from batdetect2.postprocess.types import GeometryBuilder, RawPrediction
+from batdetect2.targets.classes import SoundEventDecoder
+
+__all__ = [
+    "convert_xr_dataset_to_raw_prediction",
+    "convert_raw_predictions_to_clip_prediction",
+    "convert_raw_prediction_to_sound_event_prediction",
+    "DEFAULT_CLASSIFICATION_THRESHOLD",
+]
+
+
+DEFAULT_CLASSIFICATION_THRESHOLD = 0.1
+"""Default threshold applied to classification scores.
+
+Class predictions with scores below this value are typically ignored during
+decoding.
+"""
+
+
+def convert_xr_dataset_to_raw_prediction(
+    detection_dataset: xr.Dataset,
+    geometry_builder: GeometryBuilder,
+) -> List[RawPrediction]:
+    """Convert an xarray.Dataset of detections to RawPrediction objects.
+
+    Takes the output of the extraction step (`extract_detection_xr_dataset`)
+    and transforms each detection entry into an intermediate `RawPrediction`
+    object. This involves recovering the geometry (e.g., bounding box) from
+    the predicted position and scaled size dimensions using the provided
+    `geometry_builder` function.
+
+    Parameters
+    ----------
+    detection_dataset : xr.Dataset
+        An xarray Dataset containing aligned detection information, typically
+        output by `extract_detection_xr_dataset`. Expected variables include
+        'scores' (with time/freq coords), 'dimensions', 'classes', 'features'.
+        Must have a 'detection' dimension.
+    geometry_builder : GeometryBuilder
+        A function that takes a position tuple `(time, freq)` and a NumPy array
+        of dimensions, and returns the corresponding reconstructed
+        `soundevent.data.Geometry`.
+
+    Returns
+    -------
+    List[RawPrediction]
+        A list of `RawPrediction` objects, each containing the detection score,
+        recovered bounding box coordinates (start/end time, low/high freq),
+        the vector of class scores, and the feature vector for one detection.
+
+    Raises
+    ------
+    AttributeError, KeyError, ValueError
+        If `detection_dataset` is missing expected variables ('scores',
+        'dimensions', 'classes', 'features') or coordinates ('time', 'freq'
+        associated with 'scores'), or if `geometry_builder` fails.
+    """
+    detections = []
+
+    for det_num in range(detection_dataset.dims["detection"]):
+        det_info = detection_dataset.sel(detection=det_num)
+
+        geom = geometry_builder(
+            (det_info.time, det_info.freq),
+            det_info.dimensions,
+        )
+
+        start_time, low_freq, end_time, high_freq = compute_bounds(geom)
+
+        classes = det_info.classes
+        features = det_info.features
+
+        detections.append(
+            RawPrediction(
+                detection_score=det_info.score,
+                start_time=start_time,
+                end_time=end_time,
+                low_freq=low_freq,
+                high_freq=high_freq,
+                class_scores=classes,
+                features=features,
+            )
+        )
+
+    return detections
+
+
+def convert_raw_predictions_to_clip_prediction(
+    raw_predictions: List[RawPrediction],
+    clip: data.Clip,
+    sound_event_decoder: SoundEventDecoder,
+    generic_class_tags: List[data.Tag],
+    classification_threshold: float = DEFAULT_CLASSIFICATION_THRESHOLD,
+    top_class_only: bool = False,
+) -> data.ClipPrediction:
+    """Convert a list of RawPredictions into a soundevent ClipPrediction.
+
+    Iterates through `raw_predictions` (assumed to belong to a single clip),
+    converts each one into a `soundevent.data.SoundEventPrediction` using
+    `convert_raw_prediction_to_sound_event_prediction`, and packages them
+    into a `soundevent.data.ClipPrediction` associated with the original `clip`.
+
+    Parameters
+    ----------
+    raw_predictions : List[RawPrediction]
+        List of raw prediction objects for a single clip.
+    clip : data.Clip
+        The original `soundevent.data.Clip` object these predictions belong to.
+    sound_event_decoder : SoundEventDecoder
+        Function to decode class names into representative tags.
+    generic_class_tags : List[data.Tag]
+        List of tags representing the generic class category.
+    classification_threshold : float, default=DEFAULT_CLASSIFICATION_THRESHOLD
+        Threshold applied to class scores during decoding.
+    top_class_only : bool, default=False
+        If True, only decode tags for the single highest-scoring class above
+        the threshold. If False, decode tags for all classes above threshold.
+
+    Returns
+    -------
+    data.ClipPrediction
+        A `ClipPrediction` object containing a list of `SoundEventPrediction`
+        objects corresponding to the input `raw_predictions`.
+    """
+    return data.ClipPrediction(
+        clip=clip,
+        sound_events=[
+            convert_raw_prediction_to_sound_event_prediction(
+                prediction,
+                recording=clip.recording,
+                sound_event_decoder=sound_event_decoder,
+                generic_class_tags=generic_class_tags,
+                classification_threshold=classification_threshold,
+                top_class_only=top_class_only,
+            )
+            for prediction in raw_predictions
+        ],
+    )
+
+
+def convert_raw_prediction_to_sound_event_prediction(
+    raw_prediction: RawPrediction,
+    recording: data.Recording,
+    sound_event_decoder: SoundEventDecoder,
+    generic_class_tags: List[data.Tag],
+    classification_threshold: Optional[
+        float
+    ] = DEFAULT_CLASSIFICATION_THRESHOLD,
+    top_class_only: bool = False,
+):
+    """Convert a single RawPrediction into a soundevent SoundEventPrediction.
+
+    This function performs the core decoding steps for a single detected event:
+    1. Creates a `soundevent.data.SoundEvent` containing the geometry
+       (BoundingBox derived from `raw_prediction` bounds) and any associated
+       feature vectors.
+    2. Initializes a list of predicted tags using the provided
+       `generic_class_tags`, assigning the overall `detection_score` from the
+       `raw_prediction` to these generic tags.
+    3. Processes the `class_scores` from the `raw_prediction`:
+        a. Optionally filters out scores below `classification_threshold`
+           (if it's not None).
+        b. Sorts the remaining scores in descending order.
+        c. Iterates through the sorted, thresholded class scores.
+        d. For each class, uses the `sound_event_decoder` to get the
+           representative base tags for that class name.
+        e. Wraps these base tags in `soundevent.data.PredictedTag`, associating
+           the specific `score` of that class prediction.
+        f. Appends these specific predicted tags to the list.
+        g. If `top_class_only` is True, stops after processing the first
+           (highest-scoring) class that passed the threshold.
+    4. Creates and returns the final `soundevent.data.SoundEventPrediction`,
+       associating the `SoundEvent`, the overall `detection_score`, and the
+       compiled list of `PredictedTag` objects.
+
+    Parameters
+    ----------
+    raw_prediction : RawPrediction
+        The raw prediction object containing score, bounds, class scores,
+        features. Assumes `class_scores` is an `xr.DataArray` with a 'category'
+        coordinate. Assumes `features` is an `xr.DataArray` with a 'feature'
+        coordinate.
+    recording : data.Recording
+        The recording the sound event belongs to.
+    sound_event_decoder : SoundEventDecoder
+        Configured function mapping class names (str) to lists of base
+        `data.Tag` objects.
+    generic_class_tags : List[data.Tag]
+        List of base tags representing the generic category.
+    classification_threshold : float, optional
+        The minimum score a class prediction must have to be considered
+        significant enough to have its tags decoded and added. If None, no
+        thresholding is applied based on class score (all predicted classes,
+        or the top one if `top_class_only` is True, will be processed).
+        Defaults to `DEFAULT_CLASSIFICATION_THRESHOLD`.
+    top_class_only : bool, default=False
+        If True, only includes tags for the single highest-scoring class that
+        exceeds the threshold. If False (default), includes tags for all classes
+        exceeding the threshold.
+
+    Returns
+    -------
+    data.SoundEventPrediction
+        The fully formed sound event prediction object.
+
+    Raises
+    ------
+    ValueError
+        If `raw_prediction.features` has unexpected structure or if
+        `data.term_from_key` (if used internally) fails.
+        If `sound_event_decoder` fails for a class name and errors are raised.
+    """
+    sound_event = data.SoundEvent(
+        recording=recording,
+        geometry=data.BoundingBox(
+            coordinates=[
+                raw_prediction.start_time,
+                raw_prediction.low_freq,
+                raw_prediction.end_time,
+                raw_prediction.high_freq,
+            ]
+        ),
+        features=[
+            data.Feature(term=data.term_from_key(feat_name), value=value)
+            for feat_name, value in raw_prediction.features
+        ],
+    )
+
+    tags = [
+        data.PredictedTag(tag=tag, score=raw_prediction.detection_score)
+        for tag in generic_class_tags
+    ]
+
+    class_scores = raw_prediction.class_scores
+
+    if classification_threshold is not None:
+        class_scores = class_scores.where(
+            class_scores > classification_threshold,
+            drop=True,
+        )
+
+    for class_name, score in class_scores.sortby(
+        class_scores, ascending=False
+    ):
+        class_tags = sound_event_decoder(class_name)
+
+        for tag in class_tags:
+            tags.append(
+                data.PredictedTag(
+                    tag=tag,
+                    score=score,
+                )
+            )
+
+        if top_class_only:
+            break
+
+    return data.SoundEventPrediction(
+        sound_event=sound_event,
+        score=raw_prediction.detection_score,
+        tags=tags,
+    )
--- a/batdetect2/postprocess/detection.py
+++ b/batdetect2/postprocess/detection.py
@ -0,0 +1,162 @@
+"""Extracts candidate detection points from a model output heatmap.
+
+This module implements a specific step within the BatDetect2 postprocessing
+pipeline. Its primary function is to identify potential sound event locations
+by finding peaks (local maxima or high-scoring points) in the detection heatmap
+produced by the neural network (usually after Non-Maximum Suppression and
+coordinate remapping have been applied).
+
+It provides functionality to:
+- Identify the locations (time, frequency) of the highest-scoring points.
+- Filter these points based on a minimum confidence score threshold.
+- Limit the maximum number of detection points returned (top-k).
+
+The main output is an `xarray.DataArray` containing the scores and
+corresponding time/frequency coordinates for the extracted detection points.
+This output serves as the input for subsequent postprocessing steps, such as
+extracting predicted class probabilities and bounding box sizes at these
+specific locations.
+"""
+
+from typing import Optional
+
+import numpy as np
+import xarray as xr
+from soundevent.arrays import Dimensions, get_dim_width
+
+__all__ = [
+    "extract_detections_from_array",
+    "get_max_detections",
+    "DEFAULT_DETECTION_THRESHOLD",
+    "TOP_K_PER_SEC",
+]
+
+DEFAULT_DETECTION_THRESHOLD = 0.01
+"""Default confidence score threshold used for filtering detections."""
+
+TOP_K_PER_SEC = 200
+"""Default desired maximum number of detections per second of audio."""
+
+
+def extract_detections_from_array(
+    detection_array: xr.DataArray,
+    max_detections: Optional[int] = None,
+    threshold: Optional[float] = DEFAULT_DETECTION_THRESHOLD,
+) -> xr.DataArray:
+    """Extract detection locations (time, freq) and scores from a heatmap.
+
+    Identifies the pixels with the highest scores in the input detection
+    heatmap, filters them based on an optional score `threshold`, limits the
+    number to an optional `max_detections`, and returns their scores along with
+    their corresponding time and frequency coordinates.
+
+    Parameters
+    ----------
+    detection_array : xr.DataArray
+        A 2D xarray DataArray representing the detection heatmap. Must have
+        dimensions and coordinates named 'time' and 'frequency'. Higher values
+        are assumed to indicate higher detection confidence.
+    max_detections : int, optional
+        The absolute maximum number of detections to return. If specified, only
+        the top `max_detections` highest-scoring detections (passing the
+        threshold) are returned. If None (default), all detections passing
+        the threshold are returned, sorted by score.
+    threshold : float, optional
+        The minimum confidence score required for a detection peak to be
+        kept. Detections with scores below this value are discarded.
+        Defaults to `DEFAULT_DETECTION_THRESHOLD`. If set to None, no
+        thresholding is applied.
+
+    Returns
+    -------
+    xr.DataArray
+        A 1D xarray DataArray named 'score' with a 'detection' dimension.
+        - The data values are the scores of the extracted detections, sorted
+          in descending order.
+        - It has coordinates 'time' and 'frequency' (also indexed by the
+          'detection' dimension) indicating the location of each detection
+          peak in the original coordinate system.
+        - Returns an empty DataArray if no detections pass the criteria.
+
+    Raises
+    ------
+    ValueError
+        If `max_detections` is not None and not a positive integer, or if
+        `detection_array` lacks required dimensions/coordinates.
+    """
+    if max_detections is not None:
+        if max_detections <= 0:
+            raise ValueError("Max detections must be positive")
+
+    values = detection_array.values.flatten()
+
+    if max_detections is not None:
+        top_indices = np.argpartition(-values, max_detections)[:max_detections]
+        top_sorted_indices = top_indices[np.argsort(-values[top_indices])]
+    else:
+        top_sorted_indices = np.argsort(-values)
+
+    top_values = values[top_sorted_indices]
+
+    if threshold is not None:
+        mask = top_values > threshold
+        top_values = top_values[mask]
+        top_sorted_indices = top_sorted_indices[mask]
+
+    time_indices, freq_indices = np.unravel_index(
+        top_sorted_indices,
+        detection_array.shape,
+    )
+
+    times = detection_array.coords[Dimensions.time.value].values[time_indices]
+    freqs = detection_array.coords[Dimensions.frequency.value].values[
+        freq_indices
+    ]
+
+    return xr.DataArray(
+        data=top_values,
+        coords={
+            Dimensions.frequency.value: ("detection", freqs),
+            Dimensions.time.value: ("detection", times),
+        },
+        dims="detection",
+        name="score",
+    )
+
+
+def get_max_detections(
+    detection_array: xr.DataArray,
+    top_k_per_sec: int = TOP_K_PER_SEC,
+) -> int:
+    """Calculate max detections allowed based on duration and rate.
+
+    Determines the total maximum number of detections to extract from a
+    heatmap based on its time duration and a desired rate of detections
+    per second.
+
+    Parameters
+    ----------
+    detection_array : xr.DataArray
+        The detection heatmap, requiring 'time' coordinates from which the
+        total duration can be calculated using
+        `soundevent.arrays.get_dim_width`.
+    top_k_per_sec : int, default=TOP_K_PER_SEC
+        The desired maximum number of detections to allow per second of audio.
+
+    Returns
+    -------
+    int
+        The calculated total maximum number of detections allowed for the
+        entire duration of the `detection_array`.
+
+    Raises
+    ------
+    ValueError
+        If the duration cannot be calculated from the `detection_array` (e.g.,
+        missing or invalid 'time' coordinates/dimension).
+    """
+    if top_k_per_sec < 0:
+        raise ValueError("top_k_per_sec cannot be negative.")
+
+    duration = get_dim_width(detection_array, Dimensions.time.value)
+    return int(duration * top_k_per_sec)
--- a/batdetect2/postprocess/extraction.py
+++ b/batdetect2/postprocess/extraction.py
@ -0,0 +1,122 @@
+"""Extracts associated data for detected points from model output arrays.
+
+This module implements a key step (Step 4) in the BatDetect2 postprocessing
+pipeline. After candidate detection points (time, frequency, score) have been
+identified, this module extracts the corresponding values from other raw model
+output arrays, such as:
+
+- Predicted bounding box sizes (width, height).
+- Class probability scores for each defined target class.
+- Intermediate feature vectors.
+
+It uses coordinate-based indexing provided by `xarray` to ensure that the
+correct values are retrieved from the original heatmaps/feature maps at the
+precise time-frequency location of each detection. The final output aggregates
+all extracted information into a structured `xarray.Dataset`.
+"""
+
+import xarray as xr
+from soundevent.arrays import Dimensions
+
+__all__ = [
+    "extract_values_at_positions",
+    "extract_detection_xr_dataset",
+]
+
+
+def extract_values_at_positions(
+    array: xr.DataArray,
+    positions: xr.DataArray,
+) -> xr.DataArray:
+    """Extract values from an array at specified time-frequency positions.
+
+    Uses coordinate-based indexing to retrieve values from a source `array`
+    (e.g., class probabilities, size predictions, features) at the time and
+    frequency coordinates defined in the `positions` array.
+
+    Parameters
+    ----------
+    array : xr.DataArray
+        The source DataArray from which to extract values. Must have 'time'
+        and 'frequency' dimensions and coordinates matching the space of
+        `positions`.
+    positions : xr.DataArray
+        A 1D DataArray whose 'time' and 'frequency' coordinates specify the
+        locations from which to extract values.
+
+    Returns
+    -------
+    xr.DataArray
+        A DataArray containing the values extracted from `array` at the given
+        positions.
+
+    Raises
+    ------
+    ValueError, IndexError, KeyError
+        If dimensions or coordinates are missing or incompatible between
+        `array` and `positions`, or if selection fails.
+    """
+    return array.sel(
+        **{
+            Dimensions.frequency.value: positions.coords[
+                Dimensions.frequency.value
+            ],
+            Dimensions.time.value: positions.coords[Dimensions.time.value],
+        }
+    )
+
+
+def extract_detection_xr_dataset(
+    positions: xr.DataArray,
+    sizes: xr.DataArray,
+    classes: xr.DataArray,
+    features: xr.DataArray,
+) -> xr.Dataset:
+    """Combine extracted detection information into a structured xr.Dataset.
+
+    Takes the detection positions/scores and the full model output heatmaps
+    (sizes, classes, optional features), extracts the relevant data at the
+    detection positions, and packages everything into a single `xarray.Dataset`
+    where all variables are indexed by a common 'detection' dimension.
+
+    Parameters
+    ----------
+    positions : xr.DataArray
+        Output from `extract_detections_from_array`, containing detection
+        scores as data and 'time', 'frequency' coordinates along the
+        'detection' dimension.
+    sizes : xr.DataArray
+        The full size prediction heatmap from the model, with dimensions like
+        ('dimension', 'time', 'frequency').
+    classes : xr.DataArray
+        The full class probability heatmap from the model, with dimensions like
+        ('category', 'time', 'frequency').
+    features : xr.DataArray
+        The full feature map from the model, with
+        dimensions like ('feature', 'time', 'frequency').
+
+    Returns
+    -------
+    xr.Dataset
+        An xarray Dataset containing aligned information for each detection:
+        - 'scores': DataArray from `positions` (score data, time/freq coords).
+        - 'dimensions': DataArray with extracted size values
+          (dims: 'detection', 'dimension').
+        - 'classes': DataArray with extracted class probabilities
+          (dims: 'detection', 'category').
+        - 'features': DataArray with extracted feature vectors
+          (dims: 'detection', 'feature'), if `features` was provided. All
+          DataArrays share the 'detection' dimension and associated
+          time/frequency coordinates.
+    """
+    sizes = extract_values_at_positions(sizes, positions).T
+    classes = extract_values_at_positions(classes, positions).T
+    features = extract_values_at_positions(features, positions).T
+    return xr.Dataset(
+        {
+            "scores": positions,
+            "dimensions": sizes,
+            "classes": classes,
+            "features": features,
+        }
+    )
--- a/batdetect2/postprocess/nms.py
+++ b/batdetect2/postprocess/nms.py
@ -0,0 +1,96 @@
+"""Performs Non-Maximum Suppression (NMS) on detection heatmaps.
+
+This module provides functionality to apply Non-Maximum Suppression, a common
+technique used after model inference, particularly in object detection and peak
+detection tasks.
+
+In the context of BatDetect2 postprocessing, NMS is applied
+to the raw detection heatmap output by the neural network. Its purpose is to
+isolate distinct detection peaks by suppressing (setting to zero) nearby heatmap
+activations that have lower scores than a local maximum. This helps prevent
+multiple, overlapping detections originating from the same sound event.
+"""
+
+from typing import Tuple, Union
+
+import torch
+
+NMS_KERNEL_SIZE = 9
+"""Default kernel size (pixels) for Non-Maximum Suppression.
+
+Specifies the side length of the square neighborhood used by default in
+`non_max_suppression` to find local maxima. A 9x9 neighborhood is often
+a reasonable starting point for typical spectrogram resolutions used in
+BatDetect2.
+"""
+
+
+def non_max_suppression(
+    tensor: torch.Tensor,
+    kernel_size: Union[int, Tuple[int, int]] = NMS_KERNEL_SIZE,
+) -> torch.Tensor:
+    """Apply Non-Maximum Suppression (NMS) to a tensor, typically a heatmap.
+
+    This function identifies local maxima within a defined neighborhood for
+    each point in the input tensor. Values that are *not* the maximum within
+    their neighborhood are suppressed (set to zero). This is commonly used on
+    detection probability heatmaps to isolate distinct peaks corresponding to
+    individual detections and remove redundant lower scores nearby.
+
+    The implementation uses efficient 2D max pooling to find the maximum value
+    in the neighborhood of each point.
+
+    Parameters
+    ----------
+    tensor : torch.Tensor
+        Input tensor, typically representing a detection heatmap. Must be a
+        3D (C, H, W) or 4D (N, C, H, W) tensor as required by the underlying
+        `torch.nn.functional.max_pool2d` operation.
+    kernel_size : Union[int, Tuple[int, int]], default=NMS_KERNEL_SIZE
+        Size of the sliding window neighborhood used to find local maxima.
+        If an integer `k` is provided, a square kernel of size `(k, k)` is used.
+        If a tuple `(h, w)` is provided, a rectangular kernel of height `h`
+        and width `w` is used. The kernel size should typically be odd to
+        have a well-defined center.
+
+    Returns
+    -------
+    torch.Tensor
+        A tensor of the same shape as the input, where only local maxima within
+        their respective neighborhoods (defined by `kernel_size`) retain their
+        original values. All other values are set to zero.
+
+    Raises
+    ------
+    TypeError
+        If `kernel_size` is not an int or a tuple of two ints.
+    RuntimeError
+        If the input `tensor` does not have 3 or 4 dimensions (as required
+        by `max_pool2d`).
+
+    Notes
+    -----
+    - The function assumes higher values in the tensor indicate stronger peaks.
+    - Choosing an appropriate `kernel_size` is important. It should be large
+      enough to cover the typical "footprint" of a single detection peak plus
+      some surrounding context, effectively preventing multiple detections for
+      the same event. A size that is too large might suppress nearby distinct
+      events.
+    """
+    if isinstance(kernel_size, int):
+        kernel_size_h = kernel_size
+        kernel_size_w = kernel_size
+    else:
+        kernel_size_h, kernel_size_w = kernel_size
+
+    pad_h = (kernel_size_h - 1) // 2
+    pad_w = (kernel_size_w - 1) // 2
+
+    hmax = torch.nn.functional.max_pool2d(
+        tensor,
+        (kernel_size_h, kernel_size_w),
+        stride=1,
+        padding=(pad_h, pad_w),
+    )
+    keep = (hmax == tensor).float()
+    return tensor * keep
--- a/batdetect2/postprocess/non_max_supression.py
+++ b/batdetect2/postprocess/non_max_supression.py
@ -1,50 +0,0 @@
-from typing import Tuple, Union
-
-import torch
-
-NMS_KERNEL_SIZE = 9
-
-
-def non_max_suppression(
-    tensor: torch.Tensor,
-    kernel_size: Union[int, Tuple[int, int]] = NMS_KERNEL_SIZE,
-) -> torch.Tensor:
-    """Run non-maximum suppression on a tensor.
-
-    This function removes values from the input tensor that are not local
-    maxima in the neighborhood of the given kernel size.
-
-    All non-maximum values are set to zero.
-
-    Parameters
-    ----------
-    tensor : torch.Tensor
-        Input tensor.
-    kernel_size : Union[int, Tuple[int, int]], optional
-        Size of the neighborhood to consider for non-maximum suppression.
-        If an integer is given, the neighborhood will be a square of the
-        given size. If a tuple is given, the neighborhood will be a
-        rectangle with the given height and width.
-
-    Returns
-    -------
-    torch.Tensor
-        Tensor with non-maximum suppressed values.
-    """
-    if isinstance(kernel_size, int):
-        kernel_size_h = kernel_size
-        kernel_size_w = kernel_size
-    else:
-        kernel_size_h, kernel_size_w = kernel_size
-
-    pad_h = (kernel_size_h - 1) // 2
-    pad_w = (kernel_size_w - 1) // 2
-
-    hmax = torch.nn.functional.max_pool2d(
-        tensor,
-        (kernel_size_h, kernel_size_w),
-        stride=1,
-        padding=(pad_h, pad_w),
-    )
-    keep = (hmax == tensor).float()
-    return tensor * keep
--- a/batdetect2/postprocess/remapping.py
+++ b/batdetect2/postprocess/remapping.py
@ -0,0 +1,316 @@
+"""Remaps raw model output tensors to coordinate-aware xarray DataArrays.
+
+This module provides utility functions to convert the raw numerical outputs
+(typically PyTorch tensors) from the BatDetect2 DNN model into
+`xarray.DataArray` objects. This step adds coordinate information
+(time in seconds, frequency in Hz) back to the model's predictions, making them
+interpretable in the context of the original audio signal and facilitating
+subsequent processing steps.
+
+Functions are provided for common BatDetect2 output types: detection heatmaps,
+classification probability maps, size prediction maps, and potentially
+intermediate features.
+"""
+
+from typing import List
+
+import numpy as np
+import torch
+import xarray as xr
+from soundevent.arrays import Dimensions
+
+from batdetect2.preprocess import MAX_FREQ, MIN_FREQ
+
+__all__ = [
+    "features_to_xarray",
+    "detection_to_xarray",
+    "classification_to_xarray",
+    "sizes_to_xarray",
+]
+
+
+def features_to_xarray(
+    features: torch.Tensor,
+    start_time: float,
+    end_time: float,
+    min_freq: float = MIN_FREQ,
+    max_freq: float = MAX_FREQ,
+    features_prefix: str = "batdetect2_feature_",
+):
+    """Convert a multi-channel feature tensor to a coordinate-aware DataArray.
+
+    Assigns time, frequency, and feature coordinates to a raw feature tensor
+    output by the model.
+
+    Parameters
+    ----------
+    features : torch.Tensor
+        The raw feature tensor from the model. Expected shape is
+        (num_features, num_freq_bins, num_time_bins).
+    start_time : float
+        The start time (in seconds) corresponding to the first time bin of
+        the tensor.
+    end_time : float
+        The end time (in seconds) corresponding to the *end* of the last time
+        bin.
+    min_freq : float, default=MIN_FREQ
+        The minimum frequency (in Hz) corresponding to the first frequency bin.
+    max_freq : float, default=MAX_FREQ
+        The maximum frequency (in Hz) corresponding to the *end* of the last
+        frequency bin.
+    features_prefix : str, default="batdetect2_feature_"
+        Prefix used to generate names for the feature coordinate dimension
+        (e.g., "batdetect2_feature_0", "batdetect2_feature_1", ...).
+
+    Returns
+    -------
+    xr.DataArray
+        An xarray DataArray containing the feature data with named dimensions
+        ('feature', 'frequency', 'time') and calculated coordinates.
+
+    Raises
+    ------
+    ValueError
+        If the input tensor does not have 3 dimensions.
+    """
+    if features.ndim != 3:
+        raise ValueError(
+            "Input features tensor must have 3 dimensions (C, T, F), "
+            f"got shape {features.shape}"
+        )
+
+    num_features, height, width = features.shape
+    times = np.linspace(start_time, end_time, width, endpoint=False)
+    freqs = np.linspace(min_freq, max_freq, height, endpoint=False)
+
+    return xr.DataArray(
+        data=features.detach().numpy(),
+        dims=[
+            Dimensions.feature.value,
+            Dimensions.frequency.value,
+            Dimensions.time.value,
+        ],
+        coords={
+            Dimensions.feature.value: [
+                f"{features_prefix}{i}" for i in range(num_features)
+            ],
+            Dimensions.frequency.value: freqs,
+            Dimensions.time.value: times,
+        },
+        name="features",
+    )
+
+
+def detection_to_xarray(
+    detection: torch.Tensor,
+    start_time: float,
+    end_time: float,
+    min_freq: float = MIN_FREQ,
+    max_freq: float = MAX_FREQ,
+) -> xr.DataArray:
+    """Convert a single-channel detection heatmap tensor to a DataArray.
+
+    Assigns time and frequency coordinates to a raw detection heatmap tensor.
+
+    Parameters
+    ----------
+    detection : torch.Tensor
+        Raw detection heatmap tensor from the model. Expected shape is
+        (1, num_freq_bins, num_time_bins).
+    start_time : float
+        Start time (seconds) corresponding to the first time bin.
+    end_time : float
+        End time (seconds) corresponding to the end of the last time bin.
+    min_freq : float, default=MIN_FREQ
+        Minimum frequency (Hz) corresponding to the first frequency bin.
+    max_freq : float, default=MAX_FREQ
+        Maximum frequency (Hz) corresponding to the end of the last frequency
+        bin.
+
+    Returns
+    -------
+    xr.DataArray
+        An xarray DataArray containing the detection scores with named
+        dimensions ('frequency', 'time') and calculated coordinates.
+
+    Raises
+    ------
+    ValueError
+        If the input tensor does not have 3 dimensions or if the first
+        dimension size is not 1.
+    """
+    if detection.ndim != 3:
+        raise ValueError(
+            "Input detection tensor must have 3 dimensions (1, T, F), "
+            f"got shape {detection.shape}"
+        )
+
+    num_channels, height, width = detection.shape
+
+    if num_channels != 1:
+        raise ValueError(
+            "Expected a single channel output, instead got "
+            f"{num_channels} channels"
+        )
+
+    times = np.linspace(start_time, end_time, width, endpoint=False)
+    freqs = np.linspace(min_freq, max_freq, height, endpoint=False)
+
+    return xr.DataArray(
+        data=detection.squeeze(dim=0).detach().numpy(),
+        dims=[
+            Dimensions.frequency.value,
+            Dimensions.time.value,
+        ],
+        coords={
+            Dimensions.frequency.value: freqs,
+            Dimensions.time.value: times,
+        },
+        name="detection_score",
+    )
+
+
+def classification_to_xarray(
+    classes: torch.Tensor,
+    start_time: float,
+    end_time: float,
+    class_names: List[str],
+    min_freq: float = MIN_FREQ,
+    max_freq: float = MAX_FREQ,
+) -> xr.DataArray:
+    """Convert multi-channel class probability tensor to a DataArray.
+
+    Assigns category (class name), frequency, and time coordinates to a raw
+    class probability tensor output by the model.
+
+    Parameters
+    ----------
+    classes : torch.Tensor
+        Raw class probability tensor. Expected shape is
+        (num_classes, num_freq_bins, num_time_bins).
+    start_time : float
+        Start time (seconds) corresponding to the first time bin.
+    end_time : float
+        End time (seconds) corresponding to the end of the last time bin.
+    class_names : List[str]
+        Ordered list of class names corresponding to the first dimension
+        of the `classes` tensor. The length must match `classes.shape[0]`.
+    min_freq : float, default=MIN_FREQ
+        Minimum frequency (Hz) corresponding to the first frequency bin.
+    max_freq : float, default=MAX_FREQ
+        Maximum frequency (Hz) corresponding to the end of the last frequency
+        bin.
+
+    Returns
+    -------
+    xr.DataArray
+        An xarray DataArray containing class probabilities with named
+        dimensions ('category', 'frequency', 'time') and calculated
+        coordinates.
+
+    Raises
+    ------
+    ValueError
+        If the input tensor does not have 3 dimensions, or if the size of the
+        first dimension does not match the length of `class_names`.
+    """
+    if classes.ndim != 3:
+        raise ValueError(
+            "Input classes tensor must have 3 dimensions (C, F, T), "
+            f"got shape {classes.shape}"
+        )
+
+    num_classes, height, width = classes.shape
+
+    if num_classes != len(class_names):
+        raise ValueError(
+            "The number of classes does not coincide with the number of "
+            "class names provided: "
+            f"({num_classes = }) != ({len(class_names) = })"
+        )
+
+    times = np.linspace(start_time, end_time, width, endpoint=False)
+    freqs = np.linspace(min_freq, max_freq, height, endpoint=False)
+
+    return xr.DataArray(
+        data=classes.detach().numpy(),
+        dims=[
+            "category",
+            Dimensions.frequency.value,
+            Dimensions.time.value,
+        ],
+        coords={
+            "category": class_names,
+            Dimensions.frequency.value: freqs,
+            Dimensions.time.value: times,
+        },
+        name="class_scores",
+    )
+
+
+def sizes_to_xarray(
+    sizes: torch.Tensor,
+    start_time: float,
+    end_time: float,
+    min_freq: float = MIN_FREQ,
+    max_freq: float = MAX_FREQ,
+) -> xr.DataArray:
+    """Convert the 2-channel size prediction tensor to a DataArray.
+
+    Assigns dimension ('width', 'height'), frequency, and time coordinates
+    to the raw size prediction tensor output by the model.
+
+    Parameters
+    ----------
+    sizes : torch.Tensor
+        Raw size prediction tensor. Expected shape is
+        (2, num_freq_bins, num_time_bins), where the first dimension
+        corresponds to predicted width and height respectively.
+    start_time : float
+        Start time (seconds) corresponding to the first time bin.
+    end_time : float
+        End time (seconds) corresponding to the end of the last time bin.
+    min_freq : float, default=MIN_FREQ
+        Minimum frequency (Hz) corresponding to the first frequency bin.
+    max_freq : float, default=MAX_FREQ
+        Maximum frequency (Hz) corresponding to the end of the last frequency
+        bin.
+
+    Returns
+    -------
+    xr.DataArray
+        An xarray DataArray containing predicted sizes with named dimensions
+        ('dimension', 'frequency', 'time') and calculated time/frequency
+        coordinates. The 'dimension' coordinate will have values
+        ['width', 'height'].
+
+    Raises
+    ------
+    ValueError
+        If the input tensor does not have 3 dimensions or if the first
+        dimension size is not exactly 2.
+    """
+    num_channels, height, width = sizes.shape
+
+    if num_channels != 2:
+        raise ValueError(
+            "Expected a two-channel output, instead got "
+            f"{num_channels} channels"
+        )
+
+    times = np.linspace(start_time, end_time, width, endpoint=False)
+    freqs = np.linspace(min_freq, max_freq, height, endpoint=False)
+
+    return xr.DataArray(
+        data=sizes.detach().numpy(),
+        dims=[
+            "dimension",
+            Dimensions.frequency.value,
+            Dimensions.time.value,
+        ],
+        coords={
+            "dimension": ["width", "height"],
+            Dimensions.frequency.value: freqs,
+            Dimensions.time.value: times,
+        },
+    )
--- a/batdetect2/postprocess/types.py
+++ b/batdetect2/postprocess/types.py
@ -1,21 +1,284 @@
-from typing import Dict, NamedTuple, Protocol
+"""Defines shared interfaces and data structures for postprocessing.
+
+This module centralizes the Protocol definitions and common data structures
+used throughout the `batdetect2.postprocess` module.
+
+The main component is the `PostprocessorProtocol`, which outlines the standard
+interface for an object responsible for executing the entire postprocessing
+pipeline. This pipeline transforms raw neural network outputs into interpretable
+detections represented as `soundevent` objects. Using protocols ensures
+modularity and consistent interaction between different parts of the BatDetect2
+system that deal with model predictions.
+"""
+
+from typing import Callable, List, NamedTuple, Protocol

 import numpy as np
+import xarray as xr
+from soundevent import data
+
+from batdetect2.models.types import ModelOutput

 __all__ = [
-    "BatDetect2Prediction",
+    "RawPrediction",
+    "PostprocessorProtocol",
+    "GeometryBuilder",
 ]


-class BatDetect2Prediction(NamedTuple):
+GeometryBuilder = Callable[[tuple[float, float], np.ndarray], data.Geometry]
+"""Type alias for a function that recovers geometry from position and size.
+
+This callable takes:
+1.  A position tuple `(time, frequency)`.
+2.  A NumPy array of size dimensions (e.g., `[width, height]`).
+It should return the reconstructed `soundevent.data.Geometry` (typically a
+`BoundingBox`).
+"""
+
+
+class RawPrediction(NamedTuple):
+    """Intermediate representation of a single detected sound event.
+
+    Holds extracted information about a detection after initial processing
+    (like peak finding, coordinate remapping, geometry recovery) but before
+    final class decoding and conversion into a `SoundEventPrediction`. This
+    can be useful for evaluation or simpler data handling formats.
+
+    Attributes
+    ----------
+    start_time : float
+        Start time of the recovered bounding box in seconds.
+    end_time : float
+        End time of the recovered bounding box in seconds.
+    low_freq : float
+        Lowest frequency of the recovered bounding box in Hz.
+    high_freq : float
+        Highest frequency of the recovered bounding box in Hz.
+    detection_score : float
+        The confidence score associated with this detection, typically from
+        the detection heatmap peak.
+    class_scores : xr.DataArray
+        An xarray DataArray containing the predicted probabilities or scores
+        for each target class at the detection location. Indexed by a
+        'category' coordinate containing class names.
+    features : xr.DataArray
+        An xarray DataArray containing extracted feature vectors at the
+        detection location. Indexed by a 'feature' coordinate.
+    """
+
    start_time: float
    end_time: float
    low_freq: float
    high_freq: float
    detection_score: float
-    class_scores: Dict[str, float]
-    features: np.ndarray
+    class_scores: xr.DataArray
+    features: xr.DataArray


 class PostprocessorProtocol(Protocol):
-    pass
+    """Protocol defining the interface for the full postprocessing pipeline.
+
+    This protocol outlines the standard methods for an object that takes raw
+    output from a BatDetect2 model and the corresponding input clip metadata,
+    and processes it through various stages (e.g., coordinate remapping, NMS,
+    detection extraction, data extraction, decoding) to produce interpretable
+    results at different levels of completion.
+
+    Implementations manage the configured logic for all postprocessing steps.
+    """
+
+    def get_feature_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Remap feature tensors to coordinate-aware DataArrays.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch, expected
+            to contain the necessary feature tensors.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects, one for each item in the
+            processed batch. This list provides the timing, recording, and
+            other metadata context needed to calculate real-world coordinates
+            (seconds, Hz) for the output arrays. The length of this list must
+            correspond to the batch size of the `output`.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            A list of xarray DataArrays, one for each input clip in the batch,
+            in the same order. Each DataArray contains the feature vectors
+            with dimensions like ('feature', 'time', 'frequency') and
+            corresponding real-world coordinates.
+        """
+        ...
+
+    def get_detection_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Remap detection tensors to coordinate-aware DataArrays.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch,
+            containing detection heatmaps.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing coordinate context. Must match the batch size of
+            `output`.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            A list of 2D xarray DataArrays (one per input clip, in order),
+            representing the detection heatmap with 'time' and 'frequency'
+            coordinates. Values typically indicate detection confidence.
+        """
+        ...
+
+    def get_classification_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Remap classification tensors to coordinate-aware DataArrays.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch,
+            containing class probability tensors.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing coordinate context. Must match the batch size of
+            `output`.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            A list of 3D xarray DataArrays (one per input clip, in order),
+            representing class probabilities with 'category', 'time', and
+            'frequency' dimensions and coordinates.
+        """
+        ...
+
+    def get_sizes_arrays(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.DataArray]:
+        """Remap size prediction tensors to coordinate-aware DataArrays.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch,
+            containing predicted size tensors (e.g., width and height).
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing coordinate context. Must match the batch size of
+            `output`.
+
+        Returns
+        -------
+        List[xr.DataArray]
+            A list of 3D xarray DataArrays (one per input clip, in order),
+            representing predicted sizes with 'dimension'
+            (e.g., ['width', 'height']), 'time', and 'frequency' dimensions and
+            coordinates. Values represent estimated detection sizes.
+        """
+        ...
+
+    def get_detection_datasets(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[xr.Dataset]:
+        """Perform remapping, NMS, detection, and data extraction for a batch.
+
+        Processes the raw model output for a batch to identify detection peaks
+        and extract all associated information (score, position, size, class
+        probs, features) at those peak locations, returning a structured
+        dataset for each input clip in the batch.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing context. Must match the batch size of `output`.
+
+        Returns
+        -------
+        List[xr.Dataset]
+            A list of xarray Datasets (one per input clip, in order). Each
+            Dataset contains multiple DataArrays ('scores', 'dimensions',
+            'classes', 'features') sharing a common 'detection' dimension,
+            providing aligned data for each detected event in that clip.
+        """
+        ...
+
+    def get_raw_predictions(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[List[RawPrediction]]:
+        """Extract intermediate RawPrediction objects for a batch.
+
+        Processes the raw model output for a batch through remapping, NMS,
+        detection, data extraction, and geometry recovery to produce a list of
+        `RawPrediction` objects for each corresponding input clip. This provides
+        a simplified, intermediate representation before final tag decoding.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing context. Must match the batch size of `output`.
+
+        Returns
+        -------
+        List[List[RawPrediction]]
+            A list of lists (one inner list per input clip, in order). Each
+            inner list contains the `RawPrediction` objects extracted for the
+            corresponding input clip.
+        """
+        ...
+
+    def get_predictions(
+        self,
+        output: ModelOutput,
+        clips: List[data.Clip],
+    ) -> List[data.ClipPrediction]:
+        """Perform the full postprocessing pipeline for a batch.
+
+        Takes raw model output for a batch and corresponding clips, applies the
+        entire postprocessing chain, and returns the final, interpretable
+        predictions as a list of `soundevent.data.ClipPrediction` objects.
+
+        Parameters
+        ----------
+        output : ModelOutput
+            The raw output from the neural network model for a batch.
+        clips : List[data.Clip]
+            A list of `soundevent.data.Clip` objects corresponding to the batch
+            items, providing context. Must match the batch size of `output`.
+
+        Returns
+        -------
+        List[data.ClipPrediction]
+            A list containing one `ClipPrediction` object for each input clip
+            (in the same order), populated with `SoundEventPrediction` objects
+            representing the final detections with decoded tags and geometry.
+        """
+        ...