Added postprocess function to API

2025-06-29 14:41:58 +02:00 · 2023-02-26 20:48:52 +00:00 · 2023-02-26 20:48:52 +00:00 · acf01f4970
commit acf01f4970
parent b0d9576a24
5 changed files with 140 additions and 12 deletions
--- a/bat_detect/api.py
+++ b/bat_detect/api.py
@ -15,6 +15,7 @@ from bat_detect.detector.parameters import (
 from bat_detect.types import (
    Annotation,
    DetectionModel,
+    ModelOutput,
    ProcessingConfiguration,
    SpectrogramParameters,
 )
@ -24,16 +25,17 @@ from bat_detect.utils.detector_utils import list_audio_files, load_model
 warnings.filterwarnings("ignore", category=UserWarning, module="torch")

 __all__ = [
-    "load_model",
-    "load_audio",
-    "list_audio_files",
+    "config",
    "generate_spectrogram",
    "get_config",
+    "list_audio_files",
+    "load_audio",
+    "load_model",
+    "model",
+    "postprocess",
+    "process_audio",
    "process_file",
    "process_spectrogram",
-    "process_audio",
-    "model",
-    "config",
 ]


@ -248,6 +250,48 @@ def process_audio(
    )


+def postprocess(
+    outputs: ModelOutput,
+    samp_rate: int = TARGET_SAMPLERATE_HZ,
+    config: Optional[ProcessingConfiguration] = None,
+) -> Tuple[List[Annotation], np.ndarray]:
+    """Postprocess model outputs.
+
+    Convert model tensor outputs to predicted bounding boxes and
+    extracted features.
+
+    Will run non-maximum suppression and remove overlapping annotations.
+
+    Parameters
+    ----------
+    outputs : ModelOutput
+        Model raw outputs.
+    samp_rate : int, Optional
+        Sample rate of the audio from which the spectrogram was generated.
+        Defaults to 256000 which is the target sample rate of the default
+        model. Only change if you generated outputs from a spectrogram with
+        sample rate.
+    config : Optional[ProcessingConfiguration], Optional
+        Processing configuration, by default None (uses default parameters).
+
+    Returns
+    -------
+    annotations : List[Annotation]
+        List of predicted annotations.
+    features: np.ndarray
+        An array of extracted features for each annotation. The shape of the
+        array is (n_annotations, n_features).
+    """
+    if config is None:
+        config = CONFIG
+
+    return du.postprocess_model_outputs(
+        outputs,
+        samp_rate,
+        config,
+    )
+
+
 model: DetectionModel = MODEL
 """Base detection model."""

--- a/bat_detect/detector/post_process.py
+++ b/bat_detect/detector/post_process.py
@ -134,12 +134,12 @@ def run_nms(
                y_pos[num_detection, valid_inds],
                x_pos[num_detection, valid_inds],
            ].transpose(0, 1)
-            feat = feat.cpu().numpy().astype(np.float32)
+            feat = feat.detach().numpy().astype(np.float32)
            feats.append(feat)

        # convert to numpy
        for key, value in pred.items():
-            pred[key] = value.cpu().numpy().astype(np.float32)
+            pred[key] = value.detach().numpy().astype(np.float32)

        preds.append(pred)  # type: ignore

--- a/bat_detect/types.py
+++ b/bat_detect/types.py
@ -278,7 +278,23 @@ class ProcessingConfiguration(TypedDict):


 class ModelOutput(NamedTuple):
-    """Output of the detection model."""
+    """Output of the detection model.
+
+    Each of the tensors has a shape of
+
+        `(batch_size, num_channels,spec_height, spec_width)`.
+
+    Where `spec_height` and `spec_width` are the height and width of the
+    input spectrograms.
+
+    They contain localised information of:
+
+    1. The probability of a bounding box detection at the given location.
+    2. The predicted size of the bounding box at the given location.
+    3. The probabilities of each class at the given location.
+    4. Same as 3. but before softmax.
+    5. Features used to make the predictions at the given location.
+    """

    pred_det: torch.Tensor
    """Tensor with predict detection probabilities."""
@ -330,7 +346,7 @@ class PredictionResults(TypedDict):
    high_freqs: np.ndarray
    """High frequencies of the detections in Hz."""

-    class_probs: Optional[np.ndarray]
+    class_probs: np.ndarray
    """Class probabilities."""


--- a/bat_detect/utils/detector_utils.py
+++ b/bat_detect/utils/detector_utils.py
@ -16,6 +16,7 @@ from bat_detect.types import (
    Annotation,
    DetectionModel,
    FileAnnotations,
+    ModelOutput,
    ModelParameters,
    PredictionResults,
    ProcessingConfiguration,
@ -148,7 +149,7 @@ def _merge_results(predictions, spec_feats, cnn_feats, spec_slices):


 def get_annotations_from_preds(
-    predictions,
+    predictions: PredictionResults,
    class_names: List[str],
 ) -> List[Annotation]:
    """Get list of annotations from predictions."""
@ -194,7 +195,7 @@ def format_single_result(
    file_id: str,
    time_exp: float,
    duration: float,
-    predictions,
+    predictions: PredictionResults,
    class_names: List[str],
 ) -> FileAnnotations:
    """Format results into the format expected by the annotation tool.
@ -506,6 +507,44 @@ def _process_spectrogram(
    return pred_nms, features


+def postprocess_model_outputs(
+    outputs: ModelOutput,
+    samp_rate: int,
+    config: ProcessingConfiguration,
+) -> Tuple[List[Annotation], np.ndarray]:
+    # run non-max suppression
+    pred_nms_list, features = pp.run_nms(
+        outputs,
+        {
+            "nms_kernel_size": config["nms_kernel_size"],
+            "max_freq": config["max_freq"],
+            "min_freq": config["min_freq"],
+            "fft_win_length": config["fft_win_length"],
+            "fft_overlap": config["fft_overlap"],
+            "resize_factor": config["resize_factor"],
+            "nms_top_k_per_sec": config["nms_top_k_per_sec"],
+            "detection_threshold": config["detection_threshold"],
+        },
+        np.array([float(samp_rate)]),
+    )
+
+    pred_nms = pred_nms_list[0]
+
+    # if we have a background class
+    class_probs = pred_nms.get("class_probs")
+    if (class_probs is not None) and (
+        class_probs.shape[0] > len(config["class_names"])
+    ):
+        pred_nms["class_probs"] = class_probs[:-1, :]
+
+    annotations = get_annotations_from_preds(
+        pred_nms,
+        config["class_names"],
+    )
+
+    return annotations, features[0]
+
+
 def process_spectrogram(
    spec: torch.Tensor,
    samplerate: int,
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -224,3 +224,32 @@ def test_process_audio_with_default_model():
    assert spec is not None
    assert isinstance(spec, torch.Tensor)
    assert spec.shape == (1, 1, 128, 512)
+
+
+def test_postprocess_model_outputs():
+    """Test postprocessing model outputs."""
+    # Load model outputs
+    audio = api.load_audio(TEST_DATA[1])
+    spec = api.generate_spectrogram(audio)
+    model_outputs = api.model(spec)
+
+    # Postprocess outputs
+    predictions, features = api.postprocess(model_outputs)
+
+    assert predictions is not None
+    assert isinstance(predictions, list)
+    assert len(predictions) > 0
+    sample_pred = predictions[0]
+    assert isinstance(sample_pred, dict)
+    assert "class" in sample_pred
+    assert "class_prob" in sample_pred
+    assert "det_prob" in sample_pred
+    assert "start_time" in sample_pred
+    assert "end_time" in sample_pred
+    assert "low_freq" in sample_pred
+    assert "high_freq" in sample_pred
+
+    assert features is not None
+    assert isinstance(features, np.ndarray)
+    assert features.shape[0] == len(predictions)
+    assert features.shape[1] == 32