batdetect2/tests/test_data/test_predictions/test_parquet.py

from pathlib import Path
from uuid import uuid4

import numpy as np
import pytest
from soundevent import data

from batdetect2.data.predictions import ParquetOutputConfig, build_output_formatter
from batdetect2.typing import (
    BatDetect2Prediction,
    RawPrediction,
    TargetProtocol,
)


@pytest.fixture
def sample_formatter(sample_targets: TargetProtocol):
    return build_output_formatter(
        config=ParquetOutputConfig(),
        targets=sample_targets,
    )


def test_roundtrip(
    sample_formatter,
    clip: data.Clip,
    sample_targets: TargetProtocol,
    tmp_path: Path,
):
    detections = [
        RawPrediction(
            geometry=data.BoundingBox(
                coordinates=list(np.random.uniform(size=[4]))
            ),
            detection_score=0.5,
            class_scores=np.random.uniform(
                size=len(sample_targets.class_names)
            ),
            features=np.random.uniform(size=32),
        )
        for _ in range(10)
    ]

    prediction = BatDetect2Prediction(clip=clip, predictions=detections)

    path = tmp_path / "predictions.parquet"

    sample_formatter.save(predictions=[prediction], path=path)

    assert path.exists()

    recovered = sample_formatter.load(path=path)

    assert len(recovered) == 1
    assert recovered[0].clip == prediction.clip

    for recovered_prediction, detection in zip(
        recovered[0].predictions, detections
    ):
        assert (
            recovered_prediction.detection_score == detection.detection_score
        )
        # Note: floating point comparison might need tolerance, but parquet should preserve float64
        assert np.allclose(
            recovered_prediction.class_scores, detection.class_scores
        )
        assert np.allclose(recovered_prediction.features, detection.features)
        assert recovered_prediction.geometry == detection.geometry


def test_multiple_clips(
    sample_formatter,
    clip: data.Clip,
    sample_targets: TargetProtocol,
    tmp_path: Path,
):
    # Create a second clip
    clip2 = clip.model_copy(update={"uuid": uuid4()})

    detections1 = [
        RawPrediction(
            geometry=data.BoundingBox(
                coordinates=list(np.random.uniform(size=[4]))
            ),
            detection_score=0.8,
            class_scores=np.random.uniform(
                size=len(sample_targets.class_names)
            ),
            features=np.random.uniform(size=32),
        )
    ]

    detections2 = [
        RawPrediction(
            geometry=data.BoundingBox(
                coordinates=list(np.random.uniform(size=[4]))
            ),
            detection_score=0.9,
            class_scores=np.random.uniform(
                size=len(sample_targets.class_names)
            ),
            features=np.random.uniform(size=32),
        )
    ]

    predictions = [
        BatDetect2Prediction(clip=clip, predictions=detections1),
        BatDetect2Prediction(clip=clip2, predictions=detections2),
    ]

    path = tmp_path / "multi_predictions.parquet"
    sample_formatter.save(predictions=predictions, path=path)

    recovered = sample_formatter.load(path=path)

    assert len(recovered) == 2
    # Order might not be preserved if we don't sort, but implementation appends so it should be
    # However, let's sort by clip uuid to be safe if needed, or just check existence

    recovered_uuids = {p.clip.uuid for p in recovered}
    expected_uuids = {clip.uuid, clip2.uuid}
    assert recovered_uuids == expected_uuids


def test_complex_geometry(
    sample_formatter,
    clip: data.Clip,
    sample_targets: TargetProtocol,
    tmp_path: Path,
):
    # Create a polygon geometry
    polygon = data.Polygon(
        coordinates=[[
            [0.0, 10000.0],
            [0.1, 20000.0],
            [0.2, 10000.0],
            [0.0, 10000.0],
        ]]
    )

    detections = [
        RawPrediction(
            geometry=polygon,
            detection_score=0.95,
            class_scores=np.random.uniform(
                size=len(sample_targets.class_names)
            ),
            features=np.random.uniform(size=32),
        )
    ]

    prediction = BatDetect2Prediction(clip=clip, predictions=detections)

    path = tmp_path / "complex_geometry.parquet"
    sample_formatter.save(predictions=[prediction], path=path)

    recovered = sample_formatter.load(path=path)

    assert len(recovered) == 1
    assert len(recovered[0].predictions) == 1

    recovered_pred = recovered[0].predictions[0]

    # Check if geometry is recovered correctly as a Polygon
    assert isinstance(recovered_pred.geometry, data.Polygon)
    assert recovered_pred.geometry == polygon