From 451093f2da06e6d7b4b4a032ad064496d7e17d44 Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Thu, 3 Apr 2025 16:47:03 +0100
Subject: [PATCH] More structured data module

---
 batdetect2/data/__init__.py                   |  14 +
 batdetect2/data/annotations.py                |  36 +++
 batdetect2/data/annotations/__init__.py       |  55 ++++
 batdetect2/data/annotations/aeof.py           |  37 +++
 .../data/annotations/batdetect2_files.py      |  80 +++++
 .../data/annotations/batdetect2_merged.py     |  64 ++++
 batdetect2/data/annotations/legacy.py         | 304 ++++++++++++++++++
 batdetect2/data/annotations/types.py          |  41 +++
 batdetect2/data/data.py                       |  37 +++
 batdetect2/data/types.py                      |  29 ++
 10 files changed, 697 insertions(+)
 create mode 100644 batdetect2/data/__init__.py
 create mode 100644 batdetect2/data/annotations.py
 create mode 100644 batdetect2/data/annotations/__init__.py
 create mode 100644 batdetect2/data/annotations/aeof.py
 create mode 100644 batdetect2/data/annotations/batdetect2_files.py
 create mode 100644 batdetect2/data/annotations/batdetect2_merged.py
 create mode 100644 batdetect2/data/annotations/legacy.py
 create mode 100644 batdetect2/data/annotations/types.py
 create mode 100644 batdetect2/data/data.py
 create mode 100644 batdetect2/data/types.py

diff --git a/batdetect2/data/__init__.py b/batdetect2/data/__init__.py
new file mode 100644
index 0000000..512ed87
--- /dev/null
+++ b/batdetect2/data/__init__.py
@@ -0,0 +1,14 @@
+from batdetect2.data.annotations import (
+    AnnotatedDataset,
+    load_annotated_dataset,
+)
+from batdetect2.data.data import load_dataset, load_dataset_from_config
+from batdetect2.data.types import Dataset
+
+__all__ = [
+    "AnnotatedDataset",
+    "Dataset",
+    "load_annotated_dataset",
+    "load_dataset",
+    "load_dataset_from_config",
+]
diff --git a/batdetect2/data/annotations.py b/batdetect2/data/annotations.py
new file mode 100644
index 0000000..a69b79f
--- /dev/null
+++ b/batdetect2/data/annotations.py
@@ -0,0 +1,36 @@
+import json
+from pathlib import Path
+from typing import Literal, Union
+
+from batdetect2.configs import BaseConfig
+
+__all__ = [
+    "AOEFAnnotationFile",
+    "AnnotationFormats",
+    "BatDetect2AnnotationFile",
+    "BatDetect2AnnotationFiles",
+]
+
+
+class BatDetect2AnnotationFiles(BaseConfig):
+    format: Literal["batdetect2"] = "batdetect2"
+    path: Path
+
+
+class BatDetect2AnnotationFile(BaseConfig):
+    format: Literal["batdetect2_file"] = "batdetect2_file"
+    path: Path
+
+
+class AOEFAnnotationFile(BaseConfig):
+    format: Literal["aoef"] = "aoef"
+    path: Path
+
+
+AnnotationFormats = Union[
+    BatDetect2AnnotationFiles,
+    BatDetect2AnnotationFile,
+    AOEFAnnotationFile,
+]
+
+
diff --git a/batdetect2/data/annotations/__init__.py b/batdetect2/data/annotations/__init__.py
new file mode 100644
index 0000000..5193cc1
--- /dev/null
+++ b/batdetect2/data/annotations/__init__.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+from typing import Optional, Union
+
+from soundevent import data
+
+from batdetect2.data.annotations.aeof import (
+    AOEFAnnotations,
+    load_aoef_annotated_dataset,
+)
+from batdetect2.data.annotations.batdetect2_files import (
+    BatDetect2FilesAnnotations,
+    load_batdetect2_files_annotated_dataset,
+)
+from batdetect2.data.annotations.batdetect2_merged import (
+    BatDetect2MergedAnnotations,
+    load_batdetect2_merged_annotated_dataset,
+)
+from batdetect2.data.annotations.types import AnnotatedDataset
+
+__all__ = [
+    "load_annotated_dataset",
+    "AnnotatedDataset",
+    "AOEFAnnotations",
+    "BatDetect2FilesAnnotations",
+    "BatDetect2MergedAnnotations",
+    "AnnotationFormats",
+]
+
+
+AnnotationFormats = Union[
+    BatDetect2MergedAnnotations,
+    BatDetect2FilesAnnotations,
+    AOEFAnnotations,
+]
+
+
+def load_annotated_dataset(
+    dataset: AnnotatedDataset,
+    base_dir: Optional[Path] = None,
+) -> data.AnnotationSet:
+    if isinstance(dataset, AOEFAnnotations):
+        return load_aoef_annotated_dataset(dataset, base_dir=base_dir)
+
+    if isinstance(dataset, BatDetect2MergedAnnotations):
+        return load_batdetect2_merged_annotated_dataset(
+            dataset, base_dir=base_dir
+        )
+
+    if isinstance(dataset, BatDetect2FilesAnnotations):
+        return load_batdetect2_files_annotated_dataset(
+            dataset,
+            base_dir=base_dir,
+        )
+
+    raise NotImplementedError(f"Unknown annotation format: {dataset.name}")
diff --git a/batdetect2/data/annotations/aeof.py b/batdetect2/data/annotations/aeof.py
new file mode 100644
index 0000000..e634a02
--- /dev/null
+++ b/batdetect2/data/annotations/aeof.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from typing import Literal, Optional
+
+from soundevent import data, io
+
+from batdetect2.data.annotations.types import AnnotatedDataset
+
+__all__ = [
+    "AOEFAnnotations",
+    "load_aoef_annotated_dataset",
+]
+
+
+class AOEFAnnotations(AnnotatedDataset):
+    format: Literal["aoef"] = "aoef"
+    annotations_path: Path
+
+
+def load_aoef_annotated_dataset(
+    dataset: AOEFAnnotations,
+    base_dir: Optional[Path] = None,
+) -> data.AnnotationSet:
+    audio_dir = dataset.audio_dir
+    path = dataset.annotations_path
+
+    if base_dir:
+        audio_dir = base_dir / audio_dir
+        path = base_dir / path
+
+    loaded = io.load(path, audio_dir=audio_dir)
+
+    if not isinstance(loaded, (data.AnnotationSet, data.AnnotationProject)):
+        raise ValueError(
+            f"The AOEF file at {path} does not contain a set of annotations"
+        )
+
+    return loaded
diff --git a/batdetect2/data/annotations/batdetect2_files.py b/batdetect2/data/annotations/batdetect2_files.py
new file mode 100644
index 0000000..da88f9a
--- /dev/null
+++ b/batdetect2/data/annotations/batdetect2_files.py
@@ -0,0 +1,80 @@
+import os
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+from soundevent import data
+
+from batdetect2.data.annotations.legacy import (
+    file_annotation_to_annotation_task,
+    file_annotation_to_clip,
+    file_annotation_to_clip_annotation,
+    list_file_annotations,
+    load_file_annotation,
+)
+from batdetect2.data.annotations.types import AnnotatedDataset
+
+PathLike = Union[Path, str, os.PathLike]
+
+
+__all__ = [
+    "load_batdetect2_files_annotated_dataset",
+    "BatDetect2FilesAnnotations",
+]
+
+
+class BatDetect2FilesAnnotations(AnnotatedDataset):
+    format: Literal["batdetect2"] = "batdetect2"
+    annotations_dir: Path
+
+
+def load_batdetect2_files_annotated_dataset(
+    dataset: BatDetect2FilesAnnotations,
+    base_dir: Optional[PathLike] = None,
+) -> data.AnnotationProject:
+    """Convert annotations to annotation project."""
+    audio_dir = dataset.audio_dir
+    path = dataset.annotations_dir
+
+    if base_dir:
+        audio_dir = base_dir / audio_dir
+        path = base_dir / path
+
+    paths = list_file_annotations(path)
+
+    annotations = []
+    tasks = []
+
+    for p in paths:
+        try:
+            file_annotation = load_file_annotation(p)
+        except FileNotFoundError:
+            continue
+
+        try:
+            clip = file_annotation_to_clip(
+                file_annotation,
+                audio_dir=audio_dir,
+            )
+        except FileNotFoundError:
+            continue
+
+        annotations.append(
+            file_annotation_to_clip_annotation(
+                file_annotation,
+                clip,
+            )
+        )
+
+        tasks.append(
+            file_annotation_to_annotation_task(
+                file_annotation,
+                clip,
+            )
+        )
+
+    return data.AnnotationProject(
+        name=dataset.name,
+        description=dataset.description,
+        clip_annotations=annotations,
+        tasks=tasks,
+    )
diff --git a/batdetect2/data/annotations/batdetect2_merged.py b/batdetect2/data/annotations/batdetect2_merged.py
new file mode 100644
index 0000000..5424cb2
--- /dev/null
+++ b/batdetect2/data/annotations/batdetect2_merged.py
@@ -0,0 +1,64 @@
+import json
+import os
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+from soundevent import data
+
+from batdetect2.data.annotations.legacy import (
+    FileAnnotation,
+    file_annotation_to_annotation_task,
+    file_annotation_to_clip,
+    file_annotation_to_clip_annotation,
+)
+from batdetect2.data.annotations.types import AnnotatedDataset
+
+PathLike = Union[Path, str, os.PathLike]
+
+__all__ = [
+    "BatDetect2MergedAnnotations",
+    "load_batdetect2_merged_annotated_dataset",
+]
+
+
+class BatDetect2MergedAnnotations(AnnotatedDataset):
+    format: Literal["batdetect2_file"] = "batdetect2_file"
+    annotations_path: Path
+
+
+def load_batdetect2_merged_annotated_dataset(
+    dataset: BatDetect2MergedAnnotations,
+    base_dir: Optional[PathLike] = None,
+) -> data.AnnotationProject:
+    audio_dir = dataset.audio_dir
+    path = dataset.annotations_path
+
+    if base_dir:
+        audio_dir = base_dir / audio_dir
+        path = base_dir / path
+
+    content = json.loads(Path(path).read_text())
+
+    annotations = []
+    tasks = []
+
+    for ann in content:
+        try:
+            ann = FileAnnotation.model_validate(ann)
+        except ValueError:
+            continue
+
+        try:
+            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
+        except FileNotFoundError:
+            continue
+
+        annotations.append(file_annotation_to_clip_annotation(ann, clip))
+        tasks.append(file_annotation_to_annotation_task(ann, clip))
+
+    return data.AnnotationProject(
+        name=dataset.name,
+        description=dataset.description,
+        clip_annotations=annotations,
+        tasks=tasks,
+    )
diff --git a/batdetect2/data/annotations/legacy.py b/batdetect2/data/annotations/legacy.py
new file mode 100644
index 0000000..5c8a696
--- /dev/null
+++ b/batdetect2/data/annotations/legacy.py
@@ -0,0 +1,304 @@
+"""Compatibility functions between old and new data structures."""
+
+import os
+import uuid
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+from pydantic import BaseModel, Field
+from soundevent import data
+from soundevent.geometry import compute_bounds
+from soundevent.types import ClassMapper
+
+from batdetect2 import types
+
+PathLike = Union[Path, str, os.PathLike]
+
+__all__ = [
+    "convert_to_annotation_group",
+]
+
+SPECIES_TAG_KEY = "species"
+ECHOLOCATION_EVENT = "Echolocation"
+UNKNOWN_CLASS = "__UNKNOWN__"
+
+NAMESPACE = uuid.UUID("97a9776b-c0fd-4c68-accb-0b0ecd719242")
+
+
+EventFn = Callable[[data.SoundEventAnnotation], Optional[str]]
+
+ClassFn = Callable[[data.Recording], int]
+
+IndividualFn = Callable[[data.SoundEventAnnotation], int]
+
+
+def get_recording_class_name(recording: data.Recording) -> str:
+    """Get the class name for a recording."""
+    tag = data.find_tag(recording.tags, SPECIES_TAG_KEY)
+    if tag is None:
+        return UNKNOWN_CLASS
+    return tag.value
+
+
+def get_annotation_notes(annotation: data.ClipAnnotation) -> str:
+    """Get the notes for a ClipAnnotation."""
+    all_notes = [
+        *annotation.notes,
+        *annotation.clip.recording.notes,
+    ]
+    messages = [note.message for note in all_notes if note.message is not None]
+    return "\n".join(messages)
+
+
+def convert_to_annotation_group(
+    annotation: data.ClipAnnotation,
+    class_mapper: ClassMapper,
+    event_fn: EventFn = lambda _: ECHOLOCATION_EVENT,
+    class_fn: ClassFn = lambda _: 0,
+    individual_fn: IndividualFn = lambda _: 0,
+) -> types.AudioLoaderAnnotationGroup:
+    """Convert a ClipAnnotation to an AudioLoaderAnnotationGroup."""
+    recording = annotation.clip.recording
+
+    start_times = []
+    end_times = []
+    low_freqs = []
+    high_freqs = []
+    class_ids = []
+    x_inds = []
+    y_inds = []
+    individual_ids = []
+    annotations: List[types.Annotation] = []
+    class_id_file = class_fn(recording)
+
+    for sound_event in annotation.sound_events:
+        geometry = sound_event.sound_event.geometry
+
+        if geometry is None:
+            continue
+
+        start_time, low_freq, end_time, high_freq = compute_bounds(geometry)
+        class_id = class_mapper.transform(sound_event) or -1
+        event = event_fn(sound_event) or ""
+        individual_id = individual_fn(sound_event) or -1
+
+        start_times.append(start_time)
+        end_times.append(end_time)
+        low_freqs.append(low_freq)
+        high_freqs.append(high_freq)
+        class_ids.append(class_id)
+        individual_ids.append(individual_id)
+
+        # NOTE: This will be computed later so we just put a placeholder
+        # here for now.
+        x_inds.append(0)
+        y_inds.append(0)
+
+        annotations.append(
+            {
+                "start_time": start_time,
+                "end_time": end_time,
+                "low_freq": low_freq,
+                "high_freq": high_freq,
+                "class_prob": 1.0,
+                "det_prob": 1.0,
+                "individual": "0",
+                "event": event,
+                "class_id": class_id,  # type: ignore
+            }
+        )
+
+    return {
+        "id": str(recording.path),
+        "duration": recording.duration,
+        "issues": False,
+        "file_path": str(recording.path),
+        "time_exp": recording.time_expansion,
+        "class_name": get_recording_class_name(recording),
+        "notes": get_annotation_notes(annotation),
+        "annotated": True,
+        "start_times": np.array(start_times),
+        "end_times": np.array(end_times),
+        "low_freqs": np.array(low_freqs),
+        "high_freqs": np.array(high_freqs),
+        "class_ids": np.array(class_ids),
+        "x_inds": np.array(x_inds),
+        "y_inds": np.array(y_inds),
+        "individual_ids": np.array(individual_ids),
+        "annotation": annotations,
+        "class_id_file": class_id_file,
+    }
+
+
+class Annotation(BaseModel):
+    """Annotation class to hold batdetect annotations."""
+
+    label: str = Field(alias="class")
+    event: str
+    individual: int = 0
+
+    start_time: float
+    end_time: float
+    low_freq: float
+    high_freq: float
+
+
+class FileAnnotation(BaseModel):
+    """FileAnnotation class to hold batdetect annotations for a file."""
+
+    id: str
+    duration: float
+    time_exp: float = 1
+
+    label: str = Field(alias="class_name")
+
+    annotation: List[Annotation]
+
+    annotated: bool = False
+    issues: bool = False
+    notes: str = ""
+
+
+def load_file_annotation(path: PathLike) -> FileAnnotation:
+    """Load annotation from batdetect format."""
+    path = Path(path)
+    return FileAnnotation.model_validate_json(path.read_text())
+
+
+def annotation_to_sound_event(
+    annotation: Annotation,
+    recording: data.Recording,
+    label_key: str = "class",
+    event_key: str = "event",
+    individual_key: str = "individual",
+) -> data.SoundEventAnnotation:
+    """Convert annotation to sound event annotation."""
+    sound_event = data.SoundEvent(
+        uuid=uuid.uuid5(
+            NAMESPACE,
+            f"{recording.hash}_{annotation.start_time}_{annotation.end_time}",
+        ),
+        recording=recording,
+        geometry=data.BoundingBox(
+            coordinates=[
+                annotation.start_time,
+                annotation.low_freq,
+                annotation.end_time,
+                annotation.high_freq,
+            ],
+        ),
+    )
+
+    return data.SoundEventAnnotation(
+        uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
+        sound_event=sound_event,
+        tags=[
+            data.Tag(
+                term=data.term_from_key(label_key),
+                value=annotation.label,
+            ),
+            data.Tag(
+                term=data.term_from_key(event_key),
+                value=annotation.event,
+            ),
+            data.Tag(
+                term=data.term_from_key(individual_key),
+                value=str(annotation.individual),
+            ),
+        ],
+    )
+
+
+def file_annotation_to_clip(
+    file_annotation: FileAnnotation,
+    audio_dir: Optional[PathLike] = None,
+    label_key: str = "class",
+) -> data.Clip:
+    """Convert file annotation to recording."""
+    audio_dir = audio_dir or Path.cwd()
+
+    full_path = Path(audio_dir) / file_annotation.id
+
+    if not full_path.exists():
+        raise FileNotFoundError(f"File {full_path} not found.")
+
+    recording = data.Recording.from_file(
+        full_path,
+        time_expansion=file_annotation.time_exp,
+        tags=[
+            data.Tag(
+                term=data.term_from_key(label_key),
+                value=file_annotation.label,
+            )
+        ],
+    )
+
+    return data.Clip(
+        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip"),
+        recording=recording,
+        start_time=0,
+        end_time=recording.duration,
+    )
+
+
+def file_annotation_to_clip_annotation(
+    file_annotation: FileAnnotation,
+    clip: data.Clip,
+    label_key: str = "class",
+    event_key: str = "event",
+    individual_key: str = "individual",
+) -> data.ClipAnnotation:
+    """Convert file annotation to clip annotation."""
+    notes = []
+    if file_annotation.notes:
+        notes.append(data.Note(message=file_annotation.notes))
+
+    return data.ClipAnnotation(
+        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
+        clip=clip,
+        notes=notes,
+        tags=[
+            data.Tag(
+                term=data.term_from_key(label_key), value=file_annotation.label
+            )
+        ],
+        sound_events=[
+            annotation_to_sound_event(
+                annotation,
+                clip.recording,
+                label_key=label_key,
+                event_key=event_key,
+                individual_key=individual_key,
+            )
+            for annotation in file_annotation.annotation
+        ],
+    )
+
+
+def file_annotation_to_annotation_task(
+    file_annotation: FileAnnotation,
+    clip: data.Clip,
+) -> data.AnnotationTask:
+    status_badges = []
+
+    if file_annotation.issues:
+        status_badges.append(
+            data.StatusBadge(state=data.AnnotationState.rejected)
+        )
+    elif file_annotation.annotated:
+        status_badges.append(
+            data.StatusBadge(state=data.AnnotationState.completed)
+        )
+
+    return data.AnnotationTask(
+        uuid=uuid.uuid5(uuid.NAMESPACE_URL, f"{file_annotation.id}_task"),
+        clip=clip,
+        status_badges=status_badges,
+    )
+
+
+def list_file_annotations(path: PathLike) -> List[Path]:
+    """List all annotations in a directory."""
+    path = Path(path)
+    return [file for file in path.glob("*.json")]
diff --git a/batdetect2/data/annotations/types.py b/batdetect2/data/annotations/types.py
new file mode 100644
index 0000000..a184b23
--- /dev/null
+++ b/batdetect2/data/annotations/types.py
@@ -0,0 +1,41 @@
+from pathlib import Path
+from typing import Literal, Union
+
+from batdetect2.configs import BaseConfig
+
+__all__ = [
+    "AnnotatedDataset",
+    "BatDetect2MergedAnnotations",
+]
+
+
+class AnnotatedDataset(BaseConfig):
+    """Represents a single, cohesive source of audio recordings and annotations.
+
+    A source typically groups recordings originating from a specific context,
+    such as a single project, site, deployment, or recordist. All audio files
+    belonging to a source should be located within a single directory,
+    specified by `audio_dir`.
+
+    Annotations associated with these recordings are defined by the
+    `annotations` field, which supports various formats (e.g., AOEF files,
+                                                         specific CSV
+                                                         structures).
+    Crucially, file paths referenced within the annotation data *must* be
+    relative to the `audio_dir`. This ensures that the dataset definition
+    remains portable across different systems and base directories.
+
+    Attributes:
+        name: A unique identifier for this data source.
+        description: Detailed information about the source, including recording
+            methods, annotation procedures, equipment used, potential biases,
+            or any important caveats for users.
+        audio_dir: The file system path to the directory containing the audio
+            recordings for this source.
+    """
+
+    name: str
+    audio_dir: Path
+    description: str = ""
+
+
diff --git a/batdetect2/data/data.py b/batdetect2/data/data.py
new file mode 100644
index 0000000..c227991
--- /dev/null
+++ b/batdetect2/data/data.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from typing import Optional
+
+from soundevent import data
+
+from batdetect2.configs import load_config
+from batdetect2.data.annotations import load_annotated_dataset
+from batdetect2.data.types import Dataset
+
+__all__ = [
+    "load_dataset",
+    "load_dataset_from_config",
+]
+
+
+def load_dataset(
+    dataset: Dataset,
+    base_dir: Optional[Path] = None,
+) -> data.AnnotationSet:
+    clip_annotations = []
+    for source in dataset.sources:
+        annotated_source = load_annotated_dataset(source, base_dir=base_dir)
+        clip_annotations.extend(annotated_source.clip_annotations)
+    return data.AnnotationSet(clip_annotations=clip_annotations)
+
+
+def load_dataset_from_config(
+    path: data.PathLike,
+    field: Optional[str] = None,
+    base_dir: Optional[Path] = None,
+):
+    config = load_config(
+        path=path,
+        schema=Dataset,
+        field=field,
+    )
+    return load_dataset(config, base_dir=base_dir)
diff --git a/batdetect2/data/types.py b/batdetect2/data/types.py
new file mode 100644
index 0000000..237f215
--- /dev/null
+++ b/batdetect2/data/types.py
@@ -0,0 +1,29 @@
+from typing import Annotated, List
+
+from pydantic import Field
+
+from batdetect2.configs import BaseConfig
+from batdetect2.data.annotations import AnnotationFormats
+
+
+class Dataset(BaseConfig):
+    """Represents a collection of one or more DatasetSources.
+
+    In the context of batdetect2, a Dataset aggregates multiple `DatasetSource`
+    instances. It serves as the primary unit for defining data splits,
+    typically used for model training, validation, or testing phases.
+
+    Attributes:
+        name: A descriptive name for the overall dataset
+            (e.g., "UK Training Set").
+        description: A detailed explanation of the dataset's purpose,
+            composition, how it was assembled, or any specific characteristics.
+        sources: A list containing the `DatasetSource` objects included in this
+            dataset.
+    """
+
+    name: str
+    description: str
+    sources: List[
+        Annotated[AnnotationFormats, Field(..., discriminator="format")]
+    ]