More structured data module

2025-06-29 22:51:58 +02:00 · 2025-04-03 16:47:03 +01:00 · 2025-04-03 16:47:03 +01:00 · 451093f2da
commit 451093f2da
parent 30d3a2c92e
10 changed files with 697 additions and 0 deletions
--- a/batdetect2/data/init.py
+++ b/batdetect2/data/init.py
@ -0,0 +1,14 @@
 from batdetect2.data.annotations import (
    AnnotatedDataset,
    load_annotated_dataset,
 )
 from batdetect2.data.data import load_dataset, load_dataset_from_config
 from batdetect2.data.types import Dataset
 __all__ = [
    "AnnotatedDataset",
    "Dataset",
    "load_annotated_dataset",
    "load_dataset",
    "load_dataset_from_config",
 ]
--- a/batdetect2/data/annotations.py
+++ b/batdetect2/data/annotations.py
@ -0,0 +1,36 @@
 import json
 from pathlib import Path
 from typing import Literal, Union
 from batdetect2.configs import BaseConfig
 __all__ = [
    "AOEFAnnotationFile",
    "AnnotationFormats",
    "BatDetect2AnnotationFile",
    "BatDetect2AnnotationFiles",
 ]
 class BatDetect2AnnotationFiles(BaseConfig):
    format: Literal["batdetect2"] = "batdetect2"
    path: Path
 class BatDetect2AnnotationFile(BaseConfig):
    format: Literal["batdetect2_file"] = "batdetect2_file"
    path: Path
 class AOEFAnnotationFile(BaseConfig):
    format: Literal["aoef"] = "aoef"
    path: Path
 AnnotationFormats = Union[
    BatDetect2AnnotationFiles,
    BatDetect2AnnotationFile,
    AOEFAnnotationFile,
 ]
--- a/batdetect2/data/annotations/init.py
+++ b/batdetect2/data/annotations/init.py
@ -0,0 +1,55 @@
 from pathlib import Path
 from typing import Optional, Union
 from soundevent import data
 from batdetect2.data.annotations.aeof import (
    AOEFAnnotations,
    load_aoef_annotated_dataset,
 )
 from batdetect2.data.annotations.batdetect2_files import (
    BatDetect2FilesAnnotations,
    load_batdetect2_files_annotated_dataset,
 )
 from batdetect2.data.annotations.batdetect2_merged import (
    BatDetect2MergedAnnotations,
    load_batdetect2_merged_annotated_dataset,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 __all__ = [
    "load_annotated_dataset",
    "AnnotatedDataset",
    "AOEFAnnotations",
    "BatDetect2FilesAnnotations",
    "BatDetect2MergedAnnotations",
    "AnnotationFormats",
 ]
 AnnotationFormats = Union[
    BatDetect2MergedAnnotations,
    BatDetect2FilesAnnotations,
    AOEFAnnotations,
 ]
 def load_annotated_dataset(
    dataset: AnnotatedDataset,
    base_dir: Optional[Path] = None,
 ) -> data.AnnotationSet:
    if isinstance(dataset, AOEFAnnotations):
        return load_aoef_annotated_dataset(dataset, base_dir=base_dir)
    if isinstance(dataset, BatDetect2MergedAnnotations):
        return load_batdetect2_merged_annotated_dataset(
            dataset, base_dir=base_dir
        )
    if isinstance(dataset, BatDetect2FilesAnnotations):
        return load_batdetect2_files_annotated_dataset(
            dataset,
            base_dir=base_dir,
        )
    raise NotImplementedError(f"Unknown annotation format: {dataset.name}")
--- a/batdetect2/data/annotations/aeof.py
+++ b/batdetect2/data/annotations/aeof.py
@ -0,0 +1,37 @@
 from pathlib import Path
 from typing import Literal, Optional
 from soundevent import data, io
 from batdetect2.data.annotations.types import AnnotatedDataset
 __all__ = [
    "AOEFAnnotations",
    "load_aoef_annotated_dataset",
 ]
 class AOEFAnnotations(AnnotatedDataset):
    format: Literal["aoef"] = "aoef"
    annotations_path: Path
 def load_aoef_annotated_dataset(
    dataset: AOEFAnnotations,
    base_dir: Optional[Path] = None,
 ) -> data.AnnotationSet:
    audio_dir = dataset.audio_dir
    path = dataset.annotations_path
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    loaded = io.load(path, audio_dir=audio_dir)
    if not isinstance(loaded, (data.AnnotationSet, data.AnnotationProject)):
        raise ValueError(
            f"The AOEF file at {path} does not contain a set of annotations"
        )
    return loaded
--- a/batdetect2/data/annotations/batdetect2_files.py
+++ b/batdetect2/data/annotations/batdetect2_files.py
@ -0,0 +1,80 @@
 import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 from soundevent import data
 from batdetect2.data.annotations.legacy import (
    file_annotation_to_annotation_task,
    file_annotation_to_clip,
    file_annotation_to_clip_annotation,
    list_file_annotations,
    load_file_annotation,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "load_batdetect2_files_annotated_dataset",
    "BatDetect2FilesAnnotations",
 ]
 class BatDetect2FilesAnnotations(AnnotatedDataset):
    format: Literal["batdetect2"] = "batdetect2"
    annotations_dir: Path
 def load_batdetect2_files_annotated_dataset(
    dataset: BatDetect2FilesAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationProject:
    """Convert annotations to annotation project."""
    audio_dir = dataset.audio_dir
    path = dataset.annotations_dir
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    paths = list_file_annotations(path)
    annotations = []
    tasks = []
    for p in paths:
        try:
            file_annotation = load_file_annotation(p)
        except FileNotFoundError:
            continue
        try:
            clip = file_annotation_to_clip(
                file_annotation,
                audio_dir=audio_dir,
            )
        except FileNotFoundError:
            continue
        annotations.append(
            file_annotation_to_clip_annotation(
                file_annotation,
                clip,
            )
        )
        tasks.append(
            file_annotation_to_annotation_task(
                file_annotation,
                clip,
            )
        )
    return data.AnnotationProject(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
        tasks=tasks,
    )
--- a/batdetect2/data/annotations/batdetect2_merged.py
+++ b/batdetect2/data/annotations/batdetect2_merged.py
@ -0,0 +1,64 @@
 import json
 import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 from soundevent import data
 from batdetect2.data.annotations.legacy import (
    FileAnnotation,
    file_annotation_to_annotation_task,
    file_annotation_to_clip,
    file_annotation_to_clip_annotation,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "BatDetect2MergedAnnotations",
    "load_batdetect2_merged_annotated_dataset",
 ]
 class BatDetect2MergedAnnotations(AnnotatedDataset):
    format: Literal["batdetect2_file"] = "batdetect2_file"
    annotations_path: Path
 def load_batdetect2_merged_annotated_dataset(
    dataset: BatDetect2MergedAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationProject:
    audio_dir = dataset.audio_dir
    path = dataset.annotations_path
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    content = json.loads(Path(path).read_text())
    annotations = []
    tasks = []
    for ann in content:
        try:
            ann = FileAnnotation.model_validate(ann)
        except ValueError:
            continue
        try:
            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
        except FileNotFoundError:
            continue
        annotations.append(file_annotation_to_clip_annotation(ann, clip))
        tasks.append(file_annotation_to_annotation_task(ann, clip))
    return data.AnnotationProject(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
        tasks=tasks,
    )
--- a/batdetect2/data/annotations/legacy.py
+++ b/batdetect2/data/annotations/legacy.py
@ -0,0 +1,304 @@
 """Compatibility functions between old and new data structures."""
 import os
 import uuid
 from pathlib import Path
 from typing import Callable, List, Optional, Union
 import numpy as np
 from pydantic import BaseModel, Field
 from soundevent import data
 from soundevent.geometry import compute_bounds
 from soundevent.types import ClassMapper
 from batdetect2 import types
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "convert_to_annotation_group",
 ]
 SPECIES_TAG_KEY = "species"
 ECHOLOCATION_EVENT = "Echolocation"
 UNKNOWN_CLASS = "__UNKNOWN__"
 NAMESPACE = uuid.UUID("97a9776b-c0fd-4c68-accb-0b0ecd719242")
 EventFn = Callable[[data.SoundEventAnnotation], Optional[str]]
 ClassFn = Callable[[data.Recording], int]
 IndividualFn = Callable[[data.SoundEventAnnotation], int]
 def get_recording_class_name(recording: data.Recording) -> str:
    """Get the class name for a recording."""
    tag = data.find_tag(recording.tags, SPECIES_TAG_KEY)
    if tag is None:
        return UNKNOWN_CLASS
    return tag.value
 def get_annotation_notes(annotation: data.ClipAnnotation) -> str:
    """Get the notes for a ClipAnnotation."""
    all_notes = [
        *annotation.notes,
        *annotation.clip.recording.notes,
    ]
    messages = [note.message for note in all_notes if note.message is not None]
    return "\n".join(messages)
 def convert_to_annotation_group(
    annotation: data.ClipAnnotation,
    class_mapper: ClassMapper,
    event_fn: EventFn = lambda _: ECHOLOCATION_EVENT,
    class_fn: ClassFn = lambda _: 0,
    individual_fn: IndividualFn = lambda _: 0,
 ) -> types.AudioLoaderAnnotationGroup:
    """Convert a ClipAnnotation to an AudioLoaderAnnotationGroup."""
    recording = annotation.clip.recording
    start_times = []
    end_times = []
    low_freqs = []
    high_freqs = []
    class_ids = []
    x_inds = []
    y_inds = []
    individual_ids = []
    annotations: List[types.Annotation] = []
    class_id_file = class_fn(recording)
    for sound_event in annotation.sound_events:
        geometry = sound_event.sound_event.geometry
        if geometry is None:
            continue
        start_time, low_freq, end_time, high_freq = compute_bounds(geometry)
        class_id = class_mapper.transform(sound_event) or -1
        event = event_fn(sound_event) or ""
        individual_id = individual_fn(sound_event) or -1
        start_times.append(start_time)
        end_times.append(end_time)
        low_freqs.append(low_freq)
        high_freqs.append(high_freq)
        class_ids.append(class_id)
        individual_ids.append(individual_id)
        # NOTE: This will be computed later so we just put a placeholder
        # here for now.
        x_inds.append(0)
        y_inds.append(0)
        annotations.append(
            {
                "start_time": start_time,
                "end_time": end_time,
                "low_freq": low_freq,
                "high_freq": high_freq,
                "class_prob": 1.0,
                "det_prob": 1.0,
                "individual": "0",
                "event": event,
                "class_id": class_id,  # type: ignore
            }
        )
    return {
        "id": str(recording.path),
        "duration": recording.duration,
        "issues": False,
        "file_path": str(recording.path),
        "time_exp": recording.time_expansion,
        "class_name": get_recording_class_name(recording),
        "notes": get_annotation_notes(annotation),
        "annotated": True,
        "start_times": np.array(start_times),
        "end_times": np.array(end_times),
        "low_freqs": np.array(low_freqs),
        "high_freqs": np.array(high_freqs),
        "class_ids": np.array(class_ids),
        "x_inds": np.array(x_inds),
        "y_inds": np.array(y_inds),
        "individual_ids": np.array(individual_ids),
        "annotation": annotations,
        "class_id_file": class_id_file,
    }
 class Annotation(BaseModel):
    """Annotation class to hold batdetect annotations."""
    label: str = Field(alias="class")
    event: str
    individual: int = 0
    start_time: float
    end_time: float
    low_freq: float
    high_freq: float
 class FileAnnotation(BaseModel):
    """FileAnnotation class to hold batdetect annotations for a file."""
    id: str
    duration: float
    time_exp: float = 1
    label: str = Field(alias="class_name")
    annotation: List[Annotation]
    annotated: bool = False
    issues: bool = False
    notes: str = ""
 def load_file_annotation(path: PathLike) -> FileAnnotation:
    """Load annotation from batdetect format."""
    path = Path(path)
    return FileAnnotation.model_validate_json(path.read_text())
 def annotation_to_sound_event(
    annotation: Annotation,
    recording: data.Recording,
    label_key: str = "class",
    event_key: str = "event",
    individual_key: str = "individual",
 ) -> data.SoundEventAnnotation:
    """Convert annotation to sound event annotation."""
    sound_event = data.SoundEvent(
        uuid=uuid.uuid5(
            NAMESPACE,
            f"{recording.hash}_{annotation.start_time}_{annotation.end_time}",
        ),
        recording=recording,
        geometry=data.BoundingBox(
            coordinates=[
                annotation.start_time,
                annotation.low_freq,
                annotation.end_time,
                annotation.high_freq,
            ],
        ),
    )
    return data.SoundEventAnnotation(
        uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
        sound_event=sound_event,
        tags=[
            data.Tag(
                term=data.term_from_key(label_key),
                value=annotation.label,
            ),
            data.Tag(
                term=data.term_from_key(event_key),
                value=annotation.event,
            ),
            data.Tag(
                term=data.term_from_key(individual_key),
                value=str(annotation.individual),
            ),
        ],
    )
 def file_annotation_to_clip(
    file_annotation: FileAnnotation,
    audio_dir: Optional[PathLike] = None,
    label_key: str = "class",
 ) -> data.Clip:
    """Convert file annotation to recording."""
    audio_dir = audio_dir or Path.cwd()
    full_path = Path(audio_dir) / file_annotation.id
    if not full_path.exists():
        raise FileNotFoundError(f"File {full_path} not found.")
    recording = data.Recording.from_file(
        full_path,
        time_expansion=file_annotation.time_exp,
        tags=[
            data.Tag(
                term=data.term_from_key(label_key),
                value=file_annotation.label,
            )
        ],
    )
    return data.Clip(
        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip"),
        recording=recording,
        start_time=0,
        end_time=recording.duration,
    )
 def file_annotation_to_clip_annotation(
    file_annotation: FileAnnotation,
    clip: data.Clip,
    label_key: str = "class",
    event_key: str = "event",
    individual_key: str = "individual",
 ) -> data.ClipAnnotation:
    """Convert file annotation to clip annotation."""
    notes = []
    if file_annotation.notes:
        notes.append(data.Note(message=file_annotation.notes))
    return data.ClipAnnotation(
        uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
        clip=clip,
        notes=notes,
        tags=[
            data.Tag(
                term=data.term_from_key(label_key), value=file_annotation.label
            )
        ],
        sound_events=[
            annotation_to_sound_event(
                annotation,
                clip.recording,
                label_key=label_key,
                event_key=event_key,
                individual_key=individual_key,
            )
            for annotation in file_annotation.annotation
        ],
    )
 def file_annotation_to_annotation_task(
    file_annotation: FileAnnotation,
    clip: data.Clip,
 ) -> data.AnnotationTask:
    status_badges = []
    if file_annotation.issues:
        status_badges.append(
            data.StatusBadge(state=data.AnnotationState.rejected)
        )
    elif file_annotation.annotated:
        status_badges.append(
            data.StatusBadge(state=data.AnnotationState.completed)
        )
    return data.AnnotationTask(
        uuid=uuid.uuid5(uuid.NAMESPACE_URL, f"{file_annotation.id}_task"),
        clip=clip,
        status_badges=status_badges,
    )
 def list_file_annotations(path: PathLike) -> List[Path]:
    """List all annotations in a directory."""
    path = Path(path)
    return [file for file in path.glob("*.json")]
--- a/batdetect2/data/annotations/types.py
+++ b/batdetect2/data/annotations/types.py
@ -0,0 +1,41 @@
 from pathlib import Path
 from typing import Literal, Union
 from batdetect2.configs import BaseConfig
 __all__ = [
    "AnnotatedDataset",
    "BatDetect2MergedAnnotations",
 ]
 class AnnotatedDataset(BaseConfig):
    """Represents a single, cohesive source of audio recordings and annotations.
    A source typically groups recordings originating from a specific context,
    such as a single project, site, deployment, or recordist. All audio files
    belonging to a source should be located within a single directory,
    specified by `audio_dir`.
    Annotations associated with these recordings are defined by the
    `annotations` field, which supports various formats (e.g., AOEF files,
                                                         specific CSV
                                                         structures).
    Crucially, file paths referenced within the annotation data *must* be
    relative to the `audio_dir`. This ensures that the dataset definition
    remains portable across different systems and base directories.
    Attributes:
        name: A unique identifier for this data source.
        description: Detailed information about the source, including recording
            methods, annotation procedures, equipment used, potential biases,
            or any important caveats for users.
        audio_dir: The file system path to the directory containing the audio
            recordings for this source.
    """
    name: str
    audio_dir: Path
    description: str = ""
--- a/batdetect2/data/data.py
+++ b/batdetect2/data/data.py
@ -0,0 +1,37 @@
 from pathlib import Path
 from typing import Optional
 from soundevent import data
 from batdetect2.configs import load_config
 from batdetect2.data.annotations import load_annotated_dataset
 from batdetect2.data.types import Dataset
 __all__ = [
    "load_dataset",
    "load_dataset_from_config",
 ]
 def load_dataset(
    dataset: Dataset,
    base_dir: Optional[Path] = None,
 ) -> data.AnnotationSet:
    clip_annotations = []
    for source in dataset.sources:
        annotated_source = load_annotated_dataset(source, base_dir=base_dir)
        clip_annotations.extend(annotated_source.clip_annotations)
    return data.AnnotationSet(clip_annotations=clip_annotations)
 def load_dataset_from_config(
    path: data.PathLike,
    field: Optional[str] = None,
    base_dir: Optional[Path] = None,
 ):
    config = load_config(
        path=path,
        schema=Dataset,
        field=field,
    )
    return load_dataset(config, base_dir=base_dir)
--- a/batdetect2/data/types.py
+++ b/batdetect2/data/types.py
@ -0,0 +1,29 @@
 from typing import Annotated, List
 from pydantic import Field
 from batdetect2.configs import BaseConfig
 from batdetect2.data.annotations import AnnotationFormats
 class Dataset(BaseConfig):
    """Represents a collection of one or more DatasetSources.
    In the context of batdetect2, a Dataset aggregates multiple `DatasetSource`
    instances. It serves as the primary unit for defining data splits,
    typically used for model training, validation, or testing phases.
    Attributes:
        name: A descriptive name for the overall dataset
            (e.g., "UK Training Set").
        description: A detailed explanation of the dataset's purpose,
            composition, how it was assembled, or any specific characteristics.
        sources: A list containing the `DatasetSource` objects included in this
            dataset.
    """
    name: str
    description: str
    sources: List[
        Annotated[AnnotationFormats, Field(..., discriminator="format")]
    ]