From 451093f2da06e6d7b4b4a032ad064496d7e17d44 Mon Sep 17 00:00:00 2001 From: mbsantiago Date: Thu, 3 Apr 2025 16:47:03 +0100 Subject: [PATCH] More structured data module --- batdetect2/data/__init__.py | 14 + batdetect2/data/annotations.py | 36 +++ batdetect2/data/annotations/__init__.py | 55 ++++ batdetect2/data/annotations/aeof.py | 37 +++ .../data/annotations/batdetect2_files.py | 80 +++++ .../data/annotations/batdetect2_merged.py | 64 ++++ batdetect2/data/annotations/legacy.py | 304 ++++++++++++++++++ batdetect2/data/annotations/types.py | 41 +++ batdetect2/data/data.py | 37 +++ batdetect2/data/types.py | 29 ++ 10 files changed, 697 insertions(+) create mode 100644 batdetect2/data/__init__.py create mode 100644 batdetect2/data/annotations.py create mode 100644 batdetect2/data/annotations/__init__.py create mode 100644 batdetect2/data/annotations/aeof.py create mode 100644 batdetect2/data/annotations/batdetect2_files.py create mode 100644 batdetect2/data/annotations/batdetect2_merged.py create mode 100644 batdetect2/data/annotations/legacy.py create mode 100644 batdetect2/data/annotations/types.py create mode 100644 batdetect2/data/data.py create mode 100644 batdetect2/data/types.py diff --git a/batdetect2/data/__init__.py b/batdetect2/data/__init__.py new file mode 100644 index 0000000..512ed87 --- /dev/null +++ b/batdetect2/data/__init__.py @@ -0,0 +1,14 @@ +from batdetect2.data.annotations import ( + AnnotatedDataset, + load_annotated_dataset, +) +from batdetect2.data.data import load_dataset, load_dataset_from_config +from batdetect2.data.types import Dataset + +__all__ = [ + "AnnotatedDataset", + "Dataset", + "load_annotated_dataset", + "load_dataset", + "load_dataset_from_config", +] diff --git a/batdetect2/data/annotations.py b/batdetect2/data/annotations.py new file mode 100644 index 0000000..a69b79f --- /dev/null +++ b/batdetect2/data/annotations.py @@ -0,0 +1,36 @@ +import json +from pathlib import Path +from typing import Literal, Union + +from batdetect2.configs import BaseConfig + +__all__ = [ + "AOEFAnnotationFile", + "AnnotationFormats", + "BatDetect2AnnotationFile", + "BatDetect2AnnotationFiles", +] + + +class BatDetect2AnnotationFiles(BaseConfig): + format: Literal["batdetect2"] = "batdetect2" + path: Path + + +class BatDetect2AnnotationFile(BaseConfig): + format: Literal["batdetect2_file"] = "batdetect2_file" + path: Path + + +class AOEFAnnotationFile(BaseConfig): + format: Literal["aoef"] = "aoef" + path: Path + + +AnnotationFormats = Union[ + BatDetect2AnnotationFiles, + BatDetect2AnnotationFile, + AOEFAnnotationFile, +] + + diff --git a/batdetect2/data/annotations/__init__.py b/batdetect2/data/annotations/__init__.py new file mode 100644 index 0000000..5193cc1 --- /dev/null +++ b/batdetect2/data/annotations/__init__.py @@ -0,0 +1,55 @@ +from pathlib import Path +from typing import Optional, Union + +from soundevent import data + +from batdetect2.data.annotations.aeof import ( + AOEFAnnotations, + load_aoef_annotated_dataset, +) +from batdetect2.data.annotations.batdetect2_files import ( + BatDetect2FilesAnnotations, + load_batdetect2_files_annotated_dataset, +) +from batdetect2.data.annotations.batdetect2_merged import ( + BatDetect2MergedAnnotations, + load_batdetect2_merged_annotated_dataset, +) +from batdetect2.data.annotations.types import AnnotatedDataset + +__all__ = [ + "load_annotated_dataset", + "AnnotatedDataset", + "AOEFAnnotations", + "BatDetect2FilesAnnotations", + "BatDetect2MergedAnnotations", + "AnnotationFormats", +] + + +AnnotationFormats = Union[ + BatDetect2MergedAnnotations, + BatDetect2FilesAnnotations, + AOEFAnnotations, +] + + +def load_annotated_dataset( + dataset: AnnotatedDataset, + base_dir: Optional[Path] = None, +) -> data.AnnotationSet: + if isinstance(dataset, AOEFAnnotations): + return load_aoef_annotated_dataset(dataset, base_dir=base_dir) + + if isinstance(dataset, BatDetect2MergedAnnotations): + return load_batdetect2_merged_annotated_dataset( + dataset, base_dir=base_dir + ) + + if isinstance(dataset, BatDetect2FilesAnnotations): + return load_batdetect2_files_annotated_dataset( + dataset, + base_dir=base_dir, + ) + + raise NotImplementedError(f"Unknown annotation format: {dataset.name}") diff --git a/batdetect2/data/annotations/aeof.py b/batdetect2/data/annotations/aeof.py new file mode 100644 index 0000000..e634a02 --- /dev/null +++ b/batdetect2/data/annotations/aeof.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import Literal, Optional + +from soundevent import data, io + +from batdetect2.data.annotations.types import AnnotatedDataset + +__all__ = [ + "AOEFAnnotations", + "load_aoef_annotated_dataset", +] + + +class AOEFAnnotations(AnnotatedDataset): + format: Literal["aoef"] = "aoef" + annotations_path: Path + + +def load_aoef_annotated_dataset( + dataset: AOEFAnnotations, + base_dir: Optional[Path] = None, +) -> data.AnnotationSet: + audio_dir = dataset.audio_dir + path = dataset.annotations_path + + if base_dir: + audio_dir = base_dir / audio_dir + path = base_dir / path + + loaded = io.load(path, audio_dir=audio_dir) + + if not isinstance(loaded, (data.AnnotationSet, data.AnnotationProject)): + raise ValueError( + f"The AOEF file at {path} does not contain a set of annotations" + ) + + return loaded diff --git a/batdetect2/data/annotations/batdetect2_files.py b/batdetect2/data/annotations/batdetect2_files.py new file mode 100644 index 0000000..da88f9a --- /dev/null +++ b/batdetect2/data/annotations/batdetect2_files.py @@ -0,0 +1,80 @@ +import os +from pathlib import Path +from typing import Literal, Optional, Union + +from soundevent import data + +from batdetect2.data.annotations.legacy import ( + file_annotation_to_annotation_task, + file_annotation_to_clip, + file_annotation_to_clip_annotation, + list_file_annotations, + load_file_annotation, +) +from batdetect2.data.annotations.types import AnnotatedDataset + +PathLike = Union[Path, str, os.PathLike] + + +__all__ = [ + "load_batdetect2_files_annotated_dataset", + "BatDetect2FilesAnnotations", +] + + +class BatDetect2FilesAnnotations(AnnotatedDataset): + format: Literal["batdetect2"] = "batdetect2" + annotations_dir: Path + + +def load_batdetect2_files_annotated_dataset( + dataset: BatDetect2FilesAnnotations, + base_dir: Optional[PathLike] = None, +) -> data.AnnotationProject: + """Convert annotations to annotation project.""" + audio_dir = dataset.audio_dir + path = dataset.annotations_dir + + if base_dir: + audio_dir = base_dir / audio_dir + path = base_dir / path + + paths = list_file_annotations(path) + + annotations = [] + tasks = [] + + for p in paths: + try: + file_annotation = load_file_annotation(p) + except FileNotFoundError: + continue + + try: + clip = file_annotation_to_clip( + file_annotation, + audio_dir=audio_dir, + ) + except FileNotFoundError: + continue + + annotations.append( + file_annotation_to_clip_annotation( + file_annotation, + clip, + ) + ) + + tasks.append( + file_annotation_to_annotation_task( + file_annotation, + clip, + ) + ) + + return data.AnnotationProject( + name=dataset.name, + description=dataset.description, + clip_annotations=annotations, + tasks=tasks, + ) diff --git a/batdetect2/data/annotations/batdetect2_merged.py b/batdetect2/data/annotations/batdetect2_merged.py new file mode 100644 index 0000000..5424cb2 --- /dev/null +++ b/batdetect2/data/annotations/batdetect2_merged.py @@ -0,0 +1,64 @@ +import json +import os +from pathlib import Path +from typing import Literal, Optional, Union + +from soundevent import data + +from batdetect2.data.annotations.legacy import ( + FileAnnotation, + file_annotation_to_annotation_task, + file_annotation_to_clip, + file_annotation_to_clip_annotation, +) +from batdetect2.data.annotations.types import AnnotatedDataset + +PathLike = Union[Path, str, os.PathLike] + +__all__ = [ + "BatDetect2MergedAnnotations", + "load_batdetect2_merged_annotated_dataset", +] + + +class BatDetect2MergedAnnotations(AnnotatedDataset): + format: Literal["batdetect2_file"] = "batdetect2_file" + annotations_path: Path + + +def load_batdetect2_merged_annotated_dataset( + dataset: BatDetect2MergedAnnotations, + base_dir: Optional[PathLike] = None, +) -> data.AnnotationProject: + audio_dir = dataset.audio_dir + path = dataset.annotations_path + + if base_dir: + audio_dir = base_dir / audio_dir + path = base_dir / path + + content = json.loads(Path(path).read_text()) + + annotations = [] + tasks = [] + + for ann in content: + try: + ann = FileAnnotation.model_validate(ann) + except ValueError: + continue + + try: + clip = file_annotation_to_clip(ann, audio_dir=audio_dir) + except FileNotFoundError: + continue + + annotations.append(file_annotation_to_clip_annotation(ann, clip)) + tasks.append(file_annotation_to_annotation_task(ann, clip)) + + return data.AnnotationProject( + name=dataset.name, + description=dataset.description, + clip_annotations=annotations, + tasks=tasks, + ) diff --git a/batdetect2/data/annotations/legacy.py b/batdetect2/data/annotations/legacy.py new file mode 100644 index 0000000..5c8a696 --- /dev/null +++ b/batdetect2/data/annotations/legacy.py @@ -0,0 +1,304 @@ +"""Compatibility functions between old and new data structures.""" + +import os +import uuid +from pathlib import Path +from typing import Callable, List, Optional, Union + +import numpy as np +from pydantic import BaseModel, Field +from soundevent import data +from soundevent.geometry import compute_bounds +from soundevent.types import ClassMapper + +from batdetect2 import types + +PathLike = Union[Path, str, os.PathLike] + +__all__ = [ + "convert_to_annotation_group", +] + +SPECIES_TAG_KEY = "species" +ECHOLOCATION_EVENT = "Echolocation" +UNKNOWN_CLASS = "__UNKNOWN__" + +NAMESPACE = uuid.UUID("97a9776b-c0fd-4c68-accb-0b0ecd719242") + + +EventFn = Callable[[data.SoundEventAnnotation], Optional[str]] + +ClassFn = Callable[[data.Recording], int] + +IndividualFn = Callable[[data.SoundEventAnnotation], int] + + +def get_recording_class_name(recording: data.Recording) -> str: + """Get the class name for a recording.""" + tag = data.find_tag(recording.tags, SPECIES_TAG_KEY) + if tag is None: + return UNKNOWN_CLASS + return tag.value + + +def get_annotation_notes(annotation: data.ClipAnnotation) -> str: + """Get the notes for a ClipAnnotation.""" + all_notes = [ + *annotation.notes, + *annotation.clip.recording.notes, + ] + messages = [note.message for note in all_notes if note.message is not None] + return "\n".join(messages) + + +def convert_to_annotation_group( + annotation: data.ClipAnnotation, + class_mapper: ClassMapper, + event_fn: EventFn = lambda _: ECHOLOCATION_EVENT, + class_fn: ClassFn = lambda _: 0, + individual_fn: IndividualFn = lambda _: 0, +) -> types.AudioLoaderAnnotationGroup: + """Convert a ClipAnnotation to an AudioLoaderAnnotationGroup.""" + recording = annotation.clip.recording + + start_times = [] + end_times = [] + low_freqs = [] + high_freqs = [] + class_ids = [] + x_inds = [] + y_inds = [] + individual_ids = [] + annotations: List[types.Annotation] = [] + class_id_file = class_fn(recording) + + for sound_event in annotation.sound_events: + geometry = sound_event.sound_event.geometry + + if geometry is None: + continue + + start_time, low_freq, end_time, high_freq = compute_bounds(geometry) + class_id = class_mapper.transform(sound_event) or -1 + event = event_fn(sound_event) or "" + individual_id = individual_fn(sound_event) or -1 + + start_times.append(start_time) + end_times.append(end_time) + low_freqs.append(low_freq) + high_freqs.append(high_freq) + class_ids.append(class_id) + individual_ids.append(individual_id) + + # NOTE: This will be computed later so we just put a placeholder + # here for now. + x_inds.append(0) + y_inds.append(0) + + annotations.append( + { + "start_time": start_time, + "end_time": end_time, + "low_freq": low_freq, + "high_freq": high_freq, + "class_prob": 1.0, + "det_prob": 1.0, + "individual": "0", + "event": event, + "class_id": class_id, # type: ignore + } + ) + + return { + "id": str(recording.path), + "duration": recording.duration, + "issues": False, + "file_path": str(recording.path), + "time_exp": recording.time_expansion, + "class_name": get_recording_class_name(recording), + "notes": get_annotation_notes(annotation), + "annotated": True, + "start_times": np.array(start_times), + "end_times": np.array(end_times), + "low_freqs": np.array(low_freqs), + "high_freqs": np.array(high_freqs), + "class_ids": np.array(class_ids), + "x_inds": np.array(x_inds), + "y_inds": np.array(y_inds), + "individual_ids": np.array(individual_ids), + "annotation": annotations, + "class_id_file": class_id_file, + } + + +class Annotation(BaseModel): + """Annotation class to hold batdetect annotations.""" + + label: str = Field(alias="class") + event: str + individual: int = 0 + + start_time: float + end_time: float + low_freq: float + high_freq: float + + +class FileAnnotation(BaseModel): + """FileAnnotation class to hold batdetect annotations for a file.""" + + id: str + duration: float + time_exp: float = 1 + + label: str = Field(alias="class_name") + + annotation: List[Annotation] + + annotated: bool = False + issues: bool = False + notes: str = "" + + +def load_file_annotation(path: PathLike) -> FileAnnotation: + """Load annotation from batdetect format.""" + path = Path(path) + return FileAnnotation.model_validate_json(path.read_text()) + + +def annotation_to_sound_event( + annotation: Annotation, + recording: data.Recording, + label_key: str = "class", + event_key: str = "event", + individual_key: str = "individual", +) -> data.SoundEventAnnotation: + """Convert annotation to sound event annotation.""" + sound_event = data.SoundEvent( + uuid=uuid.uuid5( + NAMESPACE, + f"{recording.hash}_{annotation.start_time}_{annotation.end_time}", + ), + recording=recording, + geometry=data.BoundingBox( + coordinates=[ + annotation.start_time, + annotation.low_freq, + annotation.end_time, + annotation.high_freq, + ], + ), + ) + + return data.SoundEventAnnotation( + uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"), + sound_event=sound_event, + tags=[ + data.Tag( + term=data.term_from_key(label_key), + value=annotation.label, + ), + data.Tag( + term=data.term_from_key(event_key), + value=annotation.event, + ), + data.Tag( + term=data.term_from_key(individual_key), + value=str(annotation.individual), + ), + ], + ) + + +def file_annotation_to_clip( + file_annotation: FileAnnotation, + audio_dir: Optional[PathLike] = None, + label_key: str = "class", +) -> data.Clip: + """Convert file annotation to recording.""" + audio_dir = audio_dir or Path.cwd() + + full_path = Path(audio_dir) / file_annotation.id + + if not full_path.exists(): + raise FileNotFoundError(f"File {full_path} not found.") + + recording = data.Recording.from_file( + full_path, + time_expansion=file_annotation.time_exp, + tags=[ + data.Tag( + term=data.term_from_key(label_key), + value=file_annotation.label, + ) + ], + ) + + return data.Clip( + uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip"), + recording=recording, + start_time=0, + end_time=recording.duration, + ) + + +def file_annotation_to_clip_annotation( + file_annotation: FileAnnotation, + clip: data.Clip, + label_key: str = "class", + event_key: str = "event", + individual_key: str = "individual", +) -> data.ClipAnnotation: + """Convert file annotation to clip annotation.""" + notes = [] + if file_annotation.notes: + notes.append(data.Note(message=file_annotation.notes)) + + return data.ClipAnnotation( + uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"), + clip=clip, + notes=notes, + tags=[ + data.Tag( + term=data.term_from_key(label_key), value=file_annotation.label + ) + ], + sound_events=[ + annotation_to_sound_event( + annotation, + clip.recording, + label_key=label_key, + event_key=event_key, + individual_key=individual_key, + ) + for annotation in file_annotation.annotation + ], + ) + + +def file_annotation_to_annotation_task( + file_annotation: FileAnnotation, + clip: data.Clip, +) -> data.AnnotationTask: + status_badges = [] + + if file_annotation.issues: + status_badges.append( + data.StatusBadge(state=data.AnnotationState.rejected) + ) + elif file_annotation.annotated: + status_badges.append( + data.StatusBadge(state=data.AnnotationState.completed) + ) + + return data.AnnotationTask( + uuid=uuid.uuid5(uuid.NAMESPACE_URL, f"{file_annotation.id}_task"), + clip=clip, + status_badges=status_badges, + ) + + +def list_file_annotations(path: PathLike) -> List[Path]: + """List all annotations in a directory.""" + path = Path(path) + return [file for file in path.glob("*.json")] diff --git a/batdetect2/data/annotations/types.py b/batdetect2/data/annotations/types.py new file mode 100644 index 0000000..a184b23 --- /dev/null +++ b/batdetect2/data/annotations/types.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Literal, Union + +from batdetect2.configs import BaseConfig + +__all__ = [ + "AnnotatedDataset", + "BatDetect2MergedAnnotations", +] + + +class AnnotatedDataset(BaseConfig): + """Represents a single, cohesive source of audio recordings and annotations. + + A source typically groups recordings originating from a specific context, + such as a single project, site, deployment, or recordist. All audio files + belonging to a source should be located within a single directory, + specified by `audio_dir`. + + Annotations associated with these recordings are defined by the + `annotations` field, which supports various formats (e.g., AOEF files, + specific CSV + structures). + Crucially, file paths referenced within the annotation data *must* be + relative to the `audio_dir`. This ensures that the dataset definition + remains portable across different systems and base directories. + + Attributes: + name: A unique identifier for this data source. + description: Detailed information about the source, including recording + methods, annotation procedures, equipment used, potential biases, + or any important caveats for users. + audio_dir: The file system path to the directory containing the audio + recordings for this source. + """ + + name: str + audio_dir: Path + description: str = "" + + diff --git a/batdetect2/data/data.py b/batdetect2/data/data.py new file mode 100644 index 0000000..c227991 --- /dev/null +++ b/batdetect2/data/data.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import Optional + +from soundevent import data + +from batdetect2.configs import load_config +from batdetect2.data.annotations import load_annotated_dataset +from batdetect2.data.types import Dataset + +__all__ = [ + "load_dataset", + "load_dataset_from_config", +] + + +def load_dataset( + dataset: Dataset, + base_dir: Optional[Path] = None, +) -> data.AnnotationSet: + clip_annotations = [] + for source in dataset.sources: + annotated_source = load_annotated_dataset(source, base_dir=base_dir) + clip_annotations.extend(annotated_source.clip_annotations) + return data.AnnotationSet(clip_annotations=clip_annotations) + + +def load_dataset_from_config( + path: data.PathLike, + field: Optional[str] = None, + base_dir: Optional[Path] = None, +): + config = load_config( + path=path, + schema=Dataset, + field=field, + ) + return load_dataset(config, base_dir=base_dir) diff --git a/batdetect2/data/types.py b/batdetect2/data/types.py new file mode 100644 index 0000000..237f215 --- /dev/null +++ b/batdetect2/data/types.py @@ -0,0 +1,29 @@ +from typing import Annotated, List + +from pydantic import Field + +from batdetect2.configs import BaseConfig +from batdetect2.data.annotations import AnnotationFormats + + +class Dataset(BaseConfig): + """Represents a collection of one or more DatasetSources. + + In the context of batdetect2, a Dataset aggregates multiple `DatasetSource` + instances. It serves as the primary unit for defining data splits, + typically used for model training, validation, or testing phases. + + Attributes: + name: A descriptive name for the overall dataset + (e.g., "UK Training Set"). + description: A detailed explanation of the dataset's purpose, + composition, how it was assembled, or any specific characteristics. + sources: A list containing the `DatasetSource` objects included in this + dataset. + """ + + name: str + description: str + sources: List[ + Annotated[AnnotationFormats, Field(..., discriminator="format")] + ]