More structured data module

This commit is contained in:
mbsantiago 2025-04-03 16:47:03 +01:00
parent 30d3a2c92e
commit 451093f2da
10 changed files with 697 additions and 0 deletions

View File

@ -0,0 +1,14 @@
from batdetect2.data.annotations import (
AnnotatedDataset,
load_annotated_dataset,
)
from batdetect2.data.data import load_dataset, load_dataset_from_config
from batdetect2.data.types import Dataset
__all__ = [
"AnnotatedDataset",
"Dataset",
"load_annotated_dataset",
"load_dataset",
"load_dataset_from_config",
]

View File

@ -0,0 +1,36 @@
import json
from pathlib import Path
from typing import Literal, Union
from batdetect2.configs import BaseConfig
__all__ = [
"AOEFAnnotationFile",
"AnnotationFormats",
"BatDetect2AnnotationFile",
"BatDetect2AnnotationFiles",
]
class BatDetect2AnnotationFiles(BaseConfig):
format: Literal["batdetect2"] = "batdetect2"
path: Path
class BatDetect2AnnotationFile(BaseConfig):
format: Literal["batdetect2_file"] = "batdetect2_file"
path: Path
class AOEFAnnotationFile(BaseConfig):
format: Literal["aoef"] = "aoef"
path: Path
AnnotationFormats = Union[
BatDetect2AnnotationFiles,
BatDetect2AnnotationFile,
AOEFAnnotationFile,
]

View File

@ -0,0 +1,55 @@
from pathlib import Path
from typing import Optional, Union
from soundevent import data
from batdetect2.data.annotations.aeof import (
AOEFAnnotations,
load_aoef_annotated_dataset,
)
from batdetect2.data.annotations.batdetect2_files import (
BatDetect2FilesAnnotations,
load_batdetect2_files_annotated_dataset,
)
from batdetect2.data.annotations.batdetect2_merged import (
BatDetect2MergedAnnotations,
load_batdetect2_merged_annotated_dataset,
)
from batdetect2.data.annotations.types import AnnotatedDataset
__all__ = [
"load_annotated_dataset",
"AnnotatedDataset",
"AOEFAnnotations",
"BatDetect2FilesAnnotations",
"BatDetect2MergedAnnotations",
"AnnotationFormats",
]
AnnotationFormats = Union[
BatDetect2MergedAnnotations,
BatDetect2FilesAnnotations,
AOEFAnnotations,
]
def load_annotated_dataset(
dataset: AnnotatedDataset,
base_dir: Optional[Path] = None,
) -> data.AnnotationSet:
if isinstance(dataset, AOEFAnnotations):
return load_aoef_annotated_dataset(dataset, base_dir=base_dir)
if isinstance(dataset, BatDetect2MergedAnnotations):
return load_batdetect2_merged_annotated_dataset(
dataset, base_dir=base_dir
)
if isinstance(dataset, BatDetect2FilesAnnotations):
return load_batdetect2_files_annotated_dataset(
dataset,
base_dir=base_dir,
)
raise NotImplementedError(f"Unknown annotation format: {dataset.name}")

View File

@ -0,0 +1,37 @@
from pathlib import Path
from typing import Literal, Optional
from soundevent import data, io
from batdetect2.data.annotations.types import AnnotatedDataset
__all__ = [
"AOEFAnnotations",
"load_aoef_annotated_dataset",
]
class AOEFAnnotations(AnnotatedDataset):
format: Literal["aoef"] = "aoef"
annotations_path: Path
def load_aoef_annotated_dataset(
dataset: AOEFAnnotations,
base_dir: Optional[Path] = None,
) -> data.AnnotationSet:
audio_dir = dataset.audio_dir
path = dataset.annotations_path
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
loaded = io.load(path, audio_dir=audio_dir)
if not isinstance(loaded, (data.AnnotationSet, data.AnnotationProject)):
raise ValueError(
f"The AOEF file at {path} does not contain a set of annotations"
)
return loaded

View File

@ -0,0 +1,80 @@
import os
from pathlib import Path
from typing import Literal, Optional, Union
from soundevent import data
from batdetect2.data.annotations.legacy import (
file_annotation_to_annotation_task,
file_annotation_to_clip,
file_annotation_to_clip_annotation,
list_file_annotations,
load_file_annotation,
)
from batdetect2.data.annotations.types import AnnotatedDataset
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"load_batdetect2_files_annotated_dataset",
"BatDetect2FilesAnnotations",
]
class BatDetect2FilesAnnotations(AnnotatedDataset):
format: Literal["batdetect2"] = "batdetect2"
annotations_dir: Path
def load_batdetect2_files_annotated_dataset(
dataset: BatDetect2FilesAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationProject:
"""Convert annotations to annotation project."""
audio_dir = dataset.audio_dir
path = dataset.annotations_dir
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
paths = list_file_annotations(path)
annotations = []
tasks = []
for p in paths:
try:
file_annotation = load_file_annotation(p)
except FileNotFoundError:
continue
try:
clip = file_annotation_to_clip(
file_annotation,
audio_dir=audio_dir,
)
except FileNotFoundError:
continue
annotations.append(
file_annotation_to_clip_annotation(
file_annotation,
clip,
)
)
tasks.append(
file_annotation_to_annotation_task(
file_annotation,
clip,
)
)
return data.AnnotationProject(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
tasks=tasks,
)

View File

@ -0,0 +1,64 @@
import json
import os
from pathlib import Path
from typing import Literal, Optional, Union
from soundevent import data
from batdetect2.data.annotations.legacy import (
FileAnnotation,
file_annotation_to_annotation_task,
file_annotation_to_clip,
file_annotation_to_clip_annotation,
)
from batdetect2.data.annotations.types import AnnotatedDataset
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"BatDetect2MergedAnnotations",
"load_batdetect2_merged_annotated_dataset",
]
class BatDetect2MergedAnnotations(AnnotatedDataset):
format: Literal["batdetect2_file"] = "batdetect2_file"
annotations_path: Path
def load_batdetect2_merged_annotated_dataset(
dataset: BatDetect2MergedAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationProject:
audio_dir = dataset.audio_dir
path = dataset.annotations_path
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
content = json.loads(Path(path).read_text())
annotations = []
tasks = []
for ann in content:
try:
ann = FileAnnotation.model_validate(ann)
except ValueError:
continue
try:
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
except FileNotFoundError:
continue
annotations.append(file_annotation_to_clip_annotation(ann, clip))
tasks.append(file_annotation_to_annotation_task(ann, clip))
return data.AnnotationProject(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
tasks=tasks,
)

View File

@ -0,0 +1,304 @@
"""Compatibility functions between old and new data structures."""
import os
import uuid
from pathlib import Path
from typing import Callable, List, Optional, Union
import numpy as np
from pydantic import BaseModel, Field
from soundevent import data
from soundevent.geometry import compute_bounds
from soundevent.types import ClassMapper
from batdetect2 import types
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"convert_to_annotation_group",
]
SPECIES_TAG_KEY = "species"
ECHOLOCATION_EVENT = "Echolocation"
UNKNOWN_CLASS = "__UNKNOWN__"
NAMESPACE = uuid.UUID("97a9776b-c0fd-4c68-accb-0b0ecd719242")
EventFn = Callable[[data.SoundEventAnnotation], Optional[str]]
ClassFn = Callable[[data.Recording], int]
IndividualFn = Callable[[data.SoundEventAnnotation], int]
def get_recording_class_name(recording: data.Recording) -> str:
"""Get the class name for a recording."""
tag = data.find_tag(recording.tags, SPECIES_TAG_KEY)
if tag is None:
return UNKNOWN_CLASS
return tag.value
def get_annotation_notes(annotation: data.ClipAnnotation) -> str:
"""Get the notes for a ClipAnnotation."""
all_notes = [
*annotation.notes,
*annotation.clip.recording.notes,
]
messages = [note.message for note in all_notes if note.message is not None]
return "\n".join(messages)
def convert_to_annotation_group(
annotation: data.ClipAnnotation,
class_mapper: ClassMapper,
event_fn: EventFn = lambda _: ECHOLOCATION_EVENT,
class_fn: ClassFn = lambda _: 0,
individual_fn: IndividualFn = lambda _: 0,
) -> types.AudioLoaderAnnotationGroup:
"""Convert a ClipAnnotation to an AudioLoaderAnnotationGroup."""
recording = annotation.clip.recording
start_times = []
end_times = []
low_freqs = []
high_freqs = []
class_ids = []
x_inds = []
y_inds = []
individual_ids = []
annotations: List[types.Annotation] = []
class_id_file = class_fn(recording)
for sound_event in annotation.sound_events:
geometry = sound_event.sound_event.geometry
if geometry is None:
continue
start_time, low_freq, end_time, high_freq = compute_bounds(geometry)
class_id = class_mapper.transform(sound_event) or -1
event = event_fn(sound_event) or ""
individual_id = individual_fn(sound_event) or -1
start_times.append(start_time)
end_times.append(end_time)
low_freqs.append(low_freq)
high_freqs.append(high_freq)
class_ids.append(class_id)
individual_ids.append(individual_id)
# NOTE: This will be computed later so we just put a placeholder
# here for now.
x_inds.append(0)
y_inds.append(0)
annotations.append(
{
"start_time": start_time,
"end_time": end_time,
"low_freq": low_freq,
"high_freq": high_freq,
"class_prob": 1.0,
"det_prob": 1.0,
"individual": "0",
"event": event,
"class_id": class_id, # type: ignore
}
)
return {
"id": str(recording.path),
"duration": recording.duration,
"issues": False,
"file_path": str(recording.path),
"time_exp": recording.time_expansion,
"class_name": get_recording_class_name(recording),
"notes": get_annotation_notes(annotation),
"annotated": True,
"start_times": np.array(start_times),
"end_times": np.array(end_times),
"low_freqs": np.array(low_freqs),
"high_freqs": np.array(high_freqs),
"class_ids": np.array(class_ids),
"x_inds": np.array(x_inds),
"y_inds": np.array(y_inds),
"individual_ids": np.array(individual_ids),
"annotation": annotations,
"class_id_file": class_id_file,
}
class Annotation(BaseModel):
"""Annotation class to hold batdetect annotations."""
label: str = Field(alias="class")
event: str
individual: int = 0
start_time: float
end_time: float
low_freq: float
high_freq: float
class FileAnnotation(BaseModel):
"""FileAnnotation class to hold batdetect annotations for a file."""
id: str
duration: float
time_exp: float = 1
label: str = Field(alias="class_name")
annotation: List[Annotation]
annotated: bool = False
issues: bool = False
notes: str = ""
def load_file_annotation(path: PathLike) -> FileAnnotation:
"""Load annotation from batdetect format."""
path = Path(path)
return FileAnnotation.model_validate_json(path.read_text())
def annotation_to_sound_event(
annotation: Annotation,
recording: data.Recording,
label_key: str = "class",
event_key: str = "event",
individual_key: str = "individual",
) -> data.SoundEventAnnotation:
"""Convert annotation to sound event annotation."""
sound_event = data.SoundEvent(
uuid=uuid.uuid5(
NAMESPACE,
f"{recording.hash}_{annotation.start_time}_{annotation.end_time}",
),
recording=recording,
geometry=data.BoundingBox(
coordinates=[
annotation.start_time,
annotation.low_freq,
annotation.end_time,
annotation.high_freq,
],
),
)
return data.SoundEventAnnotation(
uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
sound_event=sound_event,
tags=[
data.Tag(
term=data.term_from_key(label_key),
value=annotation.label,
),
data.Tag(
term=data.term_from_key(event_key),
value=annotation.event,
),
data.Tag(
term=data.term_from_key(individual_key),
value=str(annotation.individual),
),
],
)
def file_annotation_to_clip(
file_annotation: FileAnnotation,
audio_dir: Optional[PathLike] = None,
label_key: str = "class",
) -> data.Clip:
"""Convert file annotation to recording."""
audio_dir = audio_dir or Path.cwd()
full_path = Path(audio_dir) / file_annotation.id
if not full_path.exists():
raise FileNotFoundError(f"File {full_path} not found.")
recording = data.Recording.from_file(
full_path,
time_expansion=file_annotation.time_exp,
tags=[
data.Tag(
term=data.term_from_key(label_key),
value=file_annotation.label,
)
],
)
return data.Clip(
uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip"),
recording=recording,
start_time=0,
end_time=recording.duration,
)
def file_annotation_to_clip_annotation(
file_annotation: FileAnnotation,
clip: data.Clip,
label_key: str = "class",
event_key: str = "event",
individual_key: str = "individual",
) -> data.ClipAnnotation:
"""Convert file annotation to clip annotation."""
notes = []
if file_annotation.notes:
notes.append(data.Note(message=file_annotation.notes))
return data.ClipAnnotation(
uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
clip=clip,
notes=notes,
tags=[
data.Tag(
term=data.term_from_key(label_key), value=file_annotation.label
)
],
sound_events=[
annotation_to_sound_event(
annotation,
clip.recording,
label_key=label_key,
event_key=event_key,
individual_key=individual_key,
)
for annotation in file_annotation.annotation
],
)
def file_annotation_to_annotation_task(
file_annotation: FileAnnotation,
clip: data.Clip,
) -> data.AnnotationTask:
status_badges = []
if file_annotation.issues:
status_badges.append(
data.StatusBadge(state=data.AnnotationState.rejected)
)
elif file_annotation.annotated:
status_badges.append(
data.StatusBadge(state=data.AnnotationState.completed)
)
return data.AnnotationTask(
uuid=uuid.uuid5(uuid.NAMESPACE_URL, f"{file_annotation.id}_task"),
clip=clip,
status_badges=status_badges,
)
def list_file_annotations(path: PathLike) -> List[Path]:
"""List all annotations in a directory."""
path = Path(path)
return [file for file in path.glob("*.json")]

View File

@ -0,0 +1,41 @@
from pathlib import Path
from typing import Literal, Union
from batdetect2.configs import BaseConfig
__all__ = [
"AnnotatedDataset",
"BatDetect2MergedAnnotations",
]
class AnnotatedDataset(BaseConfig):
"""Represents a single, cohesive source of audio recordings and annotations.
A source typically groups recordings originating from a specific context,
such as a single project, site, deployment, or recordist. All audio files
belonging to a source should be located within a single directory,
specified by `audio_dir`.
Annotations associated with these recordings are defined by the
`annotations` field, which supports various formats (e.g., AOEF files,
specific CSV
structures).
Crucially, file paths referenced within the annotation data *must* be
relative to the `audio_dir`. This ensures that the dataset definition
remains portable across different systems and base directories.
Attributes:
name: A unique identifier for this data source.
description: Detailed information about the source, including recording
methods, annotation procedures, equipment used, potential biases,
or any important caveats for users.
audio_dir: The file system path to the directory containing the audio
recordings for this source.
"""
name: str
audio_dir: Path
description: str = ""

37
batdetect2/data/data.py Normal file
View File

@ -0,0 +1,37 @@
from pathlib import Path
from typing import Optional
from soundevent import data
from batdetect2.configs import load_config
from batdetect2.data.annotations import load_annotated_dataset
from batdetect2.data.types import Dataset
__all__ = [
"load_dataset",
"load_dataset_from_config",
]
def load_dataset(
dataset: Dataset,
base_dir: Optional[Path] = None,
) -> data.AnnotationSet:
clip_annotations = []
for source in dataset.sources:
annotated_source = load_annotated_dataset(source, base_dir=base_dir)
clip_annotations.extend(annotated_source.clip_annotations)
return data.AnnotationSet(clip_annotations=clip_annotations)
def load_dataset_from_config(
path: data.PathLike,
field: Optional[str] = None,
base_dir: Optional[Path] = None,
):
config = load_config(
path=path,
schema=Dataset,
field=field,
)
return load_dataset(config, base_dir=base_dir)

29
batdetect2/data/types.py Normal file
View File

@ -0,0 +1,29 @@
from typing import Annotated, List
from pydantic import Field
from batdetect2.configs import BaseConfig
from batdetect2.data.annotations import AnnotationFormats
class Dataset(BaseConfig):
"""Represents a collection of one or more DatasetSources.
In the context of batdetect2, a Dataset aggregates multiple `DatasetSource`
instances. It serves as the primary unit for defining data splits,
typically used for model training, validation, or testing phases.
Attributes:
name: A descriptive name for the overall dataset
(e.g., "UK Training Set").
description: A detailed explanation of the dataset's purpose,
composition, how it was assembled, or any specific characteristics.
sources: A list containing the `DatasetSource` objects included in this
dataset.
"""
name: str
description: str
sources: List[
Annotated[AnnotationFormats, Field(..., discriminator="format")]
]