mirror of
https://github.com/macaodha/batdetect2.git
synced 2025-06-29 22:51:58 +02:00
More structured data module
This commit is contained in:
parent
30d3a2c92e
commit
451093f2da
14
batdetect2/data/__init__.py
Normal file
14
batdetect2/data/__init__.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from batdetect2.data.annotations import (
|
||||||
|
AnnotatedDataset,
|
||||||
|
load_annotated_dataset,
|
||||||
|
)
|
||||||
|
from batdetect2.data.data import load_dataset, load_dataset_from_config
|
||||||
|
from batdetect2.data.types import Dataset
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AnnotatedDataset",
|
||||||
|
"Dataset",
|
||||||
|
"load_annotated_dataset",
|
||||||
|
"load_dataset",
|
||||||
|
"load_dataset_from_config",
|
||||||
|
]
|
36
batdetect2/data/annotations.py
Normal file
36
batdetect2/data/annotations.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Union
|
||||||
|
|
||||||
|
from batdetect2.configs import BaseConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AOEFAnnotationFile",
|
||||||
|
"AnnotationFormats",
|
||||||
|
"BatDetect2AnnotationFile",
|
||||||
|
"BatDetect2AnnotationFiles",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2AnnotationFiles(BaseConfig):
|
||||||
|
format: Literal["batdetect2"] = "batdetect2"
|
||||||
|
path: Path
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2AnnotationFile(BaseConfig):
|
||||||
|
format: Literal["batdetect2_file"] = "batdetect2_file"
|
||||||
|
path: Path
|
||||||
|
|
||||||
|
|
||||||
|
class AOEFAnnotationFile(BaseConfig):
|
||||||
|
format: Literal["aoef"] = "aoef"
|
||||||
|
path: Path
|
||||||
|
|
||||||
|
|
||||||
|
AnnotationFormats = Union[
|
||||||
|
BatDetect2AnnotationFiles,
|
||||||
|
BatDetect2AnnotationFile,
|
||||||
|
AOEFAnnotationFile,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
55
batdetect2/data/annotations/__init__.py
Normal file
55
batdetect2/data/annotations/__init__.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from soundevent import data
|
||||||
|
|
||||||
|
from batdetect2.data.annotations.aeof import (
|
||||||
|
AOEFAnnotations,
|
||||||
|
load_aoef_annotated_dataset,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.batdetect2_files import (
|
||||||
|
BatDetect2FilesAnnotations,
|
||||||
|
load_batdetect2_files_annotated_dataset,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.batdetect2_merged import (
|
||||||
|
BatDetect2MergedAnnotations,
|
||||||
|
load_batdetect2_merged_annotated_dataset,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"load_annotated_dataset",
|
||||||
|
"AnnotatedDataset",
|
||||||
|
"AOEFAnnotations",
|
||||||
|
"BatDetect2FilesAnnotations",
|
||||||
|
"BatDetect2MergedAnnotations",
|
||||||
|
"AnnotationFormats",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
AnnotationFormats = Union[
|
||||||
|
BatDetect2MergedAnnotations,
|
||||||
|
BatDetect2FilesAnnotations,
|
||||||
|
AOEFAnnotations,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_annotated_dataset(
|
||||||
|
dataset: AnnotatedDataset,
|
||||||
|
base_dir: Optional[Path] = None,
|
||||||
|
) -> data.AnnotationSet:
|
||||||
|
if isinstance(dataset, AOEFAnnotations):
|
||||||
|
return load_aoef_annotated_dataset(dataset, base_dir=base_dir)
|
||||||
|
|
||||||
|
if isinstance(dataset, BatDetect2MergedAnnotations):
|
||||||
|
return load_batdetect2_merged_annotated_dataset(
|
||||||
|
dataset, base_dir=base_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(dataset, BatDetect2FilesAnnotations):
|
||||||
|
return load_batdetect2_files_annotated_dataset(
|
||||||
|
dataset,
|
||||||
|
base_dir=base_dir,
|
||||||
|
)
|
||||||
|
|
||||||
|
raise NotImplementedError(f"Unknown annotation format: {dataset.name}")
|
37
batdetect2/data/annotations/aeof.py
Normal file
37
batdetect2/data/annotations/aeof.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Optional
|
||||||
|
|
||||||
|
from soundevent import data, io
|
||||||
|
|
||||||
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AOEFAnnotations",
|
||||||
|
"load_aoef_annotated_dataset",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class AOEFAnnotations(AnnotatedDataset):
|
||||||
|
format: Literal["aoef"] = "aoef"
|
||||||
|
annotations_path: Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_aoef_annotated_dataset(
|
||||||
|
dataset: AOEFAnnotations,
|
||||||
|
base_dir: Optional[Path] = None,
|
||||||
|
) -> data.AnnotationSet:
|
||||||
|
audio_dir = dataset.audio_dir
|
||||||
|
path = dataset.annotations_path
|
||||||
|
|
||||||
|
if base_dir:
|
||||||
|
audio_dir = base_dir / audio_dir
|
||||||
|
path = base_dir / path
|
||||||
|
|
||||||
|
loaded = io.load(path, audio_dir=audio_dir)
|
||||||
|
|
||||||
|
if not isinstance(loaded, (data.AnnotationSet, data.AnnotationProject)):
|
||||||
|
raise ValueError(
|
||||||
|
f"The AOEF file at {path} does not contain a set of annotations"
|
||||||
|
)
|
||||||
|
|
||||||
|
return loaded
|
80
batdetect2/data/annotations/batdetect2_files.py
Normal file
80
batdetect2/data/annotations/batdetect2_files.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
|
from soundevent import data
|
||||||
|
|
||||||
|
from batdetect2.data.annotations.legacy import (
|
||||||
|
file_annotation_to_annotation_task,
|
||||||
|
file_annotation_to_clip,
|
||||||
|
file_annotation_to_clip_annotation,
|
||||||
|
list_file_annotations,
|
||||||
|
load_file_annotation,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
|
PathLike = Union[Path, str, os.PathLike]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"load_batdetect2_files_annotated_dataset",
|
||||||
|
"BatDetect2FilesAnnotations",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2FilesAnnotations(AnnotatedDataset):
|
||||||
|
format: Literal["batdetect2"] = "batdetect2"
|
||||||
|
annotations_dir: Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_batdetect2_files_annotated_dataset(
|
||||||
|
dataset: BatDetect2FilesAnnotations,
|
||||||
|
base_dir: Optional[PathLike] = None,
|
||||||
|
) -> data.AnnotationProject:
|
||||||
|
"""Convert annotations to annotation project."""
|
||||||
|
audio_dir = dataset.audio_dir
|
||||||
|
path = dataset.annotations_dir
|
||||||
|
|
||||||
|
if base_dir:
|
||||||
|
audio_dir = base_dir / audio_dir
|
||||||
|
path = base_dir / path
|
||||||
|
|
||||||
|
paths = list_file_annotations(path)
|
||||||
|
|
||||||
|
annotations = []
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for p in paths:
|
||||||
|
try:
|
||||||
|
file_annotation = load_file_annotation(p)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
clip = file_annotation_to_clip(
|
||||||
|
file_annotation,
|
||||||
|
audio_dir=audio_dir,
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotations.append(
|
||||||
|
file_annotation_to_clip_annotation(
|
||||||
|
file_annotation,
|
||||||
|
clip,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
tasks.append(
|
||||||
|
file_annotation_to_annotation_task(
|
||||||
|
file_annotation,
|
||||||
|
clip,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return data.AnnotationProject(
|
||||||
|
name=dataset.name,
|
||||||
|
description=dataset.description,
|
||||||
|
clip_annotations=annotations,
|
||||||
|
tasks=tasks,
|
||||||
|
)
|
64
batdetect2/data/annotations/batdetect2_merged.py
Normal file
64
batdetect2/data/annotations/batdetect2_merged.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
|
from soundevent import data
|
||||||
|
|
||||||
|
from batdetect2.data.annotations.legacy import (
|
||||||
|
FileAnnotation,
|
||||||
|
file_annotation_to_annotation_task,
|
||||||
|
file_annotation_to_clip,
|
||||||
|
file_annotation_to_clip_annotation,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
|
PathLike = Union[Path, str, os.PathLike]
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BatDetect2MergedAnnotations",
|
||||||
|
"load_batdetect2_merged_annotated_dataset",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2MergedAnnotations(AnnotatedDataset):
|
||||||
|
format: Literal["batdetect2_file"] = "batdetect2_file"
|
||||||
|
annotations_path: Path
|
||||||
|
|
||||||
|
|
||||||
|
def load_batdetect2_merged_annotated_dataset(
|
||||||
|
dataset: BatDetect2MergedAnnotations,
|
||||||
|
base_dir: Optional[PathLike] = None,
|
||||||
|
) -> data.AnnotationProject:
|
||||||
|
audio_dir = dataset.audio_dir
|
||||||
|
path = dataset.annotations_path
|
||||||
|
|
||||||
|
if base_dir:
|
||||||
|
audio_dir = base_dir / audio_dir
|
||||||
|
path = base_dir / path
|
||||||
|
|
||||||
|
content = json.loads(Path(path).read_text())
|
||||||
|
|
||||||
|
annotations = []
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for ann in content:
|
||||||
|
try:
|
||||||
|
ann = FileAnnotation.model_validate(ann)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotations.append(file_annotation_to_clip_annotation(ann, clip))
|
||||||
|
tasks.append(file_annotation_to_annotation_task(ann, clip))
|
||||||
|
|
||||||
|
return data.AnnotationProject(
|
||||||
|
name=dataset.name,
|
||||||
|
description=dataset.description,
|
||||||
|
clip_annotations=annotations,
|
||||||
|
tasks=tasks,
|
||||||
|
)
|
304
batdetect2/data/annotations/legacy.py
Normal file
304
batdetect2/data/annotations/legacy.py
Normal file
@ -0,0 +1,304 @@
|
|||||||
|
"""Compatibility functions between old and new data structures."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from soundevent import data
|
||||||
|
from soundevent.geometry import compute_bounds
|
||||||
|
from soundevent.types import ClassMapper
|
||||||
|
|
||||||
|
from batdetect2 import types
|
||||||
|
|
||||||
|
PathLike = Union[Path, str, os.PathLike]
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"convert_to_annotation_group",
|
||||||
|
]
|
||||||
|
|
||||||
|
SPECIES_TAG_KEY = "species"
|
||||||
|
ECHOLOCATION_EVENT = "Echolocation"
|
||||||
|
UNKNOWN_CLASS = "__UNKNOWN__"
|
||||||
|
|
||||||
|
NAMESPACE = uuid.UUID("97a9776b-c0fd-4c68-accb-0b0ecd719242")
|
||||||
|
|
||||||
|
|
||||||
|
EventFn = Callable[[data.SoundEventAnnotation], Optional[str]]
|
||||||
|
|
||||||
|
ClassFn = Callable[[data.Recording], int]
|
||||||
|
|
||||||
|
IndividualFn = Callable[[data.SoundEventAnnotation], int]
|
||||||
|
|
||||||
|
|
||||||
|
def get_recording_class_name(recording: data.Recording) -> str:
|
||||||
|
"""Get the class name for a recording."""
|
||||||
|
tag = data.find_tag(recording.tags, SPECIES_TAG_KEY)
|
||||||
|
if tag is None:
|
||||||
|
return UNKNOWN_CLASS
|
||||||
|
return tag.value
|
||||||
|
|
||||||
|
|
||||||
|
def get_annotation_notes(annotation: data.ClipAnnotation) -> str:
|
||||||
|
"""Get the notes for a ClipAnnotation."""
|
||||||
|
all_notes = [
|
||||||
|
*annotation.notes,
|
||||||
|
*annotation.clip.recording.notes,
|
||||||
|
]
|
||||||
|
messages = [note.message for note in all_notes if note.message is not None]
|
||||||
|
return "\n".join(messages)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_annotation_group(
|
||||||
|
annotation: data.ClipAnnotation,
|
||||||
|
class_mapper: ClassMapper,
|
||||||
|
event_fn: EventFn = lambda _: ECHOLOCATION_EVENT,
|
||||||
|
class_fn: ClassFn = lambda _: 0,
|
||||||
|
individual_fn: IndividualFn = lambda _: 0,
|
||||||
|
) -> types.AudioLoaderAnnotationGroup:
|
||||||
|
"""Convert a ClipAnnotation to an AudioLoaderAnnotationGroup."""
|
||||||
|
recording = annotation.clip.recording
|
||||||
|
|
||||||
|
start_times = []
|
||||||
|
end_times = []
|
||||||
|
low_freqs = []
|
||||||
|
high_freqs = []
|
||||||
|
class_ids = []
|
||||||
|
x_inds = []
|
||||||
|
y_inds = []
|
||||||
|
individual_ids = []
|
||||||
|
annotations: List[types.Annotation] = []
|
||||||
|
class_id_file = class_fn(recording)
|
||||||
|
|
||||||
|
for sound_event in annotation.sound_events:
|
||||||
|
geometry = sound_event.sound_event.geometry
|
||||||
|
|
||||||
|
if geometry is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
start_time, low_freq, end_time, high_freq = compute_bounds(geometry)
|
||||||
|
class_id = class_mapper.transform(sound_event) or -1
|
||||||
|
event = event_fn(sound_event) or ""
|
||||||
|
individual_id = individual_fn(sound_event) or -1
|
||||||
|
|
||||||
|
start_times.append(start_time)
|
||||||
|
end_times.append(end_time)
|
||||||
|
low_freqs.append(low_freq)
|
||||||
|
high_freqs.append(high_freq)
|
||||||
|
class_ids.append(class_id)
|
||||||
|
individual_ids.append(individual_id)
|
||||||
|
|
||||||
|
# NOTE: This will be computed later so we just put a placeholder
|
||||||
|
# here for now.
|
||||||
|
x_inds.append(0)
|
||||||
|
y_inds.append(0)
|
||||||
|
|
||||||
|
annotations.append(
|
||||||
|
{
|
||||||
|
"start_time": start_time,
|
||||||
|
"end_time": end_time,
|
||||||
|
"low_freq": low_freq,
|
||||||
|
"high_freq": high_freq,
|
||||||
|
"class_prob": 1.0,
|
||||||
|
"det_prob": 1.0,
|
||||||
|
"individual": "0",
|
||||||
|
"event": event,
|
||||||
|
"class_id": class_id, # type: ignore
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": str(recording.path),
|
||||||
|
"duration": recording.duration,
|
||||||
|
"issues": False,
|
||||||
|
"file_path": str(recording.path),
|
||||||
|
"time_exp": recording.time_expansion,
|
||||||
|
"class_name": get_recording_class_name(recording),
|
||||||
|
"notes": get_annotation_notes(annotation),
|
||||||
|
"annotated": True,
|
||||||
|
"start_times": np.array(start_times),
|
||||||
|
"end_times": np.array(end_times),
|
||||||
|
"low_freqs": np.array(low_freqs),
|
||||||
|
"high_freqs": np.array(high_freqs),
|
||||||
|
"class_ids": np.array(class_ids),
|
||||||
|
"x_inds": np.array(x_inds),
|
||||||
|
"y_inds": np.array(y_inds),
|
||||||
|
"individual_ids": np.array(individual_ids),
|
||||||
|
"annotation": annotations,
|
||||||
|
"class_id_file": class_id_file,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Annotation(BaseModel):
|
||||||
|
"""Annotation class to hold batdetect annotations."""
|
||||||
|
|
||||||
|
label: str = Field(alias="class")
|
||||||
|
event: str
|
||||||
|
individual: int = 0
|
||||||
|
|
||||||
|
start_time: float
|
||||||
|
end_time: float
|
||||||
|
low_freq: float
|
||||||
|
high_freq: float
|
||||||
|
|
||||||
|
|
||||||
|
class FileAnnotation(BaseModel):
|
||||||
|
"""FileAnnotation class to hold batdetect annotations for a file."""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
duration: float
|
||||||
|
time_exp: float = 1
|
||||||
|
|
||||||
|
label: str = Field(alias="class_name")
|
||||||
|
|
||||||
|
annotation: List[Annotation]
|
||||||
|
|
||||||
|
annotated: bool = False
|
||||||
|
issues: bool = False
|
||||||
|
notes: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def load_file_annotation(path: PathLike) -> FileAnnotation:
|
||||||
|
"""Load annotation from batdetect format."""
|
||||||
|
path = Path(path)
|
||||||
|
return FileAnnotation.model_validate_json(path.read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def annotation_to_sound_event(
|
||||||
|
annotation: Annotation,
|
||||||
|
recording: data.Recording,
|
||||||
|
label_key: str = "class",
|
||||||
|
event_key: str = "event",
|
||||||
|
individual_key: str = "individual",
|
||||||
|
) -> data.SoundEventAnnotation:
|
||||||
|
"""Convert annotation to sound event annotation."""
|
||||||
|
sound_event = data.SoundEvent(
|
||||||
|
uuid=uuid.uuid5(
|
||||||
|
NAMESPACE,
|
||||||
|
f"{recording.hash}_{annotation.start_time}_{annotation.end_time}",
|
||||||
|
),
|
||||||
|
recording=recording,
|
||||||
|
geometry=data.BoundingBox(
|
||||||
|
coordinates=[
|
||||||
|
annotation.start_time,
|
||||||
|
annotation.low_freq,
|
||||||
|
annotation.end_time,
|
||||||
|
annotation.high_freq,
|
||||||
|
],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
return data.SoundEventAnnotation(
|
||||||
|
uuid=uuid.uuid5(NAMESPACE, f"{sound_event.uuid}_annotation"),
|
||||||
|
sound_event=sound_event,
|
||||||
|
tags=[
|
||||||
|
data.Tag(
|
||||||
|
term=data.term_from_key(label_key),
|
||||||
|
value=annotation.label,
|
||||||
|
),
|
||||||
|
data.Tag(
|
||||||
|
term=data.term_from_key(event_key),
|
||||||
|
value=annotation.event,
|
||||||
|
),
|
||||||
|
data.Tag(
|
||||||
|
term=data.term_from_key(individual_key),
|
||||||
|
value=str(annotation.individual),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def file_annotation_to_clip(
|
||||||
|
file_annotation: FileAnnotation,
|
||||||
|
audio_dir: Optional[PathLike] = None,
|
||||||
|
label_key: str = "class",
|
||||||
|
) -> data.Clip:
|
||||||
|
"""Convert file annotation to recording."""
|
||||||
|
audio_dir = audio_dir or Path.cwd()
|
||||||
|
|
||||||
|
full_path = Path(audio_dir) / file_annotation.id
|
||||||
|
|
||||||
|
if not full_path.exists():
|
||||||
|
raise FileNotFoundError(f"File {full_path} not found.")
|
||||||
|
|
||||||
|
recording = data.Recording.from_file(
|
||||||
|
full_path,
|
||||||
|
time_expansion=file_annotation.time_exp,
|
||||||
|
tags=[
|
||||||
|
data.Tag(
|
||||||
|
term=data.term_from_key(label_key),
|
||||||
|
value=file_annotation.label,
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
return data.Clip(
|
||||||
|
uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip"),
|
||||||
|
recording=recording,
|
||||||
|
start_time=0,
|
||||||
|
end_time=recording.duration,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def file_annotation_to_clip_annotation(
|
||||||
|
file_annotation: FileAnnotation,
|
||||||
|
clip: data.Clip,
|
||||||
|
label_key: str = "class",
|
||||||
|
event_key: str = "event",
|
||||||
|
individual_key: str = "individual",
|
||||||
|
) -> data.ClipAnnotation:
|
||||||
|
"""Convert file annotation to clip annotation."""
|
||||||
|
notes = []
|
||||||
|
if file_annotation.notes:
|
||||||
|
notes.append(data.Note(message=file_annotation.notes))
|
||||||
|
|
||||||
|
return data.ClipAnnotation(
|
||||||
|
uuid=uuid.uuid5(NAMESPACE, f"{file_annotation.id}_clip_annotation"),
|
||||||
|
clip=clip,
|
||||||
|
notes=notes,
|
||||||
|
tags=[
|
||||||
|
data.Tag(
|
||||||
|
term=data.term_from_key(label_key), value=file_annotation.label
|
||||||
|
)
|
||||||
|
],
|
||||||
|
sound_events=[
|
||||||
|
annotation_to_sound_event(
|
||||||
|
annotation,
|
||||||
|
clip.recording,
|
||||||
|
label_key=label_key,
|
||||||
|
event_key=event_key,
|
||||||
|
individual_key=individual_key,
|
||||||
|
)
|
||||||
|
for annotation in file_annotation.annotation
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def file_annotation_to_annotation_task(
|
||||||
|
file_annotation: FileAnnotation,
|
||||||
|
clip: data.Clip,
|
||||||
|
) -> data.AnnotationTask:
|
||||||
|
status_badges = []
|
||||||
|
|
||||||
|
if file_annotation.issues:
|
||||||
|
status_badges.append(
|
||||||
|
data.StatusBadge(state=data.AnnotationState.rejected)
|
||||||
|
)
|
||||||
|
elif file_annotation.annotated:
|
||||||
|
status_badges.append(
|
||||||
|
data.StatusBadge(state=data.AnnotationState.completed)
|
||||||
|
)
|
||||||
|
|
||||||
|
return data.AnnotationTask(
|
||||||
|
uuid=uuid.uuid5(uuid.NAMESPACE_URL, f"{file_annotation.id}_task"),
|
||||||
|
clip=clip,
|
||||||
|
status_badges=status_badges,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_file_annotations(path: PathLike) -> List[Path]:
|
||||||
|
"""List all annotations in a directory."""
|
||||||
|
path = Path(path)
|
||||||
|
return [file for file in path.glob("*.json")]
|
41
batdetect2/data/annotations/types.py
Normal file
41
batdetect2/data/annotations/types.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Union
|
||||||
|
|
||||||
|
from batdetect2.configs import BaseConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AnnotatedDataset",
|
||||||
|
"BatDetect2MergedAnnotations",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotatedDataset(BaseConfig):
|
||||||
|
"""Represents a single, cohesive source of audio recordings and annotations.
|
||||||
|
|
||||||
|
A source typically groups recordings originating from a specific context,
|
||||||
|
such as a single project, site, deployment, or recordist. All audio files
|
||||||
|
belonging to a source should be located within a single directory,
|
||||||
|
specified by `audio_dir`.
|
||||||
|
|
||||||
|
Annotations associated with these recordings are defined by the
|
||||||
|
`annotations` field, which supports various formats (e.g., AOEF files,
|
||||||
|
specific CSV
|
||||||
|
structures).
|
||||||
|
Crucially, file paths referenced within the annotation data *must* be
|
||||||
|
relative to the `audio_dir`. This ensures that the dataset definition
|
||||||
|
remains portable across different systems and base directories.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: A unique identifier for this data source.
|
||||||
|
description: Detailed information about the source, including recording
|
||||||
|
methods, annotation procedures, equipment used, potential biases,
|
||||||
|
or any important caveats for users.
|
||||||
|
audio_dir: The file system path to the directory containing the audio
|
||||||
|
recordings for this source.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
audio_dir: Path
|
||||||
|
description: str = ""
|
||||||
|
|
||||||
|
|
37
batdetect2/data/data.py
Normal file
37
batdetect2/data/data.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from soundevent import data
|
||||||
|
|
||||||
|
from batdetect2.configs import load_config
|
||||||
|
from batdetect2.data.annotations import load_annotated_dataset
|
||||||
|
from batdetect2.data.types import Dataset
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"load_dataset",
|
||||||
|
"load_dataset_from_config",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(
|
||||||
|
dataset: Dataset,
|
||||||
|
base_dir: Optional[Path] = None,
|
||||||
|
) -> data.AnnotationSet:
|
||||||
|
clip_annotations = []
|
||||||
|
for source in dataset.sources:
|
||||||
|
annotated_source = load_annotated_dataset(source, base_dir=base_dir)
|
||||||
|
clip_annotations.extend(annotated_source.clip_annotations)
|
||||||
|
return data.AnnotationSet(clip_annotations=clip_annotations)
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset_from_config(
|
||||||
|
path: data.PathLike,
|
||||||
|
field: Optional[str] = None,
|
||||||
|
base_dir: Optional[Path] = None,
|
||||||
|
):
|
||||||
|
config = load_config(
|
||||||
|
path=path,
|
||||||
|
schema=Dataset,
|
||||||
|
field=field,
|
||||||
|
)
|
||||||
|
return load_dataset(config, base_dir=base_dir)
|
29
batdetect2/data/types.py
Normal file
29
batdetect2/data/types.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from typing import Annotated, List
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from batdetect2.configs import BaseConfig
|
||||||
|
from batdetect2.data.annotations import AnnotationFormats
|
||||||
|
|
||||||
|
|
||||||
|
class Dataset(BaseConfig):
|
||||||
|
"""Represents a collection of one or more DatasetSources.
|
||||||
|
|
||||||
|
In the context of batdetect2, a Dataset aggregates multiple `DatasetSource`
|
||||||
|
instances. It serves as the primary unit for defining data splits,
|
||||||
|
typically used for model training, validation, or testing phases.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
name: A descriptive name for the overall dataset
|
||||||
|
(e.g., "UK Training Set").
|
||||||
|
description: A detailed explanation of the dataset's purpose,
|
||||||
|
composition, how it was assembled, or any specific characteristics.
|
||||||
|
sources: A list containing the `DatasetSource` objects included in this
|
||||||
|
dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
sources: List[
|
||||||
|
Annotated[AnnotationFormats, Field(..., discriminator="format")]
|
||||||
|
]
|
Loading…
Reference in New Issue
Block a user