Added docstrings for the batdetect2 legacy annotation format

This commit is contained in:
mbsantiago 2025-04-18 15:14:48 +01:00
parent b78e5a3a2f
commit bf14f4d37e
4 changed files with 298 additions and 151 deletions

View File

@ -22,23 +22,23 @@ from batdetect2.data.annotations.aoef import (
AOEFAnnotations,
load_aoef_annotated_dataset,
)
from batdetect2.data.annotations.batdetect2_files import (
from batdetect2.data.annotations.batdetect2 import (
AnnotationFilter,
BatDetect2FilesAnnotations,
load_batdetect2_files_annotated_dataset,
)
from batdetect2.data.annotations.batdetect2_merged import (
BatDetect2MergedAnnotations,
load_batdetect2_files_annotated_dataset,
load_batdetect2_merged_annotated_dataset,
)
from batdetect2.data.annotations.types import AnnotatedDataset
__all__ = [
"load_annotated_dataset",
"AnnotatedDataset",
"AOEFAnnotations",
"AnnotatedDataset",
"AnnotationFilter",
"AnnotationFormats",
"BatDetect2FilesAnnotations",
"BatDetect2MergedAnnotations",
"AnnotationFormats",
"load_annotated_dataset",
]

View File

@ -0,0 +1,291 @@
"""Loads annotation data from legacy BatDetect2 JSON formats.
This module provides backward compatibility for loading annotation data stored
in two related formats used by older BatDetect2 tools:
1. **`batdetect2` format** (Directory-based): Annotations are stored in
individual JSON files (one per audio recording) within a specified
directory.
Each JSON file contains a `FileAnnotation` structure. Loaded via
`load_batdetect2_files_annotated_dataset` defined by
`BatDetect2FilesAnnotations`.
2. **`batdetect2_file` format** (Single-file): Annotations for multiple
recordings are merged into a single JSON file, containing a list of
`FileAnnotation` objects. Loaded via
`load_batdetect2_merged_annotated_dataset` defined by
`BatDetect2MergedAnnotations`.
Both formats use the same internal structure for annotations per file and
support filtering based on `annotated` and `issues` flags within that
structure.
The loading functions convert data from these legacy formats into the modern
`soundevent` data model (primarily `ClipAnnotation`) and return the results
aggregated into a `soundevent.data.AnnotationSet`.
"""
import json
import os
from pathlib import Path
from typing import Literal, Optional, Union
from pydantic import Field
from soundevent import data
from batdetect2.configs import BaseConfig
from batdetect2.data.annotations.legacy import (
FileAnnotation,
file_annotation_to_clip,
file_annotation_to_clip_annotation,
list_file_annotations,
load_file_annotation,
)
from batdetect2.data.annotations.types import AnnotatedDataset
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"load_batdetect2_files_annotated_dataset",
"load_batdetect2_merged_annotated_dataset",
"BatDetect2FilesAnnotations",
"BatDetect2MergedAnnotations",
"AnnotationFilter",
]
class AnnotationFilter(BaseConfig):
"""Configuration for filtering legacy FileAnnotations based on flags.
Specifies criteria based on boolean flags (`annotated` and `issues`)
present within the legacy `FileAnnotation` JSON structure to select which
entries (either files or records within a merged file) should be loaded and
converted.
Attributes
----------
only_annotated : bool, default=True
If True, only process entries where the `annotated` flag in the JSON
is set to `True`.
exclude_issues : bool, default=True
If True, skip processing entries where the `issues` flag in the JSON
is set to `True`.
"""
only_annotated: bool = True
exclude_issues: bool = True
class BatDetect2FilesAnnotations(AnnotatedDataset):
"""Configuration for the legacy 'batdetect2' format (directory-based).
Defines a data source where annotations are stored as individual JSON files
(one per recording, containing a `FileAnnotation` structure) within the
`annotations_dir`. Requires a corresponding `audio_dir`. Assumes a naming
convention links audio files to JSON files
(e.g., `rec.wav` -> `rec.wav.json`).
Attributes
----------
format : Literal["batdetect2"]
The fixed format identifier for this configuration type.
annotations_dir : Path
Path to the directory containing the individual JSON annotation files.
filter : AnnotationFilter, optional
Configuration for filtering which files to process based on their
`annotated` and `issues` flags. Defaults to requiring `annotated=True`
and `issues=False`. Set explicitly to `None` in config (e.g.,
`filter: null`) to disable filtering.
"""
format: Literal["batdetect2"] = "batdetect2"
annotations_dir: Path
filter: AnnotationFilter = Field(
default_factory=AnnotationFilter,
)
class BatDetect2MergedAnnotations(AnnotatedDataset):
"""Configuration for the legacy 'batdetect2_file' format (merged file).
Defines a data source where annotations for multiple recordings (each as a
`FileAnnotation` structure) are stored within a single JSON file specified
by `annotations_path`. Audio files are expected in `audio_dir`.
Inherits `name`, `description`, and `audio_dir` from `AnnotatedDataset`.
Attributes
----------
format : Literal["batdetect2_file"]
The fixed format identifier for this configuration type.
annotations_path : Path
Path to the single JSON file containing a list of `FileAnnotation`
objects.
filter : AnnotationFilter, optional
Configuration for filtering which `FileAnnotation` entries within the
merged file to process based on their `annotated` and `issues` flags.
Defaults to requiring `annotated=True` and `issues=False`. Set to `None`
in config (e.g., `filter: null`) to disable filtering.
"""
format: Literal["batdetect2_file"] = "batdetect2_file"
annotations_path: Path
filter: AnnotationFilter = Field(
default_factory=AnnotationFilter,
)
def load_batdetect2_files_annotated_dataset(
dataset: BatDetect2FilesAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationSet:
"""Load and convert 'batdetect2_file' annotations into an AnnotationSet.
Scans the specified `annotations_dir` for individual JSON annotation files.
For each file: loads the legacy `FileAnnotation`, applies filtering based
on `dataset.filter` (`annotated`/`issues` flags), attempts to find the
corresponding audio file, converts valid entries to `ClipAnnotation`, and
collects them into a single `soundevent.data.AnnotationSet`.
Parameters
----------
dataset : BatDetect2FilesAnnotations
Configuration describing the 'batdetect2' (directory) data source.
base_dir : PathLike, optional
Optional base directory to resolve relative paths in `dataset.audio_dir`
and `dataset.annotations_dir`. Defaults to None.
Returns
-------
soundevent.data.AnnotationSet
An AnnotationSet containing all successfully loaded, filtered, and
converted `ClipAnnotation` objects.
Raises
------
FileNotFoundError
If the `annotations_dir` or `audio_dir` does not exist. Errors finding
individual JSON or audio files during iteration are logged and skipped.
"""
audio_dir = dataset.audio_dir
path = dataset.annotations_dir
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
paths = list_file_annotations(path)
annotations = []
for p in paths:
try:
file_annotation = load_file_annotation(p)
except FileNotFoundError:
continue
if dataset.filter.only_annotated and not file_annotation.annotated:
continue
if dataset.filter.exclude_issues and file_annotation.issues:
continue
try:
clip = file_annotation_to_clip(
file_annotation,
audio_dir=audio_dir,
)
except FileNotFoundError:
continue
annotations.append(
file_annotation_to_clip_annotation(
file_annotation,
clip,
)
)
return data.AnnotationSet(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
)
def load_batdetect2_merged_annotated_dataset(
dataset: BatDetect2MergedAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationSet:
"""Load and convert 'batdetect2_merged' annotations into an AnnotationSet.
Loads a single JSON file containing a list of legacy `FileAnnotation`
objects. For each entry in the list: applies filtering based on
`dataset.filter` (`annotated`/`issues` flags), attempts to find the
corresponding audio file, converts valid entries to `ClipAnnotation`, and
collects them into a single `soundevent.data.AnnotationSet`.
Parameters
----------
dataset : BatDetect2MergedAnnotations
Configuration describing the 'batdetect2_file' (merged) data source.
base_dir : PathLike, optional
Optional base directory to resolve relative paths in `dataset.audio_dir`
and `dataset.annotations_path`. Defaults to None.
Returns
-------
soundevent.data.AnnotationSet
An AnnotationSet containing all successfully loaded, filtered, and
converted `ClipAnnotation` objects from the merged file.
Raises
------
FileNotFoundError
If the `annotations_path` or `audio_dir` does not exist. Errors
finding individual audio files referenced within the JSON are logged
and skipped.
json.JSONDecodeError
If the annotations file is not valid JSON.
TypeError
If the root JSON structure is not a list.
pydantic.ValidationError
If entries within the JSON list do not conform to the legacy
`FileAnnotation` structure.
"""
audio_dir = dataset.audio_dir
path = dataset.annotations_path
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
content = json.loads(Path(path).read_text())
annotations = []
for ann in content:
try:
ann = FileAnnotation.model_validate(ann)
except ValueError:
continue
if dataset.filter.only_annotated and not ann.annotated:
continue
if dataset.filter.exclude_issues and ann.issues:
continue
try:
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
except FileNotFoundError:
continue
annotations.append(file_annotation_to_clip_annotation(ann, clip))
return data.AnnotationSet(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
)

View File

@ -1,80 +0,0 @@
import os
from pathlib import Path
from typing import Literal, Optional, Union
from soundevent import data
from batdetect2.data.annotations.legacy import (
file_annotation_to_annotation_task,
file_annotation_to_clip,
file_annotation_to_clip_annotation,
list_file_annotations,
load_file_annotation,
)
from batdetect2.data.annotations.types import AnnotatedDataset
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"load_batdetect2_files_annotated_dataset",
"BatDetect2FilesAnnotations",
]
class BatDetect2FilesAnnotations(AnnotatedDataset):
format: Literal["batdetect2"] = "batdetect2"
annotations_dir: Path
def load_batdetect2_files_annotated_dataset(
dataset: BatDetect2FilesAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationProject:
"""Convert annotations to annotation project."""
audio_dir = dataset.audio_dir
path = dataset.annotations_dir
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
paths = list_file_annotations(path)
annotations = []
tasks = []
for p in paths:
try:
file_annotation = load_file_annotation(p)
except FileNotFoundError:
continue
try:
clip = file_annotation_to_clip(
file_annotation,
audio_dir=audio_dir,
)
except FileNotFoundError:
continue
annotations.append(
file_annotation_to_clip_annotation(
file_annotation,
clip,
)
)
tasks.append(
file_annotation_to_annotation_task(
file_annotation,
clip,
)
)
return data.AnnotationProject(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
tasks=tasks,
)

View File

@ -1,64 +0,0 @@
import json
import os
from pathlib import Path
from typing import Literal, Optional, Union
from soundevent import data
from batdetect2.data.annotations.legacy import (
FileAnnotation,
file_annotation_to_annotation_task,
file_annotation_to_clip,
file_annotation_to_clip_annotation,
)
from batdetect2.data.annotations.types import AnnotatedDataset
PathLike = Union[Path, str, os.PathLike]
__all__ = [
"BatDetect2MergedAnnotations",
"load_batdetect2_merged_annotated_dataset",
]
class BatDetect2MergedAnnotations(AnnotatedDataset):
format: Literal["batdetect2_file"] = "batdetect2_file"
annotations_path: Path
def load_batdetect2_merged_annotated_dataset(
dataset: BatDetect2MergedAnnotations,
base_dir: Optional[PathLike] = None,
) -> data.AnnotationProject:
audio_dir = dataset.audio_dir
path = dataset.annotations_path
if base_dir:
audio_dir = base_dir / audio_dir
path = base_dir / path
content = json.loads(Path(path).read_text())
annotations = []
tasks = []
for ann in content:
try:
ann = FileAnnotation.model_validate(ann)
except ValueError:
continue
try:
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
except FileNotFoundError:
continue
annotations.append(file_annotation_to_clip_annotation(ann, clip))
tasks.append(file_annotation_to_annotation_task(ann, clip))
return data.AnnotationProject(
name=dataset.name,
description=dataset.description,
clip_annotations=annotations,
tasks=tasks,
)