mirror of
https://github.com/macaodha/batdetect2.git
synced 2025-06-29 22:51:58 +02:00
Added docstrings for the batdetect2 legacy annotation format
This commit is contained in:
parent
b78e5a3a2f
commit
bf14f4d37e
@ -22,23 +22,23 @@ from batdetect2.data.annotations.aoef import (
|
|||||||
AOEFAnnotations,
|
AOEFAnnotations,
|
||||||
load_aoef_annotated_dataset,
|
load_aoef_annotated_dataset,
|
||||||
)
|
)
|
||||||
from batdetect2.data.annotations.batdetect2_files import (
|
from batdetect2.data.annotations.batdetect2 import (
|
||||||
|
AnnotationFilter,
|
||||||
BatDetect2FilesAnnotations,
|
BatDetect2FilesAnnotations,
|
||||||
load_batdetect2_files_annotated_dataset,
|
|
||||||
)
|
|
||||||
from batdetect2.data.annotations.batdetect2_merged import (
|
|
||||||
BatDetect2MergedAnnotations,
|
BatDetect2MergedAnnotations,
|
||||||
|
load_batdetect2_files_annotated_dataset,
|
||||||
load_batdetect2_merged_annotated_dataset,
|
load_batdetect2_merged_annotated_dataset,
|
||||||
)
|
)
|
||||||
from batdetect2.data.annotations.types import AnnotatedDataset
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"load_annotated_dataset",
|
|
||||||
"AnnotatedDataset",
|
|
||||||
"AOEFAnnotations",
|
"AOEFAnnotations",
|
||||||
|
"AnnotatedDataset",
|
||||||
|
"AnnotationFilter",
|
||||||
|
"AnnotationFormats",
|
||||||
"BatDetect2FilesAnnotations",
|
"BatDetect2FilesAnnotations",
|
||||||
"BatDetect2MergedAnnotations",
|
"BatDetect2MergedAnnotations",
|
||||||
"AnnotationFormats",
|
"load_annotated_dataset",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
291
batdetect2/data/annotations/batdetect2.py
Normal file
291
batdetect2/data/annotations/batdetect2.py
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
"""Loads annotation data from legacy BatDetect2 JSON formats.
|
||||||
|
|
||||||
|
This module provides backward compatibility for loading annotation data stored
|
||||||
|
in two related formats used by older BatDetect2 tools:
|
||||||
|
|
||||||
|
1. **`batdetect2` format** (Directory-based): Annotations are stored in
|
||||||
|
individual JSON files (one per audio recording) within a specified
|
||||||
|
directory.
|
||||||
|
Each JSON file contains a `FileAnnotation` structure. Loaded via
|
||||||
|
`load_batdetect2_files_annotated_dataset` defined by
|
||||||
|
`BatDetect2FilesAnnotations`.
|
||||||
|
2. **`batdetect2_file` format** (Single-file): Annotations for multiple
|
||||||
|
recordings are merged into a single JSON file, containing a list of
|
||||||
|
`FileAnnotation` objects. Loaded via
|
||||||
|
`load_batdetect2_merged_annotated_dataset` defined by
|
||||||
|
`BatDetect2MergedAnnotations`.
|
||||||
|
|
||||||
|
Both formats use the same internal structure for annotations per file and
|
||||||
|
support filtering based on `annotated` and `issues` flags within that
|
||||||
|
structure.
|
||||||
|
|
||||||
|
The loading functions convert data from these legacy formats into the modern
|
||||||
|
`soundevent` data model (primarily `ClipAnnotation`) and return the results
|
||||||
|
aggregated into a `soundevent.data.AnnotationSet`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
from soundevent import data
|
||||||
|
|
||||||
|
from batdetect2.configs import BaseConfig
|
||||||
|
from batdetect2.data.annotations.legacy import (
|
||||||
|
FileAnnotation,
|
||||||
|
file_annotation_to_clip,
|
||||||
|
file_annotation_to_clip_annotation,
|
||||||
|
list_file_annotations,
|
||||||
|
load_file_annotation,
|
||||||
|
)
|
||||||
|
from batdetect2.data.annotations.types import AnnotatedDataset
|
||||||
|
|
||||||
|
PathLike = Union[Path, str, os.PathLike]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"load_batdetect2_files_annotated_dataset",
|
||||||
|
"load_batdetect2_merged_annotated_dataset",
|
||||||
|
"BatDetect2FilesAnnotations",
|
||||||
|
"BatDetect2MergedAnnotations",
|
||||||
|
"AnnotationFilter",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotationFilter(BaseConfig):
|
||||||
|
"""Configuration for filtering legacy FileAnnotations based on flags.
|
||||||
|
|
||||||
|
Specifies criteria based on boolean flags (`annotated` and `issues`)
|
||||||
|
present within the legacy `FileAnnotation` JSON structure to select which
|
||||||
|
entries (either files or records within a merged file) should be loaded and
|
||||||
|
converted.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
only_annotated : bool, default=True
|
||||||
|
If True, only process entries where the `annotated` flag in the JSON
|
||||||
|
is set to `True`.
|
||||||
|
exclude_issues : bool, default=True
|
||||||
|
If True, skip processing entries where the `issues` flag in the JSON
|
||||||
|
is set to `True`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
only_annotated: bool = True
|
||||||
|
exclude_issues: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2FilesAnnotations(AnnotatedDataset):
|
||||||
|
"""Configuration for the legacy 'batdetect2' format (directory-based).
|
||||||
|
|
||||||
|
Defines a data source where annotations are stored as individual JSON files
|
||||||
|
(one per recording, containing a `FileAnnotation` structure) within the
|
||||||
|
`annotations_dir`. Requires a corresponding `audio_dir`. Assumes a naming
|
||||||
|
convention links audio files to JSON files
|
||||||
|
(e.g., `rec.wav` -> `rec.wav.json`).
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
format : Literal["batdetect2"]
|
||||||
|
The fixed format identifier for this configuration type.
|
||||||
|
annotations_dir : Path
|
||||||
|
Path to the directory containing the individual JSON annotation files.
|
||||||
|
filter : AnnotationFilter, optional
|
||||||
|
Configuration for filtering which files to process based on their
|
||||||
|
`annotated` and `issues` flags. Defaults to requiring `annotated=True`
|
||||||
|
and `issues=False`. Set explicitly to `None` in config (e.g.,
|
||||||
|
`filter: null`) to disable filtering.
|
||||||
|
"""
|
||||||
|
|
||||||
|
format: Literal["batdetect2"] = "batdetect2"
|
||||||
|
annotations_dir: Path
|
||||||
|
|
||||||
|
filter: AnnotationFilter = Field(
|
||||||
|
default_factory=AnnotationFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BatDetect2MergedAnnotations(AnnotatedDataset):
|
||||||
|
"""Configuration for the legacy 'batdetect2_file' format (merged file).
|
||||||
|
|
||||||
|
Defines a data source where annotations for multiple recordings (each as a
|
||||||
|
`FileAnnotation` structure) are stored within a single JSON file specified
|
||||||
|
by `annotations_path`. Audio files are expected in `audio_dir`.
|
||||||
|
|
||||||
|
Inherits `name`, `description`, and `audio_dir` from `AnnotatedDataset`.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
format : Literal["batdetect2_file"]
|
||||||
|
The fixed format identifier for this configuration type.
|
||||||
|
annotations_path : Path
|
||||||
|
Path to the single JSON file containing a list of `FileAnnotation`
|
||||||
|
objects.
|
||||||
|
filter : AnnotationFilter, optional
|
||||||
|
Configuration for filtering which `FileAnnotation` entries within the
|
||||||
|
merged file to process based on their `annotated` and `issues` flags.
|
||||||
|
Defaults to requiring `annotated=True` and `issues=False`. Set to `None`
|
||||||
|
in config (e.g., `filter: null`) to disable filtering.
|
||||||
|
"""
|
||||||
|
|
||||||
|
format: Literal["batdetect2_file"] = "batdetect2_file"
|
||||||
|
annotations_path: Path
|
||||||
|
|
||||||
|
filter: AnnotationFilter = Field(
|
||||||
|
default_factory=AnnotationFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_batdetect2_files_annotated_dataset(
|
||||||
|
dataset: BatDetect2FilesAnnotations,
|
||||||
|
base_dir: Optional[PathLike] = None,
|
||||||
|
) -> data.AnnotationSet:
|
||||||
|
"""Load and convert 'batdetect2_file' annotations into an AnnotationSet.
|
||||||
|
|
||||||
|
Scans the specified `annotations_dir` for individual JSON annotation files.
|
||||||
|
For each file: loads the legacy `FileAnnotation`, applies filtering based
|
||||||
|
on `dataset.filter` (`annotated`/`issues` flags), attempts to find the
|
||||||
|
corresponding audio file, converts valid entries to `ClipAnnotation`, and
|
||||||
|
collects them into a single `soundevent.data.AnnotationSet`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : BatDetect2FilesAnnotations
|
||||||
|
Configuration describing the 'batdetect2' (directory) data source.
|
||||||
|
base_dir : PathLike, optional
|
||||||
|
Optional base directory to resolve relative paths in `dataset.audio_dir`
|
||||||
|
and `dataset.annotations_dir`. Defaults to None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
soundevent.data.AnnotationSet
|
||||||
|
An AnnotationSet containing all successfully loaded, filtered, and
|
||||||
|
converted `ClipAnnotation` objects.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
FileNotFoundError
|
||||||
|
If the `annotations_dir` or `audio_dir` does not exist. Errors finding
|
||||||
|
individual JSON or audio files during iteration are logged and skipped.
|
||||||
|
"""
|
||||||
|
audio_dir = dataset.audio_dir
|
||||||
|
path = dataset.annotations_dir
|
||||||
|
|
||||||
|
if base_dir:
|
||||||
|
audio_dir = base_dir / audio_dir
|
||||||
|
path = base_dir / path
|
||||||
|
|
||||||
|
paths = list_file_annotations(path)
|
||||||
|
|
||||||
|
annotations = []
|
||||||
|
|
||||||
|
for p in paths:
|
||||||
|
try:
|
||||||
|
file_annotation = load_file_annotation(p)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dataset.filter.only_annotated and not file_annotation.annotated:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dataset.filter.exclude_issues and file_annotation.issues:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
clip = file_annotation_to_clip(
|
||||||
|
file_annotation,
|
||||||
|
audio_dir=audio_dir,
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotations.append(
|
||||||
|
file_annotation_to_clip_annotation(
|
||||||
|
file_annotation,
|
||||||
|
clip,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return data.AnnotationSet(
|
||||||
|
name=dataset.name,
|
||||||
|
description=dataset.description,
|
||||||
|
clip_annotations=annotations,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_batdetect2_merged_annotated_dataset(
|
||||||
|
dataset: BatDetect2MergedAnnotations,
|
||||||
|
base_dir: Optional[PathLike] = None,
|
||||||
|
) -> data.AnnotationSet:
|
||||||
|
"""Load and convert 'batdetect2_merged' annotations into an AnnotationSet.
|
||||||
|
|
||||||
|
Loads a single JSON file containing a list of legacy `FileAnnotation`
|
||||||
|
objects. For each entry in the list: applies filtering based on
|
||||||
|
`dataset.filter` (`annotated`/`issues` flags), attempts to find the
|
||||||
|
corresponding audio file, converts valid entries to `ClipAnnotation`, and
|
||||||
|
collects them into a single `soundevent.data.AnnotationSet`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : BatDetect2MergedAnnotations
|
||||||
|
Configuration describing the 'batdetect2_file' (merged) data source.
|
||||||
|
base_dir : PathLike, optional
|
||||||
|
Optional base directory to resolve relative paths in `dataset.audio_dir`
|
||||||
|
and `dataset.annotations_path`. Defaults to None.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
soundevent.data.AnnotationSet
|
||||||
|
An AnnotationSet containing all successfully loaded, filtered, and
|
||||||
|
converted `ClipAnnotation` objects from the merged file.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
FileNotFoundError
|
||||||
|
If the `annotations_path` or `audio_dir` does not exist. Errors
|
||||||
|
finding individual audio files referenced within the JSON are logged
|
||||||
|
and skipped.
|
||||||
|
json.JSONDecodeError
|
||||||
|
If the annotations file is not valid JSON.
|
||||||
|
TypeError
|
||||||
|
If the root JSON structure is not a list.
|
||||||
|
pydantic.ValidationError
|
||||||
|
If entries within the JSON list do not conform to the legacy
|
||||||
|
`FileAnnotation` structure.
|
||||||
|
"""
|
||||||
|
audio_dir = dataset.audio_dir
|
||||||
|
path = dataset.annotations_path
|
||||||
|
|
||||||
|
if base_dir:
|
||||||
|
audio_dir = base_dir / audio_dir
|
||||||
|
path = base_dir / path
|
||||||
|
|
||||||
|
content = json.loads(Path(path).read_text())
|
||||||
|
|
||||||
|
annotations = []
|
||||||
|
|
||||||
|
for ann in content:
|
||||||
|
try:
|
||||||
|
ann = FileAnnotation.model_validate(ann)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dataset.filter.only_annotated and not ann.annotated:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dataset.filter.exclude_issues and ann.issues:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotations.append(file_annotation_to_clip_annotation(ann, clip))
|
||||||
|
|
||||||
|
return data.AnnotationSet(
|
||||||
|
name=dataset.name,
|
||||||
|
description=dataset.description,
|
||||||
|
clip_annotations=annotations,
|
||||||
|
)
|
@ -1,80 +0,0 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Literal, Optional, Union
|
|
||||||
|
|
||||||
from soundevent import data
|
|
||||||
|
|
||||||
from batdetect2.data.annotations.legacy import (
|
|
||||||
file_annotation_to_annotation_task,
|
|
||||||
file_annotation_to_clip,
|
|
||||||
file_annotation_to_clip_annotation,
|
|
||||||
list_file_annotations,
|
|
||||||
load_file_annotation,
|
|
||||||
)
|
|
||||||
from batdetect2.data.annotations.types import AnnotatedDataset
|
|
||||||
|
|
||||||
PathLike = Union[Path, str, os.PathLike]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"load_batdetect2_files_annotated_dataset",
|
|
||||||
"BatDetect2FilesAnnotations",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class BatDetect2FilesAnnotations(AnnotatedDataset):
|
|
||||||
format: Literal["batdetect2"] = "batdetect2"
|
|
||||||
annotations_dir: Path
|
|
||||||
|
|
||||||
|
|
||||||
def load_batdetect2_files_annotated_dataset(
|
|
||||||
dataset: BatDetect2FilesAnnotations,
|
|
||||||
base_dir: Optional[PathLike] = None,
|
|
||||||
) -> data.AnnotationProject:
|
|
||||||
"""Convert annotations to annotation project."""
|
|
||||||
audio_dir = dataset.audio_dir
|
|
||||||
path = dataset.annotations_dir
|
|
||||||
|
|
||||||
if base_dir:
|
|
||||||
audio_dir = base_dir / audio_dir
|
|
||||||
path = base_dir / path
|
|
||||||
|
|
||||||
paths = list_file_annotations(path)
|
|
||||||
|
|
||||||
annotations = []
|
|
||||||
tasks = []
|
|
||||||
|
|
||||||
for p in paths:
|
|
||||||
try:
|
|
||||||
file_annotation = load_file_annotation(p)
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
clip = file_annotation_to_clip(
|
|
||||||
file_annotation,
|
|
||||||
audio_dir=audio_dir,
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
annotations.append(
|
|
||||||
file_annotation_to_clip_annotation(
|
|
||||||
file_annotation,
|
|
||||||
clip,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
tasks.append(
|
|
||||||
file_annotation_to_annotation_task(
|
|
||||||
file_annotation,
|
|
||||||
clip,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return data.AnnotationProject(
|
|
||||||
name=dataset.name,
|
|
||||||
description=dataset.description,
|
|
||||||
clip_annotations=annotations,
|
|
||||||
tasks=tasks,
|
|
||||||
)
|
|
@ -1,64 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Literal, Optional, Union
|
|
||||||
|
|
||||||
from soundevent import data
|
|
||||||
|
|
||||||
from batdetect2.data.annotations.legacy import (
|
|
||||||
FileAnnotation,
|
|
||||||
file_annotation_to_annotation_task,
|
|
||||||
file_annotation_to_clip,
|
|
||||||
file_annotation_to_clip_annotation,
|
|
||||||
)
|
|
||||||
from batdetect2.data.annotations.types import AnnotatedDataset
|
|
||||||
|
|
||||||
PathLike = Union[Path, str, os.PathLike]
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"BatDetect2MergedAnnotations",
|
|
||||||
"load_batdetect2_merged_annotated_dataset",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class BatDetect2MergedAnnotations(AnnotatedDataset):
|
|
||||||
format: Literal["batdetect2_file"] = "batdetect2_file"
|
|
||||||
annotations_path: Path
|
|
||||||
|
|
||||||
|
|
||||||
def load_batdetect2_merged_annotated_dataset(
|
|
||||||
dataset: BatDetect2MergedAnnotations,
|
|
||||||
base_dir: Optional[PathLike] = None,
|
|
||||||
) -> data.AnnotationProject:
|
|
||||||
audio_dir = dataset.audio_dir
|
|
||||||
path = dataset.annotations_path
|
|
||||||
|
|
||||||
if base_dir:
|
|
||||||
audio_dir = base_dir / audio_dir
|
|
||||||
path = base_dir / path
|
|
||||||
|
|
||||||
content = json.loads(Path(path).read_text())
|
|
||||||
|
|
||||||
annotations = []
|
|
||||||
tasks = []
|
|
||||||
|
|
||||||
for ann in content:
|
|
||||||
try:
|
|
||||||
ann = FileAnnotation.model_validate(ann)
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
annotations.append(file_annotation_to_clip_annotation(ann, clip))
|
|
||||||
tasks.append(file_annotation_to_annotation_task(ann, clip))
|
|
||||||
|
|
||||||
return data.AnnotationProject(
|
|
||||||
name=dataset.name,
|
|
||||||
description=dataset.description,
|
|
||||||
clip_annotations=annotations,
|
|
||||||
tasks=tasks,
|
|
||||||
)
|
|
Loading…
Reference in New Issue
Block a user