Added docstrings for the batdetect2 legacy annotation format

2025-06-29 22:51:58 +02:00 · 2025-04-18 15:14:48 +01:00 · 2025-04-18 15:14:48 +01:00 · bf14f4d37e
commit bf14f4d37e
parent b78e5a3a2f
4 changed files with 298 additions and 151 deletions
--- a/batdetect2/data/annotations/init.py
+++ b/batdetect2/data/annotations/init.py
@ -22,23 +22,23 @@ from batdetect2.data.annotations.aoef import (
    AOEFAnnotations,
    load_aoef_annotated_dataset,
 )
-from batdetect2.data.annotations.batdetect2_files import (
+from batdetect2.data.annotations.batdetect2 import (
+    AnnotationFilter,
    BatDetect2FilesAnnotations,
-    load_batdetect2_files_annotated_dataset,
-)
-from batdetect2.data.annotations.batdetect2_merged import (
    BatDetect2MergedAnnotations,
+    load_batdetect2_files_annotated_dataset,
    load_batdetect2_merged_annotated_dataset,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset

 __all__ = [
-    "load_annotated_dataset",
-    "AnnotatedDataset",
    "AOEFAnnotations",
+    "AnnotatedDataset",
+    "AnnotationFilter",
+    "AnnotationFormats",
    "BatDetect2FilesAnnotations",
    "BatDetect2MergedAnnotations",
-    "AnnotationFormats",
+    "load_annotated_dataset",
 ]


--- a/batdetect2/data/annotations/batdetect2.py
+++ b/batdetect2/data/annotations/batdetect2.py
@ -0,0 +1,291 @@
+"""Loads annotation data from legacy BatDetect2 JSON formats.
+
+This module provides backward compatibility for loading annotation data stored
+in two related formats used by older BatDetect2 tools:
+
+1.  **`batdetect2` format** (Directory-based): Annotations are stored in
+    individual JSON files (one per audio recording) within a specified
+    directory.
+    Each JSON file contains a `FileAnnotation` structure. Loaded via
+    `load_batdetect2_files_annotated_dataset` defined by
+    `BatDetect2FilesAnnotations`.
+2.  **`batdetect2_file` format** (Single-file): Annotations for multiple
+    recordings are merged into a single JSON file, containing a list of
+    `FileAnnotation` objects. Loaded via
+    `load_batdetect2_merged_annotated_dataset` defined by
+    `BatDetect2MergedAnnotations`.
+
+Both formats use the same internal structure for annotations per file and
+support filtering based on `annotated` and `issues` flags within that
+structure.
+
+The loading functions convert data from these legacy formats into the modern
+`soundevent` data model (primarily `ClipAnnotation`) and return the results
+aggregated into a `soundevent.data.AnnotationSet`.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Literal, Optional, Union
+
+from pydantic import Field
+from soundevent import data
+
+from batdetect2.configs import BaseConfig
+from batdetect2.data.annotations.legacy import (
+    FileAnnotation,
+    file_annotation_to_clip,
+    file_annotation_to_clip_annotation,
+    list_file_annotations,
+    load_file_annotation,
+)
+from batdetect2.data.annotations.types import AnnotatedDataset
+
+PathLike = Union[Path, str, os.PathLike]
+
+
+__all__ = [
+    "load_batdetect2_files_annotated_dataset",
+    "load_batdetect2_merged_annotated_dataset",
+    "BatDetect2FilesAnnotations",
+    "BatDetect2MergedAnnotations",
+    "AnnotationFilter",
+]
+
+
+class AnnotationFilter(BaseConfig):
+    """Configuration for filtering legacy FileAnnotations based on flags.
+
+    Specifies criteria based on boolean flags (`annotated` and `issues`)
+    present within the legacy `FileAnnotation` JSON structure to select which
+    entries (either files or records within a merged file) should be loaded and
+    converted.
+
+    Attributes
+    ----------
+    only_annotated : bool, default=True
+        If True, only process entries where the `annotated` flag in the JSON
+        is set to `True`.
+    exclude_issues : bool, default=True
+        If True, skip processing entries where the `issues` flag in the JSON
+        is set to `True`.
+    """
+
+    only_annotated: bool = True
+    exclude_issues: bool = True
+
+
+class BatDetect2FilesAnnotations(AnnotatedDataset):
+    """Configuration for the legacy 'batdetect2' format (directory-based).
+
+    Defines a data source where annotations are stored as individual JSON files
+    (one per recording, containing a `FileAnnotation` structure) within the
+    `annotations_dir`. Requires a corresponding `audio_dir`. Assumes a naming
+    convention links audio files to JSON files
+    (e.g., `rec.wav` -> `rec.wav.json`).
+
+    Attributes
+    ----------
+    format : Literal["batdetect2"]
+        The fixed format identifier for this configuration type.
+    annotations_dir : Path
+        Path to the directory containing the individual JSON annotation files.
+    filter : AnnotationFilter, optional
+        Configuration for filtering which files to process based on their
+        `annotated` and `issues` flags. Defaults to requiring `annotated=True`
+        and `issues=False`. Set explicitly to `None` in config (e.g.,
+        `filter: null`) to disable filtering.
+    """
+
+    format: Literal["batdetect2"] = "batdetect2"
+    annotations_dir: Path
+
+    filter: AnnotationFilter = Field(
+        default_factory=AnnotationFilter,
+    )
+
+
+class BatDetect2MergedAnnotations(AnnotatedDataset):
+    """Configuration for the legacy 'batdetect2_file' format (merged file).
+
+    Defines a data source where annotations for multiple recordings (each as a
+    `FileAnnotation` structure) are stored within a single JSON file specified
+    by `annotations_path`. Audio files are expected in `audio_dir`.
+
+    Inherits `name`, `description`, and `audio_dir` from `AnnotatedDataset`.
+
+    Attributes
+    ----------
+    format : Literal["batdetect2_file"]
+        The fixed format identifier for this configuration type.
+    annotations_path : Path
+        Path to the single JSON file containing a list of `FileAnnotation`
+        objects.
+    filter : AnnotationFilter, optional
+        Configuration for filtering which `FileAnnotation` entries within the
+        merged file to process based on their `annotated` and `issues` flags.
+        Defaults to requiring `annotated=True` and `issues=False`. Set to `None`
+        in config (e.g., `filter: null`) to disable filtering.
+    """
+
+    format: Literal["batdetect2_file"] = "batdetect2_file"
+    annotations_path: Path
+
+    filter: AnnotationFilter = Field(
+        default_factory=AnnotationFilter,
+    )
+
+
+def load_batdetect2_files_annotated_dataset(
+    dataset: BatDetect2FilesAnnotations,
+    base_dir: Optional[PathLike] = None,
+) -> data.AnnotationSet:
+    """Load and convert 'batdetect2_file' annotations into an AnnotationSet.
+
+    Scans the specified `annotations_dir` for individual JSON annotation files.
+    For each file: loads the legacy `FileAnnotation`, applies filtering based
+    on `dataset.filter` (`annotated`/`issues` flags), attempts to find the
+    corresponding audio file, converts valid entries to `ClipAnnotation`, and
+    collects them into a single `soundevent.data.AnnotationSet`.
+
+    Parameters
+    ----------
+    dataset : BatDetect2FilesAnnotations
+        Configuration describing the 'batdetect2' (directory) data source.
+    base_dir : PathLike, optional
+        Optional base directory to resolve relative paths in `dataset.audio_dir`
+        and `dataset.annotations_dir`. Defaults to None.
+
+    Returns
+    -------
+    soundevent.data.AnnotationSet
+        An AnnotationSet containing all successfully loaded, filtered, and
+        converted `ClipAnnotation` objects.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the `annotations_dir` or `audio_dir` does not exist. Errors finding
+        individual JSON or audio files during iteration are logged and skipped.
+    """
+    audio_dir = dataset.audio_dir
+    path = dataset.annotations_dir
+
+    if base_dir:
+        audio_dir = base_dir / audio_dir
+        path = base_dir / path
+
+    paths = list_file_annotations(path)
+
+    annotations = []
+
+    for p in paths:
+        try:
+            file_annotation = load_file_annotation(p)
+        except FileNotFoundError:
+            continue
+
+        if dataset.filter.only_annotated and not file_annotation.annotated:
+            continue
+
+        if dataset.filter.exclude_issues and file_annotation.issues:
+            continue
+
+        try:
+            clip = file_annotation_to_clip(
+                file_annotation,
+                audio_dir=audio_dir,
+            )
+        except FileNotFoundError:
+            continue
+
+        annotations.append(
+            file_annotation_to_clip_annotation(
+                file_annotation,
+                clip,
+            )
+        )
+
+    return data.AnnotationSet(
+        name=dataset.name,
+        description=dataset.description,
+        clip_annotations=annotations,
+    )
+
+
+def load_batdetect2_merged_annotated_dataset(
+    dataset: BatDetect2MergedAnnotations,
+    base_dir: Optional[PathLike] = None,
+) -> data.AnnotationSet:
+    """Load and convert 'batdetect2_merged' annotations into an AnnotationSet.
+
+    Loads a single JSON file containing a list of legacy `FileAnnotation`
+    objects. For each entry in the list: applies filtering based on
+    `dataset.filter` (`annotated`/`issues` flags), attempts to find the
+    corresponding audio file, converts valid entries to `ClipAnnotation`, and
+    collects them into a single `soundevent.data.AnnotationSet`.
+
+    Parameters
+    ----------
+    dataset : BatDetect2MergedAnnotations
+        Configuration describing the 'batdetect2_file' (merged) data source.
+    base_dir : PathLike, optional
+        Optional base directory to resolve relative paths in `dataset.audio_dir`
+        and `dataset.annotations_path`. Defaults to None.
+
+    Returns
+    -------
+    soundevent.data.AnnotationSet
+        An AnnotationSet containing all successfully loaded, filtered, and
+        converted `ClipAnnotation` objects from the merged file.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the `annotations_path` or `audio_dir` does not exist. Errors
+        finding individual audio files referenced within the JSON are logged
+        and skipped.
+    json.JSONDecodeError
+        If the annotations file is not valid JSON.
+    TypeError
+        If the root JSON structure is not a list.
+    pydantic.ValidationError
+        If entries within the JSON list do not conform to the legacy
+        `FileAnnotation` structure.
+    """
+    audio_dir = dataset.audio_dir
+    path = dataset.annotations_path
+
+    if base_dir:
+        audio_dir = base_dir / audio_dir
+        path = base_dir / path
+
+    content = json.loads(Path(path).read_text())
+
+    annotations = []
+
+    for ann in content:
+        try:
+            ann = FileAnnotation.model_validate(ann)
+        except ValueError:
+            continue
+
+        if dataset.filter.only_annotated and not ann.annotated:
+            continue
+
+        if dataset.filter.exclude_issues and ann.issues:
+            continue
+
+        try:
+            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
+        except FileNotFoundError:
+            continue
+
+        annotations.append(file_annotation_to_clip_annotation(ann, clip))
+
+    return data.AnnotationSet(
+        name=dataset.name,
+        description=dataset.description,
+        clip_annotations=annotations,
+    )
--- a/batdetect2/data/annotations/batdetect2_files.py
+++ b/batdetect2/data/annotations/batdetect2_files.py
@ -1,80 +0,0 @@
-import os
-from pathlib import Path
-from typing import Literal, Optional, Union
-
-from soundevent import data
-
-from batdetect2.data.annotations.legacy import (
-    file_annotation_to_annotation_task,
-    file_annotation_to_clip,
-    file_annotation_to_clip_annotation,
-    list_file_annotations,
-    load_file_annotation,
-)
-from batdetect2.data.annotations.types import AnnotatedDataset
-
-PathLike = Union[Path, str, os.PathLike]
-
-
-__all__ = [
-    "load_batdetect2_files_annotated_dataset",
-    "BatDetect2FilesAnnotations",
-]
-
-
-class BatDetect2FilesAnnotations(AnnotatedDataset):
-    format: Literal["batdetect2"] = "batdetect2"
-    annotations_dir: Path
-
-
-def load_batdetect2_files_annotated_dataset(
-    dataset: BatDetect2FilesAnnotations,
-    base_dir: Optional[PathLike] = None,
-) -> data.AnnotationProject:
-    """Convert annotations to annotation project."""
-    audio_dir = dataset.audio_dir
-    path = dataset.annotations_dir
-
-    if base_dir:
-        audio_dir = base_dir / audio_dir
-        path = base_dir / path
-
-    paths = list_file_annotations(path)
-
-    annotations = []
-    tasks = []
-
-    for p in paths:
-        try:
-            file_annotation = load_file_annotation(p)
-        except FileNotFoundError:
-            continue
-
-        try:
-            clip = file_annotation_to_clip(
-                file_annotation,
-                audio_dir=audio_dir,
-            )
-        except FileNotFoundError:
-            continue
-
-        annotations.append(
-            file_annotation_to_clip_annotation(
-                file_annotation,
-                clip,
-            )
-        )
-
-        tasks.append(
-            file_annotation_to_annotation_task(
-                file_annotation,
-                clip,
-            )
-        )
-
-    return data.AnnotationProject(
-        name=dataset.name,
-        description=dataset.description,
-        clip_annotations=annotations,
-        tasks=tasks,
-    )
--- a/batdetect2/data/annotations/batdetect2_merged.py
+++ b/batdetect2/data/annotations/batdetect2_merged.py
@ -1,64 +0,0 @@
-import json
-import os
-from pathlib import Path
-from typing import Literal, Optional, Union
-
-from soundevent import data
-
-from batdetect2.data.annotations.legacy import (
-    FileAnnotation,
-    file_annotation_to_annotation_task,
-    file_annotation_to_clip,
-    file_annotation_to_clip_annotation,
-)
-from batdetect2.data.annotations.types import AnnotatedDataset
-
-PathLike = Union[Path, str, os.PathLike]
-
-__all__ = [
-    "BatDetect2MergedAnnotations",
-    "load_batdetect2_merged_annotated_dataset",
-]
-
-
-class BatDetect2MergedAnnotations(AnnotatedDataset):
-    format: Literal["batdetect2_file"] = "batdetect2_file"
-    annotations_path: Path
-
-
-def load_batdetect2_merged_annotated_dataset(
-    dataset: BatDetect2MergedAnnotations,
-    base_dir: Optional[PathLike] = None,
-) -> data.AnnotationProject:
-    audio_dir = dataset.audio_dir
-    path = dataset.annotations_path
-
-    if base_dir:
-        audio_dir = base_dir / audio_dir
-        path = base_dir / path
-
-    content = json.loads(Path(path).read_text())
-
-    annotations = []
-    tasks = []
-
-    for ann in content:
-        try:
-            ann = FileAnnotation.model_validate(ann)
-        except ValueError:
-            continue
-
-        try:
-            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
-        except FileNotFoundError:
-            continue
-
-        annotations.append(file_annotation_to_clip_annotation(ann, clip))
-        tasks.append(file_annotation_to_annotation_task(ann, clip))
-
-    return data.AnnotationProject(
-        name=dataset.name,
-        description=dataset.description,
-        clip_annotations=annotations,
-        tasks=tasks,
-    )