Added docstrings for the batdetect2 legacy annotation format

2025-06-29 22:51:58 +02:00 · 2025-04-18 15:14:48 +01:00 · 2025-04-18 15:14:48 +01:00 · bf14f4d37e
commit bf14f4d37e
parent b78e5a3a2f
4 changed files with 298 additions and 151 deletions
--- a/batdetect2/data/annotations/init.py
+++ b/batdetect2/data/annotations/init.py
@ -22,23 +22,23 @@ from batdetect2.data.annotations.aoef import (
    AOEFAnnotations,
    load_aoef_annotated_dataset,
 )
-from batdetect2.data.annotations.batdetect2_files import (
+from batdetect2.data.annotations.batdetect2 import (
    AnnotationFilter,
    BatDetect2FilesAnnotations,
    load_batdetect2_files_annotated_dataset,
 )
 from batdetect2.data.annotations.batdetect2_merged import (
    BatDetect2MergedAnnotations,
    load_batdetect2_files_annotated_dataset,
    load_batdetect2_merged_annotated_dataset,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 __all__ = [
    "load_annotated_dataset",
    "AnnotatedDataset",
    "AOEFAnnotations",
    "AnnotatedDataset",
    "AnnotationFilter",
    "AnnotationFormats",
    "BatDetect2FilesAnnotations",
    "BatDetect2MergedAnnotations",
-    "AnnotationFormats",
+    "load_annotated_dataset",
 ]
--- a/batdetect2/data/annotations/batdetect2.py
+++ b/batdetect2/data/annotations/batdetect2.py
@ -0,0 +1,291 @@
 """Loads annotation data from legacy BatDetect2 JSON formats.
 This module provides backward compatibility for loading annotation data stored
 in two related formats used by older BatDetect2 tools:
 1.  **`batdetect2` format** (Directory-based): Annotations are stored in
    individual JSON files (one per audio recording) within a specified
    directory.
    Each JSON file contains a `FileAnnotation` structure. Loaded via
    `load_batdetect2_files_annotated_dataset` defined by
    `BatDetect2FilesAnnotations`.
 2.  **`batdetect2_file` format** (Single-file): Annotations for multiple
    recordings are merged into a single JSON file, containing a list of
    `FileAnnotation` objects. Loaded via
    `load_batdetect2_merged_annotated_dataset` defined by
    `BatDetect2MergedAnnotations`.
 Both formats use the same internal structure for annotations per file and
 support filtering based on `annotated` and `issues` flags within that
 structure.
 The loading functions convert data from these legacy formats into the modern
 `soundevent` data model (primarily `ClipAnnotation`) and return the results
 aggregated into a `soundevent.data.AnnotationSet`.
 """
 import json
 import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 from pydantic import Field
 from soundevent import data
 from batdetect2.configs import BaseConfig
 from batdetect2.data.annotations.legacy import (
    FileAnnotation,
    file_annotation_to_clip,
    file_annotation_to_clip_annotation,
    list_file_annotations,
    load_file_annotation,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "load_batdetect2_files_annotated_dataset",
    "load_batdetect2_merged_annotated_dataset",
    "BatDetect2FilesAnnotations",
    "BatDetect2MergedAnnotations",
    "AnnotationFilter",
 ]
 class AnnotationFilter(BaseConfig):
    """Configuration for filtering legacy FileAnnotations based on flags.
    Specifies criteria based on boolean flags (`annotated` and `issues`)
    present within the legacy `FileAnnotation` JSON structure to select which
    entries (either files or records within a merged file) should be loaded and
    converted.
    Attributes
    ----------
    only_annotated : bool, default=True
        If True, only process entries where the `annotated` flag in the JSON
        is set to `True`.
    exclude_issues : bool, default=True
        If True, skip processing entries where the `issues` flag in the JSON
        is set to `True`.
    """
    only_annotated: bool = True
    exclude_issues: bool = True
 class BatDetect2FilesAnnotations(AnnotatedDataset):
    """Configuration for the legacy 'batdetect2' format (directory-based).
    Defines a data source where annotations are stored as individual JSON files
    (one per recording, containing a `FileAnnotation` structure) within the
    `annotations_dir`. Requires a corresponding `audio_dir`. Assumes a naming
    convention links audio files to JSON files
    (e.g., `rec.wav` -> `rec.wav.json`).
    Attributes
    ----------
    format : Literal["batdetect2"]
        The fixed format identifier for this configuration type.
    annotations_dir : Path
        Path to the directory containing the individual JSON annotation files.
    filter : AnnotationFilter, optional
        Configuration for filtering which files to process based on their
        `annotated` and `issues` flags. Defaults to requiring `annotated=True`
        and `issues=False`. Set explicitly to `None` in config (e.g.,
        `filter: null`) to disable filtering.
    """
    format: Literal["batdetect2"] = "batdetect2"
    annotations_dir: Path
    filter: AnnotationFilter = Field(
        default_factory=AnnotationFilter,
    )
 class BatDetect2MergedAnnotations(AnnotatedDataset):
    """Configuration for the legacy 'batdetect2_file' format (merged file).
    Defines a data source where annotations for multiple recordings (each as a
    `FileAnnotation` structure) are stored within a single JSON file specified
    by `annotations_path`. Audio files are expected in `audio_dir`.
    Inherits `name`, `description`, and `audio_dir` from `AnnotatedDataset`.
    Attributes
    ----------
    format : Literal["batdetect2_file"]
        The fixed format identifier for this configuration type.
    annotations_path : Path
        Path to the single JSON file containing a list of `FileAnnotation`
        objects.
    filter : AnnotationFilter, optional
        Configuration for filtering which `FileAnnotation` entries within the
        merged file to process based on their `annotated` and `issues` flags.
        Defaults to requiring `annotated=True` and `issues=False`. Set to `None`
        in config (e.g., `filter: null`) to disable filtering.
    """
    format: Literal["batdetect2_file"] = "batdetect2_file"
    annotations_path: Path
    filter: AnnotationFilter = Field(
        default_factory=AnnotationFilter,
    )
 def load_batdetect2_files_annotated_dataset(
    dataset: BatDetect2FilesAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationSet:
    """Load and convert 'batdetect2_file' annotations into an AnnotationSet.
    Scans the specified `annotations_dir` for individual JSON annotation files.
    For each file: loads the legacy `FileAnnotation`, applies filtering based
    on `dataset.filter` (`annotated`/`issues` flags), attempts to find the
    corresponding audio file, converts valid entries to `ClipAnnotation`, and
    collects them into a single `soundevent.data.AnnotationSet`.
    Parameters
    ----------
    dataset : BatDetect2FilesAnnotations
        Configuration describing the 'batdetect2' (directory) data source.
    base_dir : PathLike, optional
        Optional base directory to resolve relative paths in `dataset.audio_dir`
        and `dataset.annotations_dir`. Defaults to None.
    Returns
    -------
    soundevent.data.AnnotationSet
        An AnnotationSet containing all successfully loaded, filtered, and
        converted `ClipAnnotation` objects.
    Raises
    ------
    FileNotFoundError
        If the `annotations_dir` or `audio_dir` does not exist. Errors finding
        individual JSON or audio files during iteration are logged and skipped.
    """
    audio_dir = dataset.audio_dir
    path = dataset.annotations_dir
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    paths = list_file_annotations(path)
    annotations = []
    for p in paths:
        try:
            file_annotation = load_file_annotation(p)
        except FileNotFoundError:
            continue
        if dataset.filter.only_annotated and not file_annotation.annotated:
            continue
        if dataset.filter.exclude_issues and file_annotation.issues:
            continue
        try:
            clip = file_annotation_to_clip(
                file_annotation,
                audio_dir=audio_dir,
            )
        except FileNotFoundError:
            continue
        annotations.append(
            file_annotation_to_clip_annotation(
                file_annotation,
                clip,
            )
        )
    return data.AnnotationSet(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
    )
 def load_batdetect2_merged_annotated_dataset(
    dataset: BatDetect2MergedAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationSet:
    """Load and convert 'batdetect2_merged' annotations into an AnnotationSet.
    Loads a single JSON file containing a list of legacy `FileAnnotation`
    objects. For each entry in the list: applies filtering based on
    `dataset.filter` (`annotated`/`issues` flags), attempts to find the
    corresponding audio file, converts valid entries to `ClipAnnotation`, and
    collects them into a single `soundevent.data.AnnotationSet`.
    Parameters
    ----------
    dataset : BatDetect2MergedAnnotations
        Configuration describing the 'batdetect2_file' (merged) data source.
    base_dir : PathLike, optional
        Optional base directory to resolve relative paths in `dataset.audio_dir`
        and `dataset.annotations_path`. Defaults to None.
    Returns
    -------
    soundevent.data.AnnotationSet
        An AnnotationSet containing all successfully loaded, filtered, and
        converted `ClipAnnotation` objects from the merged file.
    Raises
    ------
    FileNotFoundError
        If the `annotations_path` or `audio_dir` does not exist. Errors
        finding individual audio files referenced within the JSON are logged
        and skipped.
    json.JSONDecodeError
        If the annotations file is not valid JSON.
    TypeError
        If the root JSON structure is not a list.
    pydantic.ValidationError
        If entries within the JSON list do not conform to the legacy
        `FileAnnotation` structure.
    """
    audio_dir = dataset.audio_dir
    path = dataset.annotations_path
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    content = json.loads(Path(path).read_text())
    annotations = []
    for ann in content:
        try:
            ann = FileAnnotation.model_validate(ann)
        except ValueError:
            continue
        if dataset.filter.only_annotated and not ann.annotated:
            continue
        if dataset.filter.exclude_issues and ann.issues:
            continue
        try:
            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
        except FileNotFoundError:
            continue
        annotations.append(file_annotation_to_clip_annotation(ann, clip))
    return data.AnnotationSet(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
    )
--- a/batdetect2/data/annotations/batdetect2_files.py
+++ b/batdetect2/data/annotations/batdetect2_files.py
@ -1,80 +0,0 @@
 import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 from soundevent import data
 from batdetect2.data.annotations.legacy import (
    file_annotation_to_annotation_task,
    file_annotation_to_clip,
    file_annotation_to_clip_annotation,
    list_file_annotations,
    load_file_annotation,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "load_batdetect2_files_annotated_dataset",
    "BatDetect2FilesAnnotations",
 ]
 class BatDetect2FilesAnnotations(AnnotatedDataset):
    format: Literal["batdetect2"] = "batdetect2"
    annotations_dir: Path
 def load_batdetect2_files_annotated_dataset(
    dataset: BatDetect2FilesAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationProject:
    """Convert annotations to annotation project."""
    audio_dir = dataset.audio_dir
    path = dataset.annotations_dir
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    paths = list_file_annotations(path)
    annotations = []
    tasks = []
    for p in paths:
        try:
            file_annotation = load_file_annotation(p)
        except FileNotFoundError:
            continue
        try:
            clip = file_annotation_to_clip(
                file_annotation,
                audio_dir=audio_dir,
            )
        except FileNotFoundError:
            continue
        annotations.append(
            file_annotation_to_clip_annotation(
                file_annotation,
                clip,
            )
        )
        tasks.append(
            file_annotation_to_annotation_task(
                file_annotation,
                clip,
            )
        )
    return data.AnnotationProject(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
        tasks=tasks,
    )
--- a/batdetect2/data/annotations/batdetect2_merged.py
+++ b/batdetect2/data/annotations/batdetect2_merged.py
@ -1,64 +0,0 @@
 import json
 import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 from soundevent import data
 from batdetect2.data.annotations.legacy import (
    FileAnnotation,
    file_annotation_to_annotation_task,
    file_annotation_to_clip,
    file_annotation_to_clip_annotation,
 )
 from batdetect2.data.annotations.types import AnnotatedDataset
 PathLike = Union[Path, str, os.PathLike]
 __all__ = [
    "BatDetect2MergedAnnotations",
    "load_batdetect2_merged_annotated_dataset",
 ]
 class BatDetect2MergedAnnotations(AnnotatedDataset):
    format: Literal["batdetect2_file"] = "batdetect2_file"
    annotations_path: Path
 def load_batdetect2_merged_annotated_dataset(
    dataset: BatDetect2MergedAnnotations,
    base_dir: Optional[PathLike] = None,
 ) -> data.AnnotationProject:
    audio_dir = dataset.audio_dir
    path = dataset.annotations_path
    if base_dir:
        audio_dir = base_dir / audio_dir
        path = base_dir / path
    content = json.loads(Path(path).read_text())
    annotations = []
    tasks = []
    for ann in content:
        try:
            ann = FileAnnotation.model_validate(ann)
        except ValueError:
            continue
        try:
            clip = file_annotation_to_clip(ann, audio_dir=audio_dir)
        except FileNotFoundError:
            continue
        annotations.append(file_annotation_to_clip_annotation(ann, clip))
        tasks.append(file_annotation_to_annotation_task(ann, clip))
    return data.AnnotationProject(
        name=dataset.name,
        description=dataset.description,
        clip_annotations=annotations,
        tasks=tasks,
    )