From f353aaa08cffed3c7c3d6559d7973f80c86abfd6 Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Fri, 18 Apr 2025 18:39:58 +0100
Subject: [PATCH] Added unit tests to legacy annotation loader

---
 batdetect2/data/annotations/batdetect2.py     |  33 +-
 .../test_annotations/test_batdetect2.py       | 611 ++++++++++++++++++
 2 files changed, 636 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_data/test_annotations/test_batdetect2.py

diff --git a/batdetect2/data/annotations/batdetect2.py b/batdetect2/data/annotations/batdetect2.py
index 5362ac7..055d84b 100644
--- a/batdetect2/data/annotations/batdetect2.py
+++ b/batdetect2/data/annotations/batdetect2.py
@@ -29,7 +29,7 @@ import os
 from pathlib import Path
 from typing import Literal, Optional, Union
 
-from pydantic import Field
+from pydantic import Field, ValidationError
 from soundevent import data
 
 from batdetect2.configs import BaseConfig
@@ -101,7 +101,7 @@ class BatDetect2FilesAnnotations(AnnotatedDataset):
     format: Literal["batdetect2"] = "batdetect2"
     annotations_dir: Path
 
-    filter: AnnotationFilter = Field(
+    filter: Optional[AnnotationFilter] = Field(
         default_factory=AnnotationFilter,
     )
 
@@ -132,7 +132,7 @@ class BatDetect2MergedAnnotations(AnnotatedDataset):
     format: Literal["batdetect2_file"] = "batdetect2_file"
     annotations_path: Path
 
-    filter: AnnotationFilter = Field(
+    filter: Optional[AnnotationFilter] = Field(
         default_factory=AnnotationFilter,
     )
 
@@ -183,13 +183,21 @@ def load_batdetect2_files_annotated_dataset(
     for p in paths:
         try:
             file_annotation = load_file_annotation(p)
-        except FileNotFoundError:
+        except (FileNotFoundError, ValidationError):
             continue
 
-        if dataset.filter.only_annotated and not file_annotation.annotated:
+        if (
+            dataset.filter
+            and dataset.filter.only_annotated
+            and not file_annotation.annotated
+        ):
             continue
 
-        if dataset.filter.exclude_issues and file_annotation.issues:
+        if (
+            dataset.filter
+            and dataset.filter.exclude_issues
+            and file_annotation.issues
+        ):
             continue
 
         try:
@@ -263,6 +271,11 @@ def load_batdetect2_merged_annotated_dataset(
 
     content = json.loads(Path(path).read_text())
 
+    if not isinstance(content, list):
+        raise TypeError(
+            f"Expected a list of FileAnnotations, but got {type(content)}",
+        )
+
     annotations = []
 
     for ann in content:
@@ -271,10 +284,14 @@ def load_batdetect2_merged_annotated_dataset(
         except ValueError:
             continue
 
-        if dataset.filter.only_annotated and not ann.annotated:
+        if (
+            dataset.filter
+            and dataset.filter.only_annotated
+            and not ann.annotated
+        ):
             continue
 
-        if dataset.filter.exclude_issues and ann.issues:
+        if dataset.filter and dataset.filter.exclude_issues and ann.issues:
             continue
 
         try:
diff --git a/tests/test_data/test_annotations/test_batdetect2.py b/tests/test_data/test_annotations/test_batdetect2.py
new file mode 100644
index 0000000..8e4beb9
--- /dev/null
+++ b/tests/test_data/test_annotations/test_batdetect2.py
@@ -0,0 +1,611 @@
+import json
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import pytest
+from soundevent import data
+
+from batdetect2.data.annotations.batdetect2 import (
+    AnnotationFilter,
+    BatDetect2FilesAnnotations,
+    BatDetect2MergedAnnotations,
+    load_batdetect2_files_annotated_dataset,
+    load_batdetect2_merged_annotated_dataset,
+)
+
+
+def create_legacy_file_annotation(
+    file_id: str,
+    duration: float = 5.0,
+    time_exp: float = 1.0,
+    class_name: str = "Myotis",
+    annotations: Optional[List[Dict[str, Any]]] = None,
+    annotated: bool = True,
+    issues: bool = False,
+    notes: str = "",
+) -> Dict[str, Any]:
+    if annotations is None:
+        annotations = [
+            {
+                "class": "Myotis",
+                "event": "Echolocation",
+                "individual": 0,
+                "start_time": 1.1,
+                "end_time": 1.2,
+                "low_freq": 30000,
+                "high_freq": 40000,
+            },
+            {
+                "class": "Pipistrellus",
+                "event": "Echolocation",
+                "individual": 0,
+                "start_time": 2.5,
+                "end_time": 2.55,
+                "low_freq": 50000,
+                "high_freq": 55000,
+            },
+        ]
+    return {
+        "id": file_id,
+        "duration": duration,
+        "time_exp": time_exp,
+        "class_name": class_name,
+        "annotation": annotations,
+        "annotated": annotated,
+        "issues": issues,
+        "notes": notes,
+    }
+
+
+@pytest.fixture
+def batdetect2_files_test_setup(
+    tmp_path: Path, wav_factory
+) -> Tuple[Path, Path, List[Dict[str, Any]]]:
+    """Sets up a directory structure for batdetect2 files format tests."""
+    audio_dir = tmp_path / "audio"
+    audio_dir.mkdir()
+    anns_dir = tmp_path / "anns"
+    anns_dir.mkdir()
+
+    files_data = []
+
+    # 1. File with single myotis annotation
+    rec1_path = wav_factory(path=audio_dir / "rec1.wav", duration=5.0)
+    ann1_data = create_legacy_file_annotation(
+        file_id="rec1.wav",
+        annotated=True,
+        issues=False,
+        notes="Standard notes.",
+        class_name="Myotis",
+        annotations=[
+            {
+                "class": "Myotis",
+                "event": "Echolocation",
+                "individual": 0,
+                "start_time": 1.1,
+                "end_time": 1.2,
+                "low_freq": 30000,
+                "high_freq": 40000,
+            }
+        ],
+    )
+    (anns_dir / f"{rec1_path.name}.json").write_text(json.dumps(ann1_data))
+    files_data.append(ann1_data)
+
+    # 2. File that has not been annotated
+    rec2_path = wav_factory(path=audio_dir / "rec2.wav", duration=4.0)
+    ann2_data = create_legacy_file_annotation(
+        file_id="rec2.wav",
+        annotated=False,
+        issues=False,
+        annotations=[],
+        class_name="Unknown",
+    )
+    (anns_dir / f"{rec2_path.name}.json").write_text(json.dumps(ann2_data))
+    files_data.append(ann2_data)
+
+    # 3. File that has been annotated but has issues
+    rec3_path = wav_factory(path=audio_dir / "rec3.wav", duration=6.0)
+    ann3_data = create_legacy_file_annotation(
+        file_id="rec3.wav",
+        annotated=True,
+        issues=True,
+        notes="File has issues.",
+        class_name="Pipistrellus",
+    )
+    (anns_dir / f"{rec3_path.name}.json").write_text(json.dumps(ann3_data))
+    files_data.append(ann3_data)
+
+    # 4. File that has been not been annotated and has issues
+    rec4_path = wav_factory(path=audio_dir / "rec4.wav", duration=3.0)
+    ann4_data = create_legacy_file_annotation(
+        file_id="rec4.wav", annotated=False, issues=True, class_name="Nyctalus"
+    )
+    (anns_dir / f"{rec4_path.name}.json").write_text(json.dumps(ann4_data))
+    files_data.append(ann4_data)
+
+    # 5. File that has been annotated but is missing audio
+    ann5_data = create_legacy_file_annotation(
+        file_id="rec_missing_audio.wav", annotated=True, issues=False
+    )
+    (anns_dir / "rec_missing_audio.wav.json").write_text(json.dumps(ann5_data))
+
+    # 6. File that has missing annotations
+    wav_factory(path=audio_dir / "rec_missing_ann.wav", duration=2.0)
+
+    # 7. A non -JSON file in the annotations directory
+    (anns_dir / "not_a_json.txt").write_text("hello")
+
+    return audio_dir, anns_dir, files_data
+
+
+@pytest.fixture
+def batdetect2_merged_test_setup(
+    tmp_path: Path, batdetect2_files_test_setup
+) -> Tuple[Path, Path, List[Dict[str, Any]]]:
+    """Sets up a directory structure for batdetect2 merged file format tests."""
+    audio_dir, _, files_data = batdetect2_files_test_setup
+    merged_anns_path = tmp_path / "merged_anns.json"
+
+    merged_data = [
+        fd for fd in files_data if fd["id"] != "rec_missing_audio.wav"
+    ]
+    merged_anns_path.write_text(json.dumps(merged_data))
+
+    return audio_dir, merged_anns_path, merged_data
+
+
+def test_annotation_filter_defaults():
+    """Test default values for AnnotationFilter."""
+    filt = AnnotationFilter()
+    assert filt.only_annotated is True
+    assert filt.exclude_issues is True
+
+
+def test_annotation_filter_custom():
+    """Test custom values for AnnotationFilter."""
+    filt = AnnotationFilter(only_annotated=False, exclude_issues=False)
+    assert filt.only_annotated is False
+    assert filt.exclude_issues is False
+
+
+def test_batdetect2_files_annotations_config(tmp_path: Path):
+    """Test initialization of BatDetect2FilesAnnotations."""
+    anns_dir = tmp_path / "annotations"
+    config = BatDetect2FilesAnnotations(
+        name="test_files",
+        description="Test Files Desc",
+        audio_dir=tmp_path / "audio",
+        annotations_dir=anns_dir,
+    )
+    assert config.format == "batdetect2"
+    assert config.name == "test_files"
+    assert config.description == "Test Files Desc"
+    assert config.annotations_dir == anns_dir
+    assert isinstance(config.filter, AnnotationFilter)
+    assert config.filter.only_annotated is True
+    assert config.filter.exclude_issues is True
+
+
+def test_batdetect2_files_annotations_config_no_filter(tmp_path: Path):
+    """Test BatDetect2FilesAnnotations with filter explicitly set to None."""
+    anns_dir = tmp_path / "annotations"
+    data = {
+        "name": "test_files_no_filter",
+        "audio_dir": str(tmp_path / "audio"),
+        "annotations_dir": str(anns_dir),
+        "filter": None,
+    }
+    config = BatDetect2FilesAnnotations.model_validate(data)
+    assert config.filter is None
+
+
+def test_batdetect2_merged_annotations_config(tmp_path: Path):
+    """Test initialization of BatDetect2MergedAnnotations."""
+    anns_path = tmp_path / "annotations.json"
+    config = BatDetect2MergedAnnotations(
+        name="test_merged",
+        description="Test Merged Desc",
+        audio_dir=tmp_path / "audio",
+        annotations_path=anns_path,
+        filter=AnnotationFilter(only_annotated=False, exclude_issues=True),
+    )
+    assert config.format == "batdetect2_file"
+    assert config.name == "test_merged"
+    assert config.description == "Test Merged Desc"
+    assert config.annotations_path == anns_path
+    assert isinstance(config.filter, AnnotationFilter)
+    assert config.filter.only_annotated is False
+    assert config.filter.exclude_issues is True
+
+
+def test_batdetect2_merged_annotations_config_default_filter(tmp_path: Path):
+    """Test BatDetect2MergedAnnotations uses default filter if not provided."""
+    anns_path = tmp_path / "annotations.json"
+    config = BatDetect2MergedAnnotations(
+        name="test_merged_default",
+        audio_dir=tmp_path / "audio",
+        annotations_path=anns_path,
+    )
+    assert isinstance(config.filter, AnnotationFilter)
+    assert config.filter.only_annotated is True
+    assert config.filter.exclude_issues is True
+
+
+class TestLoadBatDetect2Files:
+    def test_load_default_filter(self, batdetect2_files_test_setup):
+        """Test loading with default filter (annotated=True, issues=False)."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        config = BatDetect2FilesAnnotations(
+            name="default_load",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+        )
+
+        result_set = load_batdetect2_files_annotated_dataset(config)
+
+        assert isinstance(result_set, data.AnnotationSet)
+        assert result_set.name == "default_load"
+        assert len(result_set.clip_annotations) == 1
+
+        clip_ann = result_set.clip_annotations[0]
+        assert clip_ann.clip.recording.path.name == "rec1.wav"
+        assert clip_ann.clip.recording.duration == 5.0
+        assert len(clip_ann.sound_events) == 1
+        assert clip_ann.notes[0].message == "Standard notes."
+        clip_tag = data.find_tag(clip_ann.tags, "class")
+        assert clip_tag is not None
+        assert clip_tag.value == "Myotis"
+
+        recording_tag = data.find_tag(clip_ann.clip.recording.tags, "class")
+        assert recording_tag is not None
+        assert recording_tag.value == "Myotis"
+
+        se_ann = clip_ann.sound_events[0]
+        assert se_ann.sound_event.geometry is not None
+        assert se_ann.sound_event.geometry.coordinates == [
+            1.1,
+            30000,
+            1.2,
+            40000,
+        ]
+
+        se_class_tag = data.find_tag(se_ann.tags, "class")
+        assert se_class_tag is not None
+        assert se_class_tag.value == "Myotis"
+
+        se_event_tag = data.find_tag(se_ann.tags, "event")
+        assert se_event_tag is not None
+        assert se_event_tag.value == "Echolocation"
+
+        se_individual_tag = data.find_tag(se_ann.tags, "individual")
+        assert se_individual_tag is not None
+        assert se_individual_tag.value == "0"
+
+    def test_load_only_annotated_false(self, batdetect2_files_test_setup):
+        """Test filter with only_annotated=False."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        config = BatDetect2FilesAnnotations(
+            name="ann_false",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+            filter=AnnotationFilter(only_annotated=False, exclude_issues=True),
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 2
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec2.wav"}
+
+    def test_load_exclude_issues_false(self, batdetect2_files_test_setup):
+        """Test filter with exclude_issues=False."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        config = BatDetect2FilesAnnotations(
+            name="iss_false",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+            filter=AnnotationFilter(only_annotated=True, exclude_issues=False),
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 2
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec3.wav"}
+
+    def test_load_no_filter(self, batdetect2_files_test_setup):
+        """Test loading with filtering disabled."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        config_data = {
+            "name": "no_filter",
+            "audio_dir": str(audio_dir),
+            "annotations_dir": str(anns_dir),
+            "filter": None,
+        }
+        config = BatDetect2FilesAnnotations.model_validate(config_data)
+
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec2.wav", "rec3.wav", "rec4.wav"}
+
+    def test_load_with_base_dir(self, tmp_path, batdetect2_files_test_setup):
+        """Test loading with a base_dir."""
+        audio_dir_abs, anns_dir_abs, _ = batdetect2_files_test_setup
+        base_dir = tmp_path
+        audio_dir_rel = audio_dir_abs.relative_to(base_dir)
+        anns_dir_rel = anns_dir_abs.relative_to(base_dir)
+
+        config = BatDetect2FilesAnnotations(
+            name="base_dir_test",
+            audio_dir=audio_dir_rel,
+            annotations_dir=anns_dir_rel,
+        )
+
+        result_set = load_batdetect2_files_annotated_dataset(
+            config, base_dir=base_dir
+        )
+        assert len(result_set.clip_annotations) == 1
+        assert result_set.clip_annotations[0].clip.recording.path.is_absolute()
+        assert (
+            result_set.clip_annotations[0].clip.recording.path
+            == audio_dir_abs / "rec1.wav"
+        )
+
+    def test_load_missing_annotations_dir(self, tmp_path):
+        """Test error when annotations_dir does not exist."""
+        audio_dir = tmp_path / "audio"
+        audio_dir.mkdir()
+        anns_dir = tmp_path / "non_existent_anns"
+        config = BatDetect2FilesAnnotations(
+            name="missing_anns",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 0
+
+    def test_load_missing_audio_dir(self, batdetect2_files_test_setup):
+        """Test error or skipping when audio_dir does not exist or files missing."""
+        _, anns_dir, _ = batdetect2_files_test_setup
+        missing_audio_dir = Path(
+            "/tmp/non_existent_audio_dir_" + str(uuid.uuid4())
+        )
+        config = BatDetect2FilesAnnotations(
+            name="missing_audio",
+            audio_dir=missing_audio_dir,
+            annotations_dir=anns_dir,
+            filter=None,
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 0
+
+    def test_load_skips_invalid_json(self, batdetect2_files_test_setup):
+        """Test that invalid JSON files are skipped."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        (anns_dir / "invalid.json").write_text(".invalid json")
+        (anns_dir / "wrong_structure.json").write_text("[1, 2, 3]")
+
+        config = BatDetect2FilesAnnotations(
+            name="invalid_json_test",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+            filter=None,
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+
+    def test_load_skips_missing_individual_audio(
+        self, batdetect2_files_test_setup
+    ):
+        """Test skipping a file if its corresponding audio is missing."""
+        audio_dir, anns_dir, _ = batdetect2_files_test_setup
+        config = BatDetect2FilesAnnotations(
+            name="skip_missing_audio",
+            audio_dir=audio_dir,
+            annotations_dir=anns_dir,
+            filter=None,
+        )
+        result_set = load_batdetect2_files_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert "rec_missing_audio.wav" not in loaded_files
+
+
+class TestLoadBatDetect2Merged:
+    def test_load_default_filter(self, batdetect2_merged_test_setup):
+        """Test loading merged file with default filter."""
+        audio_dir, anns_path, _ = batdetect2_merged_test_setup
+        config = BatDetect2MergedAnnotations(
+            name="merged_default",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+        )
+
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+
+        assert isinstance(result_set, data.AnnotationSet)
+        assert result_set.name == "merged_default"
+        assert len(result_set.clip_annotations) == 1
+
+        clip_ann = result_set.clip_annotations[0]
+        assert clip_ann.clip.recording.path.name == "rec1.wav"
+        assert clip_ann.clip.recording.duration == 5.0
+        assert len(clip_ann.sound_events) == 1
+
+        clip_class_tag = data.find_tag(clip_ann.tags, "class")
+        assert clip_class_tag is not None
+        assert clip_class_tag.value == "Myotis"
+
+    def test_load_only_annotated_false(self, batdetect2_merged_test_setup):
+        """Test merged filter with only_annotated=False."""
+        audio_dir, anns_path, _ = batdetect2_merged_test_setup
+        config = BatDetect2MergedAnnotations(
+            name="merged_ann_false",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+            filter=AnnotationFilter(only_annotated=False, exclude_issues=True),
+        )
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 2
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec2.wav"}
+
+    def test_load_exclude_issues_false(self, batdetect2_merged_test_setup):
+        """Test merged filter with exclude_issues=False."""
+        audio_dir, anns_path, _ = batdetect2_merged_test_setup
+        config = BatDetect2MergedAnnotations(
+            name="merged_iss_false",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+            filter=AnnotationFilter(only_annotated=True, exclude_issues=False),
+        )
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 2
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec3.wav"}
+
+    def test_load_no_filter(self, batdetect2_merged_test_setup):
+        """Test loading merged file with filtering disabled."""
+        audio_dir, anns_path, _ = batdetect2_merged_test_setup
+        config_data = {
+            "name": "merged_no_filter",
+            "audio_dir": str(audio_dir),
+            "annotations_path": str(anns_path),
+            "filter": None,
+        }
+        config = BatDetect2MergedAnnotations.model_validate(config_data)
+
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert loaded_files == {"rec1.wav", "rec2.wav", "rec3.wav", "rec4.wav"}
+
+    def test_load_with_base_dir(self, tmp_path, batdetect2_merged_test_setup):
+        """Test loading merged file with a base_dir."""
+        audio_dir_abs, anns_path_abs, _ = batdetect2_merged_test_setup
+        base_dir = tmp_path
+        audio_dir_rel = audio_dir_abs.relative_to(base_dir)
+        anns_path_rel = anns_path_abs.relative_to(base_dir)
+
+        config = BatDetect2MergedAnnotations(
+            name="merged_base_dir",
+            audio_dir=audio_dir_rel,
+            annotations_path=anns_path_rel,
+        )
+
+        result_set = load_batdetect2_merged_annotated_dataset(
+            config, base_dir=base_dir
+        )
+        assert len(result_set.clip_annotations) == 1
+        assert result_set.clip_annotations[0].clip.recording.path.is_absolute()
+        assert (
+            result_set.clip_annotations[0].clip.recording.path
+            == audio_dir_abs / "rec1.wav"
+        )
+
+    def test_load_missing_annotations_path(self, tmp_path):
+        """Test error when annotations_path does not exist."""
+        audio_dir = tmp_path / "audio"
+        audio_dir.mkdir()
+        anns_path = tmp_path / "non_existent_anns.json"
+        config = BatDetect2MergedAnnotations(
+            name="missing_anns_file",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+        )
+        with pytest.raises(FileNotFoundError):
+            load_batdetect2_merged_annotated_dataset(config)
+
+    def test_load_missing_audio_dir(self, batdetect2_merged_test_setup):
+        """Test error/skipping when audio_dir does not exist in merged."""
+        _, anns_path, _ = batdetect2_merged_test_setup
+        missing_audio_dir = Path(
+            "/tmp/non_existent_audio_dir_merged_" + str(uuid.uuid4())
+        )
+        config = BatDetect2MergedAnnotations(
+            name="missing_audio_merged",
+            audio_dir=missing_audio_dir,
+            annotations_path=anns_path,
+            filter=None,
+        )
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 0
+
+    def test_load_invalid_json_format(self, tmp_path):
+        """Test error for malformed JSON file."""
+        audio_dir = tmp_path / "audio"
+        audio_dir.mkdir()
+        anns_path = tmp_path / "invalid.json"
+        anns_path.write_text(".malformed json")
+        config = BatDetect2MergedAnnotations(
+            name="invalid_json",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+        )
+        with pytest.raises(json.JSONDecodeError):
+            load_batdetect2_merged_annotated_dataset(config)
+
+    def test_load_json_not_a_list(self, tmp_path):
+        """Test error if JSON root is not a list."""
+        audio_dir = tmp_path / "audio"
+        audio_dir.mkdir()
+        anns_path = tmp_path / "not_list.json"
+        anns_path.write_text('{"not": "a list"}')
+        config = BatDetect2MergedAnnotations(
+            name="not_list", audio_dir=audio_dir, annotations_path=anns_path
+        )
+        with pytest.raises(TypeError):
+            load_batdetect2_merged_annotated_dataset(config)
+
+    def test_load_invalid_entry_in_list(self, batdetect2_merged_test_setup):
+        """Test skipping entries that don't conform to FileAnnotation."""
+        audio_dir, anns_path, merged_data = batdetect2_merged_test_setup
+        invalid_entry = {"wrong_field": 123}
+        merged_data_with_invalid = merged_data + [invalid_entry]
+        anns_path.write_text(json.dumps(merged_data_with_invalid))
+
+        config = BatDetect2MergedAnnotations(
+            name="invalid_entry",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+            filter=None,
+        )
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+
+    def test_load_skips_missing_audio_in_merged(
+        self, batdetect2_merged_test_setup
+    ):
+        """Test skipping an entry if its audio file is missing."""
+        audio_dir, anns_path, merged_data = batdetect2_merged_test_setup
+        missing_audio_entry = create_legacy_file_annotation(
+            file_id="non_existent.wav", annotated=True, issues=False
+        )
+        merged_data_with_missing = merged_data + [missing_audio_entry]
+        anns_path.write_text(json.dumps(merged_data_with_missing))
+
+        config = BatDetect2MergedAnnotations(
+            name="skip_missing_audio_merged",
+            audio_dir=audio_dir,
+            annotations_path=anns_path,
+            filter=None,
+        )
+        result_set = load_batdetect2_merged_annotated_dataset(config)
+        assert len(result_set.clip_annotations) == 4
+        loaded_files = {
+            ann.clip.recording.path.name for ann in result_set.clip_annotations
+        }
+        assert "non_existent.wav" not in loaded_files