From 374c62d7abad704ebbb43920422b25ebb51c6b89 Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Mon, 11 Aug 2025 01:35:09 +0100
Subject: [PATCH] Add dataset summary and split functions

---
 notebooks/data.py               | 195 +++++++-------------------------
 src/batdetect2/data/__init__.py |   8 ++
 src/batdetect2/data/split.py    |  75 ++++++++++++
 src/batdetect2/data/summary.py  | 192 +++++++++++++++++++++++++++++++
 4 files changed, 318 insertions(+), 152 deletions(-)
 create mode 100644 src/batdetect2/data/split.py
 create mode 100644 src/batdetect2/data/summary.py

diff --git a/notebooks/data.py b/notebooks/data.py
index ed3e861..dcdfe09 100644
--- a/notebooks/data.py
+++ b/notebooks/data.py
@@ -12,8 +12,20 @@ def _():
 
 @app.cell
 def _():
-    from batdetect2.data import load_dataset_config, load_dataset
-    return load_dataset, load_dataset_config
+    from batdetect2.data import (
+        load_dataset_config,
+        load_dataset,
+        extract_recordings_df,
+        extract_sound_events_df,
+        compute_class_summary,
+    )
+    return (
+        compute_class_summary,
+        extract_recordings_df,
+        extract_sound_events_df,
+        load_dataset,
+        load_dataset_config,
+    )
 
 
 @app.cell
@@ -72,183 +84,50 @@ def _(build_targets, targets_config):
 def _():
     import pandas as pd
     from soundevent.geometry import compute_bounds
-    return compute_bounds, pd
+    return
 
 
 @app.cell
-def _(dataset, pd):
-    def get_recording_df(dataset):
-        recordings = []
-
-        for clip_annotation in dataset:
-            recordings.append(
-                {
-                    "recording_id": clip_annotation.clip.recording.uuid,
-                    "duration": clip_annotation.clip.duration,
-                    "clip_annotation_id": clip_annotation.uuid,
-                    "samplerate": clip_annotation.clip.recording.samplerate,
-                    "path": clip_annotation.clip.recording.path.name,
-                }
-            )
-
-        return pd.DataFrame(recordings)
-
-
-    recordings = get_recording_df(dataset)
+def _(dataset, extract_recordings_df):
+    recordings = extract_recordings_df(dataset)
     recordings
-    return (recordings,)
+    return
 
 
 @app.cell
-def _(compute_bounds, dataset, pd, targets):
-    def get_sound_event_df(dataset):
-        sound_events = []
-
-        for clip_annotation in dataset:
-            for sound_event in clip_annotation.sound_events:
-                if not targets.filter(sound_event):
-                    continue
-
-                if sound_event.sound_event.geometry is None:
-                    continue
-
-                class_name = targets.encode_class(sound_event)
-
-                if class_name is None:
-                    continue
-
-                start_time, low_freq, end_time, high_freq = compute_bounds(
-                    sound_event.sound_event.geometry
-                )
-
-                sound_events.append(
-                    {
-                        "clip_annotation_id": clip_annotation.uuid,
-                        "sound_event_id": sound_event.uuid,
-                        "class_name": class_name,
-                        "start_time": start_time,
-                        "end_time": end_time,
-                        "low_freq": low_freq,
-                        "high_freq": high_freq,
-                    }
-                )
-
-        return pd.DataFrame(sound_events)
-
-
-    sound_events = get_sound_event_df(dataset)
+def _(dataset, extract_sound_events_df, targets):
+    sound_events = extract_sound_events_df(dataset, targets)
     sound_events
-    return get_sound_event_df, sound_events
+    return
 
 
 @app.cell
-def _(recordings, sound_events):
-    def produce_summary(sound_events):
-        num_calls = (
-            sound_events.groupby("class_name")
-            .size()
-            .sort_values(ascending=False)
-            .rename("num calls")
-        )
-        num_recs = (
-            sound_events.groupby("class_name")["clip_annotation_id"]
-            .nunique()
-            .sort_values(ascending=False)
-            .rename("num recordings")
-        )
-        durations = (
-            sound_events.groupby("class_name")
-            .apply(
-                lambda group: recordings[
-                    recordings["clip_annotation_id"].isin(
-                        group["clip_annotation_id"]
-                    )
-                ]["duration"].sum(),
-                include_groups=False,
-            )
-            .sort_values(ascending=False)
-            .rename("duration")
-        )
-        return (
-            num_calls.to_frame()
-            .join(num_recs)
-            .join(durations)
-            .sort_values("num calls", ascending=False)
-            .assign(call_rate=lambda df: df["num calls"] / df["duration"])
-        )
-
-
-    produce_summary(sound_events)
-    return (produce_summary,)
-
-
-@app.cell
-def _(sound_events):
-    majority_class = (
-        sound_events.groupby("clip_annotation_id")
-        .apply(
-            lambda group: group["class_name"]
-            .value_counts()
-            .sort_values(ascending=False)
-            .index[0],
-            include_groups=False,
-        )
-        .rename("class_name")
-        .to_frame()
-        .reset_index()
-    )
-    return (majority_class,)
-
-
-@app.cell
-def _(majority_class):
-    majority_class
+def _(compute_class_summary, dataset, targets):
+    compute_class_summary(dataset, targets)
     return
 
 
 @app.cell
 def _():
-    from sklearn.model_selection import train_test_split
-    return (train_test_split,)
+    from batdetect2.data.split import split_dataset_by_recordings
+    return (split_dataset_by_recordings,)
 
 
 @app.cell
-def _(majority_class, train_test_split):
-    train, val = train_test_split(
-        majority_class["clip_annotation_id"],
-        stratify=majority_class["class_name"],
-    )
-    return train, val
-
-
-@app.cell
-def _(dataset, train, val):
-    train_dataset = [
-        clip_annotation
-        for clip_annotation in dataset
-        if clip_annotation.uuid in set(train.values)
-    ]
-    val_dataset = [
-        clip_annotation
-        for clip_annotation in dataset
-        if clip_annotation.uuid in set(val.values)
-    ]
+def _(dataset, split_dataset_by_recordings, targets):
+    train_dataset, val_dataset = split_dataset_by_recordings(dataset, targets, random_state=42)
     return train_dataset, val_dataset
 
 
 @app.cell
-def _(get_sound_event_df, produce_summary, train_dataset):
-    train_sound_events = get_sound_event_df(train_dataset)
-    train_summary = produce_summary(train_sound_events)
-    train_summary
+def _(compute_class_summary, targets, train_dataset):
+    compute_class_summary(train_dataset, targets)
     return
 
 
 @app.cell
-def _(get_sound_event_df, produce_summary, val_dataset):
-    val_sound_events = get_sound_event_df(val_dataset)
-    val_summary = produce_summary(val_sound_events)
-    val_summary
+def _(compute_class_summary, targets, val_dataset):
+    compute_class_summary(val_dataset, targets)
     return
 
 
@@ -291,6 +170,18 @@ def _(Path, data, io, val_dataset):
 def _(load_dataset, load_dataset_config):
     config = load_dataset_config("../paper/conf/datasets/train/uk_tune.yaml")
     rec = load_dataset(config, base_dir="../paper/")
+    return (rec,)
+
+
+@app.cell
+def _(rec):
+    dict(rec[0].sound_events[0].tags[0].term)
+    return
+
+
+@app.cell
+def _(compute_class_summary, rec, targets):
+    compute_class_summary(rec,targets)
     return
 
 
diff --git a/src/batdetect2/data/__init__.py b/src/batdetect2/data/__init__.py
index f9a5ee3..54a762a 100644
--- a/src/batdetect2/data/__init__.py
+++ b/src/batdetect2/data/__init__.py
@@ -11,6 +11,11 @@ from batdetect2.data.datasets import (
     load_dataset_config,
     load_dataset_from_config,
 )
+from batdetect2.data.summary import (
+    compute_class_summary,
+    extract_recordings_df,
+    extract_sound_events_df,
+)
 
 __all__ = [
     "AOEFAnnotations",
@@ -18,6 +23,9 @@ __all__ = [
     "BatDetect2FilesAnnotations",
     "BatDetect2MergedAnnotations",
     "DatasetConfig",
+    "compute_class_summary",
+    "extract_recordings_df",
+    "extract_sound_events_df",
     "load_annotated_dataset",
     "load_dataset",
     "load_dataset_config",
diff --git a/src/batdetect2/data/split.py b/src/batdetect2/data/split.py
new file mode 100644
index 0000000..aaa1c5e
--- /dev/null
+++ b/src/batdetect2/data/split.py
@@ -0,0 +1,75 @@
+from typing import Optional, Tuple
+
+from sklearn.model_selection import train_test_split
+
+from batdetect2.data.datasets import Dataset
+from batdetect2.data.summary import (
+    extract_recordings_df,
+    extract_sound_events_df,
+)
+from batdetect2.targets.types import TargetProtocol
+
+
+def split_dataset_by_recordings(
+    dataset: Dataset,
+    targets: TargetProtocol,
+    train_size: float = 0.75,
+    random_state: Optional[int] = None,
+) -> Tuple[Dataset, Dataset]:
+    recordings = extract_recordings_df(dataset)
+
+    sound_events = extract_sound_events_df(
+        dataset,
+        targets,
+        exclude_non_target=True,
+        exclude_generic=True,
+    )
+
+    majority_class = (
+        sound_events.groupby("recording_id")
+        .apply(
+            lambda group: group["class_name"]  # type: ignore
+            .value_counts()
+            .sort_values(ascending=False)
+            .index[0],
+            include_groups=False,  # type: ignore
+        )
+        .rename("class_name")
+        .to_frame()
+        .reset_index()
+    )
+
+    train, test = train_test_split(
+        majority_class["recording_id"],
+        stratify=majority_class["class_name"],
+        train_size=train_size,
+        random_state=random_state,
+    )
+
+    train_ids_set = set(train.values)  # type: ignore
+    test_ids_set = set(test.values)  # type: ignore
+
+    extra = set(recordings["recording_id"]) - train_ids_set - test_ids_set
+
+    if extra:
+        train_extra, test_extra = train_test_split(
+            list(extra),
+            train_size=train_size,
+            random_state=random_state,
+        )
+        train_ids_set.update(train_extra)
+        test_ids_set.update(test_extra)
+
+    train_dataset = [
+        clip_annotation
+        for clip_annotation in dataset
+        if str(clip_annotation.clip.recording.uuid) in train_ids_set
+    ]
+
+    test_dataset = [
+        clip_annotation
+        for clip_annotation in dataset
+        if str(clip_annotation.clip.recording.uuid) in test_ids_set
+    ]
+
+    return train_dataset, test_dataset
diff --git a/src/batdetect2/data/summary.py b/src/batdetect2/data/summary.py
new file mode 100644
index 0000000..713520b
--- /dev/null
+++ b/src/batdetect2/data/summary.py
@@ -0,0 +1,192 @@
+import pandas as pd
+from soundevent.geometry import compute_bounds
+
+from batdetect2.data.datasets import Dataset
+from batdetect2.targets.types import TargetProtocol
+
+__all__ = [
+    "extract_recordings_df",
+    "extract_sound_events_df",
+    "compute_class_summary",
+]
+
+
+def extract_recordings_df(dataset: Dataset) -> pd.DataFrame:
+    """Extract recording metadata into a pandas DataFrame.
+
+    Parameters
+    ----------
+    dataset : List[data.ClipAnnotation]
+        A list of clip annotations from which to extract recording information.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame where each row corresponds to a recording, containing
+        metadata such as duration, path, sample rate, and other properties.
+    """
+    recordings = []
+
+    for clip_annotation in dataset:
+        clip = clip_annotation.clip
+        recording = clip.recording
+        recordings.append(
+            {
+                "clip_annotation_id": str(clip_annotation.uuid),
+                "recording_id": str(recording.uuid),
+                "duration": clip.duration,
+                "filename": recording.path.name,
+                **recording.model_dump(
+                    mode="json",
+                    include={
+                        "samplerate",
+                        "hash",
+                        "path",
+                        "date",
+                        "time",
+                        "latitude",
+                        "longitude",
+                    },
+                ),
+            }
+        )
+
+    return pd.DataFrame(recordings)
+
+
+def extract_sound_events_df(
+    dataset: Dataset,
+    targets: TargetProtocol,
+    exclude_non_target: bool = True,
+    exclude_generic: bool = True,
+) -> pd.DataFrame:
+    """Extract sound event data into a pandas DataFrame.
+
+    This function iterates through all sound events in the provided dataset,
+    applies filtering and classification logic based on the `targets`
+    protocol, and compiles the results into a structured DataFrame.
+
+    Parameters
+    ----------
+    dataset : List[data.ClipAnnotation]
+        The dataset containing clip annotations with sound events.
+    targets : TargetProtocol
+        An object that provides methods to filter (`filter`) and classify
+        (`encode_class`) sound events.
+    exclude_non_target : bool, default=True
+        If True, sound events that do not pass the `targets.filter()` check
+        are excluded from the output.
+    exclude_generic : bool, default=True
+        If True, sound events that are classified with a `None` class name
+        by `targets.encode_class()` are excluded.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame where each row represents a single sound event, including
+        its bounding box, class name, and other relevant attributes.
+    """
+    sound_events = []
+
+    for clip_annotation in dataset:
+        for sound_event in clip_annotation.sound_events:
+            is_target = targets.filter(sound_event)
+
+            if not is_target and exclude_non_target:
+                continue
+
+            if sound_event.sound_event.geometry is None:
+                continue
+
+            class_name = targets.encode_class(sound_event)
+
+            if class_name is None and exclude_generic:
+                continue
+
+            start_time, low_freq, end_time, high_freq = compute_bounds(
+                sound_event.sound_event.geometry
+            )
+
+            sound_events.append(
+                {
+                    "clip_annotation_id": str(clip_annotation.uuid),
+                    "sound_event_id": str(sound_event.uuid),
+                    "recording_id": str(
+                        sound_event.sound_event.recording.uuid
+                    ),
+                    "start_time": start_time,
+                    "end_time": end_time,
+                    "low_freq": low_freq,
+                    "high_freq": high_freq,
+                    "is_target": is_target,
+                    "class_name": class_name,
+                }
+            )
+
+    return pd.DataFrame(sound_events)
+
+
+def compute_class_summary(
+    dataset: Dataset,
+    targets: TargetProtocol,
+) -> pd.DataFrame:
+    """Compute a summary of sound event statistics grouped by class.
+
+    This function generates a high-level summary DataFrame that provides
+    key metrics for each class identified in the dataset. It calculates
+    the total number of calls, the number of unique recordings containing
+    each class, the total duration of those recordings, and the call rate.
+
+    Parameters
+    ----------
+    dataset : List[data.ClipAnnotation]
+        The dataset to be summarized.
+    targets : TargetProtocol
+        An object providing the classification logic for sound events.
+
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame indexed by class name, with columns for 'num calls',
+        'num recordings', 'duration', and 'call_rate'.
+    """
+    sound_events = extract_sound_events_df(
+        dataset,
+        targets,
+        exclude_generic=True,
+        exclude_non_target=True,
+    )
+    recordings = extract_recordings_df(dataset)
+
+    num_calls = (
+        sound_events.groupby("class_name")
+        .size()
+        .sort_values(ascending=False)
+        .rename("num calls")
+    )
+    num_recs = (
+        sound_events.groupby("class_name")["clip_annotation_id"]
+        .nunique()
+        .sort_values(ascending=False)
+        .rename("num recordings")
+    )
+    durations = (
+        sound_events.groupby("class_name")
+        .apply(
+            lambda group: recordings[
+                recordings["clip_annotation_id"].isin(
+                    group["clip_annotation_id"]  # type: ignore
+                )
+            ]["duration"].sum(),
+            include_groups=False,  # type: ignore
+        )
+        .sort_values(ascending=False)
+        .rename("duration")
+    )
+    return (
+        num_calls.to_frame()
+        .join(num_recs)
+        .join(durations)
+        .sort_values("num calls", ascending=False)
+        .assign(call_rate=lambda df: df["num calls"] / df["duration"])
+    )