From 374c62d7abad704ebbb43920422b25ebb51c6b89 Mon Sep 17 00:00:00 2001 From: mbsantiago Date: Mon, 11 Aug 2025 01:35:09 +0100 Subject: [PATCH] Add dataset summary and split functions --- notebooks/data.py | 195 +++++++------------------------- src/batdetect2/data/__init__.py | 8 ++ src/batdetect2/data/split.py | 75 ++++++++++++ src/batdetect2/data/summary.py | 192 +++++++++++++++++++++++++++++++ 4 files changed, 318 insertions(+), 152 deletions(-) create mode 100644 src/batdetect2/data/split.py create mode 100644 src/batdetect2/data/summary.py diff --git a/notebooks/data.py b/notebooks/data.py index ed3e861..dcdfe09 100644 --- a/notebooks/data.py +++ b/notebooks/data.py @@ -12,8 +12,20 @@ def _(): @app.cell def _(): - from batdetect2.data import load_dataset_config, load_dataset - return load_dataset, load_dataset_config + from batdetect2.data import ( + load_dataset_config, + load_dataset, + extract_recordings_df, + extract_sound_events_df, + compute_class_summary, + ) + return ( + compute_class_summary, + extract_recordings_df, + extract_sound_events_df, + load_dataset, + load_dataset_config, + ) @app.cell @@ -72,183 +84,50 @@ def _(build_targets, targets_config): def _(): import pandas as pd from soundevent.geometry import compute_bounds - return compute_bounds, pd + return @app.cell -def _(dataset, pd): - def get_recording_df(dataset): - recordings = [] - - for clip_annotation in dataset: - recordings.append( - { - "recording_id": clip_annotation.clip.recording.uuid, - "duration": clip_annotation.clip.duration, - "clip_annotation_id": clip_annotation.uuid, - "samplerate": clip_annotation.clip.recording.samplerate, - "path": clip_annotation.clip.recording.path.name, - } - ) - - return pd.DataFrame(recordings) - - - recordings = get_recording_df(dataset) +def _(dataset, extract_recordings_df): + recordings = extract_recordings_df(dataset) recordings - return (recordings,) + return @app.cell -def _(compute_bounds, dataset, pd, targets): - def get_sound_event_df(dataset): - sound_events = [] - - for clip_annotation in dataset: - for sound_event in clip_annotation.sound_events: - if not targets.filter(sound_event): - continue - - if sound_event.sound_event.geometry is None: - continue - - class_name = targets.encode_class(sound_event) - - if class_name is None: - continue - - start_time, low_freq, end_time, high_freq = compute_bounds( - sound_event.sound_event.geometry - ) - - sound_events.append( - { - "clip_annotation_id": clip_annotation.uuid, - "sound_event_id": sound_event.uuid, - "class_name": class_name, - "start_time": start_time, - "end_time": end_time, - "low_freq": low_freq, - "high_freq": high_freq, - } - ) - - return pd.DataFrame(sound_events) - - - sound_events = get_sound_event_df(dataset) +def _(dataset, extract_sound_events_df, targets): + sound_events = extract_sound_events_df(dataset, targets) sound_events - return get_sound_event_df, sound_events + return @app.cell -def _(recordings, sound_events): - def produce_summary(sound_events): - num_calls = ( - sound_events.groupby("class_name") - .size() - .sort_values(ascending=False) - .rename("num calls") - ) - num_recs = ( - sound_events.groupby("class_name")["clip_annotation_id"] - .nunique() - .sort_values(ascending=False) - .rename("num recordings") - ) - durations = ( - sound_events.groupby("class_name") - .apply( - lambda group: recordings[ - recordings["clip_annotation_id"].isin( - group["clip_annotation_id"] - ) - ]["duration"].sum(), - include_groups=False, - ) - .sort_values(ascending=False) - .rename("duration") - ) - return ( - num_calls.to_frame() - .join(num_recs) - .join(durations) - .sort_values("num calls", ascending=False) - .assign(call_rate=lambda df: df["num calls"] / df["duration"]) - ) - - - produce_summary(sound_events) - return (produce_summary,) - - -@app.cell -def _(sound_events): - majority_class = ( - sound_events.groupby("clip_annotation_id") - .apply( - lambda group: group["class_name"] - .value_counts() - .sort_values(ascending=False) - .index[0], - include_groups=False, - ) - .rename("class_name") - .to_frame() - .reset_index() - ) - return (majority_class,) - - -@app.cell -def _(majority_class): - majority_class +def _(compute_class_summary, dataset, targets): + compute_class_summary(dataset, targets) return @app.cell def _(): - from sklearn.model_selection import train_test_split - return (train_test_split,) + from batdetect2.data.split import split_dataset_by_recordings + return (split_dataset_by_recordings,) @app.cell -def _(majority_class, train_test_split): - train, val = train_test_split( - majority_class["clip_annotation_id"], - stratify=majority_class["class_name"], - ) - return train, val - - -@app.cell -def _(dataset, train, val): - train_dataset = [ - clip_annotation - for clip_annotation in dataset - if clip_annotation.uuid in set(train.values) - ] - val_dataset = [ - clip_annotation - for clip_annotation in dataset - if clip_annotation.uuid in set(val.values) - ] +def _(dataset, split_dataset_by_recordings, targets): + train_dataset, val_dataset = split_dataset_by_recordings(dataset, targets, random_state=42) return train_dataset, val_dataset @app.cell -def _(get_sound_event_df, produce_summary, train_dataset): - train_sound_events = get_sound_event_df(train_dataset) - train_summary = produce_summary(train_sound_events) - train_summary +def _(compute_class_summary, targets, train_dataset): + compute_class_summary(train_dataset, targets) return @app.cell -def _(get_sound_event_df, produce_summary, val_dataset): - val_sound_events = get_sound_event_df(val_dataset) - val_summary = produce_summary(val_sound_events) - val_summary +def _(compute_class_summary, targets, val_dataset): + compute_class_summary(val_dataset, targets) return @@ -291,6 +170,18 @@ def _(Path, data, io, val_dataset): def _(load_dataset, load_dataset_config): config = load_dataset_config("../paper/conf/datasets/train/uk_tune.yaml") rec = load_dataset(config, base_dir="../paper/") + return (rec,) + + +@app.cell +def _(rec): + dict(rec[0].sound_events[0].tags[0].term) + return + + +@app.cell +def _(compute_class_summary, rec, targets): + compute_class_summary(rec,targets) return diff --git a/src/batdetect2/data/__init__.py b/src/batdetect2/data/__init__.py index f9a5ee3..54a762a 100644 --- a/src/batdetect2/data/__init__.py +++ b/src/batdetect2/data/__init__.py @@ -11,6 +11,11 @@ from batdetect2.data.datasets import ( load_dataset_config, load_dataset_from_config, ) +from batdetect2.data.summary import ( + compute_class_summary, + extract_recordings_df, + extract_sound_events_df, +) __all__ = [ "AOEFAnnotations", @@ -18,6 +23,9 @@ __all__ = [ "BatDetect2FilesAnnotations", "BatDetect2MergedAnnotations", "DatasetConfig", + "compute_class_summary", + "extract_recordings_df", + "extract_sound_events_df", "load_annotated_dataset", "load_dataset", "load_dataset_config", diff --git a/src/batdetect2/data/split.py b/src/batdetect2/data/split.py new file mode 100644 index 0000000..aaa1c5e --- /dev/null +++ b/src/batdetect2/data/split.py @@ -0,0 +1,75 @@ +from typing import Optional, Tuple + +from sklearn.model_selection import train_test_split + +from batdetect2.data.datasets import Dataset +from batdetect2.data.summary import ( + extract_recordings_df, + extract_sound_events_df, +) +from batdetect2.targets.types import TargetProtocol + + +def split_dataset_by_recordings( + dataset: Dataset, + targets: TargetProtocol, + train_size: float = 0.75, + random_state: Optional[int] = None, +) -> Tuple[Dataset, Dataset]: + recordings = extract_recordings_df(dataset) + + sound_events = extract_sound_events_df( + dataset, + targets, + exclude_non_target=True, + exclude_generic=True, + ) + + majority_class = ( + sound_events.groupby("recording_id") + .apply( + lambda group: group["class_name"] # type: ignore + .value_counts() + .sort_values(ascending=False) + .index[0], + include_groups=False, # type: ignore + ) + .rename("class_name") + .to_frame() + .reset_index() + ) + + train, test = train_test_split( + majority_class["recording_id"], + stratify=majority_class["class_name"], + train_size=train_size, + random_state=random_state, + ) + + train_ids_set = set(train.values) # type: ignore + test_ids_set = set(test.values) # type: ignore + + extra = set(recordings["recording_id"]) - train_ids_set - test_ids_set + + if extra: + train_extra, test_extra = train_test_split( + list(extra), + train_size=train_size, + random_state=random_state, + ) + train_ids_set.update(train_extra) + test_ids_set.update(test_extra) + + train_dataset = [ + clip_annotation + for clip_annotation in dataset + if str(clip_annotation.clip.recording.uuid) in train_ids_set + ] + + test_dataset = [ + clip_annotation + for clip_annotation in dataset + if str(clip_annotation.clip.recording.uuid) in test_ids_set + ] + + return train_dataset, test_dataset diff --git a/src/batdetect2/data/summary.py b/src/batdetect2/data/summary.py new file mode 100644 index 0000000..713520b --- /dev/null +++ b/src/batdetect2/data/summary.py @@ -0,0 +1,192 @@ +import pandas as pd +from soundevent.geometry import compute_bounds + +from batdetect2.data.datasets import Dataset +from batdetect2.targets.types import TargetProtocol + +__all__ = [ + "extract_recordings_df", + "extract_sound_events_df", + "compute_class_summary", +] + + +def extract_recordings_df(dataset: Dataset) -> pd.DataFrame: + """Extract recording metadata into a pandas DataFrame. + + Parameters + ---------- + dataset : List[data.ClipAnnotation] + A list of clip annotations from which to extract recording information. + + Returns + ------- + pd.DataFrame + A DataFrame where each row corresponds to a recording, containing + metadata such as duration, path, sample rate, and other properties. + """ + recordings = [] + + for clip_annotation in dataset: + clip = clip_annotation.clip + recording = clip.recording + recordings.append( + { + "clip_annotation_id": str(clip_annotation.uuid), + "recording_id": str(recording.uuid), + "duration": clip.duration, + "filename": recording.path.name, + **recording.model_dump( + mode="json", + include={ + "samplerate", + "hash", + "path", + "date", + "time", + "latitude", + "longitude", + }, + ), + } + ) + + return pd.DataFrame(recordings) + + +def extract_sound_events_df( + dataset: Dataset, + targets: TargetProtocol, + exclude_non_target: bool = True, + exclude_generic: bool = True, +) -> pd.DataFrame: + """Extract sound event data into a pandas DataFrame. + + This function iterates through all sound events in the provided dataset, + applies filtering and classification logic based on the `targets` + protocol, and compiles the results into a structured DataFrame. + + Parameters + ---------- + dataset : List[data.ClipAnnotation] + The dataset containing clip annotations with sound events. + targets : TargetProtocol + An object that provides methods to filter (`filter`) and classify + (`encode_class`) sound events. + exclude_non_target : bool, default=True + If True, sound events that do not pass the `targets.filter()` check + are excluded from the output. + exclude_generic : bool, default=True + If True, sound events that are classified with a `None` class name + by `targets.encode_class()` are excluded. + + Returns + ------- + pd.DataFrame + A DataFrame where each row represents a single sound event, including + its bounding box, class name, and other relevant attributes. + """ + sound_events = [] + + for clip_annotation in dataset: + for sound_event in clip_annotation.sound_events: + is_target = targets.filter(sound_event) + + if not is_target and exclude_non_target: + continue + + if sound_event.sound_event.geometry is None: + continue + + class_name = targets.encode_class(sound_event) + + if class_name is None and exclude_generic: + continue + + start_time, low_freq, end_time, high_freq = compute_bounds( + sound_event.sound_event.geometry + ) + + sound_events.append( + { + "clip_annotation_id": str(clip_annotation.uuid), + "sound_event_id": str(sound_event.uuid), + "recording_id": str( + sound_event.sound_event.recording.uuid + ), + "start_time": start_time, + "end_time": end_time, + "low_freq": low_freq, + "high_freq": high_freq, + "is_target": is_target, + "class_name": class_name, + } + ) + + return pd.DataFrame(sound_events) + + +def compute_class_summary( + dataset: Dataset, + targets: TargetProtocol, +) -> pd.DataFrame: + """Compute a summary of sound event statistics grouped by class. + + This function generates a high-level summary DataFrame that provides + key metrics for each class identified in the dataset. It calculates + the total number of calls, the number of unique recordings containing + each class, the total duration of those recordings, and the call rate. + + Parameters + ---------- + dataset : List[data.ClipAnnotation] + The dataset to be summarized. + targets : TargetProtocol + An object providing the classification logic for sound events. + + Returns + ------- + pd.DataFrame + A DataFrame indexed by class name, with columns for 'num calls', + 'num recordings', 'duration', and 'call_rate'. + """ + sound_events = extract_sound_events_df( + dataset, + targets, + exclude_generic=True, + exclude_non_target=True, + ) + recordings = extract_recordings_df(dataset) + + num_calls = ( + sound_events.groupby("class_name") + .size() + .sort_values(ascending=False) + .rename("num calls") + ) + num_recs = ( + sound_events.groupby("class_name")["clip_annotation_id"] + .nunique() + .sort_values(ascending=False) + .rename("num recordings") + ) + durations = ( + sound_events.groupby("class_name") + .apply( + lambda group: recordings[ + recordings["clip_annotation_id"].isin( + group["clip_annotation_id"] # type: ignore + ) + ]["duration"].sum(), + include_groups=False, # type: ignore + ) + .sort_values(ascending=False) + .rename("duration") + ) + return ( + num_calls.to_frame() + .join(num_recs) + .join(durations) + .sort_values("num calls", ascending=False) + .assign(call_rate=lambda df: df["num calls"] / df["duration"]) + )