Add dataset summary and split functions

This commit is contained in:
mbsantiago 2025-08-11 01:35:09 +01:00
parent ef279bee5d
commit 374c62d7ab
4 changed files with 318 additions and 152 deletions

View File

@ -12,8 +12,20 @@ def _():
@app.cell
def _():
from batdetect2.data import load_dataset_config, load_dataset
return load_dataset, load_dataset_config
from batdetect2.data import (
load_dataset_config,
load_dataset,
extract_recordings_df,
extract_sound_events_df,
compute_class_summary,
)
return (
compute_class_summary,
extract_recordings_df,
extract_sound_events_df,
load_dataset,
load_dataset_config,
)
@app.cell
@ -72,183 +84,50 @@ def _(build_targets, targets_config):
def _():
import pandas as pd
from soundevent.geometry import compute_bounds
return compute_bounds, pd
return
@app.cell
def _(dataset, pd):
def get_recording_df(dataset):
recordings = []
for clip_annotation in dataset:
recordings.append(
{
"recording_id": clip_annotation.clip.recording.uuid,
"duration": clip_annotation.clip.duration,
"clip_annotation_id": clip_annotation.uuid,
"samplerate": clip_annotation.clip.recording.samplerate,
"path": clip_annotation.clip.recording.path.name,
}
)
return pd.DataFrame(recordings)
recordings = get_recording_df(dataset)
def _(dataset, extract_recordings_df):
recordings = extract_recordings_df(dataset)
recordings
return (recordings,)
return
@app.cell
def _(compute_bounds, dataset, pd, targets):
def get_sound_event_df(dataset):
sound_events = []
for clip_annotation in dataset:
for sound_event in clip_annotation.sound_events:
if not targets.filter(sound_event):
continue
if sound_event.sound_event.geometry is None:
continue
class_name = targets.encode_class(sound_event)
if class_name is None:
continue
start_time, low_freq, end_time, high_freq = compute_bounds(
sound_event.sound_event.geometry
)
sound_events.append(
{
"clip_annotation_id": clip_annotation.uuid,
"sound_event_id": sound_event.uuid,
"class_name": class_name,
"start_time": start_time,
"end_time": end_time,
"low_freq": low_freq,
"high_freq": high_freq,
}
)
return pd.DataFrame(sound_events)
sound_events = get_sound_event_df(dataset)
def _(dataset, extract_sound_events_df, targets):
sound_events = extract_sound_events_df(dataset, targets)
sound_events
return get_sound_event_df, sound_events
return
@app.cell
def _(recordings, sound_events):
def produce_summary(sound_events):
num_calls = (
sound_events.groupby("class_name")
.size()
.sort_values(ascending=False)
.rename("num calls")
)
num_recs = (
sound_events.groupby("class_name")["clip_annotation_id"]
.nunique()
.sort_values(ascending=False)
.rename("num recordings")
)
durations = (
sound_events.groupby("class_name")
.apply(
lambda group: recordings[
recordings["clip_annotation_id"].isin(
group["clip_annotation_id"]
)
]["duration"].sum(),
include_groups=False,
)
.sort_values(ascending=False)
.rename("duration")
)
return (
num_calls.to_frame()
.join(num_recs)
.join(durations)
.sort_values("num calls", ascending=False)
.assign(call_rate=lambda df: df["num calls"] / df["duration"])
)
produce_summary(sound_events)
return (produce_summary,)
@app.cell
def _(sound_events):
majority_class = (
sound_events.groupby("clip_annotation_id")
.apply(
lambda group: group["class_name"]
.value_counts()
.sort_values(ascending=False)
.index[0],
include_groups=False,
)
.rename("class_name")
.to_frame()
.reset_index()
)
return (majority_class,)
@app.cell
def _(majority_class):
majority_class
def _(compute_class_summary, dataset, targets):
compute_class_summary(dataset, targets)
return
@app.cell
def _():
from sklearn.model_selection import train_test_split
return (train_test_split,)
from batdetect2.data.split import split_dataset_by_recordings
return (split_dataset_by_recordings,)
@app.cell
def _(majority_class, train_test_split):
train, val = train_test_split(
majority_class["clip_annotation_id"],
stratify=majority_class["class_name"],
)
return train, val
@app.cell
def _(dataset, train, val):
train_dataset = [
clip_annotation
for clip_annotation in dataset
if clip_annotation.uuid in set(train.values)
]
val_dataset = [
clip_annotation
for clip_annotation in dataset
if clip_annotation.uuid in set(val.values)
]
def _(dataset, split_dataset_by_recordings, targets):
train_dataset, val_dataset = split_dataset_by_recordings(dataset, targets, random_state=42)
return train_dataset, val_dataset
@app.cell
def _(get_sound_event_df, produce_summary, train_dataset):
train_sound_events = get_sound_event_df(train_dataset)
train_summary = produce_summary(train_sound_events)
train_summary
def _(compute_class_summary, targets, train_dataset):
compute_class_summary(train_dataset, targets)
return
@app.cell
def _(get_sound_event_df, produce_summary, val_dataset):
val_sound_events = get_sound_event_df(val_dataset)
val_summary = produce_summary(val_sound_events)
val_summary
def _(compute_class_summary, targets, val_dataset):
compute_class_summary(val_dataset, targets)
return
@ -291,6 +170,18 @@ def _(Path, data, io, val_dataset):
def _(load_dataset, load_dataset_config):
config = load_dataset_config("../paper/conf/datasets/train/uk_tune.yaml")
rec = load_dataset(config, base_dir="../paper/")
return (rec,)
@app.cell
def _(rec):
dict(rec[0].sound_events[0].tags[0].term)
return
@app.cell
def _(compute_class_summary, rec, targets):
compute_class_summary(rec,targets)
return

View File

@ -11,6 +11,11 @@ from batdetect2.data.datasets import (
load_dataset_config,
load_dataset_from_config,
)
from batdetect2.data.summary import (
compute_class_summary,
extract_recordings_df,
extract_sound_events_df,
)
__all__ = [
"AOEFAnnotations",
@ -18,6 +23,9 @@ __all__ = [
"BatDetect2FilesAnnotations",
"BatDetect2MergedAnnotations",
"DatasetConfig",
"compute_class_summary",
"extract_recordings_df",
"extract_sound_events_df",
"load_annotated_dataset",
"load_dataset",
"load_dataset_config",

View File

@ -0,0 +1,75 @@
from typing import Optional, Tuple
from sklearn.model_selection import train_test_split
from batdetect2.data.datasets import Dataset
from batdetect2.data.summary import (
extract_recordings_df,
extract_sound_events_df,
)
from batdetect2.targets.types import TargetProtocol
def split_dataset_by_recordings(
dataset: Dataset,
targets: TargetProtocol,
train_size: float = 0.75,
random_state: Optional[int] = None,
) -> Tuple[Dataset, Dataset]:
recordings = extract_recordings_df(dataset)
sound_events = extract_sound_events_df(
dataset,
targets,
exclude_non_target=True,
exclude_generic=True,
)
majority_class = (
sound_events.groupby("recording_id")
.apply(
lambda group: group["class_name"] # type: ignore
.value_counts()
.sort_values(ascending=False)
.index[0],
include_groups=False, # type: ignore
)
.rename("class_name")
.to_frame()
.reset_index()
)
train, test = train_test_split(
majority_class["recording_id"],
stratify=majority_class["class_name"],
train_size=train_size,
random_state=random_state,
)
train_ids_set = set(train.values) # type: ignore
test_ids_set = set(test.values) # type: ignore
extra = set(recordings["recording_id"]) - train_ids_set - test_ids_set
if extra:
train_extra, test_extra = train_test_split(
list(extra),
train_size=train_size,
random_state=random_state,
)
train_ids_set.update(train_extra)
test_ids_set.update(test_extra)
train_dataset = [
clip_annotation
for clip_annotation in dataset
if str(clip_annotation.clip.recording.uuid) in train_ids_set
]
test_dataset = [
clip_annotation
for clip_annotation in dataset
if str(clip_annotation.clip.recording.uuid) in test_ids_set
]
return train_dataset, test_dataset

View File

@ -0,0 +1,192 @@
import pandas as pd
from soundevent.geometry import compute_bounds
from batdetect2.data.datasets import Dataset
from batdetect2.targets.types import TargetProtocol
__all__ = [
"extract_recordings_df",
"extract_sound_events_df",
"compute_class_summary",
]
def extract_recordings_df(dataset: Dataset) -> pd.DataFrame:
"""Extract recording metadata into a pandas DataFrame.
Parameters
----------
dataset : List[data.ClipAnnotation]
A list of clip annotations from which to extract recording information.
Returns
-------
pd.DataFrame
A DataFrame where each row corresponds to a recording, containing
metadata such as duration, path, sample rate, and other properties.
"""
recordings = []
for clip_annotation in dataset:
clip = clip_annotation.clip
recording = clip.recording
recordings.append(
{
"clip_annotation_id": str(clip_annotation.uuid),
"recording_id": str(recording.uuid),
"duration": clip.duration,
"filename": recording.path.name,
**recording.model_dump(
mode="json",
include={
"samplerate",
"hash",
"path",
"date",
"time",
"latitude",
"longitude",
},
),
}
)
return pd.DataFrame(recordings)
def extract_sound_events_df(
dataset: Dataset,
targets: TargetProtocol,
exclude_non_target: bool = True,
exclude_generic: bool = True,
) -> pd.DataFrame:
"""Extract sound event data into a pandas DataFrame.
This function iterates through all sound events in the provided dataset,
applies filtering and classification logic based on the `targets`
protocol, and compiles the results into a structured DataFrame.
Parameters
----------
dataset : List[data.ClipAnnotation]
The dataset containing clip annotations with sound events.
targets : TargetProtocol
An object that provides methods to filter (`filter`) and classify
(`encode_class`) sound events.
exclude_non_target : bool, default=True
If True, sound events that do not pass the `targets.filter()` check
are excluded from the output.
exclude_generic : bool, default=True
If True, sound events that are classified with a `None` class name
by `targets.encode_class()` are excluded.
Returns
-------
pd.DataFrame
A DataFrame where each row represents a single sound event, including
its bounding box, class name, and other relevant attributes.
"""
sound_events = []
for clip_annotation in dataset:
for sound_event in clip_annotation.sound_events:
is_target = targets.filter(sound_event)
if not is_target and exclude_non_target:
continue
if sound_event.sound_event.geometry is None:
continue
class_name = targets.encode_class(sound_event)
if class_name is None and exclude_generic:
continue
start_time, low_freq, end_time, high_freq = compute_bounds(
sound_event.sound_event.geometry
)
sound_events.append(
{
"clip_annotation_id": str(clip_annotation.uuid),
"sound_event_id": str(sound_event.uuid),
"recording_id": str(
sound_event.sound_event.recording.uuid
),
"start_time": start_time,
"end_time": end_time,
"low_freq": low_freq,
"high_freq": high_freq,
"is_target": is_target,
"class_name": class_name,
}
)
return pd.DataFrame(sound_events)
def compute_class_summary(
dataset: Dataset,
targets: TargetProtocol,
) -> pd.DataFrame:
"""Compute a summary of sound event statistics grouped by class.
This function generates a high-level summary DataFrame that provides
key metrics for each class identified in the dataset. It calculates
the total number of calls, the number of unique recordings containing
each class, the total duration of those recordings, and the call rate.
Parameters
----------
dataset : List[data.ClipAnnotation]
The dataset to be summarized.
targets : TargetProtocol
An object providing the classification logic for sound events.
Returns
-------
pd.DataFrame
A DataFrame indexed by class name, with columns for 'num calls',
'num recordings', 'duration', and 'call_rate'.
"""
sound_events = extract_sound_events_df(
dataset,
targets,
exclude_generic=True,
exclude_non_target=True,
)
recordings = extract_recordings_df(dataset)
num_calls = (
sound_events.groupby("class_name")
.size()
.sort_values(ascending=False)
.rename("num calls")
)
num_recs = (
sound_events.groupby("class_name")["clip_annotation_id"]
.nunique()
.sort_values(ascending=False)
.rename("num recordings")
)
durations = (
sound_events.groupby("class_name")
.apply(
lambda group: recordings[
recordings["clip_annotation_id"].isin(
group["clip_annotation_id"] # type: ignore
)
]["duration"].sum(),
include_groups=False, # type: ignore
)
.sort_values(ascending=False)
.rename("duration")
)
return (
num_calls.to_frame()
.join(num_recs)
.join(durations)
.sort_values("num calls", ascending=False)
.assign(call_rate=lambda df: df["num calls"] / df["duration"])
)