mirror of
https://github.com/macaodha/batdetect2.git
synced 2026-01-10 17:19:34 +01:00
Add dataset summary and split functions
This commit is contained in:
parent
ef279bee5d
commit
374c62d7ab
@ -12,8 +12,20 @@ def _():
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from batdetect2.data import load_dataset_config, load_dataset
|
||||
return load_dataset, load_dataset_config
|
||||
from batdetect2.data import (
|
||||
load_dataset_config,
|
||||
load_dataset,
|
||||
extract_recordings_df,
|
||||
extract_sound_events_df,
|
||||
compute_class_summary,
|
||||
)
|
||||
return (
|
||||
compute_class_summary,
|
||||
extract_recordings_df,
|
||||
extract_sound_events_df,
|
||||
load_dataset,
|
||||
load_dataset_config,
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
@ -72,183 +84,50 @@ def _(build_targets, targets_config):
|
||||
def _():
|
||||
import pandas as pd
|
||||
from soundevent.geometry import compute_bounds
|
||||
return compute_bounds, pd
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset, pd):
|
||||
def get_recording_df(dataset):
|
||||
recordings = []
|
||||
|
||||
for clip_annotation in dataset:
|
||||
recordings.append(
|
||||
{
|
||||
"recording_id": clip_annotation.clip.recording.uuid,
|
||||
"duration": clip_annotation.clip.duration,
|
||||
"clip_annotation_id": clip_annotation.uuid,
|
||||
"samplerate": clip_annotation.clip.recording.samplerate,
|
||||
"path": clip_annotation.clip.recording.path.name,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(recordings)
|
||||
|
||||
|
||||
recordings = get_recording_df(dataset)
|
||||
def _(dataset, extract_recordings_df):
|
||||
recordings = extract_recordings_df(dataset)
|
||||
recordings
|
||||
return (recordings,)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(compute_bounds, dataset, pd, targets):
|
||||
def get_sound_event_df(dataset):
|
||||
sound_events = []
|
||||
|
||||
for clip_annotation in dataset:
|
||||
for sound_event in clip_annotation.sound_events:
|
||||
if not targets.filter(sound_event):
|
||||
continue
|
||||
|
||||
if sound_event.sound_event.geometry is None:
|
||||
continue
|
||||
|
||||
class_name = targets.encode_class(sound_event)
|
||||
|
||||
if class_name is None:
|
||||
continue
|
||||
|
||||
start_time, low_freq, end_time, high_freq = compute_bounds(
|
||||
sound_event.sound_event.geometry
|
||||
)
|
||||
|
||||
sound_events.append(
|
||||
{
|
||||
"clip_annotation_id": clip_annotation.uuid,
|
||||
"sound_event_id": sound_event.uuid,
|
||||
"class_name": class_name,
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"low_freq": low_freq,
|
||||
"high_freq": high_freq,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(sound_events)
|
||||
|
||||
|
||||
sound_events = get_sound_event_df(dataset)
|
||||
def _(dataset, extract_sound_events_df, targets):
|
||||
sound_events = extract_sound_events_df(dataset, targets)
|
||||
sound_events
|
||||
return get_sound_event_df, sound_events
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(recordings, sound_events):
|
||||
def produce_summary(sound_events):
|
||||
num_calls = (
|
||||
sound_events.groupby("class_name")
|
||||
.size()
|
||||
.sort_values(ascending=False)
|
||||
.rename("num calls")
|
||||
)
|
||||
num_recs = (
|
||||
sound_events.groupby("class_name")["clip_annotation_id"]
|
||||
.nunique()
|
||||
.sort_values(ascending=False)
|
||||
.rename("num recordings")
|
||||
)
|
||||
durations = (
|
||||
sound_events.groupby("class_name")
|
||||
.apply(
|
||||
lambda group: recordings[
|
||||
recordings["clip_annotation_id"].isin(
|
||||
group["clip_annotation_id"]
|
||||
)
|
||||
]["duration"].sum(),
|
||||
include_groups=False,
|
||||
)
|
||||
.sort_values(ascending=False)
|
||||
.rename("duration")
|
||||
)
|
||||
return (
|
||||
num_calls.to_frame()
|
||||
.join(num_recs)
|
||||
.join(durations)
|
||||
.sort_values("num calls", ascending=False)
|
||||
.assign(call_rate=lambda df: df["num calls"] / df["duration"])
|
||||
)
|
||||
|
||||
|
||||
produce_summary(sound_events)
|
||||
return (produce_summary,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sound_events):
|
||||
majority_class = (
|
||||
sound_events.groupby("clip_annotation_id")
|
||||
.apply(
|
||||
lambda group: group["class_name"]
|
||||
.value_counts()
|
||||
.sort_values(ascending=False)
|
||||
.index[0],
|
||||
include_groups=False,
|
||||
)
|
||||
.rename("class_name")
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
return (majority_class,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(majority_class):
|
||||
majority_class
|
||||
def _(compute_class_summary, dataset, targets):
|
||||
compute_class_summary(dataset, targets)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
from sklearn.model_selection import train_test_split
|
||||
return (train_test_split,)
|
||||
from batdetect2.data.split import split_dataset_by_recordings
|
||||
return (split_dataset_by_recordings,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(majority_class, train_test_split):
|
||||
train, val = train_test_split(
|
||||
majority_class["clip_annotation_id"],
|
||||
stratify=majority_class["class_name"],
|
||||
)
|
||||
return train, val
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(dataset, train, val):
|
||||
train_dataset = [
|
||||
clip_annotation
|
||||
for clip_annotation in dataset
|
||||
if clip_annotation.uuid in set(train.values)
|
||||
]
|
||||
val_dataset = [
|
||||
clip_annotation
|
||||
for clip_annotation in dataset
|
||||
if clip_annotation.uuid in set(val.values)
|
||||
]
|
||||
def _(dataset, split_dataset_by_recordings, targets):
|
||||
train_dataset, val_dataset = split_dataset_by_recordings(dataset, targets, random_state=42)
|
||||
return train_dataset, val_dataset
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(get_sound_event_df, produce_summary, train_dataset):
|
||||
train_sound_events = get_sound_event_df(train_dataset)
|
||||
train_summary = produce_summary(train_sound_events)
|
||||
train_summary
|
||||
def _(compute_class_summary, targets, train_dataset):
|
||||
compute_class_summary(train_dataset, targets)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(get_sound_event_df, produce_summary, val_dataset):
|
||||
val_sound_events = get_sound_event_df(val_dataset)
|
||||
val_summary = produce_summary(val_sound_events)
|
||||
val_summary
|
||||
def _(compute_class_summary, targets, val_dataset):
|
||||
compute_class_summary(val_dataset, targets)
|
||||
return
|
||||
|
||||
|
||||
@ -291,6 +170,18 @@ def _(Path, data, io, val_dataset):
|
||||
def _(load_dataset, load_dataset_config):
|
||||
config = load_dataset_config("../paper/conf/datasets/train/uk_tune.yaml")
|
||||
rec = load_dataset(config, base_dir="../paper/")
|
||||
return (rec,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(rec):
|
||||
dict(rec[0].sound_events[0].tags[0].term)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(compute_class_summary, rec, targets):
|
||||
compute_class_summary(rec,targets)
|
||||
return
|
||||
|
||||
|
||||
|
||||
@ -11,6 +11,11 @@ from batdetect2.data.datasets import (
|
||||
load_dataset_config,
|
||||
load_dataset_from_config,
|
||||
)
|
||||
from batdetect2.data.summary import (
|
||||
compute_class_summary,
|
||||
extract_recordings_df,
|
||||
extract_sound_events_df,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AOEFAnnotations",
|
||||
@ -18,6 +23,9 @@ __all__ = [
|
||||
"BatDetect2FilesAnnotations",
|
||||
"BatDetect2MergedAnnotations",
|
||||
"DatasetConfig",
|
||||
"compute_class_summary",
|
||||
"extract_recordings_df",
|
||||
"extract_sound_events_df",
|
||||
"load_annotated_dataset",
|
||||
"load_dataset",
|
||||
"load_dataset_config",
|
||||
|
||||
75
src/batdetect2/data/split.py
Normal file
75
src/batdetect2/data/split.py
Normal file
@ -0,0 +1,75 @@
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from batdetect2.data.datasets import Dataset
|
||||
from batdetect2.data.summary import (
|
||||
extract_recordings_df,
|
||||
extract_sound_events_df,
|
||||
)
|
||||
from batdetect2.targets.types import TargetProtocol
|
||||
|
||||
|
||||
def split_dataset_by_recordings(
|
||||
dataset: Dataset,
|
||||
targets: TargetProtocol,
|
||||
train_size: float = 0.75,
|
||||
random_state: Optional[int] = None,
|
||||
) -> Tuple[Dataset, Dataset]:
|
||||
recordings = extract_recordings_df(dataset)
|
||||
|
||||
sound_events = extract_sound_events_df(
|
||||
dataset,
|
||||
targets,
|
||||
exclude_non_target=True,
|
||||
exclude_generic=True,
|
||||
)
|
||||
|
||||
majority_class = (
|
||||
sound_events.groupby("recording_id")
|
||||
.apply(
|
||||
lambda group: group["class_name"] # type: ignore
|
||||
.value_counts()
|
||||
.sort_values(ascending=False)
|
||||
.index[0],
|
||||
include_groups=False, # type: ignore
|
||||
)
|
||||
.rename("class_name")
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
train, test = train_test_split(
|
||||
majority_class["recording_id"],
|
||||
stratify=majority_class["class_name"],
|
||||
train_size=train_size,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
train_ids_set = set(train.values) # type: ignore
|
||||
test_ids_set = set(test.values) # type: ignore
|
||||
|
||||
extra = set(recordings["recording_id"]) - train_ids_set - test_ids_set
|
||||
|
||||
if extra:
|
||||
train_extra, test_extra = train_test_split(
|
||||
list(extra),
|
||||
train_size=train_size,
|
||||
random_state=random_state,
|
||||
)
|
||||
train_ids_set.update(train_extra)
|
||||
test_ids_set.update(test_extra)
|
||||
|
||||
train_dataset = [
|
||||
clip_annotation
|
||||
for clip_annotation in dataset
|
||||
if str(clip_annotation.clip.recording.uuid) in train_ids_set
|
||||
]
|
||||
|
||||
test_dataset = [
|
||||
clip_annotation
|
||||
for clip_annotation in dataset
|
||||
if str(clip_annotation.clip.recording.uuid) in test_ids_set
|
||||
]
|
||||
|
||||
return train_dataset, test_dataset
|
||||
192
src/batdetect2/data/summary.py
Normal file
192
src/batdetect2/data/summary.py
Normal file
@ -0,0 +1,192 @@
|
||||
import pandas as pd
|
||||
from soundevent.geometry import compute_bounds
|
||||
|
||||
from batdetect2.data.datasets import Dataset
|
||||
from batdetect2.targets.types import TargetProtocol
|
||||
|
||||
__all__ = [
|
||||
"extract_recordings_df",
|
||||
"extract_sound_events_df",
|
||||
"compute_class_summary",
|
||||
]
|
||||
|
||||
|
||||
def extract_recordings_df(dataset: Dataset) -> pd.DataFrame:
|
||||
"""Extract recording metadata into a pandas DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : List[data.ClipAnnotation]
|
||||
A list of clip annotations from which to extract recording information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
A DataFrame where each row corresponds to a recording, containing
|
||||
metadata such as duration, path, sample rate, and other properties.
|
||||
"""
|
||||
recordings = []
|
||||
|
||||
for clip_annotation in dataset:
|
||||
clip = clip_annotation.clip
|
||||
recording = clip.recording
|
||||
recordings.append(
|
||||
{
|
||||
"clip_annotation_id": str(clip_annotation.uuid),
|
||||
"recording_id": str(recording.uuid),
|
||||
"duration": clip.duration,
|
||||
"filename": recording.path.name,
|
||||
**recording.model_dump(
|
||||
mode="json",
|
||||
include={
|
||||
"samplerate",
|
||||
"hash",
|
||||
"path",
|
||||
"date",
|
||||
"time",
|
||||
"latitude",
|
||||
"longitude",
|
||||
},
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(recordings)
|
||||
|
||||
|
||||
def extract_sound_events_df(
|
||||
dataset: Dataset,
|
||||
targets: TargetProtocol,
|
||||
exclude_non_target: bool = True,
|
||||
exclude_generic: bool = True,
|
||||
) -> pd.DataFrame:
|
||||
"""Extract sound event data into a pandas DataFrame.
|
||||
|
||||
This function iterates through all sound events in the provided dataset,
|
||||
applies filtering and classification logic based on the `targets`
|
||||
protocol, and compiles the results into a structured DataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : List[data.ClipAnnotation]
|
||||
The dataset containing clip annotations with sound events.
|
||||
targets : TargetProtocol
|
||||
An object that provides methods to filter (`filter`) and classify
|
||||
(`encode_class`) sound events.
|
||||
exclude_non_target : bool, default=True
|
||||
If True, sound events that do not pass the `targets.filter()` check
|
||||
are excluded from the output.
|
||||
exclude_generic : bool, default=True
|
||||
If True, sound events that are classified with a `None` class name
|
||||
by `targets.encode_class()` are excluded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
A DataFrame where each row represents a single sound event, including
|
||||
its bounding box, class name, and other relevant attributes.
|
||||
"""
|
||||
sound_events = []
|
||||
|
||||
for clip_annotation in dataset:
|
||||
for sound_event in clip_annotation.sound_events:
|
||||
is_target = targets.filter(sound_event)
|
||||
|
||||
if not is_target and exclude_non_target:
|
||||
continue
|
||||
|
||||
if sound_event.sound_event.geometry is None:
|
||||
continue
|
||||
|
||||
class_name = targets.encode_class(sound_event)
|
||||
|
||||
if class_name is None and exclude_generic:
|
||||
continue
|
||||
|
||||
start_time, low_freq, end_time, high_freq = compute_bounds(
|
||||
sound_event.sound_event.geometry
|
||||
)
|
||||
|
||||
sound_events.append(
|
||||
{
|
||||
"clip_annotation_id": str(clip_annotation.uuid),
|
||||
"sound_event_id": str(sound_event.uuid),
|
||||
"recording_id": str(
|
||||
sound_event.sound_event.recording.uuid
|
||||
),
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"low_freq": low_freq,
|
||||
"high_freq": high_freq,
|
||||
"is_target": is_target,
|
||||
"class_name": class_name,
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(sound_events)
|
||||
|
||||
|
||||
def compute_class_summary(
|
||||
dataset: Dataset,
|
||||
targets: TargetProtocol,
|
||||
) -> pd.DataFrame:
|
||||
"""Compute a summary of sound event statistics grouped by class.
|
||||
|
||||
This function generates a high-level summary DataFrame that provides
|
||||
key metrics for each class identified in the dataset. It calculates
|
||||
the total number of calls, the number of unique recordings containing
|
||||
each class, the total duration of those recordings, and the call rate.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : List[data.ClipAnnotation]
|
||||
The dataset to be summarized.
|
||||
targets : TargetProtocol
|
||||
An object providing the classification logic for sound events.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.DataFrame
|
||||
A DataFrame indexed by class name, with columns for 'num calls',
|
||||
'num recordings', 'duration', and 'call_rate'.
|
||||
"""
|
||||
sound_events = extract_sound_events_df(
|
||||
dataset,
|
||||
targets,
|
||||
exclude_generic=True,
|
||||
exclude_non_target=True,
|
||||
)
|
||||
recordings = extract_recordings_df(dataset)
|
||||
|
||||
num_calls = (
|
||||
sound_events.groupby("class_name")
|
||||
.size()
|
||||
.sort_values(ascending=False)
|
||||
.rename("num calls")
|
||||
)
|
||||
num_recs = (
|
||||
sound_events.groupby("class_name")["clip_annotation_id"]
|
||||
.nunique()
|
||||
.sort_values(ascending=False)
|
||||
.rename("num recordings")
|
||||
)
|
||||
durations = (
|
||||
sound_events.groupby("class_name")
|
||||
.apply(
|
||||
lambda group: recordings[
|
||||
recordings["clip_annotation_id"].isin(
|
||||
group["clip_annotation_id"] # type: ignore
|
||||
)
|
||||
]["duration"].sum(),
|
||||
include_groups=False, # type: ignore
|
||||
)
|
||||
.sort_values(ascending=False)
|
||||
.rename("duration")
|
||||
)
|
||||
return (
|
||||
num_calls.to_frame()
|
||||
.join(num_recs)
|
||||
.join(durations)
|
||||
.sort_values("num calls", ascending=False)
|
||||
.assign(call_rate=lambda df: df["num calls"] / df["duration"])
|
||||
)
|
||||
Loading…
Reference in New Issue
Block a user