Compare commits

..

5 Commits

Author SHA1 Message Date
mbsantiago
5d92f3a00d Remove stale pylintrc 2026-03-19 14:50:47 +00:00
mbsantiago
26cbe00af2 Update image path 2026-03-19 14:47:40 +00:00
mbsantiago
00430c8426 Remove dvc stuff 2026-03-19 14:46:49 +00:00
mbsantiago
daf18b9c20 Change name of ims folder to assets. 2026-03-19 14:45:43 +00:00
mbsantiago
78ede31b8b Remove stale scripts 2026-03-19 14:44:05 +00:00
12 changed files with 4 additions and 840 deletions

3
.dvc/.gitignore vendored
View File

@ -1,3 +0,0 @@
/config.local
/tmp
/cache

View File

View File

@ -1,3 +0,0 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

3
.gitignore vendored
View File

@ -128,3 +128,6 @@ notebooks/tmp
/notebooks
/AGENTS.md
/scripts
# Assets
!assets/*

View File

@ -1,5 +0,0 @@
[TYPECHECK]
# List of members which are set dynamically and missed by Pylint inference
# system, and so shouldn't trigger E1101 when accessed.
generated-members=torch.*

View File

@ -1,5 +1,5 @@
# BatDetect2
<img style="display: block-inline;" width="64" height="64" src="ims/bat_icon.png"> Code for detecting and classifying bat echolocation calls in high frequency audio recordings.
<img style="display: block-inline;" width="64" height="64" src="assets/bat_icon.png"> Code for detecting and classifying bat echolocation calls in high frequency audio recordings.
## Getting started
### Python Environment

View File

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -1,17 +0,0 @@
This directory contains some scripts for visualizing the raw data and model outputs.
`gen_spec_image.py`: saves the model predictions on a spectrogram of the input audio file.
e.g.
`python gen_spec_image.py ../example_data/audio/20170701_213954-MYOMYS-LR_0_0.5.wav ../models/Net2DFast_UK_same.pth.tar`
`gen_spec_video.py`: generates a video showing the model predictions for a file.
e.g.
`python gen_spec_video.py ../example_data/audio/20170701_213954-MYOMYS-LR_0_0.5.wav ../models/Net2DFast_UK_same.pth.tar`
`gen_dataset_summary_image.py`: generates an image displaying the mean spectrogram for each class in a specified dataset.
e.g.
`python gen_dataset_summary_image.py --ann_file PATH_TO_ANN/australia_TRAIN.json PATH_TO_AUDIO/audio/ ../plots/australia/`

View File

@ -1,96 +0,0 @@
"""
Loads a set of annotations corresponding to a dataset and saves an image which
is the mean spectrogram for each class.
"""
import argparse
import os
import matplotlib.pyplot as plt
import numpy as np
import viz_helpers as vz
import batdetect2.detector.parameters as parameters
import batdetect2.train.train_split as ts
import batdetect2.train.train_utils as tu
import batdetect2.utils.audio_utils as au
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"audio_path", type=str, help="Input directory for audio"
)
parser.add_argument(
"op_dir",
type=str,
help="Path to where single annotation json file is stored",
)
parser.add_argument(
"--ann_file",
type=str,
help="Path to where single annotation json file is stored",
)
parser.add_argument(
"--uk_split", type=str, default="", help="Set as: diff or same"
)
parser.add_argument(
"--file_type",
type=str,
default="png",
help="Type of image to save png or pdf",
)
args = vars(parser.parse_args())
if not os.path.isdir(args["op_dir"]):
os.makedirs(args["op_dir"])
params = parameters.get_params(False)
params["smooth_spec"] = False
params["spec_width"] = 48
params["norm_type"] = "log" # log, pcen
params["aud_pad"] = 0.005
classes_to_ignore = params["classes_to_ignore"] + params["generic_class"]
# load train annotations
if args["uk_split"] == "":
print("\nLoading:", args["ann_file"], "\n")
dataset_name = os.path.basename(args["ann_file"]).replace(".json", "")
datasets = []
datasets.append(
tu.get_blank_dataset_dict(
dataset_name, False, args["ann_file"], args["audio_path"]
)
)
else:
# load uk data - special case
print("\nLoading:", args["uk_split"], "\n")
dataset_name = (
"uk_" + args["uk_split"]
) # should be uk_diff, or uk_same
datasets, _ = ts.get_train_test_data(
args["ann_file"],
args["audio_path"],
args["uk_split"],
load_extra=False,
)
anns, class_names, _ = tu.load_set_of_anns(
datasets, classes_to_ignore, params["events_of_interest"]
)
class_names_order = range(len(class_names))
x_train, y_train = vz.load_data(
anns,
params,
class_names,
smooth_spec=params["smooth_spec"],
norm_type=params["norm_type"],
)
op_file_name = os.path.join(
args["op_dir"], dataset_name + "." + args["file_type"]
)
vz.save_summary_image(
x_train, y_train, class_names, params, op_file_name, class_names_order
)
print("\nImage saved to:", op_file_name)

View File

@ -1,211 +0,0 @@
"""
Visualize predctions on top of spectrogram.
Will save images with:
1) raw spectrogram
2) spectrogram with GT boxes
3) spectrogram with predicted boxes
"""
import argparse
import json
import os
import sys
import torch
import matplotlib.pyplot as plt
import numpy as np
import batdetect2.evaluate.evaluate_models as evlm
import batdetect2.utils.audio_utils as au
import batdetect2.utils.detector_utils as du
import batdetect2.utils.plot_utils as viz
def filter_anns(anns, start_time, stop_time):
anns_op = []
for aa in anns:
if (aa["start_time"] >= start_time) and (
aa["start_time"] < stop_time - 0.02
):
anns_op.append(aa)
return anns_op
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("audio_file", type=str, help="Path to audio file")
parser.add_argument("model_path", type=str, help="Path to BatDetect model")
parser.add_argument(
"--ann_file", type=str, default="", help="Path to annotation file"
)
parser.add_argument(
"--op_dir",
type=str,
default="plots/",
help="Output directory for plots",
)
parser.add_argument(
"--file_type",
type=str,
default="png",
help="Type of image to save png or pdf",
)
parser.add_argument(
"--title_text",
type=str,
default="",
help="Text to add as title of plots",
)
parser.add_argument(
"--detection_threshold",
type=float,
default=0.2,
help="Threshold for output detections",
)
parser.add_argument(
"--start_time",
type=float,
default=0.0,
help="Start time for cropped file",
)
parser.add_argument(
"--stop_time",
type=float,
default=0.5,
help="End time for cropped file",
)
parser.add_argument(
"--time_expansion_factor",
type=int,
default=1,
help="Time expansion factor",
)
args_cmd = vars(parser.parse_args())
# load the model
bd_args = du.get_default_bd_args()
model, params_bd = du.load_model(args_cmd["model_path"])
bd_args["detection_threshold"] = args_cmd["detection_threshold"]
bd_args["time_expansion_factor"] = args_cmd["time_expansion_factor"]
# load the annotation if it exists
gt_present = False
if args_cmd["ann_file"] != "":
if os.path.isfile(args_cmd["ann_file"]):
with open(args_cmd["ann_file"]) as da:
gt_anns = json.load(da)
gt_anns = filter_anns(
gt_anns["annotation"],
args_cmd["start_time"],
args_cmd["stop_time"],
)
gt_present = True
else:
print("Annotation file not found: ", args_cmd["ann_file"])
# load the audio file
if not os.path.isfile(args_cmd["audio_file"]):
print("Audio file not found: ", args_cmd["audio_file"])
sys.exit()
# load audio and crop
print("\nProcessing: " + os.path.basename(args_cmd["audio_file"]))
print("\nOutput directory: " + args_cmd["op_dir"])
sampling_rate, audio = au.load_audio(
args_cmd["audio_file"],
args_cmd["time_exp"],
params_bd["target_samp_rate"],
params_bd["scale_raw_audio"],
)
st_samp = int(sampling_rate * args_cmd["start_time"])
en_samp = int(sampling_rate * args_cmd["stop_time"])
if en_samp > audio.shape[0]:
audio = np.hstack(
(audio, np.zeros((en_samp) - audio.shape[0], dtype=audio.dtype))
)
audio = audio[st_samp:en_samp]
duration = audio.shape[0] / sampling_rate
print("File duration: {} seconds".format(duration))
# create spec for viz
spec, _ = au.generate_spectrogram(
audio, sampling_rate, params_bd, True, False
)
run_config = {
**params_bd,
**bd_args,
}
# run model and filter detections so only keep ones in relevant time range
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = du.process_file(
args_cmd["audio_file"], model, run_config, device
)
pred_anns = filter_anns(
results["pred_dict"]["annotation"],
args_cmd["start_time"],
args_cmd["stop_time"],
)
print(len(pred_anns), "Detections")
# save output
if not os.path.isdir(args_cmd["op_dir"]):
os.makedirs(args_cmd["op_dir"])
# create output file names
op_path_clean = (
os.path.basename(args_cmd["audio_file"])[:-4]
+ "_clean."
+ args_cmd["file_type"]
)
op_path_clean = os.path.join(args_cmd["op_dir"], op_path_clean)
op_path_pred = (
os.path.basename(args_cmd["audio_file"])[:-4]
+ "_pred."
+ args_cmd["file_type"]
)
op_path_pred = os.path.join(args_cmd["op_dir"], op_path_pred)
# create and save iamges
viz.save_ann_spec(
op_path_clean,
spec,
params_bd["min_freq"],
params_bd["max_freq"],
duration,
args_cmd["start_time"],
"",
None,
)
viz.save_ann_spec(
op_path_pred,
spec,
params_bd["min_freq"],
params_bd["max_freq"],
duration,
args_cmd["start_time"],
"",
pred_anns,
)
if gt_present:
op_path_gt = (
os.path.basename(args_cmd["audio_file"])[:-4]
+ "_gt."
+ args_cmd["file_type"]
)
op_path_gt = os.path.join(args_cmd["op_dir"], op_path_gt)
viz.save_ann_spec(
op_path_gt,
spec,
params_bd["min_freq"],
params_bd["max_freq"],
duration,
args_cmd["start_time"],
"",
gt_anns,
)

View File

@ -1,278 +0,0 @@
"""
This script takes an audio file as input, runs the detector, and makes a video output
Notes:
It needs ffmpeg installed to make the videos
Sometimes conda can overwrite the default ffmpeg path set this to use system one.
Check which one is being used with `which ffmpeg`. If conda version, can thow an error.
Best to use system one - see ffmpeg_path.
"""
import argparse
import os
import shutil
import sys
import matplotlib.pyplot as plt
import numpy as np
import torch
from scipy.io import wavfile
import batdetect2.detector.parameters as parameters
import batdetect2.utils.audio_utils as au
import batdetect2.utils.detector_utils as du
import batdetect2.utils.plot_utils as viz
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"audio_file", type=str, help="Path to input audio file"
)
parser.add_argument(
"model_path", type=str, help="Path to trained BatDetect model"
)
parser.add_argument(
"--op_dir",
type=str,
default="generated_vids/",
help="Path to output directory",
)
parser.add_argument(
"--no_detector", action="store_true", help="Do not run detector"
)
parser.add_argument(
"--plot_class_names_off",
action="store_true",
help="Do not plot class names",
)
parser.add_argument(
"--disable_axis", action="store_true", help="Do not plot axis"
)
parser.add_argument(
"--detection_threshold",
type=float,
default=0.2,
help="Cut-off probability for detector",
)
parser.add_argument(
"--time_expansion_factor",
type=int,
default=1,
dest="time_expansion_factor",
help="The time expansion factor used for all files (default is 1)",
)
args_cmd = vars(parser.parse_args())
# file of interest
audio_file = args_cmd["audio_file"]
op_dir = args_cmd["op_dir"]
op_str = "_output"
ffmpeg_path = "/usr/bin/"
if not os.path.isfile(audio_file):
print("Audio file not found: ", audio_file)
sys.exit()
if not os.path.isfile(args_cmd["model_path"]):
print("Model not found: ", args_cmd["model_path"])
sys.exit()
start_time = 0.0
duration = 0.5
reveal_boxes = True # makes the boxes appear one at a time
fps = 24
dpi = 100
op_dir_tmp = os.path.join(op_dir, "op_tmp_vids", "")
if not os.path.isdir(op_dir_tmp):
os.makedirs(op_dir_tmp)
if not os.path.isdir(op_dir):
os.makedirs(op_dir)
params = parameters.get_params(False)
args = du.get_default_bd_args()
args["time_expansion_factor"] = args_cmd["time_expansion_factor"]
args["detection_threshold"] = args_cmd["detection_threshold"]
# load audio file
print("\nProcessing: " + os.path.basename(audio_file))
print("\nOutput directory: " + op_dir)
sampling_rate, audio = au.load_audio(
audio_file, args["time_expansion_factor"], params["target_samp_rate"]
)
audio = audio[
int(sampling_rate * start_time) : int(
sampling_rate * start_time + sampling_rate * duration
)
]
audio_orig = audio.copy()
audio = au.pad_audio(
audio,
sampling_rate,
params["fft_win_length"],
params["fft_overlap"],
params["resize_factor"],
params["spec_divide_factor"],
)
# generate spectrogram
spec, _ = au.generate_spectrogram(audio, sampling_rate, params, True)
max_val = spec.max() * 1.1
if not args_cmd["no_detector"]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(" Loading model and running detector on entire file ...")
model, det_params = du.load_model(args_cmd["model_path"])
det_params["detection_threshold"] = args["detection_threshold"]
run_config = {
**det_params,
**args,
}
results = du.process_file(
audio_file,
model,
run_config,
device,
)
print(" Processing detections and plotting ...")
detections = []
for bb in results["pred_dict"]["annotation"]:
if (bb["start_time"] >= start_time) and (
bb["end_time"] < start_time + duration
):
detections.append(bb)
# plot boxes
fig = plt.figure(
1, figsize=(spec.shape[1] / dpi, spec.shape[0] / dpi), dpi=dpi
)
duration = au.x_coords_to_time(
spec.shape[1],
sampling_rate,
params["fft_win_length"],
params["fft_overlap"],
)
viz.create_box_image(
spec,
fig,
detections,
start_time,
start_time + duration,
duration,
params,
max_val,
plot_class_names=not args_cmd["plot_class_names_off"],
)
op_im_file_boxes = os.path.join(
op_dir, os.path.basename(audio_file)[:-4] + op_str + "_boxes.png"
)
fig.savefig(op_im_file_boxes, dpi=dpi)
plt.close(1)
spec_with_boxes = plt.imread(op_im_file_boxes)
print(" Saving audio file ...")
if args["time_expansion_factor"] == 1:
sampling_rate_op = int(sampling_rate / 10.0)
else:
sampling_rate_op = sampling_rate
op_audio_file = os.path.join(
op_dir, os.path.basename(audio_file)[:-4] + op_str + ".wav"
)
wavfile.write(op_audio_file, sampling_rate_op, audio_orig)
print(" Saving image ...")
op_im_file = os.path.join(
op_dir, os.path.basename(audio_file)[:-4] + op_str + ".png"
)
plt.imsave(op_im_file, spec, vmin=0, vmax=max_val, cmap="plasma")
spec_blank = plt.imread(op_im_file)
# create figure
freq_scale = 1000 # turn Hz to kHz
min_freq = params["min_freq"] // freq_scale
max_freq = params["max_freq"] // freq_scale
y_extent = [0, duration, min_freq, max_freq]
print(" Saving video frames ...")
# save images that will be combined into video
# will either plot with or without boxes
for ii, col in enumerate(
np.linspace(0, spec.shape[1] - 1, int(fps * duration * 10))
):
if not args_cmd["no_detector"]:
spec_op = spec_with_boxes.copy()
if ii > 0:
spec_op[:, int(col), :] = 1.0
if reveal_boxes:
spec_op[:, int(col) + 1 :, :] = spec_blank[
:, int(col) + 1 :, :
]
elif ii == 0 and reveal_boxes:
spec_op = spec_blank
if not args_cmd["disable_axis"]:
plt.close("all")
fig = plt.figure(
ii,
figsize=(
1.2 * (spec_op.shape[1] / dpi),
1.5 * (spec_op.shape[0] / dpi),
),
dpi=dpi,
)
plt.xlabel("Time - seconds")
plt.ylabel("Frequency - kHz")
plt.imshow(
spec_op,
vmin=0,
vmax=1.0,
cmap="plasma",
extent=y_extent,
aspect="auto",
)
plt.tight_layout()
fig.savefig(op_dir_tmp + str(ii).zfill(4) + ".png", dpi=dpi)
else:
plt.imsave(
op_dir_tmp + str(ii).zfill(4) + ".png",
spec_op,
vmin=0,
vmax=1.0,
cmap="plasma",
)
else:
spec_op = spec.copy()
if ii > 0:
spec_op[:, int(col)] = max_val
plt.imsave(
op_dir_tmp + str(ii).zfill(4) + ".png",
spec_op,
vmin=0,
vmax=max_val,
cmap="plasma",
)
print(" Creating video ...")
op_vid_file = os.path.join(
op_dir, os.path.basename(audio_file)[:-4] + op_str + ".avi"
)
ffmpeg_cmd = (
"ffmpeg -hide_banner -loglevel panic -y -r {} -f image2 -s {}x{} -i {}%04d.png -i {} -vcodec libx264 "
"-crf 25 -pix_fmt yuv420p -acodec copy {}".format(
fps,
spec.shape[1],
spec.shape[0],
op_dir_tmp,
op_audio_file,
op_vid_file,
)
)
ffmpeg_cmd = ffmpeg_path + ffmpeg_cmd
os.system(ffmpeg_cmd)
print(" Deleting temporary files ...")
if os.path.isdir(op_dir_tmp):
shutil.rmtree(op_dir_tmp)

View File

@ -1,226 +0,0 @@
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
from scipy import ndimage
sys.path.append(os.path.join(".."))
import batdetect2.utils.audio_utils as au
def generate_spectrogram_data(
audio, sampling_rate, params, norm_type="log", smooth_spec=False
):
max_freq = round(params["max_freq"] * params["fft_win_length"])
min_freq = round(params["min_freq"] * params["fft_win_length"])
# create spectrogram - numpy
spec = au.gen_mag_spectrogram(
audio, sampling_rate, params["fft_win_length"], params["fft_overlap"]
)
# spec = au.gen_mag_spectrogram_pt(audio, sampling_rate, params['fft_win_length'], params['fft_overlap']).numpy()
if spec.shape[0] < max_freq:
freq_pad = max_freq - spec.shape[0]
spec = np.vstack(
(np.zeros((freq_pad, spec.shape[1]), dtype=np.float32), spec)
)
spec = spec[-max_freq : spec.shape[0] - min_freq, :]
if norm_type == "log":
log_scaling = (
2.0
* (1.0 / sampling_rate)
* (
1.0
/ (
np.abs(
np.hanning(
int(params["fft_win_length"] * sampling_rate)
)
)
** 2
).sum()
)
)
##log_scaling = 0.01
spec = np.log(1.0 + log_scaling * spec).astype(np.float32)
elif norm_type == "pcen":
spec = au.pcen(spec, sampling_rate)
else:
pass
if smooth_spec:
spec = ndimage.gaussian_filter(spec, 1)
return spec
def load_data(
anns,
params,
class_names,
smooth_spec=False,
norm_type="log",
extract_bg=False,
):
specs = []
labels = []
coords = []
audios = []
sampling_rates = []
file_names = []
for cur_file in anns:
sampling_rate, audio_orig = au.load_audio(
cur_file["file_path"],
cur_file["time_exp"],
params["target_samp_rate"],
params["scale_raw_audio"],
)
for ann in cur_file["annotation"]:
if (
ann["class"] not in params["classes_to_ignore"]
and ann["class"] in class_names
):
# clip out of bounds
if ann["low_freq"] < params["min_freq"]:
ann["low_freq"] = params["min_freq"]
if ann["high_freq"] > params["max_freq"]:
ann["high_freq"] = params["max_freq"]
# load cropped audio
start_samp_diff = int(sampling_rate * ann["start_time"]) - int(
sampling_rate * params["aud_pad"]
)
start_samp = np.maximum(0, start_samp_diff)
end_samp = np.minimum(
audio_orig.shape[0],
int(sampling_rate * ann["end_time"]) * 2
+ int(sampling_rate * params["aud_pad"]),
)
audio = audio_orig[start_samp:end_samp]
if start_samp_diff < 0:
# need to pad at start if the call is at the very begining
audio = np.hstack(
(np.zeros(-start_samp_diff, dtype=np.float32), audio)
)
nfft = int(params["fft_win_length"] * sampling_rate)
noverlap = int(params["fft_overlap"] * nfft)
max_samps = params["spec_width"] * (nfft - noverlap) + noverlap
if max_samps > audio.shape[0]:
audio = np.hstack(
(audio, np.zeros(max_samps - audio.shape[0]))
)
audio = audio[:max_samps].astype(np.float32)
audio = au.pad_audio(
audio,
sampling_rate,
params["fft_win_length"],
params["fft_overlap"],
params["resize_factor"],
params["spec_divide_factor"],
)
# generate spectrogram
spec = generate_spectrogram_data(
audio, sampling_rate, params, norm_type, smooth_spec
)[:, : params["spec_width"]]
specs.append(spec[np.newaxis, ...])
labels.append(ann["class"])
audios.append(audio)
sampling_rates.append(sampling_rate)
file_names.append(cur_file["file_path"])
# position in crop
x1 = int(
au.time_to_x_coords(
np.array(params["aud_pad"]),
sampling_rate,
params["fft_win_length"],
params["fft_overlap"],
)
)
y1 = (ann["low_freq"] - params["min_freq"]) * params[
"fft_win_length"
]
coords.append((y1, x1))
_, file_ids = np.unique(file_names, return_inverse=True)
labels = np.array([class_names.index(ll) for ll in labels])
# return np.vstack(specs), labels, coords, audios, sampling_rates, file_ids, file_names
return np.vstack(specs), labels
def save_summary_image(
specs,
labels,
species_names,
params,
op_file_name="plots/all_species.png",
order=None,
):
# takes the mean for each class and plots it on a grid
mean_specs = []
max_band = []
for ii in range(len(species_names)):
inds = np.where(labels == ii)[0]
mu = specs[inds, :].mean(0)
max_band.append(np.argmax(mu.sum(1)))
mean_specs.append(mu)
# control the order in which classes are printed
if order is None:
order = np.arange(len(species_names))
max_cols = 6
nrows = int(np.ceil(len(species_names) / max_cols))
ncols = np.minimum(len(species_names), max_cols)
fig, ax = plt.subplots(
nrows=nrows,
ncols=ncols,
figsize=(ncols * 3.3, nrows * 6),
gridspec_kw={"wspace": 0, "hspace": 0.2},
)
spec_min_max = (
0,
mean_specs[0].shape[1],
params["min_freq"] / 1000,
params["max_freq"] / 1000,
)
ii = 0
for row in ax:
if type(row) != np.ndarray:
row = np.array([row])
for col in row:
if ii >= len(species_names):
col.axis("off")
else:
inds = np.where(labels == order[ii])[0]
col.imshow(
mean_specs[order[ii]],
extent=spec_min_max,
cmap="plasma",
aspect="equal",
)
col.grid(color="w", alpha=0.3, linewidth=0.3)
col.set_xticks([])
col.title.set_text(
str(ii + 1) + " " + species_names[order[ii]]
)
col.tick_params(axis="both", which="major", labelsize=7)
ii += 1
# plt.tight_layout()
# plt.show()
plt.savefig(op_file_name)
plt.close("all")