From 78ede31b8bec12245acb884c6310a0a6f8bc19f5 Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Thu, 19 Mar 2026 14:44:05 +0000
Subject: [PATCH] Remove stale scripts

---
 scripts/README.md                    |  17 --
 scripts/gen_dataset_summary_image.py |  96 ---------
 scripts/gen_spec_image.py            | 211 --------------------
 scripts/gen_spec_video.py            | 278 ---------------------------
 scripts/viz_helpers.py               | 226 ----------------------
 5 files changed, 828 deletions(-)
 delete mode 100644 scripts/README.md
 delete mode 100644 scripts/gen_dataset_summary_image.py
 delete mode 100644 scripts/gen_spec_image.py
 delete mode 100644 scripts/gen_spec_video.py
 delete mode 100644 scripts/viz_helpers.py

diff --git a/scripts/README.md b/scripts/README.md
deleted file mode 100644
index bcc4692..0000000
--- a/scripts/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-This directory contains some scripts for visualizing the raw data and model outputs.
-
-
-`gen_spec_image.py`:  saves the model predictions on a spectrogram of the input audio file.   
-e.g.  
-`python gen_spec_image.py ../example_data/audio/20170701_213954-MYOMYS-LR_0_0.5.wav ../models/Net2DFast_UK_same.pth.tar`  
-
-
-`gen_spec_video.py`:  generates a video showing the model predictions for a file.
-e.g.   
-`python gen_spec_video.py ../example_data/audio/20170701_213954-MYOMYS-LR_0_0.5.wav ../models/Net2DFast_UK_same.pth.tar`  
-
-
-
-`gen_dataset_summary_image.py`:  generates an image displaying the mean spectrogram for each class in a specified dataset.  
-e.g.  
-`python gen_dataset_summary_image.py --ann_file PATH_TO_ANN/australia_TRAIN.json PATH_TO_AUDIO/audio/ ../plots/australia/`
diff --git a/scripts/gen_dataset_summary_image.py b/scripts/gen_dataset_summary_image.py
deleted file mode 100644
index 3e0a26b..0000000
--- a/scripts/gen_dataset_summary_image.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-Loads a set of annotations corresponding to a dataset and saves an image which
-is the mean spectrogram for each class.
-"""
-
-import argparse
-import os
-
-import matplotlib.pyplot as plt
-import numpy as np
-import viz_helpers as vz
-
-import batdetect2.detector.parameters as parameters
-import batdetect2.train.train_split as ts
-import batdetect2.train.train_utils as tu
-import batdetect2.utils.audio_utils as au
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "audio_path", type=str, help="Input directory for audio"
-    )
-    parser.add_argument(
-        "op_dir",
-        type=str,
-        help="Path to where single annotation json file is stored",
-    )
-    parser.add_argument(
-        "--ann_file",
-        type=str,
-        help="Path to where single annotation json file is stored",
-    )
-    parser.add_argument(
-        "--uk_split", type=str, default="", help="Set as: diff or same"
-    )
-    parser.add_argument(
-        "--file_type",
-        type=str,
-        default="png",
-        help="Type of image to save png or pdf",
-    )
-    args = vars(parser.parse_args())
-
-    if not os.path.isdir(args["op_dir"]):
-        os.makedirs(args["op_dir"])
-
-    params = parameters.get_params(False)
-    params["smooth_spec"] = False
-    params["spec_width"] = 48
-    params["norm_type"] = "log"  # log, pcen
-    params["aud_pad"] = 0.005
-    classes_to_ignore = params["classes_to_ignore"] + params["generic_class"]
-
-    # load train annotations
-    if args["uk_split"] == "":
-        print("\nLoading:", args["ann_file"], "\n")
-        dataset_name = os.path.basename(args["ann_file"]).replace(".json", "")
-        datasets = []
-        datasets.append(
-            tu.get_blank_dataset_dict(
-                dataset_name, False, args["ann_file"], args["audio_path"]
-            )
-        )
-    else:
-        # load uk data - special case
-        print("\nLoading:", args["uk_split"], "\n")
-        dataset_name = (
-            "uk_" + args["uk_split"]
-        )  # should be uk_diff, or uk_same
-        datasets, _ = ts.get_train_test_data(
-            args["ann_file"],
-            args["audio_path"],
-            args["uk_split"],
-            load_extra=False,
-        )
-
-    anns, class_names, _ = tu.load_set_of_anns(
-        datasets, classes_to_ignore, params["events_of_interest"]
-    )
-    class_names_order = range(len(class_names))
-
-    x_train, y_train = vz.load_data(
-        anns,
-        params,
-        class_names,
-        smooth_spec=params["smooth_spec"],
-        norm_type=params["norm_type"],
-    )
-
-    op_file_name = os.path.join(
-        args["op_dir"], dataset_name + "." + args["file_type"]
-    )
-    vz.save_summary_image(
-        x_train, y_train, class_names, params, op_file_name, class_names_order
-    )
-    print("\nImage saved to:", op_file_name)
diff --git a/scripts/gen_spec_image.py b/scripts/gen_spec_image.py
deleted file mode 100644
index 490cad3..0000000
--- a/scripts/gen_spec_image.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""
-Visualize predctions on top of spectrogram.
-
-Will save images with:
-1) raw spectrogram
-2) spectrogram with GT boxes
-3) spectrogram with predicted boxes
-"""
-
-import argparse
-import json
-import os
-import sys
-
-import torch
-import matplotlib.pyplot as plt
-import numpy as np
-
-import batdetect2.evaluate.evaluate_models as evlm
-import batdetect2.utils.audio_utils as au
-import batdetect2.utils.detector_utils as du
-import batdetect2.utils.plot_utils as viz
-
-
-def filter_anns(anns, start_time, stop_time):
-    anns_op = []
-    for aa in anns:
-        if (aa["start_time"] >= start_time) and (
-            aa["start_time"] < stop_time - 0.02
-        ):
-            anns_op.append(aa)
-    return anns_op
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("audio_file", type=str, help="Path to audio file")
-    parser.add_argument("model_path", type=str, help="Path to BatDetect model")
-    parser.add_argument(
-        "--ann_file", type=str, default="", help="Path to annotation file"
-    )
-    parser.add_argument(
-        "--op_dir",
-        type=str,
-        default="plots/",
-        help="Output directory for plots",
-    )
-    parser.add_argument(
-        "--file_type",
-        type=str,
-        default="png",
-        help="Type of image to save png or pdf",
-    )
-    parser.add_argument(
-        "--title_text",
-        type=str,
-        default="",
-        help="Text to add as title of plots",
-    )
-    parser.add_argument(
-        "--detection_threshold",
-        type=float,
-        default=0.2,
-        help="Threshold for output detections",
-    )
-    parser.add_argument(
-        "--start_time",
-        type=float,
-        default=0.0,
-        help="Start time for cropped file",
-    )
-    parser.add_argument(
-        "--stop_time",
-        type=float,
-        default=0.5,
-        help="End time for cropped file",
-    )
-    parser.add_argument(
-        "--time_expansion_factor",
-        type=int,
-        default=1,
-        help="Time expansion factor",
-    )
-
-    args_cmd = vars(parser.parse_args())
-
-    # load the model
-    bd_args = du.get_default_bd_args()
-    model, params_bd = du.load_model(args_cmd["model_path"])
-    bd_args["detection_threshold"] = args_cmd["detection_threshold"]
-    bd_args["time_expansion_factor"] = args_cmd["time_expansion_factor"]
-
-    # load the annotation if it exists
-    gt_present = False
-    if args_cmd["ann_file"] != "":
-        if os.path.isfile(args_cmd["ann_file"]):
-            with open(args_cmd["ann_file"]) as da:
-                gt_anns = json.load(da)
-            gt_anns = filter_anns(
-                gt_anns["annotation"],
-                args_cmd["start_time"],
-                args_cmd["stop_time"],
-            )
-            gt_present = True
-        else:
-            print("Annotation file not found: ", args_cmd["ann_file"])
-
-    # load the audio file
-    if not os.path.isfile(args_cmd["audio_file"]):
-        print("Audio file not found: ", args_cmd["audio_file"])
-        sys.exit()
-
-    # load audio and crop
-    print("\nProcessing: " + os.path.basename(args_cmd["audio_file"]))
-    print("\nOutput directory: " + args_cmd["op_dir"])
-    sampling_rate, audio = au.load_audio(
-        args_cmd["audio_file"],
-        args_cmd["time_exp"],
-        params_bd["target_samp_rate"],
-        params_bd["scale_raw_audio"],
-    )
-    st_samp = int(sampling_rate * args_cmd["start_time"])
-    en_samp = int(sampling_rate * args_cmd["stop_time"])
-    if en_samp > audio.shape[0]:
-        audio = np.hstack(
-            (audio, np.zeros((en_samp) - audio.shape[0], dtype=audio.dtype))
-        )
-    audio = audio[st_samp:en_samp]
-
-    duration = audio.shape[0] / sampling_rate
-    print("File duration: {} seconds".format(duration))
-
-    # create spec for viz
-    spec, _ = au.generate_spectrogram(
-        audio, sampling_rate, params_bd, True, False
-    )
-
-    run_config = {
-        **params_bd,
-        **bd_args,
-    }
-
-    # run model and filter detections so only keep ones in relevant time range
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    results = du.process_file(
-        args_cmd["audio_file"], model, run_config, device
-    )
-    pred_anns = filter_anns(
-        results["pred_dict"]["annotation"],
-        args_cmd["start_time"],
-        args_cmd["stop_time"],
-    )
-    print(len(pred_anns), "Detections")
-
-    # save output
-    if not os.path.isdir(args_cmd["op_dir"]):
-        os.makedirs(args_cmd["op_dir"])
-
-    # create output file names
-    op_path_clean = (
-        os.path.basename(args_cmd["audio_file"])[:-4]
-        + "_clean."
-        + args_cmd["file_type"]
-    )
-    op_path_clean = os.path.join(args_cmd["op_dir"], op_path_clean)
-    op_path_pred = (
-        os.path.basename(args_cmd["audio_file"])[:-4]
-        + "_pred."
-        + args_cmd["file_type"]
-    )
-    op_path_pred = os.path.join(args_cmd["op_dir"], op_path_pred)
-
-    # create and save iamges
-    viz.save_ann_spec(
-        op_path_clean,
-        spec,
-        params_bd["min_freq"],
-        params_bd["max_freq"],
-        duration,
-        args_cmd["start_time"],
-        "",
-        None,
-    )
-    viz.save_ann_spec(
-        op_path_pred,
-        spec,
-        params_bd["min_freq"],
-        params_bd["max_freq"],
-        duration,
-        args_cmd["start_time"],
-        "",
-        pred_anns,
-    )
-
-    if gt_present:
-        op_path_gt = (
-            os.path.basename(args_cmd["audio_file"])[:-4]
-            + "_gt."
-            + args_cmd["file_type"]
-        )
-        op_path_gt = os.path.join(args_cmd["op_dir"], op_path_gt)
-        viz.save_ann_spec(
-            op_path_gt,
-            spec,
-            params_bd["min_freq"],
-            params_bd["max_freq"],
-            duration,
-            args_cmd["start_time"],
-            "",
-            gt_anns,
-        )
diff --git a/scripts/gen_spec_video.py b/scripts/gen_spec_video.py
deleted file mode 100644
index 625ba1a..0000000
--- a/scripts/gen_spec_video.py
+++ /dev/null
@@ -1,278 +0,0 @@
-"""
-This script takes an audio file as input, runs the detector, and makes a video output
-
-Notes:
-    It needs ffmpeg installed to make the videos
-    Sometimes conda can overwrite the default ffmpeg path set this to use system one.
-    Check which one is being used with `which ffmpeg`. If conda version, can thow an error.
-    Best to use system one - see ffmpeg_path.
-"""
-
-import argparse
-import os
-import shutil
-import sys
-
-import matplotlib.pyplot as plt
-import numpy as np
-import torch
-from scipy.io import wavfile
-
-import batdetect2.detector.parameters as parameters
-import batdetect2.utils.audio_utils as au
-import batdetect2.utils.detector_utils as du
-import batdetect2.utils.plot_utils as viz
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "audio_file", type=str, help="Path to input audio file"
-    )
-    parser.add_argument(
-        "model_path", type=str, help="Path to trained BatDetect model"
-    )
-    parser.add_argument(
-        "--op_dir",
-        type=str,
-        default="generated_vids/",
-        help="Path to output directory",
-    )
-    parser.add_argument(
-        "--no_detector", action="store_true", help="Do not run detector"
-    )
-    parser.add_argument(
-        "--plot_class_names_off",
-        action="store_true",
-        help="Do not plot class names",
-    )
-    parser.add_argument(
-        "--disable_axis", action="store_true", help="Do not plot axis"
-    )
-    parser.add_argument(
-        "--detection_threshold",
-        type=float,
-        default=0.2,
-        help="Cut-off probability for detector",
-    )
-    parser.add_argument(
-        "--time_expansion_factor",
-        type=int,
-        default=1,
-        dest="time_expansion_factor",
-        help="The time expansion factor used for all files (default is 1)",
-    )
-    args_cmd = vars(parser.parse_args())
-
-    # file of interest
-    audio_file = args_cmd["audio_file"]
-    op_dir = args_cmd["op_dir"]
-    op_str = "_output"
-    ffmpeg_path = "/usr/bin/"
-
-    if not os.path.isfile(audio_file):
-        print("Audio file not found: ", audio_file)
-        sys.exit()
-
-    if not os.path.isfile(args_cmd["model_path"]):
-        print("Model not found: ", args_cmd["model_path"])
-        sys.exit()
-
-    start_time = 0.0
-    duration = 0.5
-    reveal_boxes = True  # makes the boxes appear one at a time
-    fps = 24
-    dpi = 100
-
-    op_dir_tmp = os.path.join(op_dir, "op_tmp_vids", "")
-    if not os.path.isdir(op_dir_tmp):
-        os.makedirs(op_dir_tmp)
-    if not os.path.isdir(op_dir):
-        os.makedirs(op_dir)
-
-    params = parameters.get_params(False)
-    args = du.get_default_bd_args()
-    args["time_expansion_factor"] = args_cmd["time_expansion_factor"]
-    args["detection_threshold"] = args_cmd["detection_threshold"]
-
-    # load audio file
-    print("\nProcessing: " + os.path.basename(audio_file))
-    print("\nOutput directory: " + op_dir)
-    sampling_rate, audio = au.load_audio(
-        audio_file, args["time_expansion_factor"], params["target_samp_rate"]
-    )
-    audio = audio[
-        int(sampling_rate * start_time) : int(
-            sampling_rate * start_time + sampling_rate * duration
-        )
-    ]
-    audio_orig = audio.copy()
-    audio = au.pad_audio(
-        audio,
-        sampling_rate,
-        params["fft_win_length"],
-        params["fft_overlap"],
-        params["resize_factor"],
-        params["spec_divide_factor"],
-    )
-
-    # generate spectrogram
-    spec, _ = au.generate_spectrogram(audio, sampling_rate, params, True)
-    max_val = spec.max() * 1.1
-
-    if not args_cmd["no_detector"]:
-        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-        print("  Loading model and running detector on entire file ...")
-        model, det_params = du.load_model(args_cmd["model_path"])
-        det_params["detection_threshold"] = args["detection_threshold"]
-
-        run_config = {
-            **det_params,
-            **args,
-        }
-        results = du.process_file(
-            audio_file,
-            model,
-            run_config,
-            device,
-        )
-
-        print("  Processing detections and plotting ...")
-        detections = []
-        for bb in results["pred_dict"]["annotation"]:
-            if (bb["start_time"] >= start_time) and (
-                bb["end_time"] < start_time + duration
-            ):
-                detections.append(bb)
-
-        # plot boxes
-        fig = plt.figure(
-            1, figsize=(spec.shape[1] / dpi, spec.shape[0] / dpi), dpi=dpi
-        )
-        duration = au.x_coords_to_time(
-            spec.shape[1],
-            sampling_rate,
-            params["fft_win_length"],
-            params["fft_overlap"],
-        )
-        viz.create_box_image(
-            spec,
-            fig,
-            detections,
-            start_time,
-            start_time + duration,
-            duration,
-            params,
-            max_val,
-            plot_class_names=not args_cmd["plot_class_names_off"],
-        )
-        op_im_file_boxes = os.path.join(
-            op_dir, os.path.basename(audio_file)[:-4] + op_str + "_boxes.png"
-        )
-        fig.savefig(op_im_file_boxes, dpi=dpi)
-        plt.close(1)
-        spec_with_boxes = plt.imread(op_im_file_boxes)
-
-    print("  Saving audio file ...")
-    if args["time_expansion_factor"] == 1:
-        sampling_rate_op = int(sampling_rate / 10.0)
-    else:
-        sampling_rate_op = sampling_rate
-    op_audio_file = os.path.join(
-        op_dir, os.path.basename(audio_file)[:-4] + op_str + ".wav"
-    )
-    wavfile.write(op_audio_file, sampling_rate_op, audio_orig)
-
-    print("  Saving image ...")
-    op_im_file = os.path.join(
-        op_dir, os.path.basename(audio_file)[:-4] + op_str + ".png"
-    )
-    plt.imsave(op_im_file, spec, vmin=0, vmax=max_val, cmap="plasma")
-    spec_blank = plt.imread(op_im_file)
-
-    # create figure
-    freq_scale = 1000  # turn Hz to kHz
-    min_freq = params["min_freq"] // freq_scale
-    max_freq = params["max_freq"] // freq_scale
-    y_extent = [0, duration, min_freq, max_freq]
-
-    print("  Saving video frames ...")
-    # save images that will be combined into video
-    # will either plot with or without boxes
-    for ii, col in enumerate(
-        np.linspace(0, spec.shape[1] - 1, int(fps * duration * 10))
-    ):
-        if not args_cmd["no_detector"]:
-            spec_op = spec_with_boxes.copy()
-            if ii > 0:
-                spec_op[:, int(col), :] = 1.0
-                if reveal_boxes:
-                    spec_op[:, int(col) + 1 :, :] = spec_blank[
-                        :, int(col) + 1 :, :
-                    ]
-            elif ii == 0 and reveal_boxes:
-                spec_op = spec_blank
-
-            if not args_cmd["disable_axis"]:
-                plt.close("all")
-                fig = plt.figure(
-                    ii,
-                    figsize=(
-                        1.2 * (spec_op.shape[1] / dpi),
-                        1.5 * (spec_op.shape[0] / dpi),
-                    ),
-                    dpi=dpi,
-                )
-                plt.xlabel("Time - seconds")
-                plt.ylabel("Frequency - kHz")
-                plt.imshow(
-                    spec_op,
-                    vmin=0,
-                    vmax=1.0,
-                    cmap="plasma",
-                    extent=y_extent,
-                    aspect="auto",
-                )
-                plt.tight_layout()
-                fig.savefig(op_dir_tmp + str(ii).zfill(4) + ".png", dpi=dpi)
-            else:
-                plt.imsave(
-                    op_dir_tmp + str(ii).zfill(4) + ".png",
-                    spec_op,
-                    vmin=0,
-                    vmax=1.0,
-                    cmap="plasma",
-                )
-        else:
-            spec_op = spec.copy()
-            if ii > 0:
-                spec_op[:, int(col)] = max_val
-            plt.imsave(
-                op_dir_tmp + str(ii).zfill(4) + ".png",
-                spec_op,
-                vmin=0,
-                vmax=max_val,
-                cmap="plasma",
-            )
-
-    print("  Creating video ...")
-    op_vid_file = os.path.join(
-        op_dir, os.path.basename(audio_file)[:-4] + op_str + ".avi"
-    )
-    ffmpeg_cmd = (
-        "ffmpeg -hide_banner -loglevel panic -y -r {} -f image2 -s {}x{} -i {}%04d.png -i {} -vcodec libx264 "
-        "-crf 25  -pix_fmt yuv420p -acodec copy {}".format(
-            fps,
-            spec.shape[1],
-            spec.shape[0],
-            op_dir_tmp,
-            op_audio_file,
-            op_vid_file,
-        )
-    )
-    ffmpeg_cmd = ffmpeg_path + ffmpeg_cmd
-    os.system(ffmpeg_cmd)
-
-    print("  Deleting temporary files ...")
-    if os.path.isdir(op_dir_tmp):
-        shutil.rmtree(op_dir_tmp)
diff --git a/scripts/viz_helpers.py b/scripts/viz_helpers.py
deleted file mode 100644
index 4d86283..0000000
--- a/scripts/viz_helpers.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import os
-import sys
-
-import matplotlib.pyplot as plt
-import numpy as np
-from scipy import ndimage
-
-sys.path.append(os.path.join(".."))
-
-import batdetect2.utils.audio_utils as au
-
-
-def generate_spectrogram_data(
-    audio, sampling_rate, params, norm_type="log", smooth_spec=False
-):
-    max_freq = round(params["max_freq"] * params["fft_win_length"])
-    min_freq = round(params["min_freq"] * params["fft_win_length"])
-
-    # create spectrogram - numpy
-    spec = au.gen_mag_spectrogram(
-        audio, sampling_rate, params["fft_win_length"], params["fft_overlap"]
-    )
-    # spec = au.gen_mag_spectrogram_pt(audio, sampling_rate, params['fft_win_length'], params['fft_overlap']).numpy()
-    if spec.shape[0] < max_freq:
-        freq_pad = max_freq - spec.shape[0]
-        spec = np.vstack(
-            (np.zeros((freq_pad, spec.shape[1]), dtype=np.float32), spec)
-        )
-    spec = spec[-max_freq : spec.shape[0] - min_freq, :]
-
-    if norm_type == "log":
-        log_scaling = (
-            2.0
-            * (1.0 / sampling_rate)
-            * (
-                1.0
-                / (
-                    np.abs(
-                        np.hanning(
-                            int(params["fft_win_length"] * sampling_rate)
-                        )
-                    )
-                    ** 2
-                ).sum()
-            )
-        )
-        ##log_scaling = 0.01
-        spec = np.log(1.0 + log_scaling * spec).astype(np.float32)
-    elif norm_type == "pcen":
-        spec = au.pcen(spec, sampling_rate)
-    else:
-        pass
-
-    if smooth_spec:
-        spec = ndimage.gaussian_filter(spec, 1)
-
-    return spec
-
-
-def load_data(
-    anns,
-    params,
-    class_names,
-    smooth_spec=False,
-    norm_type="log",
-    extract_bg=False,
-):
-    specs = []
-    labels = []
-    coords = []
-    audios = []
-    sampling_rates = []
-    file_names = []
-    for cur_file in anns:
-        sampling_rate, audio_orig = au.load_audio(
-            cur_file["file_path"],
-            cur_file["time_exp"],
-            params["target_samp_rate"],
-            params["scale_raw_audio"],
-        )
-
-        for ann in cur_file["annotation"]:
-            if (
-                ann["class"] not in params["classes_to_ignore"]
-                and ann["class"] in class_names
-            ):
-                # clip out of bounds
-                if ann["low_freq"] < params["min_freq"]:
-                    ann["low_freq"] = params["min_freq"]
-                if ann["high_freq"] > params["max_freq"]:
-                    ann["high_freq"] = params["max_freq"]
-
-                # load cropped audio
-                start_samp_diff = int(sampling_rate * ann["start_time"]) - int(
-                    sampling_rate * params["aud_pad"]
-                )
-                start_samp = np.maximum(0, start_samp_diff)
-                end_samp = np.minimum(
-                    audio_orig.shape[0],
-                    int(sampling_rate * ann["end_time"]) * 2
-                    + int(sampling_rate * params["aud_pad"]),
-                )
-                audio = audio_orig[start_samp:end_samp]
-                if start_samp_diff < 0:
-                    # need to pad at start if the call is at the very begining
-                    audio = np.hstack(
-                        (np.zeros(-start_samp_diff, dtype=np.float32), audio)
-                    )
-
-                nfft = int(params["fft_win_length"] * sampling_rate)
-                noverlap = int(params["fft_overlap"] * nfft)
-                max_samps = params["spec_width"] * (nfft - noverlap) + noverlap
-
-                if max_samps > audio.shape[0]:
-                    audio = np.hstack(
-                        (audio, np.zeros(max_samps - audio.shape[0]))
-                    )
-                audio = audio[:max_samps].astype(np.float32)
-
-                audio = au.pad_audio(
-                    audio,
-                    sampling_rate,
-                    params["fft_win_length"],
-                    params["fft_overlap"],
-                    params["resize_factor"],
-                    params["spec_divide_factor"],
-                )
-
-                # generate spectrogram
-                spec = generate_spectrogram_data(
-                    audio, sampling_rate, params, norm_type, smooth_spec
-                )[:, : params["spec_width"]]
-
-                specs.append(spec[np.newaxis, ...])
-                labels.append(ann["class"])
-
-                audios.append(audio)
-                sampling_rates.append(sampling_rate)
-                file_names.append(cur_file["file_path"])
-
-                # position in crop
-                x1 = int(
-                    au.time_to_x_coords(
-                        np.array(params["aud_pad"]),
-                        sampling_rate,
-                        params["fft_win_length"],
-                        params["fft_overlap"],
-                    )
-                )
-                y1 = (ann["low_freq"] - params["min_freq"]) * params[
-                    "fft_win_length"
-                ]
-                coords.append((y1, x1))
-
-    _, file_ids = np.unique(file_names, return_inverse=True)
-    labels = np.array([class_names.index(ll) for ll in labels])
-
-    # return np.vstack(specs), labels, coords, audios, sampling_rates, file_ids, file_names
-    return np.vstack(specs), labels
-
-
-def save_summary_image(
-    specs,
-    labels,
-    species_names,
-    params,
-    op_file_name="plots/all_species.png",
-    order=None,
-):
-    # takes the mean for each class and plots it on a grid
-    mean_specs = []
-    max_band = []
-    for ii in range(len(species_names)):
-        inds = np.where(labels == ii)[0]
-        mu = specs[inds, :].mean(0)
-        max_band.append(np.argmax(mu.sum(1)))
-        mean_specs.append(mu)
-
-    # control the order in which classes are printed
-    if order is None:
-        order = np.arange(len(species_names))
-
-    max_cols = 6
-    nrows = int(np.ceil(len(species_names) / max_cols))
-    ncols = np.minimum(len(species_names), max_cols)
-
-    fig, ax = plt.subplots(
-        nrows=nrows,
-        ncols=ncols,
-        figsize=(ncols * 3.3, nrows * 6),
-        gridspec_kw={"wspace": 0, "hspace": 0.2},
-    )
-    spec_min_max = (
-        0,
-        mean_specs[0].shape[1],
-        params["min_freq"] / 1000,
-        params["max_freq"] / 1000,
-    )
-    ii = 0
-    for row in ax:
-        if type(row) != np.ndarray:
-            row = np.array([row])
-
-        for col in row:
-            if ii >= len(species_names):
-                col.axis("off")
-            else:
-                inds = np.where(labels == order[ii])[0]
-                col.imshow(
-                    mean_specs[order[ii]],
-                    extent=spec_min_max,
-                    cmap="plasma",
-                    aspect="equal",
-                )
-                col.grid(color="w", alpha=0.3, linewidth=0.3)
-                col.set_xticks([])
-                col.title.set_text(
-                    str(ii + 1) + " " + species_names[order[ii]]
-                )
-                col.tick_params(axis="both", which="major", labelsize=7)
-                ii += 1
-
-    # plt.tight_layout()
-    # plt.show()
-    plt.savefig(op_file_name)
-    plt.close("all")