batdetect2/bat_detect/train/train_utils.py

import glob
import json
import os
import random

import numpy as np


def write_notes_file(file_name, text):
    with open(file_name, "a") as da:
        da.write(text + "\n")


def get_blank_dataset_dict(dataset_name, is_test, ann_path, wav_path):
    ddict = {
        "dataset_name": dataset_name,
        "is_test": is_test,
        "is_binary": False,
        "ann_path": ann_path,
        "wav_path": wav_path,
    }
    return ddict


def get_short_class_names(class_names, str_len=3):
    class_names_short = []
    for cc in class_names:
        class_names_short.append(
            " ".join([sp[:str_len] for sp in cc.split(" ")])
        )
    return class_names_short


def remove_dupes(data_train, data_test):
    test_ids = [dd["id"] for dd in data_test]
    data_train_prune = []
    for aa in data_train:
        if aa["id"] not in test_ids:
            data_train_prune.append(aa)
    diff = len(data_train) - len(data_train_prune)
    if diff != 0:
        print(diff, "items removed from train set")
    return data_train_prune


def get_genus_mapping(class_names):
    genus_names, genus_mapping = np.unique(
        [cc.split(" ")[0] for cc in class_names], return_inverse=True
    )
    return genus_names.tolist(), genus_mapping.tolist()


def standardize_low_freq(data, class_of_interest):
    # address the issue of highly variable low frequency annotations
    # this often happens for contstant frequency calls
    # for the class of interest sets the low and high freq to be the dataset mean
    low_freqs = []
    high_freqs = []
    for dd in data:
        for aa in dd["annotation"]:
            if aa["class"] == class_of_interest:
                low_freqs.append(aa["low_freq"])
                high_freqs.append(aa["high_freq"])

    low_mean = np.mean(low_freqs)
    high_mean = np.mean(high_freqs)
    assert low_mean < high_mean

    print("\nStandardizing low and high frequency for:")
    print(class_of_interest)
    print("low:  ", round(low_mean, 2))
    print("high: ", round(high_mean, 2))

    # only set the low freq, high stays the same
    # assumes that low_mean < high_mean
    for dd in data:
        for aa in dd["annotation"]:
            if aa["class"] == class_of_interest:
                aa["low_freq"] = low_mean
                if aa["high_freq"] < low_mean:
                    aa["high_freq"] = high_mean

    return data


def load_set_of_anns(
    data,
    classes_to_ignore=[],
    events_of_interest=None,
    convert_to_genus=False,
    verbose=True,
    list_of_anns=False,
    filter_issues=False,
    name_replace=False,
):

    # load the annotations
    anns = []
    if list_of_anns:
        # path to list of individual json files
        anns.extend(load_anns_from_path(data["ann_path"], data["wav_path"]))
    else:
        # dictionary of datasets
        for dd in data:
            anns.extend(load_anns(dd["ann_path"], dd["wav_path"]))

    # discarding unannoated files
    anns = [aa for aa in anns if aa["annotated"] is True]

    # filter files that have annotation issues - is the input is a dictionary of
    # datasets, this will lilely have already been done
    if filter_issues:
        anns = [aa for aa in anns if aa["issues"] is False]

    # check for some basic formatting errors with class names
    for ann in anns:
        for aa in ann["annotation"]:
            aa["class"] = aa["class"].strip()

    # only load specified events - i.e. types of calls
    if events_of_interest is not None:
        for ann in anns:
            filtered_events = []
            for aa in ann["annotation"]:
                if aa["event"] in events_of_interest:
                    filtered_events.append(aa)
            ann["annotation"] = filtered_events

    # change class names
    # replace_names will be a dictionary mapping input name to output
    if type(name_replace) is dict:
        for ann in anns:
            for aa in ann["annotation"]:
                if aa["class"] in name_replace:
                    aa["class"] = name_replace[aa["class"]]

    # convert everything to genus name
    if convert_to_genus:
        for ann in anns:
            for aa in ann["annotation"]:
                aa["class"] = aa["class"].split(" ")[0]

    # get unique class names
    class_names_all = []
    for ann in anns:
        for aa in ann["annotation"]:
            if aa["class"] not in classes_to_ignore:
                class_names_all.append(aa["class"])

    class_names, class_cnts = np.unique(class_names_all, return_counts=True)
    class_inv_freq = class_cnts.sum() / (
        len(class_names) * class_cnts.astype(np.float32)
    )

    if verbose:
        print("Class count:")
        str_len = np.max([len(cc) for cc in class_names]) + 5
        for cc in range(len(class_names)):
            print(
                str(cc).ljust(5)
                + class_names[cc].ljust(str_len)
                + str(class_cnts[cc])
            )

    if len(classes_to_ignore) == 0:
        return anns
    else:
        return anns, class_names.tolist(), class_inv_freq.tolist()


def load_anns(ann_file_name, raw_audio_dir):
    with open(ann_file_name) as da:
        anns = json.load(da)

    for aa in anns:
        aa["file_path"] = raw_audio_dir + aa["id"]

    return anns


def load_anns_from_path(ann_file_dir, raw_audio_dir):
    files = glob.glob(ann_file_dir + "*.json")
    anns = []
    for ff in files:
        with open(ff) as da:
            ann = json.load(da)
        ann["file_path"] = raw_audio_dir + ann["id"]
        anns.append(ann)

    return anns


class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count