added code

2025-06-29 22:51:58 +02:00 · 2022-12-09 19:03:24 +00:00 · 2022-12-09 19:03:24 +00:00 · 0945c1caeb
commit 0945c1caeb
parent 6f5b60404a
34 changed files with 4927 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,54 @@
 # BatDetect2
 <img align="left" width="64" height="64" src="ims/bat_icon.png">
 Code for detecting and classifying bat echolocation calls in high frequency audio recordings.
 ### Getting started
 1) Install the Anaconda Python 3.9 distribution for your operating system from [here](https://www.continuum.io/downloads).  
 2) Download this code from the repository (by clicking on the green button on top right) and unzip it.  
 3) Create a new environment and install the required packages:  
 `conda create -y --name batdetect python==3.9`  
 `conda activate batdetect`  
 `conda install --file requirements.txt`  
 ### Try the model on Colab
 TODO
 ### Running the model on your own data
 After following the above steps you can run the model on your own data by opening the command line where the code is located and typing:  
 `python run_batdetect.py AUDIO_DIR ANN_DIR DETECTION_THRESHOLD`  
 `AUDIO_DIR` is the path on your computer to the files of interest.  
 `ANN_DIR` is the path on your computer where the detailed predictions will be saved. The model will output both `.csv` and `.json` results for each audio file.   
 `DETECTION_THRESHOLD` is a number between 0 and 1 specifying the cut-off threshold applied to the calls. A smaller number will result in more calls detected, but with the chance of introducing more mistakes:  
 `python run_batdetect.py example_data/audio/ example_data/anns/ 0.3`  
 There are also optional arguments e.g. you can request that the model outputs features (i.e. call parameters) such as duration, max_frequency, etc. by setting the flag `--spec_features`. These will be saved as `*_spec_features.csv` files:  
 `python run_batdetect.py example_data/audio/ example_data/anns/ 0.3 --spec_features`   
 You can also specify which model to use by setting the `--model_path` argument. If not specified, it will default to using a model trained on UK data.  
 ### Requirements
 The code has been tested using Python3.9 with the following package versions described in `requirements.txt`.    
 ### FAQ
 For more information please consult our [FAQ](faq.md).  
 ### Reference
 If you find our work useful in your research please consider citing our paper:
 ```
@article{batdetect2_2022,
    author    = {TODO},
    title     = {TODO},
    journal   = {TODOD},
    year      = {2022}
 }
 ```
 ### Acknowledgements
 TODO
--- a/bat_detect/init.py
+++ b/bat_detect/init.py
--- a/bat_detect/detector/init.py
+++ b/bat_detect/detector/init.py
--- a/bat_detect/detector/compute_features.py
+++ b/bat_detect/detector/compute_features.py
@ -0,0 +1,84 @@
 import numpy as np
 def convert_int_to_freq(spec_ind, spec_height, min_freq, max_freq):
    spec_ind = spec_height-spec_ind
    return round((spec_ind / float(spec_height)) * (max_freq - min_freq) + min_freq, 2)
 def extract_spec_slices(spec, pred_nms, params):
    """
    Extracts spectrogram slices from spectrogram based on detected call locations.
    """
    x_pos          = pred_nms['x_pos']
    y_pos          = pred_nms['y_pos']
    bb_width       = pred_nms['bb_width']
    bb_height      = pred_nms['bb_height']
    slices    = []
    # add 20% padding either side of call
    pad          = bb_width*0.2
    x_pos_pad    = x_pos - pad
    bb_width_pad = bb_width + 2*pad
    for ff in range(len(pred_nms['det_probs'])):
        x_start = int(np.maximum(0, x_pos_pad[ff]))
        x_end   = int(np.minimum(spec.shape[1]-1, np.round(x_pos_pad[ff] + bb_width_pad[ff])))
        slices.append(spec[:, x_start:x_end].astype(np.float16))
    return slices
 def get_feature_names():
    feature_names  = ['duration', 'low_freq_bb', 'high_freq_bb', 'bandwidth',
                     'max_power_bb', 'max_power', 'max_power_first',
                     'max_power_second', 'call_interval']
    return feature_names
 def get_feats(spec, pred_nms, params):
    """
    Extracts features from spectrogram based on detected call locations.
    Condsider re-extracting spectrogram for this to get better temporal resolution.
    For more possible features check out:
    https://github.com/YvesBas/Tadarida-D/blob/master/Manual_Tadarida-D.odt
    """
    x_pos          = pred_nms['x_pos']
    y_pos          = pred_nms['y_pos']
    bb_width       = pred_nms['bb_width']
    bb_height      = pred_nms['bb_height']
    feature_names  = get_feature_names()
    num_detections = len(pred_nms['det_probs'])
    features       = np.ones((num_detections, len(feature_names)), dtype=np.float32)*-1
    for ff in range(num_detections):
        x_start = int(np.maximum(0, x_pos[ff]))
        x_end   = int(np.minimum(spec.shape[1]-1, np.round(x_pos[ff] + bb_width[ff])))
        # y low is the lowest freq but it will have a higher value due to array starting at 0 at top
        y_low   = int(np.minimum(spec.shape[0]-1, y_pos[ff]))
        y_high  = int(np.maximum(0, np.round(y_pos[ff] - bb_height[ff])))
        spec_slice = spec[:, x_start:x_end]
        if spec_slice.shape[1] > 1:
            features[ff, 0] = round(pred_nms['end_times'][ff] - pred_nms['start_times'][ff], 5)
            features[ff, 1] = int(pred_nms['low_freqs'][ff])
            features[ff, 2] = int(pred_nms['high_freqs'][ff])
            features[ff, 3] = int(pred_nms['high_freqs'][ff] - pred_nms['low_freqs'][ff])
            features[ff, 4] = int(convert_int_to_freq(y_high+spec_slice[y_high:y_low, :].sum(1).argmax(),
                                       spec.shape[0], params['min_freq'], params['max_freq']))
            features[ff, 5] = int(convert_int_to_freq(spec_slice.sum(1).argmax(),
                                       spec.shape[0], params['min_freq'], params['max_freq']))
            hlf_val = spec_slice.shape[1]//2
            features[ff, 6] = int(convert_int_to_freq(spec_slice[:, :hlf_val].sum(1).argmax(),
                                       spec.shape[0], params['min_freq'], params['max_freq']))
            features[ff, 7] = int(convert_int_to_freq(spec_slice[:, hlf_val:].sum(1).argmax(),
                                       spec.shape[0], params['min_freq'], params['max_freq']))
            if ff > 0:
                features[ff, 8] = round(pred_nms['start_times'][ff] - pred_nms['start_times'][ff-1], 5)
    return features
--- a/bat_detect/detector/model_helpers.py
+++ b/bat_detect/detector/model_helpers.py
@ -0,0 +1,93 @@
 import torch.nn as nn
 import torch
 import torch.nn.functional as F
 import numpy as np
 import math
 class SelfAttention(nn.Module):
    def __init__(self, ip_dim, att_dim):
        super(SelfAttention, self).__init__()
        # Note, does not encode position information (absolute or realtive)
        self.temperature = 1.0
        self.att_dim  = att_dim
        self.key_fun = nn.Linear(ip_dim, att_dim)
        self.val_fun = nn.Linear(ip_dim, att_dim)
        self.que_fun = nn.Linear(ip_dim, att_dim)
        self.pro_fun = nn.Linear(att_dim, ip_dim)
    def forward(self, x):
        x = x.squeeze(2).permute(0,2,1)
        kk = torch.matmul(x, self.key_fun.weight.T) + self.key_fun.bias.unsqueeze(0).unsqueeze(0)
        qq = torch.matmul(x, self.que_fun.weight.T) + self.que_fun.bias.unsqueeze(0).unsqueeze(0)
        vv = torch.matmul(x, self.val_fun.weight.T) + self.val_fun.bias.unsqueeze(0).unsqueeze(0)
        kk_qq = torch.bmm(kk, qq.permute(0,2,1)) / (self.temperature*self.att_dim)
        att_weights = F.softmax(kk_qq, 1)  # each col of each attention matrix sums to 1
        att = torch.bmm(vv.permute(0,2,1), att_weights)
        op = torch.matmul(att.permute(0,2,1), self.pro_fun.weight.T) + self.pro_fun.bias.unsqueeze(0).unsqueeze(0)
        op = op.permute(0,2,1).unsqueeze(2)
        return op
 class ConvBlockDownCoordF(nn.Module):
    def __init__(self, in_chn, out_chn, ip_height, k_size=3, pad_size=1, stride=1):
        super(ConvBlockDownCoordF, self).__init__()
        self.coords = nn.Parameter(torch.linspace(-1, 1, ip_height)[None, None, ..., None], requires_grad=False)
        self.conv = nn.Conv2d(in_chn+1, out_chn, kernel_size=k_size, padding=pad_size, stride=stride)
        self.conv_bn = nn.BatchNorm2d(out_chn)
    def forward(self, x):
        freq_info = self.coords.repeat(x.shape[0],1,1,x.shape[3])
        x = torch.cat((x, freq_info), 1)
        x = F.max_pool2d(self.conv(x), 2, 2)
        x = F.relu(self.conv_bn(x), inplace=True)
        return x
 class ConvBlockDownStandard(nn.Module):
    def __init__(self, in_chn, out_chn, ip_height=None, k_size=3, pad_size=1, stride=1):
        super(ConvBlockDownStandard, self).__init__()
        self.conv = nn.Conv2d(in_chn, out_chn, kernel_size=k_size, padding=pad_size, stride=stride)
        self.conv_bn = nn.BatchNorm2d(out_chn)
    def forward(self, x):
        x = F.max_pool2d(self.conv(x), 2, 2)
        x = F.relu(self.conv_bn(x), inplace=True)
        return x
 class ConvBlockUpF(nn.Module):
    def __init__(self, in_chn, out_chn, ip_height, k_size=3, pad_size=1, up_mode='bilinear', up_scale=(2,2)):
        super(ConvBlockUpF, self).__init__()
        self.up_scale = up_scale
        self.up_mode = up_mode
        self.coords = nn.Parameter(torch.linspace(-1, 1, ip_height*up_scale[0])[None, None, ..., None], requires_grad=False)
        self.conv = nn.Conv2d(in_chn+1, out_chn, kernel_size=k_size, padding=pad_size)
        self.conv_bn = nn.BatchNorm2d(out_chn)
    def forward(self, x):
        op = F.interpolate(x, size=(x.shape[-2]*self.up_scale[0], x.shape[-1]*self.up_scale[1]), mode=self.up_mode, align_corners=False)
        freq_info = self.coords.repeat(op.shape[0],1,1,op.shape[3])
        op = torch.cat((op, freq_info), 1)
        op = self.conv(op)
        op = F.relu(self.conv_bn(op), inplace=True)
        return op
 class ConvBlockUpStandard(nn.Module):
    def __init__(self, in_chn, out_chn, ip_height=None, k_size=3, pad_size=1, up_mode='bilinear', up_scale=(2,2)):
        super(ConvBlockUpStandard, self).__init__()
        self.up_scale = up_scale
        self.up_mode = up_mode
        self.conv = nn.Conv2d(in_chn, out_chn, kernel_size=k_size, padding=pad_size)
        self.conv_bn = nn.BatchNorm2d(out_chn)
    def forward(self, x):
        op = F.interpolate(x, size=(x.shape[-2]*self.up_scale[0], x.shape[-1]*self.up_scale[1]), mode=self.up_mode, align_corners=False)
        op = self.conv(op)
        op = F.relu(self.conv_bn(op), inplace=True)
        return op
--- a/bat_detect/detector/models.py
+++ b/bat_detect/detector/models.py
@ -0,0 +1,218 @@
 import torch.nn as nn
 import torch
 import torch.nn.functional as F
 import numpy as np
 from .model_helpers import *
 import torchvision
 import torch.fft
 from torch import nn
 class Net2DFast(nn.Module):
    def __init__(self, num_filts, num_classes=0, emb_dim=0, ip_height=128, resize_factor=0.5):
        super(Net2DFast, self).__init__()
        self.num_classes = num_classes
        self.emb_dim = emb_dim
        self.num_filts = num_filts
        self.resize_factor = resize_factor
        self.ip_height_rs = ip_height
        self.bneck_height = self.ip_height_rs//32
        # encoder
        self.conv_dn_0 = ConvBlockDownCoordF(1, num_filts//4, self.ip_height_rs, k_size=3, pad_size=1, stride=1)
        self.conv_dn_1 = ConvBlockDownCoordF(num_filts//4, num_filts//2, self.ip_height_rs//2, k_size=3, pad_size=1, stride=1)
        self.conv_dn_2 = ConvBlockDownCoordF(num_filts//2, num_filts, self.ip_height_rs//4, k_size=3, pad_size=1, stride=1)
        self.conv_dn_3 = nn.Conv2d(num_filts, num_filts*2, 3, padding=1)
        self.conv_dn_3_bn = nn.BatchNorm2d(num_filts*2)
        # bottleneck
        self.conv_1d = nn.Conv2d(num_filts*2, num_filts*2, (self.ip_height_rs//8,1), padding=0)
        self.conv_1d_bn = nn.BatchNorm2d(num_filts*2)
        self.att = SelfAttention(num_filts*2, num_filts*2)
        # decoder
        self.conv_up_2 = ConvBlockUpF(num_filts*2, num_filts//2, self.ip_height_rs//8)
        self.conv_up_3 = ConvBlockUpF(num_filts//2, num_filts//4, self.ip_height_rs//4)
        self.conv_up_4 = ConvBlockUpF(num_filts//4, num_filts//4, self.ip_height_rs//2)
        # output
        # +1 to include background class for class output
        self.conv_op  = nn.Conv2d(num_filts//4, num_filts//4, kernel_size=3, padding=1)
        self.conv_op_bn = nn.BatchNorm2d(num_filts//4)
        self.conv_size_op  = nn.Conv2d(num_filts//4, 2, kernel_size=1, padding=0)
        self.conv_classes_op = nn.Conv2d(num_filts//4, self.num_classes+1, kernel_size=1, padding=0)
        if self.emb_dim > 0:
            self.conv_emb = nn.Conv2d(num_filts, self.emb_dim, kernel_size=1, padding=0)
    def forward(self, ip, return_feats=False):
        # encoder
        x1 = self.conv_dn_0(ip)
        x2 = self.conv_dn_1(x1)
        x3 = self.conv_dn_2(x2)
        x3 = F.relu(self.conv_dn_3_bn(self.conv_dn_3(x3)), inplace=True)
        # bottleneck
        x = F.relu(self.conv_1d_bn(self.conv_1d(x3)), inplace=True)
        x = self.att(x)
        x = x.repeat([1,1,self.bneck_height*4,1])
        # decoder
        x = self.conv_up_2(x+x3)
        x = self.conv_up_3(x+x2)
        x = self.conv_up_4(x+x1)
        # output
        x = F.relu(self.conv_op_bn(self.conv_op(x)), inplace=True)
        cls  = self.conv_classes_op(x)
        comb = torch.softmax(cls, 1)
        op = {}
        op['pred_det']   = comb[:,:-1, :, :].sum(1).unsqueeze(1)
        op['pred_size']  = F.relu(self.conv_size_op(x), inplace=True)
        op['pred_class'] = comb
        op['pred_class_un_norm'] = cls
        if self.emb_dim > 0:
            op['pred_emb'] = self.conv_emb(x)
        if return_feats:
            op['features'] = x
        return op
 class Net2DFastNoAttn(nn.Module):
    def __init__(self, num_filts, num_classes=0, emb_dim=0, ip_height=128, resize_factor=0.5):
        super(Net2DFastNoAttn, self).__init__()
        self.num_classes = num_classes
        self.emb_dim = emb_dim
        self.num_filts = num_filts
        self.resize_factor = resize_factor
        self.ip_height_rs = ip_height
        self.bneck_height = self.ip_height_rs//32
        self.conv_dn_0 = ConvBlockDownCoordF(1, num_filts//4, self.ip_height_rs, k_size=3, pad_size=1, stride=1)
        self.conv_dn_1 = ConvBlockDownCoordF(num_filts//4, num_filts//2, self.ip_height_rs//2, k_size=3, pad_size=1, stride=1)
        self.conv_dn_2 = ConvBlockDownCoordF(num_filts//2, num_filts, self.ip_height_rs//4, k_size=3, pad_size=1, stride=1)
        self.conv_dn_3 = nn.Conv2d(num_filts, num_filts*2, 3, padding=1)
        self.conv_dn_3_bn = nn.BatchNorm2d(num_filts*2)
        self.conv_1d = nn.Conv2d(num_filts*2, num_filts*2, (self.ip_height_rs//8,1), padding=0)
        self.conv_1d_bn = nn.BatchNorm2d(num_filts*2)
        self.conv_up_2 = ConvBlockUpF(num_filts*2, num_filts//2, self.ip_height_rs//8)
        self.conv_up_3 = ConvBlockUpF(num_filts//2, num_filts//4, self.ip_height_rs//4)
        self.conv_up_4 = ConvBlockUpF(num_filts//4, num_filts//4, self.ip_height_rs//2)
        # output
        # +1 to include background class for class output
        self.conv_op  = nn.Conv2d(num_filts//4, num_filts//4, kernel_size=3, padding=1)
        self.conv_op_bn = nn.BatchNorm2d(num_filts//4)
        self.conv_size_op  = nn.Conv2d(num_filts//4, 2, kernel_size=1, padding=0)
        self.conv_classes_op = nn.Conv2d(num_filts//4, self.num_classes+1, kernel_size=1, padding=0)
        if self.emb_dim > 0:
            self.conv_emb = nn.Conv2d(num_filts, self.emb_dim, kernel_size=1, padding=0)
    def forward(self, ip, return_feats=False):
        x1 = self.conv_dn_0(ip)
        x2 = self.conv_dn_1(x1)
        x3 = self.conv_dn_2(x2)
        x3 = F.relu(self.conv_dn_3_bn(self.conv_dn_3(x3)), inplace=True)
        x = F.relu(self.conv_1d_bn(self.conv_1d(x3)), inplace=True)
        x = x.repeat([1,1,self.bneck_height*4,1])
        x = self.conv_up_2(x+x3)
        x = self.conv_up_3(x+x2)
        x = self.conv_up_4(x+x1)
        x = F.relu(self.conv_op_bn(self.conv_op(x)), inplace=True)
        cls  = self.conv_classes_op(x)
        comb = torch.softmax(cls, 1)
        op = {}
        op['pred_det']   = comb[:,:-1, :, :].sum(1).unsqueeze(1)
        op['pred_size']  = F.relu(self.conv_size_op(x), inplace=True)
        op['pred_class'] = comb
        op['pred_class_un_norm'] = cls
        if self.emb_dim > 0:
            op['pred_emb'] = self.conv_emb(x)
        if return_feats:
            op['features'] = x
        return op
 class Net2DFastNoCoordConv(nn.Module):
    def __init__(self, num_filts, num_classes=0, emb_dim=0, ip_height=128, resize_factor=0.5):
        super(Net2DFastNoCoordConv, self).__init__()
        self.num_classes = num_classes
        self.emb_dim = emb_dim
        self.num_filts = num_filts
        self.resize_factor = resize_factor
        self.ip_height_rs = ip_height
        self.bneck_height = self.ip_height_rs//32
        self.conv_dn_0 = ConvBlockDownStandard(1, num_filts//4, self.ip_height_rs, k_size=3, pad_size=1, stride=1)
        self.conv_dn_1 = ConvBlockDownStandard(num_filts//4, num_filts//2, self.ip_height_rs//2, k_size=3, pad_size=1, stride=1)
        self.conv_dn_2 = ConvBlockDownStandard(num_filts//2, num_filts, self.ip_height_rs//4, k_size=3, pad_size=1, stride=1)
        self.conv_dn_3 = nn.Conv2d(num_filts, num_filts*2, 3, padding=1)
        self.conv_dn_3_bn = nn.BatchNorm2d(num_filts*2)
        self.conv_1d = nn.Conv2d(num_filts*2, num_filts*2, (self.ip_height_rs//8,1), padding=0)
        self.conv_1d_bn = nn.BatchNorm2d(num_filts*2)
        self.att = SelfAttention(num_filts*2, num_filts*2)
        self.conv_up_2 = ConvBlockUpStandard(num_filts*2, num_filts//2, self.ip_height_rs//8)
        self.conv_up_3 = ConvBlockUpStandard(num_filts//2, num_filts//4, self.ip_height_rs//4)
        self.conv_up_4 = ConvBlockUpStandard(num_filts//4, num_filts//4, self.ip_height_rs//2)
        # output
        # +1 to include background class for class output
        self.conv_op  = nn.Conv2d(num_filts//4, num_filts//4, kernel_size=3, padding=1)
        self.conv_op_bn = nn.BatchNorm2d(num_filts//4)
        self.conv_size_op  = nn.Conv2d(num_filts//4, 2, kernel_size=1, padding=0)
        self.conv_classes_op = nn.Conv2d(num_filts//4, self.num_classes+1, kernel_size=1, padding=0)
        if self.emb_dim > 0:
            self.conv_emb = nn.Conv2d(num_filts, self.emb_dim, kernel_size=1, padding=0)
    def forward(self, ip, return_feats=False):
        x1 = self.conv_dn_0(ip)
        x2 = self.conv_dn_1(x1)
        x3 = self.conv_dn_2(x2)
        x3 = F.relu(self.conv_dn_3_bn(self.conv_dn_3(x3)), inplace=True)
        x = F.relu(self.conv_1d_bn(self.conv_1d(x3)), inplace=True)
        x = self.att(x)
        x = x.repeat([1,1,self.bneck_height*4,1])
        x = self.conv_up_2(x+x3)
        x = self.conv_up_3(x+x2)
        x = self.conv_up_4(x+x1)
        x = F.relu(self.conv_op_bn(self.conv_op(x)), inplace=True)
        cls  = self.conv_classes_op(x)
        comb = torch.softmax(cls, 1)
        op = {}
        op['pred_det']   = comb[:,:-1, :, :].sum(1).unsqueeze(1)
        op['pred_size']  = F.relu(self.conv_size_op(x), inplace=True)
        op['pred_class'] = comb
        op['pred_class_un_norm'] = cls
        if self.emb_dim > 0:
            op['pred_emb'] = self.conv_emb(x)
        if return_feats:
            op['features'] = x
        return op
--- a/bat_detect/detector/parameters.py
+++ b/bat_detect/detector/parameters.py
@ -0,0 +1,108 @@
 import numpy as np
 import os
 import datetime
 def mk_dir(path):
    if not os.path.isdir(path):
        os.makedirs(path)
 def get_params(make_dirs=False, exps_dir='../../experiments/'):
    params = {}
    params['model_name'] = 'Net2DFast' # Net2DFast, Net2DSkip, Net2DSimple, Net2DSkipDS, Net2DRN
    params['num_filters'] =  128
    now_str = datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")
    model_name                = now_str + '.pth.tar'
    params['experiment']      = os.path.join(exps_dir, now_str, '')
    params['model_file_name'] = os.path.join(params['experiment'], model_name)
    params['op_im_dir']       = os.path.join(params['experiment'], 'op_ims', '')
    params['op_im_dir_test']  = os.path.join(params['experiment'], 'op_ims_test', '')
    #params['notes']           = ''  # can save notes about an experiment here
    # spec parameters
    params['target_samp_rate'] = 256000        # resamples all audio so that it is at this rate
    params['fft_win_length'] = 512 / 256000.0  # in milliseconds, amount of time per stft time step
    params['fft_overlap']    = 0.75            # stft window overlap
    params['max_freq'] = 120000       # in Hz, everything above this will be discarded
    params['min_freq'] = 10000        # in Hz, everything below this will be discarded
    params['resize_factor'] = 0.5     # resize so the spectrogram at the input of the network
    params['spec_height'] = 256       # units are number of frequency bins (before resizing is performed)
    params['spec_train_width'] = 512  # units are number of time steps (before resizing is performed)
    params['spec_divide_factor'] = 32 # spectrogram should be divisible by this amount in width and height
    # spec processing params
    params['denoise_spec_avg'] = True  # removes the mean for each frequency band
    params['scale_raw_audio'] = False  # scales the raw audio to [-1, 1]
    params['max_scale_spec'] = False   # scales the spectrogram so that it is max 1
    params['spec_scale'] = 'pcen'      # 'log', 'pcen', 'none'
    # detection params
    params['detection_overlap'] = 0.01   # has to be within this number of ms to count as detection
    params['ignore_start_end'] = 0.01    # if start of GT calls are within this time from the start/end of file ignore
    params['detection_threshold'] = 0.01 # the smaller this is the better the recall will be
    params['nms_kernel_size'] = 9
    params['nms_top_k_per_sec'] = 200    # keep top K highest predictions per second of audio
    params['target_sigma'] = 2.0
    # augmentation params
    params['aug_prob'] = 0.20               # augmentations will be performed with this probability
    params['augment_at_train'] = True
    params['augment_at_train_combine'] = True
    params['echo_max_delay'] = 0.005        # simulate echo by adding copy of raw audio
    params['stretch_squeeze_delta'] = 0.04  # stretch or squeeze spec
    params['mask_max_time_perc'] = 0.05     # max mask size - here percentage, not ideal
    params['mask_max_freq_perc'] = 0.10     # max mask size - here percentage, not ideal
    params['spec_amp_scaling']   = 2.0      # multiply the "volume" by 0:X times current amount
    params['aug_sampling_rates'] = [220500, 256000, 300000, 312500, 384000, 441000, 500000]
    # loss params
    params['train_loss'] = 'focal'         # mse or focal
    params['det_loss_weight'] = 1.0        # weight for the detection part of the loss
    params['size_loss_weight'] = 0.1       # weight for the bbox size loss
    params['class_loss_weight'] = 2.0      # weight for the classification loss
    params['individual_loss_weight'] = 0.0 # not used
    if params['individual_loss_weight'] == 0.0:
        params['emb_dim'] = 0              # number of dimensions used for individual id embedding
    else:
        params['emb_dim'] = 3
    # train params
    params['lr'] = 0.001
    params['batch_size'] = 8
    params['num_workers'] = 4
    params['num_epochs'] = 200
    params['num_eval_epochs'] = 5  # run evaluation every X epochs
    params['device'] = 'cuda'
    params['save_test_image_during_train'] = False
    params['save_test_image_after_train'] = True
    params['convert_to_genus'] = False
    params['genus_mapping'] = []
    params['class_names'] = []
    params['classes_to_ignore'] = ['', ' ', 'Unknown', 'Not Bat']
    params['generic_class'] = ['Bat']
    params['events_of_interest'] = ['Echolocation']  # will ignore all other types of events e.g. social calls
    # the classes in this list are standardized during training so that the same low and high freq are used
    params['standardize_classs_names'] = []
    # create directories
    if make_dirs:
        print('Model name : ' + params['model_name'])
        print('Model file : ' + params['model_file_name'])
        print('Experiment : ' + params['experiment'])
        mk_dir(params['experiment'])
        if params['save_test_image_during_train']:
            mk_dir(params['op_im_dir'])
        if params['save_test_image_after_train']:
            mk_dir(params['op_im_dir_test'])
        mk_dir(os.path.dirname(params['model_file_name']))
    return params
--- a/bat_detect/detector/post_process.py
+++ b/bat_detect/detector/post_process.py
@ -0,0 +1,100 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 np.seterr(divide='ignore', invalid='ignore')
 def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
    nfft = int(fft_win_length*sampling_rate)
    noverlap = int(fft_overlap*nfft)
    return ((x_pos*(nfft - noverlap)) + noverlap) / sampling_rate
    #return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5)  # 0.5 is for center of temporal window
 def overall_class_pred(det_prob, class_prob):
    weighted_pred = (class_prob*det_prob).sum(1)
    return weighted_pred / weighted_pred.sum()
 def run_nms(outputs, params, sampling_rate):
    pred_det  = outputs['pred_det']   # probability of box
    pred_size = outputs['pred_size']  # box size
    pred_det_nms = non_max_suppression(pred_det, params['nms_kernel_size'])
    freq_rescale = (params['max_freq'] - params['min_freq']) /pred_det.shape[-2]
    # NOTE there will be small differences depending on which sampling rate is chosen
    # as we are choosing the same sampling rate for the entire batch
    duration = x_coords_to_time(pred_det.shape[-1], sampling_rate[0].item(),
                                params['fft_win_length'], params['fft_overlap'])
    top_k = int(duration * params['nms_top_k_per_sec'])
    scores, y_pos, x_pos = get_topk_scores(pred_det_nms, top_k)
    # loop over batch to save outputs
    preds = []
    feats = []
    for ii in range(pred_det_nms.shape[0]):
        # get valid indices
        inds_ord = torch.argsort(x_pos[ii, :])
        valid_inds = scores[ii, inds_ord] > params['detection_threshold']
        valid_inds = inds_ord[valid_inds]
        # create result dictionary
        pred = {}
        pred['det_probs']   = scores[ii, valid_inds]
        pred['x_pos']       = x_pos[ii, valid_inds]
        pred['y_pos']       = y_pos[ii, valid_inds]
        pred['bb_width']    = pred_size[ii, 0, pred['y_pos'], pred['x_pos']]
        pred['bb_height']   = pred_size[ii, 1, pred['y_pos'], pred['x_pos']]
        pred['start_times'] = x_coords_to_time(pred['x_pos'].float() / params['resize_factor'],
                                sampling_rate[ii].item(), params['fft_win_length'], params['fft_overlap'])
        pred['end_times']   = x_coords_to_time((pred['x_pos'].float()+pred['bb_width']) / params['resize_factor'],
                                sampling_rate[ii].item(), params['fft_win_length'], params['fft_overlap'])
        pred['low_freqs']   = (pred_size[ii].shape[1] - pred['y_pos'].float())*freq_rescale + params['min_freq']
        pred['high_freqs']  = pred['low_freqs'] + pred['bb_height']*freq_rescale
        # extract the per class votes
        if 'pred_class' in outputs:
            pred['class_probs'] = outputs['pred_class'][ii, :, y_pos[ii, valid_inds], x_pos[ii, valid_inds]]
        # extract the model features
        if 'features' in outputs:
            feat = outputs['features'][ii, :, y_pos[ii, valid_inds], x_pos[ii, valid_inds]].transpose(0, 1)
            feat = feat.cpu().numpy().astype(np.float32)
            feats.append(feat)
        # convert to numpy
        for kk in pred.keys():
            pred[kk] = pred[kk].cpu().numpy().astype(np.float32)
        preds.append(pred)
    return preds, feats
 def non_max_suppression(heat, kernel_size):
    # kernel can be an int or list/tuple    
    if type(kernel_size) is int:
        kernel_size_h = kernel_size
        kernel_size_w = kernel_size
    pad_h = (kernel_size_h - 1) // 2
    pad_w = (kernel_size_w - 1) // 2
    hmax = nn.functional.max_pool2d(heat, (kernel_size_h, kernel_size_w), stride=1, padding=(pad_h, pad_w))
    keep = (hmax == heat).float()
    return heat * keep
 def get_topk_scores(scores, K):
    # expects input of size:  batch x 1 x height x width
    batch, _, height, width = scores.size()
    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), K)
    topk_inds = topk_inds % (height * width)
    topk_ys   = (topk_inds // width).long()
    topk_xs   = (topk_inds % width).long()
    return topk_scores, topk_ys, topk_xs
--- a/bat_detect/evaluate/evaluate_models.py
+++ b/bat_detect/evaluate/evaluate_models.py
@ -0,0 +1,581 @@
 """
 Evaluates trained model on test set and generates plots.
 """
 import numpy as np
 import sys
 import os
 import copy
 import json
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
 import argparse
 sys.path.append('../../')
 import bat_detect.utils.detector_utils as du
 import bat_detect.train.train_utils as tu
 import bat_detect.detector.parameters as parameters
 import bat_detect.train.evaluate as evl
 import bat_detect.utils.plot_utils as pu
 def get_blank_annotation(ip_str):
    res = {}
    res['class_name'] = ''
    res['duration'] = -1
    res['id'] = ''# fileName
    res['issues'] = False
    res['notes'] = ip_str
    res['time_exp'] = 1
    res['annotated'] = False
    res['annotation'] = []
    ann = {}
    ann['class'] = ''
    ann['event'] = 'Echolocation'
    ann['individual'] = -1
    ann['start_time'] = -1
    ann['end_time'] = -1
    ann['low_freq'] = -1
    ann['high_freq'] = -1
    ann['confidence'] = -1
    return copy.deepcopy(res), copy.deepcopy(ann)
 def get_default_bd_args():
    args = {}
    args['detection_threshold'] = 0.001
    args['time_expansion_factor'] = 1
    args['audio_dir'] = ''
    args['ann_dir'] = ''
    args['spec_slices'] = False
    args['chunk_size']  = 3
    args['spec_features'] = False
    args['cnn_features'] = False
    args['quiet'] = True
    args['save_preds_if_empty'] = True
    args['ann_dir']     = os.path.join(args['ann_dir'], '')
    return args
 def create_genus_mapping(gt_test, preds, class_names):
    # rolls the per class predictions and ground truth back up to genus level
    class_names_genus, cls_to_genus = np.unique([cc.split(' ')[0] for cc in class_names], return_inverse=True)
    genus_to_cls_map = [np.where(np.array(cls_to_genus) == cc)[0] for cc in range(len(class_names_genus))]
    gt_test_g = []
    for gg in gt_test:
        gg_g = copy.deepcopy(gg)
        inds = np.where(gg_g['class_ids']!=-1)[0]
        gg_g['class_ids'][inds] = cls_to_genus[gg_g['class_ids'][inds]]
        gt_test_g.append(gg_g)
    # note, will have entries geater than one as we are summing across the respective classes
    preds_g = []
    for pp in preds:
        pp_g = copy.deepcopy(pp)
        pp_g['class_probs'] = np.zeros((len(class_names_genus), pp_g['class_probs'].shape[1]), dtype=np.float32)
        for cc, inds in enumerate(genus_to_cls_map):
            pp_g['class_probs'][cc, :] = pp['class_probs'][inds, :].sum(0)
        preds_g.append(pp_g)
    return class_names_genus, preds_g, gt_test_g
 def load_tadarida_pred(ip_dir, dataset, file_of_interest):
    res, ann = get_blank_annotation('Generated by Tadarida')
    # create the annotations in the correct format
    da_c = pd.read_csv(ip_dir + dataset + '/' + file_of_interest.replace('.wav', '.ta').replace('.WAV', '.ta'), sep='\t')
    res_c = copy.deepcopy(res)
    res_c['id'] = file_of_interest
    res_c['dataset'] = dataset
    res_c['feats'] = da_c.iloc[:, 6:].values.astype(np.float32)
    if da_c.shape[0] > 0:
        res_c['class_name'] = ''
        res_c['class_prob'] = 0.0
    for aa in range(da_c.shape[0]):
        ann_c = copy.deepcopy(ann)
        ann_c['class'] = 'Not Bat' # will assign to class later
        ann_c['start_time'] = np.round(da_c.iloc[aa]['StTime']/1000.0 ,5)
        ann_c['end_time'] = np.round((da_c.iloc[aa]['StTime'] + da_c.iloc[aa]['Dur'])/1000.0, 5)
        ann_c['low_freq'] = np.round(da_c.iloc[aa]['Fmin'] * 1000.0, 2)
        ann_c['high_freq'] = np.round(da_c.iloc[aa]['Fmax'] * 1000.0, 2)
        ann_c['det_prob'] = 0.0
        res_c['annotation'].append(ann_c)
    return res_c
 def load_sonobat_meta(ip_dir, datasets, region_classifier, class_names, only_accepted_species=True):
    sp_dict = {}
    for ss in class_names:
        sp_key = ss.split(' ')[0][:3] + ss.split(' ')[1][:3]
        sp_dict[sp_key] = ss
    sp_dict['x'] = ''  # not bat
    sp_dict['Bat'] = 'Bat'
    sonobat_meta = {}
    for tt in datasets:
        dataset = tt['dataset_name']
        sb_ip_dir = ip_dir + dataset + '/' + region_classifier + '/'
        # load the call level predictions
        ip_file_p = sb_ip_dir + dataset + '_Parameters_v4.5.0.txt'
        #ip_file_p = sb_ip_dir + 'audio_SonoBatch_v30.0 beta.txt'
        da = pd.read_csv(ip_file_p, sep='\t')
        # load the file level predictions
        ip_file_b = sb_ip_dir + dataset + '_SonoBatch_v4.5.0.txt'
        #ip_file_b = sb_ip_dir + 'audio_CumulativeParameters_v30.0 beta.txt'
        with open(ip_file_b) as f:
            lines = f.readlines()
        lines = [x.strip() for x in lines]
        del lines[0]
        file_res = {}
        for ll in lines:
            # note this does not seem to parse the file very well
            ll_data = ll.split('\t')
            # there are sometimes many different species names per file
            if only_accepted_species:
                # only choosing "SppAccp"
                ind = 4
            else:
                # choosing ""~Spp" if "SppAccp" does not exist
                if ll_data[4] != 'x':
                    ind = 4 # choosing "SppAccp", along with "Prob" here
                else:
                    ind = 8  # choosing "~Spp", along with "~Prob" here
            sp_name_1 = sp_dict[ll_data[ind]]
            prob_1 = ll_data[ind+1]
            if prob_1 == 'x':
                prob_1 = 0.0
            file_res[ll_data[1]] = {'id':ll_data[1], 'species_1':sp_name_1, 'prob_1':prob_1}
        sonobat_meta[dataset] = {}
        sonobat_meta[dataset]['file_res'] = file_res
        sonobat_meta[dataset]['call_info'] = da
    return sonobat_meta
 def load_sonobat_preds(dataset, id, sb_meta, set_class_name=None):
    # create the annotations in the correct format
    res, ann = get_blank_annotation('Generated by Sonobat')
    res_c = copy.deepcopy(res)
    res_c['id'] = id
    res_c['dataset'] = dataset
    da = sb_meta[dataset]['call_info']
    da_c = da[da['Filename'] == id]
    file_res = sb_meta[dataset]['file_res']
    res_c['feats'] = np.zeros((0,0))
    if da_c.shape[0] > 0:
        res_c['class_name'] = file_res[id]['species_1']
        res_c['class_prob'] = file_res[id]['prob_1']
        res_c['feats'] = da_c.iloc[:, 3:105].values.astype(np.float32)
        for aa in range(da_c.shape[0]):
            ann_c = copy.deepcopy(ann)
            if set_class_name is None:
                ann_c['class'] = file_res[id]['species_1']
            else:
                ann_c['class'] = set_class_name
            ann_c['start_time'] = np.round(da_c.iloc[aa]['TimeInFile'] / 1000.0 ,5)
            ann_c['end_time'] = np.round(ann_c['start_time'] + da_c.iloc[aa]['CallDuration']/1000.0, 5)
            ann_c['low_freq'] = np.round(da_c.iloc[aa]['LowFreq'] * 1000.0, 2)
            ann_c['high_freq'] = np.round(da_c.iloc[aa]['HiFreq'] * 1000.0, 2)
            ann_c['det_prob'] = np.round(da_c.iloc[aa]['Quality'], 3)
            res_c['annotation'].append(ann_c)
    return res_c
 def bb_overlap(bb_g_in, bb_p_in):
    freq_scale = 10000000.0  # ensure that both axis are roughly the same range
    bb_g = [bb_g_in['start_time'], bb_g_in['low_freq']/freq_scale, bb_g_in['end_time'], bb_g_in['high_freq']/freq_scale]
    bb_p = [bb_p_in['start_time'], bb_p_in['low_freq']/freq_scale, bb_p_in['end_time'], bb_p_in['high_freq']/freq_scale]
    xA = max(bb_g[0], bb_p[0])
    yA = max(bb_g[1], bb_p[1])
    xB = min(bb_g[2], bb_p[2])
    yB = min(bb_g[3], bb_p[3])
    # compute the area of intersection rectangle
    inter_area = abs(max((xB - xA, 0.0)) * max((yB - yA), 0.0))
    if inter_area == 0:
        iou = 0.0
    else:
        # compute the area of both
        bb_area_g = abs((bb_g[2] - bb_g[0]) * (bb_g[3] - bb_g[1]))
        bb_area_p = abs((bb_p[2] - bb_p[0]) * (bb_p[3] - bb_p[1]))
        iou = inter_area / float(bb_area_g + bb_area_p - inter_area)
    return iou
 def assign_to_gt(gt, pred, iou_thresh):
    # this will edit pred in place
    num_preds = len(pred['annotation'])
    num_gts = len(gt['annotation'])
    if num_preds > 0 and num_gts > 0:
        iou_m = np.zeros((num_preds, num_gts))
        for ii in range(num_preds):
            for jj in range(num_gts):
                iou_m[ii, jj] = bb_overlap(gt['annotation'][jj], pred['annotation'][ii])
        # greedily assign detections to ground truths
        # needs to be greater than some threshold and we cannot assign GT
        # to more than one detection
        # TODO could try to do an optimal assignment
        for jj in range(num_gts):
            max_iou = np.argmax(iou_m[:, jj])
            if iou_m[max_iou, jj] > iou_thresh:
                pred['annotation'][max_iou]['class'] = gt['annotation'][jj]['class']
                iou_m[max_iou, :] = -1.0
    return pred
 def parse_data(data, class_names, non_event_classes, is_pred=False):
    class_names_all = class_names + non_event_classes
    data['class_names']    = np.array([aa['class'] for aa in data['annotation']])
    data['start_times']    = np.array([aa['start_time'] for aa in data['annotation']])
    data['end_times']      = np.array([aa['end_time'] for aa in data['annotation']])
    data['high_freqs']     = np.array([float(aa['high_freq']) for aa in data['annotation']])
    data['low_freqs']      = np.array([float(aa['low_freq']) for aa in data['annotation']])
    if is_pred:
        # when loading predictions
        data['det_probs']   = np.array([float(aa['det_prob']) for aa in data['annotation']])
        data['class_probs'] = np.zeros((len(class_names)+1, len(data['annotation'])))
        data['class_ids']   = np.array([class_names_all.index(aa['class']) for aa in data['annotation']]).astype(np.int32)
    else:
        # when loading ground truth
        # if the class label is not in the set of interest then set to -1
        labels = []
        for aa in data['annotation']:
            if aa['class'] in class_names:
                labels.append(class_names_all.index(aa['class']))
            else:
                labels.append(-1)
        data['class_ids'] = np.array(labels).astype(np.int32)
    return data
 def load_gt_data(datasets, events_of_interest, class_names, classes_to_ignore):
    gt_data = []
    for dd in datasets:
        print('\n' + dd['dataset_name'])
        gt_dataset = tu.load_set_of_anns([dd], events_of_interest=events_of_interest, verbose=True)
        gt_dataset = [parse_data(gg, class_names, classes_to_ignore, False) for gg in gt_dataset]
        for gt in gt_dataset:
            gt['dataset_name'] = dd['dataset_name']
        gt_data.extend(gt_dataset)
    return gt_data
 def train_rf_model(x_train, y_train, num_classes, seed=2001):
    # TODO search for the best hyper parameters on val set
    # Currently only training on the species and 'not bat' - exclude 'generic_class' which is last
    # alternative would be to first have a "bat" vs "not bat" classifier, and then a species classifier?
    x_train = np.vstack(x_train)
    y_train = np.hstack(y_train)
    inds = np.where(y_train < num_classes)[0]
    x_train = x_train[inds, :]
    y_train = y_train[inds]
    un_train_class = np.unique(y_train)
    clf = RandomForestClassifier(random_state=seed, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_train)
    tr_acc = (y_pred==y_train).mean()
    #print('Train acc', round(tr_acc*100, 2))
    return clf, un_train_class
 def eval_rf_model(clf, pred, un_train_class, num_classes):
    # stores the prediction in place
    if pred['feats'].shape[0] > 0:
        pred['class_probs'] = np.zeros((num_classes, pred['feats'].shape[0]))
        pred['class_probs'][un_train_class, :] = clf.predict_proba(pred['feats']).T
        pred['det_probs'] = pred['class_probs'][:-1, :].sum(0)
    else:
        pred['class_probs'] = np.zeros((num_classes, 0))
        pred['det_probs'] = np.zeros(0)
    return pred
 def save_summary_to_json(op_dir, mod_name, results):
    op = {}
    op['avg_prec']       = round(results['avg_prec'], 3)
    op['avg_prec_class'] = round(results['avg_prec_class'], 3)
    op['top_class']      = round(results['top_class']['avg_prec'], 3)
    op['file_acc']       = round(results['file_acc'], 3)
    op['model']          = mod_name
    op['per_class'] = {}
    for cc in results['class_pr']:
        op['per_class'][cc['name']] = cc['avg_prec']
    op_file_name = os.path.join(op_dir, mod_name + '_results.json')
    with open(op_file_name, 'w') as da:
        json.dump(op, da, indent=2)
 def print_results(model_name, mod_str, results, op_dir, class_names, file_type, title_text=''):
    print('\nResults - ' + model_name)
    print('avg_prec      ', round(results['avg_prec'], 3))
    print('avg_prec_class', round(results['avg_prec_class'], 3))
    print('top_class     ', round(results['top_class']['avg_prec'], 3))
    print('file_acc      ', round(results['file_acc'], 3))
    print('\nSaving ' + model_name + ' results to: ' + op_dir)
    save_summary_to_json(op_dir, mod_str, results)
    pu.plot_pr_curve(op_dir, mod_str+'_test_all_det', mod_str+'_test_all_det', results, file_type, title_text + 'Detection PR')
    pu.plot_pr_curve(op_dir, mod_str+'_test_all_top_class', mod_str+'_test_all_top_class', results['top_class'], file_type, title_text + 'Top Class')
    pu.plot_pr_curve_class(op_dir, mod_str+'_test_all_class', mod_str+'_test_all_class', results, file_type, title_text + 'Per-Class PR')
    pu.plot_confusion_matrix(op_dir, mod_str+'_confusion', results['gt_valid_file'], results['pred_valid_file'],
                             results['file_acc'], class_names, True, file_type, title_text + 'Confusion Matrix')
 def add_root_path_back(data_sets, ann_path, wav_path):
    for dd in data_sets:
        dd['ann_path'] = os.path.join(ann_path, dd['ann_path'])
        dd['wav_path'] = os.path.join(wav_path, dd['wav_path'])
    return data_sets
 def check_classes_in_train(gt_list, class_names):
    num_gt_total = np.sum([gg['start_times'].shape[0] for gg in gt_list])
    num_with_no_class = 0
    for gt in gt_list:
        for cc in gt['class_names']:
            if cc not in class_names:
                num_with_no_class += 1
    return num_with_no_class
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('op_dir', type=str, default='plots/results_compare/',
        help='Output directory for plots')
    parser.add_argument('data_dir', type=str,
       help='Path to root of datasets')
    parser.add_argument('ann_dir', type=str,
       help='Path to extracted annotations')
    parser.add_argument('bd_model_path', type=str,
        help='Path to BatDetect model')
    parser.add_argument('--test_file', type=str, default='',
        help='Path to json file used for evaluation.')
    parser.add_argument('--sb_ip_dir', type=str, default='',
        help='Path to sonobat predictions')
    parser.add_argument('--sb_region_classifier', type=str, default='south',
        help='Path to sonobat predictions')
    parser.add_argument('--td_ip_dir', type=str, default='',
        help='Path to tadarida_D predictions')
    parser.add_argument('--iou_thresh', type=float, default=0.01,
        help='IOU threshold for assigning predictions to ground truth')
    parser.add_argument('--file_type', type=str, default='png',
        help='Type of image to save - png or pdf')
    parser.add_argument('--title_text', type=str, default='',
        help='Text to add as title of plots')
    parser.add_argument('--rand_seed', type=int, default=2001,
        help='Random seed')
    args = vars(parser.parse_args())
    np.random.seed(args['rand_seed'])
    if not os.path.isdir(args['op_dir']):
        os.makedirs(args['op_dir'])
    # load the model
    params_eval = parameters.get_params(False)
    _, params_bd = du.load_model(args['bd_model_path'])
    class_names = params_bd['class_names']
    num_classes = len(class_names) + 1  # num classes plus background class
    classes_to_ignore = ['Not Bat', 'Bat', 'Unknown']
    events_of_interest = ['Echolocation']
    # load test data
    if args['test_file'] == '':
        # load the test files of interest from the trained model
        test_sets = add_root_path_back(params_bd['test_sets'], args['ann_dir'], args['data_dir'])
        test_sets = [dd for dd in test_sets if not dd['is_binary']]  # exclude bat/not datasets
    else:
        # user specified annotation file to evaluate
        test_dict = {}
        test_dict['dataset_name'] = args['test_file'].replace('.json', '')
        test_dict['is_test']      = True
        test_dict['is_binary']    = True
        test_dict['ann_path']     = os.path.join(args['ann_dir'], args['test_file'])
        test_dict['wav_path']     = args['data_dir']
        test_sets = [test_dict]
    # load the gt for the test set
    gt_test = load_gt_data(test_sets, events_of_interest, class_names, classes_to_ignore)
    total_num_calls = np.sum([gg['start_times'].shape[0] for gg in gt_test])
    print('\nTotal number of test files:', len(gt_test))
    print('Total number of test calls:', np.sum([gg['start_times'].shape[0] for gg in gt_test]))
    # check if test contains classes not in the train set
    num_with_no_class = check_classes_in_train(gt_test, class_names)
    if total_num_calls == num_with_no_class:
        print('Classes from the test set are not in the train set.')
        assert False
    # only need the train data if evaluating Sonobat or Tadarida
    if args['sb_ip_dir'] != '' or args['td_ip_dir'] != '':
        train_sets = add_root_path_back(params_bd['train_sets'], args['ann_dir'], args['data_dir'])
        train_sets = [dd for dd in train_sets if not dd['is_binary']]  # exclude bat/not datasets
        gt_train = load_gt_data(train_sets, events_of_interest, class_names, classes_to_ignore)
    #
    # evaluate Sonobat by training random forest classifier
    #
    # NOTE: Sonobat may only make predictions for a subset of the files
    #
    if args['sb_ip_dir'] != '':
        sb_meta = load_sonobat_meta(args['sb_ip_dir'], train_sets + test_sets, args['sb_region_classifier'], class_names)
        preds_sb = []
        keep_inds_sb = []
        for ii, gt in enumerate(gt_test):
            sb_pred = load_sonobat_preds(gt['dataset_name'], gt['id'], sb_meta)
            if sb_pred['class_name'] != '':
                sb_pred = parse_data(sb_pred, class_names, classes_to_ignore, True)
                sb_pred['class_probs'][sb_pred['class_ids'], np.arange(sb_pred['class_probs'].shape[1])] = sb_pred['det_probs']
                preds_sb.append(sb_pred)
                keep_inds_sb.append(ii)
        results_sb = evl.evaluate_predictions([gt_test[ii] for ii in keep_inds_sb], preds_sb, class_names,
                                              params_eval['detection_overlap'], params_eval['ignore_start_end'])
        print_results('Sonobat', 'sb', results_sb, args['op_dir'], class_names,
                      args['file_type'], args['title_text'] + ' - Species - ')
        print('Only reporting results for', len(keep_inds_sb), 'files, out of', len(gt_test))
        # train our own random forest on sonobat features
        x_train = []
        y_train = []
        for gt in gt_train:
            pred = load_sonobat_preds(gt['dataset_name'], gt['id'], sb_meta, 'Not Bat')
            if len(pred['annotation']) > 0:
                # compute detection overlap with ground truth to determine which are the TP detections
                assign_to_gt(gt, pred, args['iou_thresh'])
                pred = parse_data(pred, class_names, classes_to_ignore, True)
                x_train.append(pred['feats'])
                y_train.append(pred['class_ids'])
        # train random forest on tadarida predictions
        clf_sb, un_train_class = train_rf_model(x_train, y_train, num_classes, args['rand_seed'])
        # run the model on the test set
        preds_sb_rf = []
        for gt in gt_test:
            pred = load_sonobat_preds(gt['dataset_name'], gt['id'], sb_meta, 'Not Bat')
            pred = parse_data(pred, class_names, classes_to_ignore, True)
            pred = eval_rf_model(clf_sb, pred, un_train_class, num_classes)
            preds_sb_rf.append(pred)
        results_sb_rf = evl.evaluate_predictions(gt_test, preds_sb_rf, class_names,
                                                 params_eval['detection_overlap'], params_eval['ignore_start_end'])
        print_results('Sonobat RF', 'sb_rf', results_sb_rf, args['op_dir'], class_names,
                      args['file_type'], args['title_text'] + ' - Species - ')
        print('\n\nWARNING\nThis is evaluating on the full test set, but there is only dections for a subset of files\n\n')
    #
    # evaluate Tadarida-D by training random forest classifier
    #
    if args['td_ip_dir'] != '':
        x_train = []
        y_train = []
        for gt in gt_train:
            pred = load_tadarida_pred(args['td_ip_dir'], gt['dataset_name'], gt['id'])
            # compute detection overlap with ground truth to determine which are the TP detections
            assign_to_gt(gt, pred, args['iou_thresh'])
            pred = parse_data(pred, class_names, classes_to_ignore, True)
            x_train.append(pred['feats'])
            y_train.append(pred['class_ids'])
        # train random forest on Tadarida-D predictions
        clf_td, un_train_class = train_rf_model(x_train, y_train, num_classes, args['rand_seed'])
        # run the model on the test set
        preds_td = []
        for gt in gt_test:
            pred = load_tadarida_pred(args['td_ip_dir'], gt['dataset_name'], gt['id'])
            pred = parse_data(pred, class_names, classes_to_ignore, True)
            pred = eval_rf_model(clf_td, pred, un_train_class, num_classes)
            preds_td.append(pred)
        results_td = evl.evaluate_predictions(gt_test, preds_td, class_names,
                                              params_eval['detection_overlap'], params_eval['ignore_start_end'])
        print_results('Tadarida', 'td_rf', results_td, args['op_dir'], class_names,
                      args['file_type'], args['title_text'] + ' - Species - ')
    #
    # evaluate BatDetect
    #
    if args['bd_model_path'] != '':
        # load model
        bd_args = get_default_bd_args()
        model, params_bd = du.load_model(args['bd_model_path'])
        # check if the class names are the same
        if params_bd['class_names'] != class_names:
            print('Warning: Class names are not the same as the trained model')
            assert False
        preds_bd = []
        for ii, gg in enumerate(gt_test):
            pred = du.process_file(gg['file_path'], model, params_bd, bd_args, return_raw_preds=True)
            preds_bd.append(pred)
        results_bd = evl.evaluate_predictions(gt_test, preds_bd, class_names,
                                              params_eval['detection_overlap'], params_eval['ignore_start_end'])
        print_results('BatDetect', 'bd', results_bd, args['op_dir'],
                      class_names, args['file_type'], args['title_text'] + ' - Species - ')
        # evaluate genus level
        class_names_genus, preds_bd_g, gt_test_g = create_genus_mapping(gt_test, preds_bd, class_names)
        results_bd_genus = evl.evaluate_predictions(gt_test_g, preds_bd_g, class_names_genus,
                                                    params_eval['detection_overlap'], params_eval['ignore_start_end'])
        print_results('BatDetect Genus', 'bd_genus', results_bd_genus, args['op_dir'],
                      class_names_genus, args['file_type'], args['title_text'] + ' - Genus - ')
--- a/bat_detect/evaluate/readme.md
+++ b/bat_detect/evaluate/readme.md
@ -0,0 +1,33 @@
 # Evaluating model
 This script evaluates a trained model and outputs several plots summarizing the performance. It is used as follows:  
 `python path_to_store_images/ path_to_audio_files/ path_to_annotation_file/ path_to_trained_model/`
 e.g.  
 `python evaluate_models.py ../../plots/results_compare_yuc/ /data1/bat_data/data/yucatan/audio/ /data1/bat_data/annotations/anns_finetune/ ../../experiments/2021_12_17__15_58_43/2021_12_17__15_58_43.pth.tar`
 By default this will just evaluate the set of test files that are already specified in the model at training time. However, you can also specify a single set of annotations to evaluate using the `--test_file` flag. These must be stored in one annotation file, containing a list of the individual files.  
 e.g.    
 `python evaluate_models.py ../../plots/results_compare_yuc/ /data1/bat_data/data/yucatan/audio/ /data1/bat_data/annotations/anns_finetune/ ../../experiments/2021_12_17__15_58_43/2021_12_17__15_58_43.pth.tar --test_file yucatan_TEST.json`  
 You can also specify if the plots are saved as a .png or .pdf using `--file_type` and you can set title text for a plot using `--title_text`, e.g. `--file_type pdf --title_text "My Dataset Name"`
 ### Comparing to Tadarida-D
 It is also possible to compare to Tadarida-D. For Tadarida-D the following steps are performed:  
 - Matches Tadarida's detections to manually annotated calls   
 - Trains a RandomForest classifier using Tadarida call features  
 - Evaluate the classifier on a held out set  
 Uses precomputed binaries for Tadarida-D from:  
 `https://github.com/YvesBas/Tadarida-D/archive/master.zip`  
 Needs to be run using the following arguments:  
 `./TadaridaD -t 4 -x 1 ip_dir/`  
 -t 4 means 4 threads  
 -x 1 means time expansions of 1  
 This will generate a folder called `txt` which contains a corresponding `.ta` file for each input audio file. Example usage is as follows:  
 `python evaluate_models.py ../../plots/results_compare_yuc/ /data1/bat_data/data/yucatan/audio/ /data1/bat_data/annotations/anns_finetune/ ../../experiments/2021_12_17__15_58_43/2021_12_17__15_58_43.pth.tar --td_ip_dir /data1/bat_data/baselines/tadarida_D/`
--- a/bat_detect/finetune/finetune_model.py
+++ b/bat_detect/finetune/finetune_model.py
@ -0,0 +1,183 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import os
 import torch
 import torch.nn.functional as F
 from torch.optim.lr_scheduler import CosineAnnealingLR
 import json
 import argparse
 import glob
 import sys
 sys.path.append(os.path.join('..', '..'))
 import bat_detect.train.train_model as tm
 import bat_detect.train.audio_dataloader as adl
 import bat_detect.train.evaluate as evl
 import bat_detect.train.train_utils as tu
 import bat_detect.train.losses as losses
 import bat_detect.detector.parameters as parameters
 import bat_detect.detector.models as models
 import bat_detect.detector.post_process as pp
 import bat_detect.utils.plot_utils as pu
 import bat_detect.utils.detector_utils as du
 if __name__ == "__main__":
    info_str = '\nBatDetect - Finetune Model\n'
    print(info_str)
    parser = argparse.ArgumentParser()
    parser.add_argument('audio_path', type=str, help='Input directory for audio')
    parser.add_argument('train_ann_path', type=str,
                        help='Path to where train annotation file is stored')
    parser.add_argument('test_ann_path', type=str,
                        help='Path to where test annotation file is stored')
    parser.add_argument('model_path', type=str,
                        help='Path to pretrained model')
    parser.add_argument('--op_model_name', type=str, default='',
                        help='Path and name for finetuned model')
    parser.add_argument('--num_epochs', type=int, default=200, dest='num_epochs',
                         help='Number of finetuning epochs')
    parser.add_argument('--finetune_only_last_layer', action='store_true',
                         help='Only train final layers')
    parser.add_argument('--train_from_scratch', action='store_true',
                         help='Do not use pretrained weights')
    parser.add_argument('--do_not_save_images', action='store_false',
                         help='Do not save images at the end of training')
    parser.add_argument('--notes', type=str, default='',
                         help='Notes to save in text file')
    args = vars(parser.parse_args())
    params = parameters.get_params(True, '../../experiments/')
    if torch.cuda.is_available():
        params['device'] = 'cuda'
    else:
        params['device'] = 'cpu'
        print('\nNote, this will be a lot faster if you use computer with a GPU.\n')
    print('\nAudio directory:      ' + args['audio_path'])
    print('Train file:           ' + args['train_ann_path'])
    print('Test file:            ' + args['test_ann_path'])
    print('Loading model:        ' + args['model_path'])
    dataset_name = os.path.basename(args['train_ann_path']).replace('.json', '').replace('_TRAIN', '')
    if args['train_from_scratch']:
        print('\nTraining model from scratch i.e. not using pretrained weights')
        model, params_train = du.load_model(args['model_path'], False)
    else:
        model, params_train = du.load_model(args['model_path'], True)
    model.to(params['device'])
    params['num_epochs'] = args['num_epochs']
    if args['op_model_name'] != '':
        params['model_file_name'] = args['op_model_name']
    classes_to_ignore = params['classes_to_ignore']+params['generic_class']
    # save notes file
    params['notes'] = args['notes']
    if args['notes'] != '':
        tu.write_notes_file(params['experiment'] + 'notes.txt', args['notes'])
    # load train annotations
    train_sets = []
    train_sets.append(tu.get_blank_dataset_dict(dataset_name, False, args['train_ann_path'], args['audio_path']))
    params['train_sets'] = [tu.get_blank_dataset_dict(dataset_name, False, os.path.basename(args['train_ann_path']), args['audio_path'])]
    print('\nTrain set:')
    data_train, params['class_names'], params['class_inv_freq'] = \
        tu.load_set_of_anns(train_sets, classes_to_ignore, params['events_of_interest'])
    print('Number of files', len(data_train))
    params['genus_names'], params['genus_mapping'] = tu.get_genus_mapping(params['class_names'])
    params['class_names_short'] = tu.get_short_class_names(params['class_names'])
    # load test annotations
    test_sets = []
    test_sets.append(tu.get_blank_dataset_dict(dataset_name, True, args['test_ann_path'], args['audio_path']))
    params['test_sets'] = [tu.get_blank_dataset_dict(dataset_name, True, os.path.basename(args['test_ann_path']), args['audio_path'])]
    print('\nTest set:')
    data_test, _, _ = tu.load_set_of_anns(test_sets, classes_to_ignore, params['events_of_interest'])
    print('Number of files', len(data_test))
    # train loader
    train_dataset = adl.AudioLoader(data_train, params, is_train=True)
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'],
                    shuffle=True, num_workers=params['num_workers'], pin_memory=True)
    # test loader - batch size of one because of variable file length
    test_dataset = adl.AudioLoader(data_test, params, is_train=False)
    test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=1,
                   shuffle=False, num_workers=params['num_workers'], pin_memory=True)
    inputs_train = next(iter(train_loader))
    params['ip_height'] = inputs_train['spec'].shape[2]
    print('\ntrain batch size :', inputs_train['spec'].shape)
    assert(params_train['model_name'] == 'Net2DFast')
    print('\n\nSOME hyperparams need to be the same as the loaded model (e.g. FFT) - currently they are getting overwritten.\n\n')
    # set the number of output classes
    num_filts = model.conv_classes_op.in_channels
    k_size = model.conv_classes_op.kernel_size
    pad = model.conv_classes_op.padding
    model.conv_classes_op = torch.nn.Conv2d(num_filts, len(params['class_names'])+1, kernel_size=k_size, padding=pad)
    model.conv_classes_op.to(params['device'])
    if args['finetune_only_last_layer']:
        print('\nOnly finetuning the final layers.\n')
        train_layers_i = ['conv_classes', 'conv_classes_op', 'conv_size', 'conv_size_op']
        train_layers = [tt + '.weight' for tt in train_layers_i] + [tt + '.bias' for tt in train_layers_i]
        for name, param in model.named_parameters():
            if name in train_layers:
                param.requires_grad = True
            else:
                param.requires_grad = False
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    scheduler = CosineAnnealingLR(optimizer, params['num_epochs'] * len(train_loader))
    if params['train_loss'] == 'mse':
        det_criterion = losses.mse_loss
    elif params['train_loss'] == 'focal':
        det_criterion = losses.focal_loss
    # plotting
    train_plt_ls = pu.LossPlotter(params['experiment'] + 'train_loss.png', params['num_epochs']+1,
                                  ['train_loss'], None, None, ['epoch', 'train_loss'], logy=True)
    test_plt_ls = pu.LossPlotter(params['experiment'] + 'test_loss.png', params['num_epochs']+1,
                                  ['test_loss'], None, None, ['epoch', 'test_loss'], logy=True)
    test_plt = pu.LossPlotter(params['experiment'] + 'test.png', params['num_epochs']+1,
                                  ['avg_prec', 'rec_at_x', 'avg_prec_class', 'file_acc', 'top_class'], [0,1], None, ['epoch', ''])
    test_plt_class = pu.LossPlotter(params['experiment'] + 'test_avg_prec.png', params['num_epochs']+1,
                                  params['class_names_short'], [0,1], params['class_names_short'], ['epoch', 'avg_prec'])
    # main train loop
    for epoch in range(0, params['num_epochs']+1):
        train_loss = tm.train(model, epoch, train_loader, det_criterion, optimizer, scheduler, params)
        train_plt_ls.update_and_save(epoch, [train_loss['train_loss']])
        if epoch % params['num_eval_epochs'] == 0:
            # detection accuracy on test set
            test_res, test_loss = tm.test(model, epoch, test_loader, det_criterion, params)
            test_plt_ls.update_and_save(epoch, [test_loss['test_loss']])
            test_plt.update_and_save(epoch, [test_res['avg_prec'], test_res['rec_at_x'],
                                             test_res['avg_prec_class'], test_res['file_acc'], test_res['top_class']['avg_prec']])
            test_plt_class.update_and_save(epoch, [rs['avg_prec'] for rs in test_res['class_pr']])
            pu.plot_pr_curve_class(params['experiment'] , 'test_pr', 'test_pr', test_res)
            # save finetuned model
            print('saving model to: ' + params['model_file_name'])
            op_state = {'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'params' : params}
            torch.save(op_state, params['model_file_name'])
    # save an image with associated prediction for each batch in the test set
    if not args['do_not_save_images']:
        tm.save_images_batch(model, test_loader, params)
--- a/bat_detect/finetune/prep_data_finetune.py
+++ b/bat_detect/finetune/prep_data_finetune.py
@ -0,0 +1,151 @@
 import numpy as np
 import argparse
 import os
 import json
 import sys
 sys.path.append(os.path.join('..', '..'))
 import bat_detect.train.train_utils as tu
 def print_dataset_stats(data, split_name, classes_to_ignore):
    print('\nSplit:', split_name)
    print('Num files:', len(data))
    class_cnts = {}
    for dd in data:
        for aa in dd['annotation']:
            if aa['class'] not in classes_to_ignore:
                if aa['class'] in class_cnts:
                    class_cnts[aa['class']] += 1
                else:
                    class_cnts[aa['class']] = 1
    if len(class_cnts) == 0:
        class_names = []
    else:
        class_names = np.sort([*class_cnts]).tolist()
        print('Class count:')
        str_len = np.max([len(cc) for cc in class_names]) + 5
        for ii, cc in enumerate(class_names):
            print(str(ii).ljust(5) + cc.ljust(str_len) + str(class_cnts[cc]))
    return class_names
 def load_file_names(file_name):
    if os.path.isfile(file_name):
        with open(file_name) as da:
            files = [line.rstrip() for line in da.readlines()]
        for ff in files:
            if ff.lower()[-3:] != 'wav':
                print('Error: Filenames need to end in .wav - ', ff)
                assert(False)
    else:
        print('Error: Input file not found - ', file_name)
        assert(False)
    return files
 if __name__ == "__main__":
    info_str = '\nBatDetect - Prepare Data for Finetuning\n'
    print(info_str)
    parser = argparse.ArgumentParser()
    parser.add_argument('dataset_name', type=str, help='Name to call your dataset')
    parser.add_argument('audio_dir', type=str, help='Input directory for audio')
    parser.add_argument('ann_dir', type=str, help='Input directory for where the audio annotations are stored')
    parser.add_argument('op_dir', type=str, help='Path where the train and test splits will be stored')
    parser.add_argument('--percent_val', type=float, default=0.20,
                         help='Hold out this much data for validation. Should be number between 0 and 1')
    parser.add_argument('--rand_seed', type=int, default=2001,
                         help='Random seed used for creating the validation split')
    parser.add_argument('--train_file', type=str, default='',
                         help='Text file where each line is a wav file in train split')
    parser.add_argument('--test_file', type=str, default='',
                         help='Text file where each line is a wav file in test split')
    parser.add_argument('--input_class_names', type=str, default='',
                         help='Specify names of classes that you want to change. Separate with ";"')
    parser.add_argument('--output_class_names', type=str, default='',
                         help='New class names to use instead. One to one mapping with "--input_class_names". \
                         Separate with ";"')
    args = vars(parser.parse_args())
    np.random.seed(args['rand_seed'])
    classes_to_ignore  = ['', ' ', 'Unknown', 'Not Bat']
    generic_class      = ['Bat']
    events_of_interest = ['Echolocation']
    if args['input_class_names'] != '' and args['output_class_names'] != '':
        # change the names of the classes
        ip_names = args['input_class_names'].split(';')
        op_names = args['output_class_names'].split(';')
        name_dict = dict(zip(ip_names, op_names))
    else:
        name_dict = False
    # load annotations
    data_all, _, _ = tu.load_set_of_anns({'ann_path': args['ann_dir'], 'wav_path': args['audio_dir']},
                                          classes_to_ignore, events_of_interest, False, False,
                                          list_of_anns=True, filter_issues=True, name_replace=name_dict)
    print('Dataset name:         ' + args['dataset_name'])
    print('Audio directory:      ' + args['audio_dir'])
    print('Annotation directory: ' + args['ann_dir'])
    print('Ouput directory:      ' + args['op_dir'])
    print('Num annotated files:  ' + str(len(data_all)))
    if args['train_file'] != '' and args['test_file'] != '':
        # user has specifed the train / test split
        train_files = load_file_names(args['train_file'])
        test_files = load_file_names(args['test_file'])
        file_names_all = [dd['id'] for dd in data_all]
        train_inds = [file_names_all.index(ff) for ff in train_files if ff in file_names_all]
        test_inds = [file_names_all.index(ff) for ff in test_files if ff in file_names_all]
    else:
        # split the data into train and test at the file level
        num_exs = len(data_all)
        test_inds = np.random.choice(np.arange(num_exs), int(num_exs*args['percent_val']), replace=False)
        test_inds = np.sort(test_inds)
        train_inds = np.setdiff1d(np.arange(num_exs), test_inds)
    data_train = [data_all[ii] for ii in train_inds]
    data_test = [data_all[ii] for ii in test_inds]
    if not os.path.isdir(args['op_dir']):
        os.makedirs(args['op_dir'])
    op_name = os.path.join(args['op_dir'], args['dataset_name'])
    op_name_train = op_name + '_TRAIN.json'
    op_name_test  = op_name + '_TEST.json'
    class_un_train = print_dataset_stats(data_train, 'Train', classes_to_ignore)
    class_un_test = print_dataset_stats(data_test, 'Test', classes_to_ignore)
    if len(data_train) > 0 and len(data_test) > 0:
        if class_un_train != class_un_test:
            print('\nError: some classes are not in both the training and test sets.\
                   \nTry a different random seed "--rand_seed".')
            assert False
    print('\n')
    if len(data_train) == 0:
        print('No train annotations to save')
    else:
        print('Saving: ', op_name_train)
        with open(op_name_train, 'w') as da:
            json.dump(data_train, da, indent=2)
    if len(data_test) == 0:
        print('No test annotations to save')
    else:
        print('Saving: ', op_name_test)
        with open(op_name_test, 'w') as da:
            json.dump(data_test, da, indent=2)
--- a/bat_detect/finetune/readme.md
+++ b/bat_detect/finetune/readme.md
@ -0,0 +1,36 @@
 # Finetuning the BatDetet2 model on your own data  
 1. Annotate your data using the annotation GUI.
 2. Run `prep_data_finetune.py` to create a training and validation split for your data.  
 3. Run `finetune_model.py` to finetune a model on your data.  
 ## 1. Annotate calls of interest in audio data
 This will result in a directory of audio files and a directory of annotation files, where each audio file will have a corresponding `.json` annotation file.
 Use the annotation GUI to do this.
 Make sure to annotation all instances of a bat call.
 If unsure of the species, just label the call as `Bat`.  
 ## 2. Split data into train and test  
 * Run `prep_data_finetune.py` to split the data into train and test sets. This will result in two separate files, a train and a test one.   
 Example usage:   
 `python prep_data_finetune.py dataset_name path_to_audio/audio/ path_to_annotations/anns/ path_to_output_anns/`   
 This may result an error if it does not result in the files containing the same species in the train and test splits. You can try different random seeds if this is an issue e.g. `--rand_seed 123456`.  
 * Can also load split from text files, where each line of the text file is the name of a .wav file e.g.  
 `python prep_data_finetune.py yucatan /data1/bat_data/data/yucatan/audio/ /data1/bat_data/data/yucatan/anns/ /data1/bat_data/annotations/anns_finetune/ --train_file path_to_file/yucatan_train_split.txt --test_file path_to_file/yucatan_test_split.txt`
 * Can also replace class names. Use semi colons to separate, without spaces between them e.g.
 `python prep_data_finetune.py brazil_data /data1/bat_data/data/brazil_data/audio/ /data1/bat_data/data/brazil_data/anns/ /data1/bat_data/annotations/anns_finetune/ --input_class_names "Histiotus;Molossidae;Lasiurus;Myotis;Rhogeesa;Vespertilionidae" --output_class_names "Group One;Group One;Group One;Group Two;Group Two;Group Three"`  
 ## 3. Finetune the model  
 Example usage:  
 `python finetune_model.py path_to_audio/audio/ path_to_train/TRAIN.json path_to_train/TEST.json ../../models/model_to_finetune.pth.tar`
 ## Additional notes
 * For the first step it is better to cut the files into less than 5 second audio clips and make sure to annotate them exhaustively (i.e. all bat calls should be annotated).  
 * You can train the model for longer, by setting the `--num_epochs` flag to a larger number e.g. `--num_epochs 400`. The default is 200.  
 * If you do not want to finetune the model, but instead want to train it from scratch you can set the `--train_from_scratch` flag.  
--- a/bat_detect/train/init.py
+++ b/bat_detect/train/init.py
--- a/bat_detect/train/audio_dataloader.py
+++ b/bat_detect/train/audio_dataloader.py
@ -0,0 +1,407 @@
 import torch
 import random
 import numpy as np
 import copy
 import librosa
 import torch.nn.functional as F
 import torchaudio
 import os
 import sys
 sys.path.append(os.path.join('..', '..'))
 import bat_detect.utils.audio_utils as au
 def generate_gt_heatmaps(spec_op_shape, sampling_rate, ann, params):
    # spec may be resized on input into the network
    num_classes  = len(params['class_names'])
    op_height    = spec_op_shape[0]
    op_width     = spec_op_shape[1]
    freq_per_bin = (params['max_freq'] - params['min_freq']) / op_height
    # start and end times
    x_pos_start = au.time_to_x_coords(ann['start_times'], sampling_rate,
                                      params['fft_win_length'], params['fft_overlap'])
    x_pos_start = (params['resize_factor']*x_pos_start).astype(np.int)
    x_pos_end   = au.time_to_x_coords(ann['end_times'], sampling_rate,
                                      params['fft_win_length'], params['fft_overlap'])
    x_pos_end   = (params['resize_factor']*x_pos_end).astype(np.int)
    # location on y axis i.e. frequency
    y_pos_low  = (ann['low_freqs'] - params['min_freq']) / freq_per_bin
    y_pos_low  = (op_height - y_pos_low).astype(np.int)
    y_pos_high = (ann['high_freqs'] - params['min_freq']) / freq_per_bin
    y_pos_high = (op_height - y_pos_high).astype(np.int)
    bb_widths  = x_pos_end - x_pos_start
    bb_heights = (y_pos_low - y_pos_high)
    valid_inds = np.where((x_pos_start >= 0) & (x_pos_start < op_width) &
                          (y_pos_low >= 0) & (y_pos_low < (op_height-1)))[0]
    ann_aug = {}
    ann_aug['x_inds'] = x_pos_start[valid_inds]
    ann_aug['y_inds'] = y_pos_low[valid_inds]
    keys = ['start_times', 'end_times', 'high_freqs', 'low_freqs', 'class_ids', 'individual_ids']
    for kk in keys:
        ann_aug[kk] = ann[kk][valid_inds]
    # if the number of calls is only 1, then it is unique
    # TODO would be better if we found these unique calls at the merging stage
    if len(ann_aug['individual_ids']) == 1:
        ann_aug['individual_ids'][0] = 0
    y_2d_det  = np.zeros((1, op_height, op_width), dtype=np.float32)
    y_2d_size = np.zeros((2, op_height, op_width), dtype=np.float32)
    # num classes and "background" class
    y_2d_classes = np.zeros((num_classes+1, op_height, op_width), dtype=np.float32)
    # create 2D ground truth heatmaps
    for ii in valid_inds:
        draw_gaussian(y_2d_det[0,:], (x_pos_start[ii], y_pos_low[ii]), params['target_sigma'])
        #draw_gaussian(y_2d_det[0,:], (x_pos_start[ii], y_pos_low[ii]), params['target_sigma'], params['target_sigma']*2)
        y_2d_size[0, y_pos_low[ii], x_pos_start[ii]] = bb_widths[ii]
        y_2d_size[1, y_pos_low[ii], x_pos_start[ii]] = bb_heights[ii]
        cls_id = ann['class_ids'][ii]
        if cls_id > -1:
            draw_gaussian(y_2d_classes[cls_id, :], (x_pos_start[ii], y_pos_low[ii]), params['target_sigma'])
            #draw_gaussian(y_2d_classes[cls_id, :], (x_pos_start[ii], y_pos_low[ii]), params['target_sigma'], params['target_sigma']*2)
    # be careful as this will have a 1.0 places where we have event but dont know gt class
    # this will be masked in training anyway
    y_2d_classes[num_classes, :] = 1.0 - y_2d_classes.sum(0)
    y_2d_classes = y_2d_classes / y_2d_classes.sum(0)[np.newaxis, ...]
    y_2d_classes[np.isnan(y_2d_classes)] = 0.0
    return y_2d_det, y_2d_size, y_2d_classes, ann_aug
 def draw_gaussian(heatmap, center, sigmax, sigmay=None):
    # center is (x, y)
    # this edits the heatmap inplace
    if sigmay is None:
        sigmay = sigmax
    tmp_size = np.maximum(sigmax, sigmay) * 3
    mu_x = int(center[0] + 0.5)
    mu_y = int(center[1] + 0.5)
    w, h = heatmap.shape[0], heatmap.shape[1]
    ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
    br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
    if ul[0] >= h or ul[1] >= w or br[0] < 0 or br[1] < 0:
        return False
    size = 2 * tmp_size + 1
    x = np.arange(0, size, 1, np.float32)
    y = x[:, np.newaxis]
    x0 = y0 = size // 2
    #g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
    g = np.exp(- ((x - x0) ** 2)/(2 * sigmax ** 2) - ((y - y0) ** 2)/(2 * sigmay ** 2))
    g_x = max(0, -ul[0]), min(br[0], h) - ul[0]
    g_y = max(0, -ul[1]), min(br[1], w) - ul[1]
    img_x = max(0, ul[0]), min(br[0], h)
    img_y = max(0, ul[1]), min(br[1], w)
    heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum(
                                    heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]],
                                    g[g_y[0]:g_y[1], g_x[0]:g_x[1]])
    return True
 def pad_aray(ip_array, pad_size):
    return np.hstack((ip_array, np.ones(pad_size, dtype=np.int)*-1))
 def warp_spec_aug(spec, ann, return_spec_for_viz, params):
    # This is messy
    # Augment spectrogram by randomly stretch and squeezing
    # NOTE this also changes the start and stop time in place
    # not taking care of spec for viz
    if return_spec_for_viz:
        assert False
    delta = params['stretch_squeeze_delta']
    op_size = (spec.shape[1], spec.shape[2])
    resize_fract_r = np.random.rand()*delta*2 - delta + 1.0
    resize_amt = int(spec.shape[2]*resize_fract_r)
    if resize_amt >= spec.shape[2]:
        spec_r = torch.cat((spec, torch.zeros((1, spec.shape[1], resize_amt-spec.shape[2]), dtype=spec.dtype)), 2)
    else:
        spec_r = spec[:, :, :resize_amt]
    spec = F.interpolate(spec_r.unsqueeze(0), size=op_size, mode='bilinear', align_corners=False).squeeze(0)
    ann['start_times'] *= (1.0/resize_fract_r)
    ann['end_times']   *= (1.0/resize_fract_r)
    return spec
 def mask_time_aug(spec, params):
    # Mask out a random block of time - repeat up to 3 times
    # SpecAugment: A Simple Data Augmentation Methodfor Automatic Speech Recognition
    fm = torchaudio.transforms.TimeMasking(int(spec.shape[1]*params['mask_max_time_perc']))
    for ii in range(np.random.randint(1, 4)):
        spec = fm(spec)
    return spec
 def mask_freq_aug(spec, params):
    # Mask out a random frequncy range - repeat up to 3 times
    # SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
    fm = torchaudio.transforms.FrequencyMasking(int(spec.shape[1]*params['mask_max_freq_perc']))
    for ii in range(np.random.randint(1, 4)):
        spec = fm(spec)
    return spec
 def scale_vol_aug(spec, params):
    return spec * np.random.random()*params['spec_amp_scaling']
 def echo_aug(audio, sampling_rate, params):
    sample_offset = int(params['echo_max_delay']*np.random.random()*sampling_rate) + 1
    audio[:-sample_offset] += np.random.random()*audio[sample_offset:]
    return audio
 def resample_aug(audio, sampling_rate, params):
    sampling_rate_old = sampling_rate
    sampling_rate = np.random.choice(params['aug_sampling_rates'])
    audio = librosa.resample(audio, sampling_rate_old, sampling_rate, res_type='polyphase')
    audio = au.pad_audio(audio, sampling_rate, params['fft_win_length'],
                         params['fft_overlap'], params['resize_factor'],
                         params['spec_divide_factor'], params['spec_train_width'])
    duration = audio.shape[0] / float(sampling_rate)
    return audio, sampling_rate, duration
 def resample_audio(num_samples, sampling_rate, audio2, sampling_rate2):
    if sampling_rate != sampling_rate2:
        audio2 = librosa.resample(audio2, sampling_rate2, sampling_rate, res_type='polyphase')
        sampling_rate2 = sampling_rate
    if audio2.shape[0] < num_samples:
        audio2 = np.hstack((audio2, np.zeros((num_samples-audio2.shape[0]), dtype=audio2.dtype)))
    elif audio2.shape[0] > num_samples:
        audio2 = audio2[:num_samples]
    return audio2, sampling_rate2
 def combine_audio_aug(audio, sampling_rate, ann, audio2, sampling_rate2, ann2):
    # resample so they are the same
    audio2, sampling_rate2 = resample_audio(audio.shape[0], sampling_rate, audio2, sampling_rate2)
    # # set mean and std to be the same
    # audio2 = (audio2 - audio2.mean())
    # audio2 = (audio2/audio2.std())*audio.std()
    # audio2 = audio2 + audio.mean()
    if ann['annotated'] and (ann2['annotated']) and \
       (sampling_rate2 == sampling_rate) and (audio.shape[0] == audio2.shape[0]):
        comb_weight = 0.3 + np.random.random()*0.4
        audio = comb_weight*audio + (1-comb_weight)*audio2
        inds = np.argsort(np.hstack((ann['start_times'], ann2['start_times'])))
        for kk in ann.keys():
            # when combining calls from different files, assume they come from different individuals
            if kk == 'individual_ids':
                if (ann[kk]>-1).sum() > 0:
                    ann2[kk][ann2[kk]>-1] += np.max(ann[kk][ann[kk]>-1]) + 1
            if (kk != 'class_id_file') and (kk != 'annotated'):
                ann[kk] = np.hstack((ann[kk], ann2[kk]))[inds]
    return audio, ann
 class AudioLoader(torch.utils.data.Dataset):
    def __init__(self, data_anns_ip, params, dataset_name=None, is_train=False):
        self.data_anns = []
        self.is_train = is_train
        self.params = params
        self.return_spec_for_viz = False
        for ii in range(len(data_anns_ip)):
            dd = copy.deepcopy(data_anns_ip[ii])
            # filter out unused annotation here
            filtered_annotations = []
            for ii, aa in enumerate(dd['annotation']):
                if 'individual' in aa.keys():
                    aa['individual'] = int(aa['individual'])
                    # if only one call labeled it has to be from the same individual
                    if len(dd['annotation']) == 1:
                        aa['individual'] = 0
                # convert class name into class label
                if aa['class'] in self.params['class_names']:
                    aa['class_id'] = self.params['class_names'].index(aa['class'])
                else:
                    aa['class_id'] = -1
                if aa['class'] not in self.params['classes_to_ignore']:
                    filtered_annotations.append(aa)
            dd['annotation']     = filtered_annotations
            dd['start_times']    = np.array([aa['start_time'] for aa in dd['annotation']])
            dd['end_times']      = np.array([aa['end_time'] for aa in dd['annotation']])
            dd['high_freqs']     = np.array([float(aa['high_freq']) for aa in dd['annotation']])
            dd['low_freqs']      = np.array([float(aa['low_freq']) for aa in dd['annotation']])
            dd['class_ids']      = np.array([aa['class_id'] for aa in dd['annotation']]).astype(np.int)
            dd['individual_ids'] = np.array([aa['individual'] for aa in dd['annotation']]).astype(np.int)
            # file level class name
            dd['class_id_file'] = -1
            if 'class_name' in dd.keys():
                if dd['class_name'] in self.params['class_names']:
                    dd['class_id_file'] = self.params['class_names'].index(dd['class_name'])
            self.data_anns.append(dd)
        ann_cnt = [len(aa['annotation']) for aa in self.data_anns]
        self.max_num_anns = 2*np.max(ann_cnt) # x2 because we may be combining files during training
        print('\n')
        if dataset_name is not None:
            print('Dataset     : ' + dataset_name)
        if self.is_train:
            print('Split type  : train')
        else:
            print('Split type  : test')
        print('Num files   : ' + str(len(self.data_anns)))
        print('Num calls   : ' + str(np.sum(ann_cnt)))
    def get_file_and_anns(self, index=None):
        # if no file specified, choose random one
        if index == None:
            index = np.random.randint(0, len(self.data_anns))
        audio_file = self.data_anns[index]['file_path']
        sampling_rate, audio_raw = au.load_audio_file(audio_file, self.data_anns[index]['time_exp'],
                                      self.params['target_samp_rate'], self.params['scale_raw_audio'])
        # copy annotation
        ann = {}
        ann['annotated']     = self.data_anns[index]['annotated']
        ann['class_id_file'] = self.data_anns[index]['class_id_file']
        keys = ['start_times', 'end_times', 'high_freqs', 'low_freqs', 'class_ids', 'individual_ids']
        for kk in keys:
            ann[kk] = self.data_anns[index][kk].copy()
        # if train then grab a random crop
        if self.is_train:
            nfft = int(self.params['fft_win_length']*sampling_rate)
            noverlap = int(self.params['fft_overlap']*nfft)
            length_samples = self.params['spec_train_width']*(nfft - noverlap) + noverlap
            if audio_raw.shape[0] - length_samples > 0:
                sample_crop = np.random.randint(audio_raw.shape[0] - length_samples)
            else:
                sample_crop = 0
            audio_raw          = audio_raw[sample_crop:sample_crop+length_samples]
            ann['start_times'] = ann['start_times'] - sample_crop/float(sampling_rate)
            ann['end_times']   = ann['end_times'] - sample_crop/float(sampling_rate)
        # pad audio
        if self.is_train:
            op_spec_target_size = self.params['spec_train_width']
        else:
            op_spec_target_size = None
        audio_raw = au.pad_audio(audio_raw, sampling_rate, self.params['fft_win_length'],
                                self.params['fft_overlap'], self.params['resize_factor'],
                                self.params['spec_divide_factor'], op_spec_target_size)
        duration = audio_raw.shape[0] / float(sampling_rate)
        # sort based on time
        inds = np.argsort(ann['start_times'])
        for kk in ann.keys():
            if (kk != 'class_id_file') and (kk != 'annotated'):
                ann[kk] = ann[kk][inds]
        return audio_raw, sampling_rate, duration, ann
    def __getitem__(self, index):
        # load audio file
        audio, sampling_rate, duration, ann = self.get_file_and_anns(index)
        # augment on raw audio
        if self.is_train and self.params['augment_at_train']:
            # augment - combine with random audio file
            if self.params['augment_at_train_combine'] and np.random.random() < self.params['aug_prob']:
                audio2, sampling_rate2, duration2, ann2 = self.get_file_and_anns()
                audio, ann = combine_audio_aug(audio, sampling_rate, ann, audio2, sampling_rate2, ann2)
            # simulate echo by adding delayed copy of the file
            if np.random.random() < self.params['aug_prob']:
                audio = echo_aug(audio, sampling_rate, self.params)
            # resample the audio
            #if np.random.random() < self.params['aug_prob']:
            #   audio, sampling_rate, duration = resample_aug(audio, sampling_rate, self.params)
        # create spectrogram
        spec, spec_for_viz = au.generate_spectrogram(audio, sampling_rate, self.params, self.return_spec_for_viz)
        rsf = self.params['resize_factor']
        spec_op_shape = (int(self.params['spec_height']*rsf), int(spec.shape[1]*rsf))
        # resize the spec
        spec = torch.from_numpy(spec).unsqueeze(0).unsqueeze(0)
        spec = F.interpolate(spec, size=spec_op_shape, mode='bilinear', align_corners=False).squeeze(0)
        # augment spectrogram
        if self.is_train and self.params['augment_at_train']:
            if np.random.random() < self.params['aug_prob']:
                spec = scale_vol_aug(spec, self.params)
            if np.random.random() < self.params['aug_prob']:
                spec = warp_spec_aug(spec, ann, self.return_spec_for_viz, self.params)
            if np.random.random() < self.params['aug_prob']:
                spec = mask_time_aug(spec, self.params)
            if np.random.random() < self.params['aug_prob']:
                spec = mask_freq_aug(spec, self.params)
        outputs = {}
        outputs['spec'] = spec
        if self.return_spec_for_viz:
            outputs['spec_for_viz'] = torch.from_numpy(spec_for_viz).unsqueeze(0)
        # create ground truth heatmaps
        outputs['y_2d_det'], outputs['y_2d_size'], outputs['y_2d_classes'], ann_aug =\
                     generate_gt_heatmaps(spec_op_shape, sampling_rate, ann, self.params)
        # hack to get around requirement that all vectors are the same length in
        # the output batch
        pad_size = self.max_num_anns-len(ann_aug['individual_ids'])
        outputs['is_valid'] = pad_aray(np.ones(len(ann_aug['individual_ids'])), pad_size)
        keys = ['class_ids', 'individual_ids', 'x_inds', 'y_inds',
                'start_times', 'end_times', 'low_freqs', 'high_freqs']
        for kk in keys:
            outputs[kk] = pad_aray(ann_aug[kk], pad_size)
        # convert to pytorch
        for kk in outputs.keys():
            if type(outputs[kk]) != torch.Tensor:
                outputs[kk] = torch.from_numpy(outputs[kk])
        # scalars
        outputs['class_id_file'] = ann['class_id_file']
        outputs['annotated']     = ann['annotated']
        outputs['duration']      = duration
        outputs['sampling_rate'] = sampling_rate
        outputs['file_id']       = index
        return outputs
    def __len__(self):
        return len(self.data_anns)
--- a/bat_detect/train/evaluate.py
+++ b/bat_detect/train/evaluate.py
@ -0,0 +1,333 @@
 import numpy as np
 from sklearn.metrics import roc_curve, auc
 from sklearn.metrics import accuracy_score, balanced_accuracy_score
 def compute_error_auc(op_str, gt, pred, prob):
    # classification error
    pred_int = (pred > prob).astype(np.int)
    class_acc = (pred_int == gt).mean() * 100.0
    # ROC - area under curve
    fpr, tpr, thresholds = roc_curve(gt, pred)
    roc_auc = auc(fpr, tpr)
    print(op_str + ", class acc = {:.3f}, ROC AUC = {:.3f}".format(class_acc, roc_auc))
    #return class_acc, roc_auc
 def calc_average_precision(recall, precision):
    precision[np.isnan(precision)] = 0
    recall[np.isnan(recall)] = 0
    # pascal 12 way
    mprec = np.hstack((0, precision, 0))
    mrec = np.hstack((0, recall, 1))
    for ii in range(mprec.shape[0]-2, -1,-1):
        mprec[ii] = np.maximum(mprec[ii], mprec[ii+1])
    inds = np.where(np.not_equal(mrec[1:], mrec[:-1]))[0]+1
    ave_prec = ((mrec[inds] - mrec[inds-1])*mprec[inds]).sum()
    return float(ave_prec)
 def calc_recall_at_x(recall, precision, x=0.95):
    precision[np.isnan(precision)] = 0
    recall[np.isnan(recall)] = 0
    inds = np.where(precision[::-1]>x)[0]
    if len(inds) > 0:
        return float(recall[::-1][inds[0]])
    else:
        return 0.0
 def compute_affinity_1d(pred_box, gt_boxes, threshold):
    # first entry is start time
    score = np.abs(pred_box[0] - gt_boxes[:, 0])
    valid_detection = np.min(score) <= threshold
    return valid_detection, np.argmin(score)
 def compute_pre_rec(gts, preds, eval_mode, class_of_interest, num_classes, threshold, ignore_start_end):
    """
    Computes precision and recall. Assumes that each file has been exhaustively
    annotated. Will not count predicted detection with a start time that is within
    ignore_start_end miliseconds of the start or end of the file.
    eval_mode == 'detection'
      Returns overall detection results (not per class)
    eval_mode == 'per_class'
      Filters ground truth based on class of interest. This will ignore predictions
      assigned to gt with unknown class.
    eval_mode = 'top_class'
       Turns the problem into a binary one and selects the top predicted class
       for each predicted detection
    """
    # get predictions and put in array
    pred_boxes = []
    confidence = []
    pred_class = []
    file_ids = []
    for pid, pp in enumerate(preds):
        # filter predicted calls that are too near the start or end of the file
        file_dur = gts[pid]['duration']
        valid_inds = (pp['start_times'] >= ignore_start_end) & (pp['start_times'] <= (file_dur - ignore_start_end))
        pred_boxes.append(np.vstack((pp['start_times'][valid_inds], pp['end_times'][valid_inds],
                                     pp['low_freqs'][valid_inds], pp['high_freqs'][valid_inds])).T)
        if eval_mode == 'detection':
            # overall detection
            confidence.append(pp['det_probs'][valid_inds])
        elif eval_mode == 'per_class':
            # per class
            confidence.append(pp['class_probs'].T[valid_inds, class_of_interest])
        elif eval_mode == 'top_class':
            # per class - note that sometimes 'class_probs' can be num_classes+1 in size
            top_class = np.argmax(pp['class_probs'].T[valid_inds, :num_classes], 1)
            confidence.append(pp['class_probs'].T[valid_inds, top_class])
            pred_class.append(top_class)
        # be careful, assuming the order in the list is same as GT
        file_ids.append([pid]*valid_inds.sum())
    confidence = np.hstack(confidence)
    file_ids = np.hstack(file_ids).astype(np.int)
    pred_boxes = np.vstack(pred_boxes)
    if len(pred_class) > 0:
        pred_class = np.hstack(pred_class)
    # extract relevant ground truth boxes
    gt_boxes = []
    gt_assigned = []
    gt_class = []
    gt_generic_class = []
    num_positives = 0
    for gg in gts:
        # filter ground truth calls that are too near the start or end of the file
        file_dur = gg['duration']
        valid_inds = (gg['start_times'] >= ignore_start_end) & (gg['start_times'] <= (file_dur - ignore_start_end))
        # note, files with the incorrect duration will cause a problem
        if (gg['start_times'] > file_dur).sum() > 0:
            print('Error: file duration incorrect for', gg['id'])
            assert(False)
        boxes = np.vstack((gg['start_times'][valid_inds], gg['end_times'][valid_inds],
                           gg['low_freqs'][valid_inds], gg['high_freqs'][valid_inds])).T
        gen_class = gg['class_ids'][valid_inds] == -1
        class_ids = gg['class_ids'][valid_inds]
        # keep track of the number of relevant ground truth calls
        if eval_mode == 'detection':
            # all valid ones
            num_positives += len(gg['start_times'][valid_inds])
        elif eval_mode == 'per_class':
            # all valid ones with class of interest
            num_positives += (gg['class_ids'][valid_inds] == class_of_interest).sum()
        elif eval_mode == 'top_class':
            # all valid ones with non generic class
            num_positives += (gg['class_ids'][valid_inds] > -1).sum()
        # find relevant classes (i.e. class_of_interest) and events without known class (i.e. generic class, -1)
        if eval_mode == 'per_class':
            class_inds = (class_ids == class_of_interest) | (class_ids == -1)
            boxes = boxes[class_inds, :]
            gen_class = gen_class[class_inds]
            class_ids = class_ids[class_inds]
        gt_assigned.append(np.zeros(boxes.shape[0]))
        gt_boxes.append(boxes)
        gt_generic_class.append(gen_class)
        gt_class.append(class_ids)
    # loop through detections and keep track of those that have been assigned
    true_pos    = np.zeros(confidence.shape[0])
    valid_inds  = np.ones(confidence.shape[0]) == 1  # intialize to True
    sorted_inds = np.argsort(confidence)[::-1]       # sort high to low
    for ii, ind in enumerate(sorted_inds):
        gt_id = file_ids[ind]
        valid_det = False
        if gt_boxes[gt_id].shape[0] > 0:
            # compute overlap
            valid_det, det_ind = compute_affinity_1d(pred_boxes[ind], gt_boxes[gt_id],
                                                     threshold)
        # valid detection that has not already been assigned
        if valid_det and (gt_assigned[gt_id][det_ind] == 0):
            count_as_true_pos = True
            if eval_mode == 'top_class' and (gt_class[gt_id][det_ind] != pred_class[ind]):
                # needs to be the same class
                count_as_true_pos = False
            if count_as_true_pos:
                true_pos[ii] = 1
            gt_assigned[gt_id][det_ind] = 1
            # if event is generic class (i.e. gt_generic_class[gt_id][det_ind] is True)
            # and eval_mode != 'detection', then ignore it
            if gt_generic_class[gt_id][det_ind]:
                if eval_mode == 'per_class' or eval_mode == 'top_class':
                    valid_inds[ii] = False
    # store threshold values - used for plotting
    conf_sorted = np.sort(confidence)[::-1][valid_inds]
    thresholds = np.linspace(0.1, 0.9, 9)
    thresholds_inds = np.zeros(len(thresholds), dtype=np.int)
    for ii, tt in enumerate(thresholds):
        thresholds_inds[ii] = np.argmin(conf_sorted > tt)
    thresholds_inds[thresholds_inds==0] = -1
    # compute precision and recall
    true_pos    = true_pos[valid_inds]
    false_pos_c = np.cumsum(1-true_pos)
    true_pos_c  = np.cumsum(true_pos)
    recall = true_pos_c / num_positives
    precision = true_pos_c / np.maximum(true_pos_c + false_pos_c, np.finfo(np.float64).eps)
    results = {}
    results['recall'] = recall
    results['precision'] = precision
    results['num_gt'] = num_positives
    results['thresholds'] = thresholds
    results['thresholds_inds'] = thresholds_inds
    if num_positives == 0:
        results['avg_prec'] = np.nan
        results['rec_at_x'] = np.nan
    else:
        results['avg_prec'] = np.round(calc_average_precision(recall, precision), 5)
        results['rec_at_x'] = np.round(calc_recall_at_x(recall, precision), 5)
    return results
 def compute_file_accuracy_simple(gts, preds, num_classes):
    """
    Evaluates the prediction accuracy at a file level.
    Does not include files that have more than one class (or the generic class).
    Simply chooses the class per file that has the highest probability overall.
    """
    gt_valid = []
    pred_valid = []
    for ii in range(len(gts)):
        gt_class = np.unique(gts[ii]['class_ids'])
        if len(gt_class) == 1 and gt_class[0] != -1:
            gt_valid.append(gt_class[0])
            pred = preds[ii]['class_probs'][:num_classes, :].T
            pred_valid.append(np.argmax(pred.mean(0)))
    acc = (np.array(gt_valid) == np.array(pred_valid)).mean()
    res = {}
    res['num_valid_files'] = len(gt_valid)
    res['num_total_files'] = len(gts)
    res['gt_valid_file'] = gt_valid
    res['pred_valid_file'] = pred_valid
    res['file_acc'] = np.round(acc, 5)
    return res
 def compute_file_accuracy(gts, preds, num_classes):
    """
    Evaluates the prediction accuracy at a file level.
    Does not include files that have more than one class (or the unknown class).
    Tries several different detection thresholds and picks the best one.
    """
    # compute min and max scoring range - then threshold
    min_val = 0
    mins = [pp['class_probs'].min() for pp in preds if pp['class_probs'].shape[1] > 0]
    if len(mins) > 0:
        min_val = np.min(mins)
    max_val = 1.0
    maxes = [pp['class_probs'].max() for pp in preds if pp['class_probs'].shape[1] > 0]
    if len(maxes) > 0:
        max_val = np.max(maxes)
    thresh = np.linspace(min_val, max_val, 11)[:10]
    # loop over the files and store the accuracy at different prediction thresholds
    # only include gt files that have one valid species
    gt_valid = []
    pred_valid_all = []
    for ii in range(len(gts)):
        gt_class = np.unique(gts[ii]['class_ids'])
        if len(gt_class) == 1 and gt_class[0] != -1:
            gt_valid.append(gt_class[0])
            pred = preds[ii]['class_probs'][:num_classes, :].T
            p_class = np.zeros(len(thresh))
            for tt in range(len(thresh)):
                p_class[tt] = (pred*(pred>=thresh[tt])).sum(0).argmax()
            pred_valid_all.append(p_class)
    # pick the result corresponding to the overall best threshold
    pred_valid_all = np.vstack(pred_valid_all)
    acc_per_thresh = (np.array(gt_valid)[..., np.newaxis] == pred_valid_all).mean(0)
    best_thresh = np.argmax(acc_per_thresh)
    best_acc = acc_per_thresh[best_thresh]
    pred_valid = pred_valid_all[:, best_thresh].astype(np.int).tolist()
    res = {}
    res['num_valid_files'] = len(gt_valid)
    res['num_total_files'] = len(gts)
    res['gt_valid_file'] = gt_valid
    res['pred_valid_file'] = pred_valid
    res['file_acc'] = np.round(best_acc, 5)
    return res
 def evaluate_predictions(gts, preds, class_names, detection_overlap, ignore_start_end=0.0):
    """
    Computes metrics derived from the precision and recall.
    Assumes that gts and preds are both lists of the same lengths, with ground
    truth and predictions contained within.
    Returns the overall detection results, and per class results
    """
    assert(len(gts) == len(preds))
    num_classes = len(class_names)
    # evaluate detection on its own i.e. ignoring class
    det_results = compute_pre_rec(gts, preds, 'detection', None, num_classes, detection_overlap, ignore_start_end)
    top_class   = compute_pre_rec(gts, preds, 'top_class', None, num_classes, detection_overlap, ignore_start_end)
    det_results['top_class'] = top_class
    # per class evaluation
    det_results['class_pr'] = []
    for cc in range(num_classes):
        res = compute_pre_rec(gts, preds, 'per_class', cc, num_classes, detection_overlap, ignore_start_end)
        res['name'] = class_names[cc]
        det_results['class_pr'].append(res)
    # ignores classes that are not present in the test set
    det_results['avg_prec_class'] = np.mean([rs['avg_prec'] for rs in det_results['class_pr'] if rs['num_gt'] > 0])
    det_results['avg_prec_class'] = np.round(det_results['avg_prec_class'], 5)
    # file level evaluation
    res_file = compute_file_accuracy(gts, preds, num_classes)
    det_results.update(res_file)
    return det_results
--- a/bat_detect/train/losses.py
+++ b/bat_detect/train/losses.py
@ -0,0 +1,56 @@
 import torch
 import torch.nn.functional as F
 def bbox_size_loss(pred_size, gt_size):
    """
    Bounding box size loss. Only compute loss where there is a bounding box.
    """
    gt_size_mask = (gt_size > 0).float()
    return (F.l1_loss(pred_size*gt_size_mask, gt_size, reduction='sum')  / (gt_size_mask.sum() + 1e-5))
 def focal_loss(pred, gt, weights=None, valid_mask=None):
    """
    Focal loss adapted from CornerNet: Detecting Objects as Paired Keypoints
    pred  (batch x c x h x w)
    gt    (batch x c x h x w)
    """
    eps = 1e-5
    beta = 4
    alpha = 2
    pos_inds = gt.eq(1).float()
    neg_inds = gt.lt(1).float()
    pos_loss = torch.log(pred + eps) * torch.pow(1 - pred, alpha) * pos_inds
    neg_loss = torch.log(1 - pred + eps) * torch.pow(pred, alpha) * torch.pow(1 - gt, beta) * neg_inds
    if weights is not None:
        pos_loss = pos_loss*weights
        #neg_loss = neg_loss*weights
    if valid_mask is not None:
        pos_loss = pos_loss*valid_mask
        neg_loss = neg_loss*valid_mask
    pos_loss = pos_loss.sum()
    neg_loss = neg_loss.sum()
    num_pos  = pos_inds.float().sum()
    if num_pos == 0:
        loss = -neg_loss
    else:
        loss = -(pos_loss + neg_loss) / num_pos
    return loss
 def mse_loss(pred, gt, weights=None, valid_mask=None):
    """
    Mean squared error loss. 
    """
    if valid_mask is None:
        op = ((gt-pred)**2).mean()
    else:
        op = (valid_mask*((gt-pred)**2)).sum() / valid_mask.sum()
    return op
--- a/bat_detect/train/readme.md
+++ b/bat_detect/train/readme.md
@ -0,0 +1,16 @@
 ## How to train a model from scratch
 `python train_model.py data_dir annotation_dir` e.g.   
 `python train_model.py /data1/bat_data/data/ /data1/bat_data/annotations/anns/` 
 ## Training on your own data
 You can either use the finetuning scripts to finetune from an existing training dataset. Follow the instructions in the `../finetune/` directory.  
 Alternatively, you can train from scratch. First, you will need to create your own annotation file (like in the finetune example), and then you will need to edit `train_split.py` to add your new dataset and specify which combination of files you want to train on.  
 Note, if training from scratch and you want to include the existing data, you may need to set all the class names to the generic class name ('Bat') so that the existing species are not added to your model, but instead just used to help perform the bat/not bat task.   
 ## Additional notes
 Having blank files with no bats in them is also useful, just make sure that the annotation files lists them as not being annotated (i.e. `is_annotated=True`). 
 Training will be slow without a GPU.  
--- a/bat_detect/train/train_model.py
+++ b/bat_detect/train/train_model.py
@ -0,0 +1,356 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import os
 import torch
 import torch.nn.functional as F
 from torch.optim.lr_scheduler import CosineAnnealingLR
 import json
 import argparse
 import sys
 sys.path.append(os.path.join('..', '..'))
 import bat_detect.detector.parameters as parameters
 import bat_detect.detector.models as models
 import bat_detect.detector.post_process as pp
 import bat_detect.utils.plot_utils as pu
 import bat_detect.train.audio_dataloader as adl
 import bat_detect.train.evaluate as evl
 import bat_detect.train.train_utils as tu
 import bat_detect.train.train_split as ts
 import bat_detect.train.losses as losses
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 def save_images_batch(model, data_loader, params):
    print('\nsaving images ...')
    is_train_state = data_loader.dataset.is_train
    data_loader.dataset.is_train = False
    data_loader.dataset.return_spec_for_viz = True
    model.eval()
    ind = 0  # first image in each batch
    with torch.no_grad():
        for batch_idx, inputs in enumerate(data_loader):
            data    = inputs['spec'].to(params['device'])
            outputs = model(data)
            spec_viz = inputs['spec_for_viz'].data.cpu().numpy()
            orig_index = inputs['file_id'][ind]
            plot_title = data_loader.dataset.data_anns[orig_index]['id']
            op_file_name = params['op_im_dir_test'] + data_loader.dataset.data_anns[orig_index]['id'] + '.jpg'
            save_image(spec_viz, outputs, ind, inputs, params, op_file_name, plot_title)
    data_loader.dataset.is_train = is_train_state
    data_loader.dataset.return_spec_for_viz = False
 def save_image(spec_viz, outputs, ind, inputs, params, op_file_name, plot_title):
    pred_nms, _ = pp.run_nms(outputs, params, inputs['sampling_rate'].float())
    pred_hm  = outputs['pred_det'][ind, 0, :].data.cpu().numpy()
    spec_viz = spec_viz[ind, 0, :]
    gt       = parse_gt_data(inputs)[ind]
    sampling_rate = inputs['sampling_rate'][ind].item()
    duration = inputs['duration'][ind].item()
    pu.plot_spec(spec_viz, sampling_rate, duration, gt, pred_nms[ind],
                 params, plot_title, op_file_name, pred_hm, plot_boxes=True, fixed_aspect=False)
 def loss_fun(outputs, gt_det, gt_size, gt_class, det_criterion, params, class_inv_freq):
    # detection loss
    loss = params['det_loss_weight']*det_criterion(outputs['pred_det'], gt_det)
    # bounding box size loss
    loss += params['size_loss_weight']*losses.bbox_size_loss(outputs['pred_size'], gt_size)
    # classification loss
    valid_mask = (gt_class[:, :-1, :, :].sum(1) > 0).float().unsqueeze(1)
    p_class = outputs['pred_class'][:, :-1, :]
    loss += params['class_loss_weight']*det_criterion(p_class, gt_class[:, :-1, :], valid_mask=valid_mask)
    return loss
 def train(model, epoch, data_loader, det_criterion, optimizer, scheduler, params):
    model.train()
    train_loss = tu.AverageMeter()
    class_inv_freq = torch.from_numpy(np.array(params['class_inv_freq'], dtype=np.float32)).to(params['device'])
    class_inv_freq = class_inv_freq.unsqueeze(0).unsqueeze(2).unsqueeze(2)
    print('\nEpoch', epoch)
    for batch_idx, inputs in enumerate(data_loader):
        data     = inputs['spec'].to(params['device'])
        gt_det   = inputs['y_2d_det'].to(params['device'])
        gt_size  = inputs['y_2d_size'].to(params['device'])
        gt_class = inputs['y_2d_classes'].to(params['device'])
        optimizer.zero_grad()
        outputs = model(data)
        loss = loss_fun(outputs, gt_det, gt_size, gt_class, det_criterion, params, class_inv_freq)
        train_loss.update(loss.item(), data.shape[0])
        loss.backward()
        optimizer.step()
        scheduler.step()
        if batch_idx % 50 == 0 and batch_idx != 0:
            print('[{}/{}]\tLoss: {:.4f}'.format(
                  batch_idx * len(data), len(data_loader.dataset), train_loss.avg))
    print('Train loss          : {:.4f}'.format(train_loss.avg))
    res = {}
    res['train_loss'] = float(train_loss.avg)
    return res
 def test(model, epoch, data_loader, det_criterion, params):
    model.eval()
    predictions = []
    ground_truths = []
    test_loss = tu.AverageMeter()
    class_inv_freq = torch.from_numpy(np.array(params['class_inv_freq'], dtype=np.float32)).to(params['device'])
    class_inv_freq = class_inv_freq.unsqueeze(0).unsqueeze(2).unsqueeze(2)
    with torch.no_grad():
        for batch_idx, inputs in enumerate(data_loader):
            data    = inputs['spec'].to(params['device'])
            gt_det  = inputs['y_2d_det'].to(params['device'])
            gt_size = inputs['y_2d_size'].to(params['device'])
            gt_class = inputs['y_2d_classes'].to(params['device'])
            outputs = model(data)
            # if the model needs a fixed sized intput run this
            # data = torch.cat(torch.split(data, int(params['spec_train_width']*params['resize_factor']), 3), 0)
            # outputs = model(data)
            # for kk in ['pred_det', 'pred_size', 'pred_class']:
            #     outputs[kk] = torch.cat([oo for oo in outputs[kk]], 2).unsqueeze(0)
            if params['save_test_image_during_train'] and batch_idx == 0:
                # for visualization - save the first prediction
                ind = 0
                orig_index = inputs['file_id'][ind]
                plot_title = data_loader.dataset.data_anns[orig_index]['id']
                op_file_name = params['op_im_dir'] + str(orig_index.item()).zfill(4) + '_' + str(epoch).zfill(4) + '_pred.jpg'
                save_image(data, outputs, ind, inputs, params, op_file_name, plot_title)
            loss = loss_fun(outputs, gt_det, gt_size, gt_class, det_criterion, params, class_inv_freq)
            test_loss.update(loss.item(), data.shape[0])
            # do NMS
            pred_nms, _ = pp.run_nms(outputs, params, inputs['sampling_rate'].float())
            predictions.extend(pred_nms)
            ground_truths.extend(parse_gt_data(inputs))
    res_det = evl.evaluate_predictions(ground_truths, predictions, params['class_names'],
                                       params['detection_overlap'], params['ignore_start_end'])
    print('\nTest loss          : {:.4f}'.format(test_loss.avg))
    print('Rec at 0.95  (det) : {:.4f}'.format(res_det['rec_at_x']))
    print('Avg prec     (cls) : {:.4f}'.format(res_det['avg_prec']))
    print('File acc     (cls) : {:.2f} - for {} out of {}'.format(res_det['file_acc'],
          res_det['num_valid_files'], res_det['num_total_files']))
    print('Cls Avg prec (cls) : {:.4f}'.format(res_det['avg_prec_class']))
    print('\nPer class average precision')
    str_len = np.max([len(rs['name']) for rs in res_det['class_pr']]) + 5
    for cc, rs in enumerate(res_det['class_pr']):
        if rs['num_gt'] > 0:
            print(str(cc).ljust(5) + rs['name'].ljust(str_len) + '{:.4f}'.format(rs['avg_prec']))
    res = {}
    res['test_loss'] = float(test_loss.avg)
    return res_det, res
 def parse_gt_data(inputs):
    # reads the torch arrays into a dictionary of numpy arrays, taking care to
    # remove padding data i.e. not valid ones
    keys = ['start_times', 'end_times', 'low_freqs', 'high_freqs', 'class_ids', 'individual_ids']
    batch_data = []
    for ind in range(inputs['start_times'].shape[0]):
        is_valid = inputs['is_valid'][ind]==1
        gt = {}
        for kk in keys:
            gt[kk] = inputs[kk][ind][is_valid].numpy().astype(np.float32)
        gt['duration'] = inputs['duration'][ind].item()
        gt['file_id'] = inputs['file_id'][ind].item()
        gt['class_id_file'] = inputs['class_id_file'][ind].item()
        batch_data.append(gt)
    return batch_data
 def select_model(params):
    num_classes = len(params['class_names'])
    if params['model_name'] == 'Net2DFast':
        model = models.Net2DFast(params['num_filters'], num_classes=num_classes,
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])
    elif params['model_name'] == 'Net2DFastNoAttn':
        model = models.Net2DFastNoAttn(params['num_filters'], num_classes=num_classes,
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])
    elif params['model_name'] == 'Net2DFastNoCoordConv':
        model = models.Net2DFastNoCoordConv(params['num_filters'], num_classes=num_classes,
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])    
    else:
        print('No valid network specified')
    return model
 if __name__ == "__main__":
    plt.close('all')
    params = parameters.get_params(True)
    if torch.cuda.is_available():
        params['device'] = 'cuda'
    else:
        params['device'] = 'cpu'
    # setup arg parser and populate it with exiting parameters - will not work with lists
    parser = argparse.ArgumentParser()
    parser.add_argument('data_dir', type=str,
        help='Path to root of datasets')
    parser.add_argument('ann_dir', type=str,
        help='Path to extracted annotations')
    parser.add_argument('--train_split', type=str, default='diff',  # diff, same
        help='Which train split to use')
    parser.add_argument('--notes', type=str, default='',
        help='Notes to save in text file')
    parser.add_argument('--do_not_save_images', action='store_false',
        help='Do not save images at the end of training')
    parser.add_argument('--standardize_classs_names_ip', type=str,
        default='Rhinolophus ferrumequinum;Rhinolophus hipposideros',
        help='Will set low and high frequency the same for these classes. Separate names with ";"')
    for key, val in params.items():
        parser.add_argument('--'+key, type=type(val), default=val)
    params = vars(parser.parse_args())
    # save notes file
    if params['notes'] != '':
        tu.write_notes_file(params['experiment'] + 'notes.txt', params['notes'])
    # load the training and test meta data - there are different splits defined
    train_sets, test_sets = ts.get_train_test_data(params['ann_dir'], params['data_dir'], params['train_split'])
    train_sets_no_path, test_sets_no_path = ts.get_train_test_data('', '', params['train_split'])
    # keep track of what we have trained on
    params['train_sets'] = train_sets_no_path
    params['test_sets'] = test_sets_no_path
    # load train annotations - merge them all together
    print('\nTraining on:')
    for tt in train_sets:
        print(tt['ann_path'])
    classes_to_ignore = params['classes_to_ignore']+params['generic_class']
    data_train, params['class_names'], params['class_inv_freq'] = \
        tu.load_set_of_anns(train_sets, classes_to_ignore, params['events_of_interest'], params['convert_to_genus'])
    params['genus_names'], params['genus_mapping'] = tu.get_genus_mapping(params['class_names'])
    params['class_names_short'] = tu.get_short_class_names(params['class_names'])
    # standardize the low and high frequency value for specified classes
    params['standardize_classs_names'] = params['standardize_classs_names_ip'].split(';')
    for cc in params['standardize_classs_names']:
        if cc in params['class_names']:
            data_train = tu.standardize_low_freq(data_train, cc)
        else:
            print(cc, 'not found')
    # train loader
    train_dataset = adl.AudioLoader(data_train, params, is_train=True)
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=params['batch_size'],
                    shuffle=True, num_workers=params['num_workers'], pin_memory=True)
    # test set
    print('\nTesting on:')
    for tt in test_sets:
        print(tt['ann_path'])
    data_test, _, _ = tu.load_set_of_anns(test_sets, classes_to_ignore, params['events_of_interest'], params['convert_to_genus'])
    data_train = tu.remove_dupes(data_train, data_test)
    test_dataset = adl.AudioLoader(data_test, params, is_train=False)
    # batch size of 1 because of variable file length
    test_loader  = torch.utils.data.DataLoader(test_dataset, batch_size=1,
                   shuffle=False, num_workers=params['num_workers'], pin_memory=True)
    inputs_train = next(iter(train_loader))
    # TODO remove params['ip_height'], this is just legacy
    params['ip_height'] = int(params['spec_height']*params['resize_factor'])
    print('\ntrain batch spec size :', inputs_train['spec'].shape)
    print('class target size     :', inputs_train['y_2d_classes'].shape)
    # select network
    model = select_model(params)
    model = model.to(params['device'])
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    #optimizer = torch.optim.SGD(model.parameters(), lr=params['lr'], momentum=0.9)
    scheduler = CosineAnnealingLR(optimizer, params['num_epochs'] * len(train_loader))
    if params['train_loss'] == 'mse':
        det_criterion = losses.mse_loss
    elif params['train_loss'] == 'focal':
        det_criterion = losses.focal_loss
    # save parameters to file
    with open(params['experiment'] + 'params.json', 'w') as da:
        json.dump(params, da, indent=2, sort_keys=True)
    # plotting
    train_plt_ls = pu.LossPlotter(params['experiment'] + 'train_loss.png', params['num_epochs']+1,
                                  ['train_loss'], None, None, ['epoch', 'train_loss'], logy=True)
    test_plt_ls = pu.LossPlotter(params['experiment'] + 'test_loss.png', params['num_epochs']+1,
                                  ['test_loss'], None, None, ['epoch', 'test_loss'], logy=True)
    test_plt = pu.LossPlotter(params['experiment'] + 'test.png', params['num_epochs']+1,
                                  ['avg_prec', 'rec_at_x', 'avg_prec_class', 'file_acc', 'top_class'], [0,1], None, ['epoch', ''])
    test_plt_class = pu.LossPlotter(params['experiment'] + 'test_avg_prec.png', params['num_epochs']+1,
                                     params['class_names_short'], [0,1], params['class_names_short'], ['epoch', 'avg_prec'])
    #
    # main train loop
    for epoch in range(0, params['num_epochs']+1):
        train_loss = train(model, epoch, train_loader, det_criterion, optimizer, scheduler, params)
        train_plt_ls.update_and_save(epoch, [train_loss['train_loss']])
        if epoch % params['num_eval_epochs'] == 0:
            # detection accuracy on test set
            test_res, test_loss = test(model, epoch, test_loader, det_criterion, params)
            test_plt_ls.update_and_save(epoch, [test_loss['test_loss']])
            test_plt.update_and_save(epoch, [test_res['avg_prec'], test_res['rec_at_x'],
                                             test_res['avg_prec_class'], test_res['file_acc'], test_res['top_class']['avg_prec']])
            test_plt_class.update_and_save(epoch, [rs['avg_prec'] for rs in test_res['class_pr']])
            pu.plot_pr_curve_class(params['experiment'] , 'test_pr', 'test_pr', test_res)
            # save trained model
            print('saving model to: ' + params['model_file_name'])
            op_state = {'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                         #'optimizer' : optimizer.state_dict(),
                        'params' : params}
            torch.save(op_state, params['model_file_name'])
    # save an image with associated prediction for each batch in the test set
    if not args['do_not_save_images']:
        save_images_batch(model, test_loader, params)
--- a/bat_detect/train/train_split.py
+++ b/bat_detect/train/train_split.py
@ -0,0 +1,231 @@
 """
 Run scripts/extract_anns.py to generate these json files.
 """
 def get_train_test_data(ann_dir, wav_dir, split_name, load_extra=True):
    if split_name == 'diff':
        train_sets, test_sets = split_diff(ann_dir, wav_dir, load_extra)
    elif split_name == 'same':
        train_sets, test_sets = split_same(ann_dir, wav_dir, load_extra)
    else:
        print('Split not defined')
        assert False
    return train_sets, test_sets
 def split_diff(ann_dir, wav_dir, load_extra=True):
    train_sets = []
    if load_extra:
        train_sets.append({'dataset_name': 'BatDetective',
            'is_test': False,
            'is_binary': True,  # just a bat / not bat dataset ie no classes
            'ann_path': ann_dir + 'train_set_bulgaria_batdetective_with_bbs.json',
            'wav_path': wav_dir + 'bat_detective/audio/'})
        train_sets.append({'dataset_name': 'bat_logger_qeop_empty',
            'is_test': False,
            'is_binary': True,
            'ann_path': ann_dir + 'bat_logger_qeop_empty.json',
            'wav_path': wav_dir + 'bat_logger_qeop_empty/audio/'})
        train_sets.append({'dataset_name': 'bat_logger_2016_empty',
            'is_test': False,
            'is_binary': True,
            'ann_path': ann_dir + 'train_set_bat_logger_2016_empty.json',
            'wav_path': wav_dir + 'bat_logger_2016/audio/'})
        # train_sets.append({'dataset_name': 'brazil_data_binary',
        #      'is_test': False,
        #      'ann_path': ann_dir + 'brazil_data_binary.json',
        #      'wav_path': wav_dir + 'brazil_data/audio/'})
    train_sets.append({'dataset_name': 'echobank',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'Echobank_train_expert.json',
        'wav_path': wav_dir + 'echobank/audio/'})
    train_sets.append({'dataset_name': 'sn_scot_nor',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'sn_scot_nor_0.5_expert.json',
        'wav_path': wav_dir + 'sn_scot_nor/audio/'})
    train_sets.append({'dataset_name': 'BCT_1_sec',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BCT_1_sec_train_expert.json',
        'wav_path': wav_dir + 'BCT_1_sec/audio/'})
    train_sets.append({'dataset_name': 'bcireland',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'bcireland_expert.json',
        'wav_path': wav_dir + 'bcireland/audio/'})
    train_sets.append({'dataset_name': 'rhinolophus_steve_BCT',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'rhinolophus_steve_BCT_expert.json',
        'wav_path': wav_dir + 'rhinolophus_steve_BCT/audio/'})
    test_sets = []
    test_sets.append({'dataset_name': 'bat_data_martyn_2018',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_train_expert.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2018_test',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_test_expert.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018_test/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2019',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_train_expert.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2019_test',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_test_expert.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019_test/audio/'})
    return train_sets, test_sets
 def split_same(ann_dir, wav_dir, load_extra=True):
    train_sets = []
    if load_extra:
        train_sets.append({'dataset_name': 'BatDetective',
            'is_test': False,
            'is_binary': True,
            'ann_path': ann_dir + 'train_set_bulgaria_batdetective_with_bbs.json',
            'wav_path': wav_dir + 'bat_detective/audio/'})
        train_sets.append({'dataset_name': 'bat_logger_qeop_empty',
            'is_test': False,
            'is_binary': True,
            'ann_path': ann_dir + 'bat_logger_qeop_empty.json',
            'wav_path': wav_dir + 'bat_logger_qeop_empty/audio/'})
        train_sets.append({'dataset_name': 'bat_logger_2016_empty',
            'is_test': False,
            'is_binary': True,
            'ann_path': ann_dir + 'train_set_bat_logger_2016_empty.json',
            'wav_path': wav_dir + 'bat_logger_2016/audio/'})
        # train_sets.append({'dataset_name': 'brazil_data_binary',
        #      'is_test': False,
        #      'ann_path': ann_dir + 'brazil_data_binary.json',
        #      'wav_path': wav_dir + 'brazil_data/audio/'})
    train_sets.append({'dataset_name': 'echobank',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'Echobank_train_expert_TRAIN.json',
        'wav_path': wav_dir + 'echobank/audio/'})
    train_sets.append({'dataset_name': 'sn_scot_nor',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'sn_scot_nor_0.5_expert_TRAIN.json',
        'wav_path': wav_dir + 'sn_scot_nor/audio/'})
    train_sets.append({'dataset_name': 'BCT_1_sec',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BCT_1_sec_train_expert_TRAIN.json',
        'wav_path': wav_dir + 'BCT_1_sec/audio/'})
    train_sets.append({'dataset_name': 'bcireland',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'bcireland_expert_TRAIN.json',
        'wav_path': wav_dir + 'bcireland/audio/'})
    train_sets.append({'dataset_name': 'rhinolophus_steve_BCT',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'rhinolophus_steve_BCT_expert_TRAIN.json',
        'wav_path': wav_dir + 'rhinolophus_steve_BCT/audio/'})
    train_sets.append({'dataset_name': 'bat_data_martyn_2018',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_train_expert_TRAIN.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018/audio/'})
    train_sets.append({'dataset_name': 'bat_data_martyn_2018_test',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_test_expert_TRAIN.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018_test/audio/'})
    train_sets.append({'dataset_name': 'bat_data_martyn_2019',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_train_expert_TRAIN.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019/audio/'})
    train_sets.append({'dataset_name': 'bat_data_martyn_2019_test',
        'is_test': False,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_test_expert_TRAIN.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019_test/audio/'})
    # train_sets.append({'dataset_name': 'bat_data_martyn_2021_train',
    #     'is_test': False,
    #     'is_binary': False,
    #     'ann_path': ann_dir + 'bat_data_martyn_2021_TRAIN.json',
    #     'wav_path': wav_dir + 'bat_data_martyn_2021/audio/'})
    # train_sets.append({'dataset_name': 'volunteers_2021_train',
    #     'is_test': False,
    #     'is_binary': False,
    #     'ann_path': ann_dir + 'volunteers_2021_TRAIN.json',
    #     'wav_path': wav_dir + 'volunteers_2021/audio/'})
    test_sets = []
    test_sets.append({'dataset_name': 'echobank',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'Echobank_train_expert_TEST.json',
        'wav_path': wav_dir + 'echobank/audio/'})
    test_sets.append({'dataset_name': 'sn_scot_nor',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'sn_scot_nor_0.5_expert_TEST.json',
        'wav_path': wav_dir + 'sn_scot_nor/audio/'})
    test_sets.append({'dataset_name': 'BCT_1_sec',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BCT_1_sec_train_expert_TEST.json',
        'wav_path': wav_dir + 'BCT_1_sec/audio/'})
    test_sets.append({'dataset_name': 'bcireland',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'bcireland_expert_TEST.json',
        'wav_path': wav_dir + 'bcireland/audio/'})
    test_sets.append({'dataset_name': 'rhinolophus_steve_BCT',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'rhinolophus_steve_BCT_expert_TEST.json',
        'wav_path': wav_dir + 'rhinolophus_steve_BCT/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2018',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_train_expert_TEST.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2018_test',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2018_1_sec_test_expert_TEST.json',
        'wav_path': wav_dir + 'bat_data_martyn_2018_test/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2019',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_train_expert_TEST.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019/audio/'})
    test_sets.append({'dataset_name': 'bat_data_martyn_2019_test',
        'is_test': True,
        'is_binary': False,
        'ann_path': ann_dir + 'BritishBatCalls_MartynCooke_2019_1_sec_test_expert_TEST.json',
        'wav_path': wav_dir + 'bat_data_martyn_2019_test/audio/'})
    # test_sets.append({'dataset_name': 'bat_data_martyn_2021_test',
    #     'is_test': True,
    #     'is_binary': False,
    #     'ann_path': ann_dir + 'bat_data_martyn_2021_TEST.json',
    #     'wav_path': wav_dir + 'bat_data_martyn_2021/audio/'})
    # test_sets.append({'dataset_name': 'volunteers_2021_test',
    #     'is_test': True,
    #     'is_binary': False,
    #     'ann_path': ann_dir + 'volunteers_2021_TEST.json',
    #     'wav_path': wav_dir + 'volunteers_2021/audio/'})
    return train_sets, test_sets
--- a/bat_detect/train/train_utils.py
+++ b/bat_detect/train/train_utils.py
@ -0,0 +1,185 @@
 import numpy as np
 import random
 import os
 import glob
 import json
 def write_notes_file(file_name, text):
    with open(file_name, 'a') as da:
        da.write(text + '\n')
 def get_blank_dataset_dict(dataset_name, is_test, ann_path, wav_path):
    ddict = {'dataset_name': dataset_name, 'is_test': is_test, 'is_binary': False,
            'ann_path': ann_path, 'wav_path': wav_path}
    return ddict
 def get_short_class_names(class_names, str_len=3):
    class_names_short = []
    for cc in class_names:
        class_names_short.append(' '.join([sp[:str_len] for sp in cc.split(' ')]))
    return class_names_short
 def remove_dupes(data_train, data_test):
    test_ids = [dd['id'] for dd in data_test]
    data_train_prune = []
    for aa in data_train:
        if aa['id'] not in test_ids:
            data_train_prune.append(aa)
    diff = len(data_train) - len(data_train_prune)
    if diff != 0:
        print(diff, 'items removed from train set')
    return data_train_prune
 def get_genus_mapping(class_names):
    genus_names, genus_mapping = np.unique([cc.split(' ')[0] for cc in class_names], return_inverse=True)
    return genus_names.tolist(), genus_mapping.tolist()
 def standardize_low_freq(data, class_of_interest):
    # address the issue of highly variable low frequency annotations
    # this often happens for contstant frequency calls
    # for the class of interest sets the low and high freq to be the dataset mean
    low_freqs = []
    high_freqs = []
    for dd in data:
        for aa in dd['annotation']:
            if aa['class'] == class_of_interest:
                low_freqs.append(aa['low_freq'])
                high_freqs.append(aa['high_freq'])
    low_mean = np.mean(low_freqs)
    high_mean = np.mean(high_freqs)
    assert(low_mean < high_mean)
    print('\nStandardizing low and high frequency for:')
    print(class_of_interest)
    print('low:  ', round(low_mean, 2))
    print('high: ', round(high_mean, 2))
    # only set the low freq, high stays the same
    # assumes that low_mean < high_mean
    for dd in data:
        for aa in dd['annotation']:
            if aa['class'] == class_of_interest:
                aa['low_freq'] = low_mean
                if aa['high_freq'] < low_mean:
                    aa['high_freq'] = high_mean
    return data
 def load_set_of_anns(data, classes_to_ignore=[], events_of_interest=None,
                     convert_to_genus=False, verbose=True, list_of_anns=False,
                     filter_issues=False, name_replace=False):
    # load the annotations
    anns = []
    if list_of_anns:
        # path to list of individual json files
        anns.extend(load_anns_from_path(data['ann_path'], data['wav_path']))
    else:
        # dictionary of datasets
        for dd in data:
            anns.extend(load_anns(dd['ann_path'], dd['wav_path']))
    # discarding unannoated files
    anns = [aa for aa in anns if aa['annotated'] is True]
    # filter files that have annotation issues - is the input is a dictionary of
    # datasets, this will lilely have already been done
    if filter_issues:
        anns = [aa for aa in anns if aa['issues'] is False]
    # check for some basic formatting errors with class names
    for ann in anns:
        for aa in ann['annotation']:
            aa['class'] = aa['class'].strip()
    # only load specified events - i.e. types of calls
    if events_of_interest is not None:
        for ann in anns:
            filtered_events = []
            for aa in ann['annotation']:
                if aa['event'] in events_of_interest:
                    filtered_events.append(aa)
            ann['annotation'] = filtered_events
    # change class names
    # replace_names will be a dictionary mapping input name to output
    if type(name_replace) is dict:
        for ann in anns:
            for aa in ann['annotation']:
                if aa['class'] in name_replace:
                    aa['class'] = name_replace[aa['class']]
    # convert everything to genus name
    if convert_to_genus:
        for ann in anns:
            for aa in ann['annotation']:
                    aa['class'] = aa['class'].split(' ')[0]
    # get unique class names
    class_names_all = []
    for ann in anns:
        for aa in ann['annotation']:
            if aa['class'] not in classes_to_ignore:
                class_names_all.append(aa['class'])
    class_names, class_cnts = np.unique(class_names_all, return_counts=True)
    class_inv_freq = (class_cnts.sum() / (len(class_names) * class_cnts.astype(np.float32)))
    if verbose:
        print('Class count:')
        str_len = np.max([len(cc) for cc in class_names]) + 5
        for cc in range(len(class_names)):
            print(str(cc).ljust(5) + class_names[cc].ljust(str_len) + str(class_cnts[cc]))
    if len(classes_to_ignore) == 0:
        return anns
    else:
        return anns, class_names.tolist(), class_inv_freq.tolist()
 def load_anns(ann_file_name, raw_audio_dir):
    with open(ann_file_name) as da:
        anns = json.load(da)
    for aa in anns:
        aa['file_path'] = raw_audio_dir + aa['id']
    return anns
 def load_anns_from_path(ann_file_dir, raw_audio_dir):
    files = glob.glob(ann_file_dir + '*.json')
    anns = []
    for ff in files:
        with open(ff) as da:
            ann = json.load(da)
        ann['file_path'] = raw_audio_dir + ann['id']
        anns.append(ann)
    return anns
 class AverageMeter(object):
  """Computes and stores the average and current value"""
  def __init__(self):
    self.reset()
  def reset(self):
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0
  def update(self, val, n=1):
    self.val = val
    self.sum += val * n
    self.count += n
    self.avg = self.sum / self.count
--- a/bat_detect/utils/init.py
+++ b/bat_detect/utils/init.py
--- a/bat_detect/utils/audio_utils.py
+++ b/bat_detect/utils/audio_utils.py
@ -0,0 +1,164 @@
 import numpy as np
 from . import wavfile
 import warnings
 import torch
 import librosa
 def time_to_x_coords(time_in_file, sampling_rate, fft_win_length, fft_overlap):
    nfft = np.floor(fft_win_length*sampling_rate) # int() uses floor
    noverlap = np.floor(fft_overlap*nfft)
    return (time_in_file*sampling_rate-noverlap) / (nfft - noverlap)
 # NOTE this is also defined in post_process
 def x_coords_to_time(x_pos, sampling_rate, fft_win_length, fft_overlap):
    nfft = np.floor(fft_win_length*sampling_rate)
    noverlap = np.floor(fft_overlap*nfft)
    return ((x_pos*(nfft - noverlap)) + noverlap) / sampling_rate
    #return (1.0 - fft_overlap) * fft_win_length * (x_pos + 0.5)  # 0.5 is for center of temporal window
 def generate_spectrogram(audio, sampling_rate, params, return_spec_for_viz=False, check_spec_size=True):
    # generate spectrogram
    spec = gen_mag_spectrogram(audio, sampling_rate, params['fft_win_length'], params['fft_overlap'])
    # crop to min/max freq
    max_freq = round(params['max_freq']*params['fft_win_length'])
    min_freq = round(params['min_freq']*params['fft_win_length'])
    if spec.shape[0] < max_freq:
        freq_pad = max_freq - spec.shape[0]
        spec = np.vstack((np.zeros((freq_pad, spec.shape[1]), dtype=spec.dtype), spec))
    spec_cropped = spec[-max_freq:spec.shape[0]-min_freq, :]
    if params['spec_scale'] == 'log':
        log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']*sampling_rate)))**2).sum())
        #log_scaling = (1.0 / sampling_rate)*0.1
        #log_scaling = (1.0 / sampling_rate)*10e4
        spec = np.log1p(log_scaling*spec_cropped)
    elif params['spec_scale'] == 'pcen':
        spec = pcen(spec_cropped, sampling_rate)
    elif params['spec_scale'] == 'none':
        pass
    if params['denoise_spec_avg']:
        spec = spec - np.mean(spec, 1)[:, np.newaxis]
        spec.clip(min=0, out=spec)
    if params['max_scale_spec']:
        spec = spec / (spec.max() + 10e-6)
    # needs to be divisible by specific factor - if not it should have been padded
    #if check_spec_size:
        #assert((int(spec.shape[0]*params['resize_factor']) % params['spec_divide_factor']) == 0)
        #assert((int(spec.shape[1]*params['resize_factor']) % params['spec_divide_factor']) == 0)
    # for visualization purposes - use log scaled spectrogram
    if return_spec_for_viz:
        log_scaling = 2.0 * (1.0 / sampling_rate) * (1.0/(np.abs(np.hanning(int(params['fft_win_length']*sampling_rate)))**2).sum())
        spec_for_viz = np.log1p(log_scaling*spec_cropped).astype(np.float32)
    else:
        spec_for_viz = None
    return spec, spec_for_viz
 def load_audio_file(audio_file, time_exp_fact, target_samp_rate, scale=False):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=wavfile.WavFileWarning)
        #sampling_rate, audio_raw = wavfile.read(audio_file)
        audio_raw, sampling_rate = librosa.load(audio_file, sr=None)
    if len(audio_raw.shape) > 1:
        raise Exception('Currently does not handle stereo files')
    sampling_rate = sampling_rate * time_exp_fact
    # resample - need to do this after correcting for time expansion
    sampling_rate_old = sampling_rate
    sampling_rate = target_samp_rate
    audio_raw = librosa.resample(audio_raw, sampling_rate_old, sampling_rate, res_type='polyphase')
    # convert to float32 and scale
    audio_raw = audio_raw.astype(np.float32)
    if scale:
        audio_raw = audio_raw - audio_raw.mean()
        audio_raw = audio_raw / (np.abs(audio_raw).max() + 10e-6)
    return sampling_rate, audio_raw
 def pad_audio(audio_raw, fs, ms, overlap_perc, resize_factor, divide_factor, fixed_width=None):
    # Adds zeros to the end of the raw data so that the generated sepctrogram
    # will be evenly divisible by `divide_factor`
    # Also deals with very short audio clips and fixed_width during training
    # This code could be clearer, clean up
    nfft = int(ms*fs)
    noverlap = int(overlap_perc*nfft)
    step = nfft - noverlap
    min_size = int(divide_factor*(1.0/resize_factor))
    spec_width = ((audio_raw.shape[0]-noverlap)//step)
    spec_width_rs = spec_width * resize_factor
    if fixed_width is not None and spec_width < fixed_width:
        # too small
        # used during training to ensure all the batches are the same size
        diff = fixed_width*step + noverlap - audio_raw.shape[0]
        audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))
    elif fixed_width is not None and spec_width > fixed_width:
        # too big
        # used during training to ensure all the batches are the same size
        diff = fixed_width*step + noverlap - audio_raw.shape[0]
        audio_raw = audio_raw[:diff]
    elif spec_width_rs < min_size or (np.floor(spec_width_rs) % divide_factor) != 0:
        # need to be at least min_size
        div_amt = np.ceil(spec_width_rs / float(divide_factor))
        div_amt = np.maximum(1, div_amt)
        target_size = int(div_amt*divide_factor*(1.0/resize_factor))
        diff = target_size*step + noverlap - audio_raw.shape[0]
        audio_raw = np.hstack((audio_raw, np.zeros(diff, dtype=audio_raw.dtype)))
    return audio_raw
 def gen_mag_spectrogram(x, fs, ms, overlap_perc):
    # Computes magnitude spectrogram by specifying time.
    x = x.astype(np.float32)
    nfft = int(ms*fs)
    noverlap = int(overlap_perc*nfft)
    # window data
    step = nfft - noverlap
    # compute spec
    spec, _ = librosa.core.spectrum._spectrogram(x, power=1, n_fft=nfft, hop_length=step, center=False)
    # remove DC component and flip vertical orientation
    spec = np.flipud(spec[1:, :])
    return spec.astype(np.float32)
 def gen_mag_spectrogram_pt(x, fs, ms, overlap_perc):
    nfft = int(ms*fs)
    nstep = round((1.0-overlap_perc)*nfft)
    han_win = torch.hann_window(nfft, periodic=False).to(x.device)
    complex_spec = torch.stft(x, nfft, nstep, window=han_win, center=False)
    spec = complex_spec.pow(2.0).sum(-1)
    # remove DC component and flip vertically
    spec = torch.flipud(spec[0, 1:,:])
    return spec
 def pcen(spec_cropped, sampling_rate):
    # TODO should be passing hop_length too i.e. step
    spec = librosa.pcen(spec_cropped * (2**31), sr=sampling_rate/10).astype(np.float32)
    return spec
--- a/bat_detect/utils/detector_utils.py
+++ b/bat_detect/utils/detector_utils.py
@ -0,0 +1,269 @@
 import torch
 import torch.nn.functional as F
 import os
 import numpy as np
 import pandas as pd
 import json
 import sys
 from bat_detect.detector import models
 import bat_detect.detector.compute_features as feats
 import bat_detect.detector.post_process as pp
 import bat_detect.utils.audio_utils as au
 def get_audio_files(ip_dir):
    matches = []
    for root, dirnames, filenames in os.walk(ip_dir):
        for filename in filenames:
            if filename.lower().endswith('.wav'):
                matches.append(os.path.join(root, filename))
    return matches
 def load_model(model_path, load_weights=True):
    # load model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if os.path.isfile(model_path):
        net_params = torch.load(model_path, map_location=device)
    else:
        print('Error: model not found.')
        sys.exit(1)
    params = net_params['params']
    params['device'] = device
    if params['model_name'] == 'Net2DFast':
        model = models.Net2DFast(params['num_filters'], num_classes=len(params['class_names']),
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])
    elif params['model_name'] == 'Net2DFastNoAttn':
        model = models.Net2DFastNoAttn(params['num_filters'], num_classes=len(params['class_names']),
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])
    elif params['model_name'] == 'Net2DFastNoCoordConv':
        model = models.Net2DFastNoCoordConv(params['num_filters'], num_classes=len(params['class_names']),
                                 emb_dim=params['emb_dim'], ip_height=params['ip_height'],
                                 resize_factor=params['resize_factor'])
    else:
        print('Error: unknown model.')
    if load_weights:
        model.load_state_dict(net_params['state_dict'])
    model = model.to(params['device'])
    model.eval()
    return model, params
 def merge_results(predictions, spec_feats, cnn_feats, spec_slices):
    predictions_m = {}
    num_preds = np.sum([len(pp['det_probs']) for pp in predictions])
    if num_preds > 0:
        for kk in predictions[0].keys():
            predictions_m[kk] = np.hstack([pp[kk] for pp in predictions if pp['det_probs'].shape[0] > 0])
    else:
        # hack in case where no detected calls as we need some of the key names in dict
        predictions_m = predictions[0]
    if len(spec_feats) > 0:
        spec_feats = np.vstack(spec_feats)
    if len(cnn_feats) > 0:
        cnn_feats = np.vstack(cnn_feats)
    return predictions_m, spec_feats, cnn_feats, spec_slices
 def convert_results(file_id, time_exp, duration, params, predictions, spec_feats, cnn_feats, spec_slices):
    # create a single dictionary - this is the format used by the annotation tool
    pred_dict = {}
    pred_dict['id'] = file_id
    pred_dict['annotated'] = False
    pred_dict['issues'] = False
    pred_dict['notes'] = 'Automatically generated.'
    pred_dict['time_exp'] = time_exp
    pred_dict['duration'] = round(duration, 4)
    pred_dict['annotation'] = []
    class_prob_best = predictions['class_probs'].max(0)
    class_ind_best  = predictions['class_probs'].argmax(0)
    class_overall   = pp.overall_class_pred(predictions['det_probs'], predictions['class_probs'])
    pred_dict['class_name'] = params['class_names'][np.argmax(class_overall)]
    for ii in range(predictions['det_probs'].shape[0]):
        res = {}
        res['start_time'] = round(float(predictions['start_times'][ii]), 4)
        res['end_time']   = round(float(predictions['end_times'][ii]), 4)
        res['low_freq']   = int(predictions['low_freqs'][ii])
        res['high_freq']  = int(predictions['high_freqs'][ii])
        res['class']      = str(params['class_names'][int(class_ind_best[ii])])
        res['class_prob'] = round(float(class_prob_best[ii]), 3)
        res['det_prob']   = round(float(predictions['det_probs'][ii]), 3)
        res['individual'] = '-1'
        res['event']      = 'Echolocation'
        pred_dict['annotation'].append(res)
    # combine into final results dictionary
    results = {}
    results['pred_dict'] = pred_dict
    if len(spec_feats) > 0:
        results['spec_feats'] = spec_feats
        results['spec_feat_names'] = feats.get_feature_names()
    if len(cnn_feats) > 0:
        results['cnn_feats'] = cnn_feats
        results['cnn_feat_names'] = [str(ii) for ii in range(cnn_feats.shape[1])]
    if len(spec_slices) > 0:
        results['spec_slices'] = spec_slices
    return results
 def save_results_to_file(results, op_path):
    # make directory if it does not exist
    if not os.path.isdir(os.path.dirname(op_path)):
        os.makedirs(os.path.dirname(op_path))
    # save csv file - if there are predictions
    result_list = [res for res in results['pred_dict']['annotation']]
    df = pd.DataFrame(result_list)
    df['file_name'] = [results['pred_dict']['id']]*len(result_list)
    df.index.name = 'id'
    if 'class_prob' in df.columns:
        df = df[['det_prob', 'start_time', 'end_time', 'high_freq',
                 'low_freq', 'class', 'class_prob']]
        df.to_csv(op_path + '.csv', sep=',')
    # save features
    if 'spec_feats' in results.keys():
        df = pd.DataFrame(results['spec_feats'], columns=results['spec_feat_names'])
        df.to_csv(op_path + '_spec_features.csv', sep=',', index=False, float_format='%.5f')
    if 'cnn_feats' in results.keys():
        df = pd.DataFrame(results['cnn_feats'], columns=results['cnn_feat_names'])
        df.to_csv(op_path + '_cnn_features.csv', sep=',', index=False, float_format='%.5f')
    # save json file
    with open(op_path + '.json', 'w') as da:
        json.dump(results['pred_dict'], da, indent=2, sort_keys=True)
 def compute_spectrogram(audio, sampling_rate, params, return_np=False):
    # pad audio so it is evenly divisible by downsampling factors
    duration = audio.shape[0] / float(sampling_rate)
    audio = au.pad_audio(audio, sampling_rate, params['fft_win_length'],
                         params['fft_overlap'], params['resize_factor'],
                         params['spec_divide_factor'])
    # generate spectrogram
    spec, _ = au.generate_spectrogram(audio, sampling_rate, params)
    # convert to pytorch
    spec = torch.from_numpy(spec).to(params['device'])
    spec = spec.unsqueeze(0).unsqueeze(0)
    # resize the spec
    rs = params['resize_factor']
    spec_op_shape = (int(params['spec_height']*rs), int(spec.shape[-1]*rs))
    spec = F.interpolate(spec, size=spec_op_shape, mode='bilinear', align_corners=False)
    if return_np:
        spec_np = spec[0,0,:].cpu().data.numpy()
    else:
        spec_np = None
    return duration, spec, spec_np
 def process_file(audio_file, model, params, args, time_exp=None, top_n=5, return_raw_preds=False):
    # store temporary results here
    predictions = []
    spec_feats  = []
    cnn_feats   = []
    spec_slices = []
    # get time expansion  factor
    if time_exp is None:
        time_exp = args['time_expansion_factor']
    params['detection_threshold'] = args['detection_threshold']
    # load audio file
    sampling_rate, audio_full = au.load_audio_file(audio_file, time_exp,
                                   params['target_samp_rate'], params['scale_raw_audio'])
    duration_full = audio_full.shape[0] / float(sampling_rate)
    return_np_spec = args['spec_features'] or args['spec_slices']
    # loop through larger file and split into chunks
    # TODO fix so that it overlaps correctly and takes care of duplicate detections at borders
    num_chunks = int(np.ceil(duration_full/args['chunk_size']))
    for chunk_id in range(num_chunks):
        # chunk
        chunk_time   = args['chunk_size']*chunk_id
        chunk_length = int(sampling_rate*args['chunk_size'])
        start_sample = chunk_id*chunk_length
        end_sample   = np.minimum((chunk_id+1)*chunk_length, audio_full.shape[0])
        audio = audio_full[start_sample:end_sample]
        # load audio file and compute spectrogram
        duration, spec, spec_np = compute_spectrogram(audio, sampling_rate, params, return_np_spec)
        # evaluate model
        with torch.no_grad():
            outputs = model(spec, return_feats=args['cnn_features'])
        # run non-max suppression
        pred_nms, features = pp.run_nms(outputs, params, np.array([float(sampling_rate)]))
        pred_nms = pred_nms[0]
        pred_nms['start_times'] += chunk_time
        pred_nms['end_times'] += chunk_time
        # if we have a background class
        if pred_nms['class_probs'].shape[0] > len(params['class_names']):
            pred_nms['class_probs'] = pred_nms['class_probs'][:-1, :]
        predictions.append(pred_nms)
        # extract features - if there are any calls detected
        if (pred_nms['det_probs'].shape[0] > 0):
            if args['spec_features']:
                spec_feats.append(feats.get_feats(spec_np, pred_nms, params))
            if args['cnn_features']:
                cnn_feats.append(features[0])
            if args['spec_slices']:
                spec_slices.extend(feats.extract_spec_slices(spec_np, pred_nms, params))
    # convert the predictions into output dictionary
    file_id = os.path.basename(audio_file)
    predictions, spec_feats, cnn_feats, spec_slices =\
              merge_results(predictions, spec_feats, cnn_feats, spec_slices)
    results = convert_results(file_id, time_exp, duration_full, params,
                              predictions, spec_feats, cnn_feats, spec_slices)
    # summarize results
    if not args['quiet']:
        num_detections = len(results['pred_dict']['annotation'])
        print('{}'.format(num_detections) + ' call(s) detected above the threshold.')
    # print results for top n classes
    if not args['quiet'] and (num_detections > 0):
        class_overall = pp.overall_class_pred(predictions['det_probs'], predictions['class_probs'])
        print('species name'.ljust(30) + 'probablity present')
        for cc in np.argsort(class_overall)[::-1][:top_n]:
            print(params['class_names'][cc].ljust(30) + str(round(class_overall[cc], 3)))
    if return_raw_preds:
        return predictions
    else:
        return results
--- a/bat_detect/utils/plot_utils.py
+++ b/bat_detect/utils/plot_utils.py
@ -0,0 +1,371 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import json
 from sklearn.metrics import confusion_matrix
 from matplotlib import patches
 from matplotlib.collections import PatchCollection
 from . import audio_utils as au
 def create_box_image(spec, fig, detections_ip, start_time, end_time, duration, params, max_val, hide_axis=True, plot_class_names=False):
    # filter detections
    stop_time = start_time + duration
    detections = []
    for bb in detections_ip:
        if (bb['start_time'] >= start_time) and (bb['start_time'] < stop_time-0.02): #(bb['end_time'] < end_time):
            detections.append(bb)
    # create figure
    freq_scale = 1000  # turn Hz to kHz
    min_freq = params['min_freq']//freq_scale
    max_freq = params['max_freq']//freq_scale
    y_extent = [0, duration, min_freq, max_freq]
    if hide_axis:
        ax = plt.Axes(fig, [0., 0., 1., 1.])
        ax.set_axis_off()
        fig.add_axes(ax)
    else:
        ax = plt.gca()
    plt.imshow(spec, aspect='auto', cmap='plasma', extent=y_extent, vmin=0, vmax=max_val)
    boxes = plot_bounding_box_patch_ann(detections, freq_scale, start_time)
    ax.add_collection(PatchCollection(boxes, match_original=True))
    plt.grid(False)
    if plot_class_names:
        for ii, bb in enumerate(boxes):
            txt = ' '.join([sp[:3] for sp in detections_ip[ii]['class'].split(' ')])
            font_info = {'color':  'white', 'size': 10, 'weight': 'bold', 'alpha': bb.get_alpha()}
            y_pos = bb.get_xy()[1] + bb.get_height()
            if y_pos > (max_freq - 10):
                y_pos = max_freq - 10
            plt.gca().text(bb.get_xy()[0], y_pos, txt, fontdict=font_info)
 def save_ann_spec(op_path, spec, min_freq, max_freq, duration, start_time, title_text='', anns=None):
    # create figure and plot boxes
    freq_scale = 1000  # turn Hz to kHz
    min_freq = min_freq//freq_scale
    max_freq = max_freq//freq_scale
    y_extent = [0, duration, min_freq, max_freq]
    plt.close('all')
    fig = plt.figure(0, figsize=(spec.shape[1]/100, spec.shape[0]/100), dpi=100)
    plt.imshow(spec, aspect='auto', cmap='plasma', extent=y_extent, vmin=0, vmax=spec.max()*1.1)
    plt.ylabel('Freq - kHz')
    plt.xlabel('Time - secs')
    if title_text != '':
        plt.title(title_text)
    plt.tight_layout()
    if anns is not None:
        # drawing bounding boxes and class names
        boxes = plot_bounding_box_patch_ann(anns, freq_scale, start_time)
        plt.gca().add_collection(PatchCollection(boxes, match_original=True))
        for ii, bb in enumerate(boxes):
            txt = ' '.join([sp[:3] for sp in anns[ii]['class'].split(' ')])
            font_info = {'color':  'white', 'size': 10, 'weight': 'bold', 'alpha': bb.get_alpha()}
            y_pos = bb.get_xy()[1] + bb.get_height()
            if y_pos > (max_freq - 10):
                y_pos = max_freq - 10
            plt.gca().text(bb.get_xy()[0], y_pos, txt, fontdict=font_info)
    print('Saving figure to:', op_path)
    plt.savefig(op_path)
 def plot_pts(fig_id, feats, class_names, colors, marker_size=4.0, plot_legend=False):
    plt.figure(fig_id)
    un_class, labels = np.unique(class_names, return_inverse=True)
    un_labels = np.unique(labels)
    if un_labels.shape[0] > len(colors):
        colors = [plt.cm.jet(float(ii)/un_labels.shape[0]) for ii in un_labels]
    for ii, u in enumerate(un_labels):
        inds = np.where(labels==u)[0]
        plt.scatter(feats[inds, 0], feats[inds, 1], c=colors[ii], label=str(un_class[ii]), s=marker_size)
    if plot_legend:
        plt.legend()
    plt.xticks([])
    plt.yticks([])
    plt.title('downsampled features')
 def plot_bounding_box_patch(pred, freq_scale, ecolor='w'):
    patch_collect = []
    for bb in range(len(pred['start_times'])):
        xx = pred['start_times'][bb]
        ww = pred['end_times'][bb] - pred['start_times'][bb]
        yy = pred['low_freqs'][bb] / freq_scale
        hh = (pred['high_freqs'][bb] - pred['low_freqs'][bb]) / freq_scale
        if 'det_probs' in pred.keys():
            alpha_val = pred['det_probs'][bb]
        else:
            alpha_val = 1.0
        patch_collect.append(patches.Rectangle((xx, yy), ww, hh, linewidth=1,
                             edgecolor=ecolor, facecolor='none', alpha=alpha_val))
    return patch_collect
 def plot_bounding_box_patch_ann(anns, freq_scale, start_time):
    patch_collect = []
    for aa in range(len(anns)):
        xx = anns[aa]['start_time'] - start_time
        ww = anns[aa]['end_time'] - anns[aa]['start_time']
        yy = anns[aa]['low_freq'] / freq_scale
        hh = (anns[aa]['high_freq'] - anns[aa]['low_freq']) / freq_scale
        if 'det_prob' in anns[aa]:
            alpha = anns[aa]['det_prob']
        else:
            alpha = 1.0
        patch_collect.append(patches.Rectangle((xx,yy), ww, hh, linewidth=1,
                             edgecolor='w', facecolor='none', alpha=alpha))
    return patch_collect
 def plot_spec(spec, sampling_rate, duration, gt, pred, params, plot_title,
              op_file_name, pred_2d_hm, plot_boxes=True, fixed_aspect=True):
    if fixed_aspect:
        # ouptut image will be this width irrespective of the duration of the audio file
        width = 12
    else:
        width = 12*duration
    fig = plt.figure(1, figsize=(width, 8))
    ax0 = plt.axes([0.05, 0.65, 0.9, 0.30]) # l b w h
    ax1 = plt.axes([0.05, 0.33, 0.9, 0.30])
    ax2 = plt.axes([0.05, 0.01, 0.9, 0.30])
    freq_scale = 1000  # turn Hz in kHz
    #duration = au.x_coords_to_time(spec.shape[1], sampling_rate, params['fft_win_length'], params['fft_overlap'])
    y_extent = [0, duration, params['min_freq']//freq_scale, params['max_freq']//freq_scale]
    # plot gt boxes
    ax0.imshow(spec, aspect='auto', cmap='plasma', extent=y_extent)
    ax0.xaxis.set_ticklabels([])
    font_info = {'color':  'white', 'size': 12, 'weight': 'bold'}
    ax0.text(0, params['min_freq']//freq_scale, 'Ground Truth', fontdict=font_info)
    plt.grid(False)
    if plot_boxes:
        boxes = plot_bounding_box_patch(gt, freq_scale)
        ax0.add_collection(PatchCollection(boxes, match_original=True))
        for ii, bb in enumerate(boxes):
            class_id = int(gt['class_ids'][ii])
            if class_id < 0:
                txt = params['generic_class'][0]
            else:
                txt = params['class_names_short'][class_id]
            font_info = {'color':  'white', 'size': 10, 'weight': 'bold', 'alpha': bb.get_alpha()}
            y_pos = bb.get_xy()[1] + bb.get_height()
            ax0.text(bb.get_xy()[0], y_pos, txt, fontdict=font_info)
    # plot predicted boxes
    ax1.imshow(spec, aspect='auto', cmap='plasma', extent=y_extent)
    ax1.xaxis.set_ticklabels([])
    font_info = {'color':  'white', 'size': 12, 'weight': 'bold'}
    ax1.text(0, params['min_freq']//freq_scale, 'Prediction', fontdict=font_info)
    plt.grid(False)
    if plot_boxes:
        boxes = plot_bounding_box_patch(pred, freq_scale)
        ax1.add_collection(PatchCollection(boxes, match_original=True))
        for ii, bb in enumerate(boxes):
            if pred['class_probs'].shape[0] > len(params['class_names_short']):
                class_id = pred['class_probs'][:-1, ii].argmax()
            else:
                class_id = pred['class_probs'][:, ii].argmax()
            txt = params['class_names_short'][class_id]
            font_info = {'color':  'white', 'size': 10, 'weight': 'bold', 'alpha': bb.get_alpha()}
            y_pos = bb.get_xy()[1] + bb.get_height()
            ax1.text(bb.get_xy()[0], y_pos, txt, fontdict=font_info)
    # plot 2D heatmap
    if pred_2d_hm is not None:
        min_val = 0.0 if pred_2d_hm.min() > 0.0 else pred_2d_hm.min()
        max_val = 1.0 if pred_2d_hm.max() < 1.0 else pred_2d_hm.max()
        ax2.imshow(pred_2d_hm, aspect='auto', cmap='plasma', extent=y_extent, clim=[min_val, max_val])
        #ax2.xaxis.set_ticklabels([])
        font_info = {'color':  'white', 'size': 12, 'weight': 'bold'}
        ax2.text(0, params['min_freq']//freq_scale, 'Heatmap', fontdict=font_info)
        plt.grid(False)
    plt.suptitle(plot_title)
    if op_file_name is not None:
        fig.savefig(op_file_name)
    plt.close(1)
 def plot_pr_curve(op_dir, plt_title, file_name, results, file_type='png', title_text=''):
    precision = results['precision']
    recall = results['recall']
    avg_prec = results['avg_prec']
    plt.figure(0, figsize=(10,8))
    plt.plot(recall, precision)
    plt.ylabel('Precision', fontsize=20)
    plt.xlabel('Recall', fontsize=20)
    if title_text != '':
        plt.title(title_text, fontdict={'fontsize': 28})
    else:
        plt.title(plt_title + ' {:.3f}\n'.format(avg_prec))
    plt.xlim(0,1.02)
    plt.ylim(0,1.02)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(op_dir + file_name + '.' + file_type)
    plt.close(0)
 def plot_pr_curve_class(op_dir, plt_title, file_name, results, file_type='png', title_text=''):
    plt.figure(0, figsize=(10,8))
    plt.ylabel('Precision', fontsize=20)
    plt.xlabel('Recall', fontsize=20)
    plt.xlim(0,1.02)
    plt.ylim(0,1.02)
    plt.grid(True)
    linestyles = ['-', ':', '--']
    markers = ['o', 'v', '>', '^', '<', 's', 'P', 'X', '*']
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    # plot the PR curves
    for ii, rr in enumerate(results['class_pr']):
        class_name = ' '.join([sp[:3] for sp in rr['name'].split(' ')])
        cur_color = colors[int(ii%10)]
        plt.plot(rr['recall'], rr['precision'], label=class_name, color=cur_color,
                 linestyle=linestyles[int(ii//10)], lw=2.5)
        #print(class_name)
        # plot the location of the confidence threshold values
        for jj, tt in enumerate(rr['thresholds']):
            ind = rr['thresholds_inds'][jj]
            if ind > -1:
                plt.plot(rr['recall'][ind], rr['precision'][ind], markers[jj],
                         color=cur_color, ms=10)
                #print(np.round(tt,2), np.round(rr['recall'][ind],3), np.round(rr['precision'][ind],3))
    if title_text != '':
        plt.title(title_text, fontdict={'fontsize': 28})
    else:
        plt.title(plt_title + ' {:.3f}\n'.format(results['avg_prec_class']))
    plt.legend(loc='lower left', prop={'size': 14})
    plt.tight_layout()
    plt.savefig(op_dir + file_name + '.' + file_type)
    plt.close(0)
 def plot_confusion_matrix(op_dir, op_file, gt, pred, file_acc, class_names_long, verbose=False, file_type='png', title_text=''):
    # shorten the class names for plotting
    class_names = []
    for cc in class_names_long:
        class_name_sm = ''.join([cc_sm[:3] + ' ' for cc_sm in cc.split(' ')])[:-1]
        class_names.append(class_name_sm)
    num_classes = len(class_names)
    cm = confusion_matrix(gt, pred, labels=np.arange(num_classes)).astype(np.float32)
    cm_norm = cm.sum(1)
    valid_inds = np.where(cm_norm > 0)[0]
    cm[valid_inds, :] = cm[valid_inds, :] / cm_norm[valid_inds][..., np.newaxis]
    cm[np.where(cm_norm ==- 0)[0], :] = np.nan
    if verbose:
        print('Per class accuracy:')
        str_len = np.max([len(cc) for cc in class_names_long]) + 5
        accs = np.diag(cm)
        for ii, cc in enumerate(class_names_long):
            if np.isnan(accs[ii]):
                print(str(ii).ljust(5) + cc.ljust(str_len))
            else:
                print(str(ii).ljust(5) + cc.ljust(str_len) + '{:.2f}'.format(accs[ii]*100))
    plt.figure(0, figsize=(10,8))
    plt.imshow(cm, vmin=0, vmax=1, cmap='plasma')
    plt.colorbar()
    plt.xticks(np.arange(cm.shape[1]), class_names, rotation='vertical')
    plt.yticks(np.arange(cm.shape[0]), class_names)
    plt.xlabel('Predicted', fontsize=20)
    plt.ylabel('Ground Truth', fontsize=20)
    if title_text != '':
        plt.title(title_text, fontdict={'fontsize': 28})
    else:
        plt.title(op_file + ' {:.3f}\n'.format(file_acc))
    plt.tight_layout()
    plt.savefig(op_dir + op_file + '.' + file_type)
    plt.close('all')
 class LossPlotter(object):
    def __init__(self, op_file_name, duration, labels, ylim, class_names, axis_labels=None, logy=False):
        self.reset()
        self.op_file_name = op_file_name
        self.duration = duration  # length of x axis
        self.labels = labels
        self.ylim = ylim
        self.class_names = class_names
        self.axis_labels = axis_labels
        self.logy = logy
    def reset(self):
        self.epochs = []
        self.vals = []
    def update_and_save(self, epoch, val, gt=None, pred=None):
        self.epochs.append(epoch)
        self.vals.append(val)
        self.save_plot()
        self.save_json()
        if gt is not None:
            self.save_confusion_matrix(gt, pred)
    def save_plot(self):
        linestyles = ['-', ':', '--']
        plt.figure(0, figsize=(8,5))
        for ii in range(len(self.vals[0])):
            l_vals = [vv[ii] for vv in self.vals]
            plt.plot(self.epochs, l_vals, label=self.labels[ii], linestyle=linestyles[int(ii//10)])
        plt.xlim(0, np.maximum(self.duration, len(self.vals)))
        if self.ylim is not None:
            plt.ylim(self.ylim[0], self.ylim[1])
        if self.axis_labels is not None:
            plt.xlabel(self.axis_labels[0])
            plt.ylabel(self.axis_labels[1])
        if self.logy:
            plt.gca().set_yscale('log')
        plt.grid(True)
        plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0.0)
        plt.tight_layout()
        plt.savefig(self.op_file_name)
        plt.close(0)
    def save_json(self):
        data = {}
        data['epochs'] = self.epochs
        for ii in range(len(self.vals[0])):
            data[self.labels[ii]] = [round(vv[ii],4) for vv in self.vals]
        with open(self.op_file_name[:-4] + '.json', 'w') as da:
            json.dump(data, da, indent=2)
    def save_confusion_matrix(self, gt, pred):
        plt.figure(0)
        cm = confusion_matrix(gt, pred, np.arange(len(self.class_names))).astype(np.float32)
        cm_norm = cm.sum(1)
        valid_inds = np.where(cm_norm > 0)[0]
        cm[valid_inds, :] = cm[valid_inds, :] / cm_norm[valid_inds][..., np.newaxis]
        plt.imshow(cm, vmin=0, vmax=1, cmap='plasma')
        plt.colorbar()
        plt.xticks(np.arange(cm.shape[1]), self.class_names, rotation='vertical')
        plt.yticks(np.arange(cm.shape[0]), self.class_names)
        plt.xlabel('Predicted')
        plt.ylabel('Ground Truth')
        plt.tight_layout()
        plt.savefig(self.op_file_name[:-4] + '_cm.png')
        plt.close(0)
--- a/bat_detect/utils/visualize.py
+++ b/bat_detect/utils/visualize.py
@ -0,0 +1,158 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib import patches
 from sklearn.svm import LinearSVC
 from matplotlib.axes._axes import _log as matplotlib_axes_logger
 matplotlib_axes_logger.setLevel('ERROR')
 colors = ['#e6194B', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
          '#42d4f4', '#f032e6', '#bfef45', '#fabebe', '#469990', '#e6beff',
          '#9A6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
          '#000075', '#a9a9a9']
 class InteractivePlotter:
    def __init__(self, feats_ds, feats, spec_slices, call_info, freq_lims, allow_training):
        """
        Plots 2D low dimensional features on left and corresponding spectgrams on
        the right.
        """
        self.feats_ds = feats_ds
        self.feats = feats
        self.clf = None
        self.spec_slices = spec_slices
        self.call_info = call_info
        #_, self.labels = np.unique([cc['class'] for cc in call_info], return_inverse=True)
        self.labels = np.zeros(len(call_info), dtype=np.int)
        self.annotated = np.zeros(self.labels.shape[0], dtype=np.int)  # can populate this with 1's where we have labels
        self.labels_cols = [colors[self.labels[ii]] for ii in range(len(self.labels))]
        self.freq_lims = freq_lims
        self.allow_training = allow_training
        self.pt_size = 5.0
        self.spec_pad = 0.2  # this much padding has been applied to the spec slices
        self.fig_width = 12
        self.fig_height = 8
        self.current_id = 0
        max_ind = np.argmax([ss.shape[1] for ss in self.spec_slices])
        self.max_width = self.spec_slices[max_ind].shape[1]
        self.blank_spec = np.zeros((self.spec_slices[0].shape[0], self.max_width))
    def plot(self, fig_id):
        self.fig, self.ax = plt.subplots(nrows=1, ncols=2, num=fig_id, figsize=(self.fig_width, self.fig_height),
                               gridspec_kw={'width_ratios': [2, 1]})
        plt.tight_layout()
        # plot 2D TNSE features
        self.low_dim_plt = self.ax[0].scatter(self.feats_ds[:, 0], self.feats_ds[:, 1],
                                              c=self.labels_cols, s=self.pt_size, picker=5)
        self.ax[0].set_title('TSNE of Call Features')
        self.ax[0].set_xticks([])
        self.ax[0].set_yticks([])
        # plot clip from spectrogram
        spec_min_max = (0, self.blank_spec.shape[1], self.freq_lims[0], self.freq_lims[1])
        self.ax[1].imshow(self.blank_spec, extent=spec_min_max, cmap='plasma', aspect='auto')
        self.spec_im = self.ax[1].get_images()[0]
        self.ax[1].set_title('Spectrogram')
        self.ax[1].grid(color='w', linewidth=0.5)
        self.ax[1].set_xticks([])
        self.ax[1].set_ylabel('kHz')
        bbox_orig = patches.Rectangle((0,0),0,0, edgecolor='w', linewidth=0, fill=False)
        self.ax[1].add_patch(bbox_orig)
        self.annot = self.ax[0].annotate('', xy=(0,0), xytext=(20,20),textcoords='offset points',
                               bbox=dict(boxstyle='round', fc='w'), arrowprops=dict(arrowstyle='->'))
        self.annot.set_visible(False)
        self.fig.canvas.mpl_connect('motion_notify_event', self.mouse_hover)
        self.fig.canvas.mpl_connect('key_press_event', self.key_press)
    def mouse_hover(self, event):
        vis = self.annot.get_visible()
        if event.inaxes == self.ax[0]:
            cont, ind = self.low_dim_plt.contains(event)
            if cont:
                self.current_id = ind['ind'][0]
                # copy spec into full window - probably a better way of doing this
                new_spec = self.blank_spec.copy()
                w_diff = (self.blank_spec.shape[1] - self.spec_slices[self.current_id].shape[1])//2
                new_spec[:, w_diff:self.spec_slices[self.current_id].shape[1]+w_diff] = self.spec_slices[self.current_id]
                self.spec_im.set_data(new_spec)
                self.spec_im.set_clim(vmin=0, vmax=new_spec.max())
                # draw bounding box around call
                self.ax[1].patches[0].remove()
                spec_width_orig = self.spec_slices[self.current_id].shape[1]/(1.0+2.0*self.spec_pad)
                xx = w_diff + self.spec_pad*spec_width_orig
                ww = spec_width_orig
                yy = self.call_info[self.current_id]['low_freq']/1000
                hh = (self.call_info[self.current_id]['high_freq']-self.call_info[self.current_id]['low_freq'])/1000
                bbox = patches.Rectangle((xx,yy),ww,hh, edgecolor='r', linewidth=0.5, fill=False)
                self.ax[1].add_patch(bbox)
                # update annotation arrow
                pos = self.low_dim_plt.get_offsets()[self.current_id]
                self.annot.xy = pos
                self.annot.set_visible(True)
                # write call info
                info_str = self.call_info[self.current_id]['file_name'] + ', time=' \
                            + str(round(self.call_info[self.current_id]['start_time'],3)) \
                            + ', prob=' + str(round(self.call_info[self.current_id]['det_prob'],3))
                self.ax[0].set_xlabel(info_str)
                # redraw
                self.fig.canvas.draw_idle()
    def key_press(self, event):
        if event.key.isdigit():
            self.labels_cols[self.current_id] = colors[int(event.key)]
            self.labels[self.current_id] = int(event.key)
            self.annotated[self.current_id] = 1
        elif event.key == 'enter' and self.allow_training:
            self.train_classifier()
        elif event.key == 'x' and self.allow_training:
            self.get_classifier_params()
        self.ax[0].scatter(self.feats_ds[:, 0], self.feats_ds[:, 1],
                           c=self.labels_cols, s=self.pt_size)
        self.fig.canvas.draw_idle()
    def train_classifier(self):
        # TODO maybe it's better to classify in 2D space - but then can't be linear ...
        inds = np.where(self.annotated == 1)[0]
        labs_un, labs_inds = np.unique(self.labels[inds], return_inverse=True)
        if labs_un.shape[0] > 1:  # needs at least 2 classes
            self.clf = LinearSVC(C=1.0, penalty='l2', loss='squared_hinge', tol=0.0001,
                                 intercept_scaling=1.0, max_iter=2000)
            self.clf.fit(self.feats[inds, :], self.labels[inds])
            # update labels
            inds_unlab = np.where(self.annotated == 0)[0]
            self.labels[inds_unlab] = self.clf.predict(self.feats[inds_unlab])
            for ii in inds_unlab:
                self.labels_cols[ii] = colors[self.labels[ii]]
        else:
            print('Not enough data - please label more classes.')
    def get_classifier_params(self):
        res = {}
        if self.clf is None:
            print('Model not trained!')
        else:
            res['weights'] = self.clf.coef_.astype(np.float32)
            res['biases'] = self.clf.intercept_.astype(np.float32)
        return res
--- a/bat_detect/utils/wavfile.py
+++ b/bat_detect/utils/wavfile.py
@ -0,0 +1,291 @@
 """
 Module to read / write wav files using numpy arrays
 Functions
 ---------
 `read`: Return the sample rate (in samples/sec) and data from a WAV file.
 `write`: Write a numpy array as a WAV file.
 """
 from __future__ import division, print_function, absolute_import
 import sys
 import numpy
 import struct
 import warnings
 import os
 class WavFileWarning(UserWarning):
    pass
 _big_endian = False
 WAVE_FORMAT_PCM = 0x0001
 WAVE_FORMAT_IEEE_FLOAT = 0x0003
 WAVE_FORMAT_EXTENSIBLE = 0xfffe
 KNOWN_WAVE_FORMATS = (WAVE_FORMAT_PCM, WAVE_FORMAT_IEEE_FLOAT)
 # assumes file pointer is immediately
 #  after the 'fmt ' id
 def _read_fmt_chunk(fid):
    if _big_endian:
        fmt = '>'
    else:
        fmt = '<'
    res = struct.unpack(fmt+'iHHIIHH',fid.read(20))
    size, comp, noc, rate, sbytes, ba, bits = res
    if comp not in KNOWN_WAVE_FORMATS or size > 16:
        comp = WAVE_FORMAT_PCM
        warnings.warn("Unknown wave file format", WavFileWarning)
        if size > 16:
            fid.read(size - 16)
    return size, comp, noc, rate, sbytes, ba, bits
 # assumes file pointer is immediately
 #   after the 'data' id
 def _read_data_chunk(fid, comp, noc, bits, mmap=False):
    if _big_endian:
        fmt = '>i'
    else:
        fmt = '<i'
    size = struct.unpack(fmt,fid.read(4))[0]
    bytes = bits//8
    if bits == 8:
        dtype = 'u1'
    else:
        if _big_endian:
            dtype = '>'
        else:
            dtype = '<'
        if comp == 1:
            dtype += 'i%d' % bytes
        else:
            dtype += 'f%d' % bytes
    if not mmap:
        data = numpy.fromstring(fid.read(size), dtype=dtype)
    else:
        start = fid.tell()
        data = numpy.memmap(fid, dtype=dtype, mode='c', offset=start,
                            shape=(size//bytes,))
        fid.seek(start + size)
    if noc > 1:
        data = data.reshape(-1,noc)
    return data
 def _skip_unknown_chunk(fid):
    if _big_endian:
        fmt = '>i'
    else:
        fmt = '<i'
    data = fid.read(4)
    size = struct.unpack(fmt, data)[0]
    fid.seek(size, 1)
 def _read_riff_chunk(fid):
    global _big_endian
    str1 = fid.read(4)
    if str1 == b'RIFX':
        _big_endian = True
    elif str1 != b'RIFF':
        raise ValueError("Not a WAV file.")
    if _big_endian:
        fmt = '>I'
    else:
        fmt = '<I'
    fsize = struct.unpack(fmt, fid.read(4))[0] + 8
    str2 = fid.read(4)
    if (str2 != b'WAVE'):
        raise ValueError("Not a WAV file.")
    if str1 == b'RIFX':
        _big_endian = True
    return fsize
 # open a wave-file
 def read(filename, mmap=False):
    """
    Return the sample rate (in samples/sec) and data from a WAV file
    Parameters
    ----------
    filename : string or open file handle
        Input wav file.
    mmap : bool, optional
        Whether to read data as memory mapped.
        Only to be used on real files (Default: False)
        .. versionadded:: 0.12.0
    Returns
    -------
    rate : int
        Sample rate of wav file
    data : numpy array
        Data read from wav file
    Notes
    -----
    * The file can be an open file or a filename.
    * The returned sample rate is a Python integer
    * The data is returned as a numpy array with a
      data-type determined from the file.
    """
    if hasattr(filename,'read'):
        fid = filename
        mmap = False
    else:
        fid = open(filename, 'rb')
    try:
        # some files seem to have the size recorded in the header greater than
        # the actual file size.
        fid.seek(0, os.SEEK_END)
        actual_size = fid.tell()
        fid.seek(0)
        fsize = _read_riff_chunk(fid)
        # the fsize should be identical to the actual size, if not
        # the header information is wrong and we need to correct it.
        if fsize != actual_size:
            fsize = actual_size
        noc = 1
        bits = 8
        comp = WAVE_FORMAT_PCM
        while (fid.tell() < fsize):
            # read the next chunk
            chunk_id = fid.read(4)
            if chunk_id == b'fmt ':
                size, comp, noc, rate, sbytes, ba, bits = _read_fmt_chunk(fid)
            elif chunk_id == b'fact':
                _skip_unknown_chunk(fid)
            elif chunk_id == b'data':
                data = _read_data_chunk(fid, comp, noc, bits, mmap=mmap)
            elif chunk_id == b'LIST':
                # Someday this could be handled properly but for now skip it
                _skip_unknown_chunk(fid)
            # OMA warning - I've commented out the following lines
            # else:
            #     warnings.warn("Chunk (non-data) not understood, skipping it.", WavFileWarning)
            #     _skip_unknown_chunk(fid)
    finally:
        if not hasattr(filename,'read'):
            fid.close()
        else:
            fid.seek(0)
    return rate, data
 # Write a wave-file
 # sample rate, data
 def write(filename, rate, data):
    """
    Write a numpy array as a WAV file
    Parameters
    ----------
    filename : string or open file handle
        Output wav file
    rate : int
        The sample rate (in samples/sec).
    data : ndarray
        A 1-D or 2-D numpy array of either integer or float data-type.
    Notes
    -----
    * The file can be an open file or a filename.
    * Writes a simple uncompressed WAV file.
    * The bits-per-sample will be determined by the data-type.
    * To write multiple-channels, use a 2-D array of shape
      (Nsamples, Nchannels).
    """
    if hasattr(filename, 'write'):
        fid = filename
    else:
        fid = open(filename, 'wb')
    try:
        # kind of numeric data in the numpy array
        dkind = data.dtype.kind
        if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and data.dtype.itemsize == 1)):
            raise ValueError("Unsupported data type '%s'" % data.dtype)
        # wav header stuff
        # http://soundfile.sapp.org/doc/WaveFormat/
        fid.write(b'RIFF')
        # placeholder for chunk size (updated later)
        fid.write(b'\x00\x00\x00\x00')
        fid.write(b'WAVE')
        # fmt chunk
        fid.write(b'fmt ')
        if dkind == 'f':
            # comp stands for compression. PCM = 1
            comp = 3
        else:
            comp = 1
        # determine number of channels
        if data.ndim == 1:
            noc = 1
        else:
            noc = data.shape[1]
        bits = data.dtype.itemsize * 8
        # number of bytes per second, at the specified sampling rate rate,
        # bits per sample and number of channels (just needed for wav header)
        sbytes = rate*(bits // 8)*noc
        # number of bytes per sample
        ba = noc * (bits // 8)
        # https://docs.python.org/3/library/struct.html#struct-format-strings
        # Write the data (16, comp, noc, etc) in the correct binary format
        # for the wav header. the string format (first arg) specifies how many bytes for each
        # value.
        fid.write(struct.pack('<ihHIIHH', 16, comp, noc, rate, sbytes, ba, bits))
        # data chunk: the word 'data' followed by the size followed by the actual data
        fid.write(b'data')
        fid.write(struct.pack('<i', data.nbytes))
        if data.dtype.byteorder == '>' or (data.dtype.byteorder == '=' and sys.byteorder == 'big'):
            data = data.byteswap()
        _array_tofile(fid, data)
        # Determine file size and place it in correct
        #  position at start of the file (replacing the 4 bytes of zeros)
        size = fid.tell()
        fid.seek(4)
        fid.write(struct.pack('<i', size-8))
    finally:
        if not hasattr(filename,'write'):
            fid.close()
        else:
            fid.seek(0)
 if sys.version_info[0] >= 3:
    def _array_tofile(fid, data):
        # ravel gives a c-contiguous buffer
        fid.write(data.ravel().view('b').data)
 else:
    def _array_tofile(fid, data):
        fid.write(data.tostring())
--- a/batdetect2_colab_notebook.ipynb
+++ b/batdetect2_colab_notebook.ipynb
--- a/config.py
+++ b/config.py
@ -0,0 +1,6 @@
 """
 Configuration parameters
 """
 MODEL_PATH = 'mdoels/Net2DFast_UK_same.pth.tar'
--- a/faq.md
+++ b/faq.md
@ -0,0 +1,65 @@
 # BatDetect2 - FAQ
 ## Installation
 #### Do I need to know Python to be able to use this?
 No. To simply run the code on your own data you do not need any knowledge of Python. However, a small bit of familiarity with the terminal (i.e. command line) in Windows/Linux/OSX may make things easier.
 #### Are there any plans for an R version?
 Currently no. All the scripts export simple `.csv` files that can be read using any programming language of choice.  
 #### How do I install the code?
 The codebase has been tested under Windows 10, Ubuntu, and OSX. Read the instructions in the main readme to get started. If you are having problems getting it working and you feel like you have tried everything (e.g. confirming that your Anaconda Python distribution is correctly installed) feel free to open an issue on GitHub.
 ## Performance
 #### The model does not work very well on my data?
 Our model is based on a machine learning approach and as such if your data is very different from our training set it may not work as well. Feel free to use our annotation tools to label some of your own data and retrain the model. Even better, if you have large quantities of audio data with reliable species data that you are willing to share with the community please get in touch.
 #### The model is incorrectly classifying insects/noise/... as bats?
 Fine-tuning the model on your data can make a big difference. See previous answer.
 #### The model fails to correctly detect feeding buzzes and social calls?
 This is a limitation of our current training data. If you have such data or would be willing to label some for us please get in touch.  
 #### Calls that are clearly belonging to the same call sequence are being predicted as coming from different species?
 Currently we do not do any sophisticated post processing on the results output by the model. We return a probability associated with each species for each call. You can use these predictions to clean up the noisy predictions for sequences of calls.  
 #### The code works well but it is slow?
 Try a different/faster computer. On a reasonably recent desktop it takes about 13 seconds (on the GPU) or 1.3 minutes (on the CPU) to process 7.5 minutes of audio. In general, we observe a factor of ~5-10 speed up using recent Nvidia GPUs compared to CPU only systems.
 #### My audio files are very big and as a result the code is slow.
 If your audio files are very long in duration (i.e. mulitple minutes) it might be better to split them up into several smaller files. `sox` is a command line tool that can achieve this. It's easy to install on Ubuntu (e.g. `sudo apt-get install sox`) and is also available for Windows and OSX [here](http://sox.sourceforge.net/). To split up a file into 8 second chunks:  
 `sox INPUT_FILENAME.wav OUTPUT_FILENAME.wav trim 0 8 : newfile : restart`  
 This will result in a bunch of individual wav files appended with a number e.g. OUTPUT_FILENAME001.wav, OUTPUT_FILENAME002.wav, ... If you have time expanded files you might want to take the time expansion factor into account when splitting the files e.g. if the files are time expanded by 10 you should multiply the chuck length by 10. So to get 8 seconds in real time you would want to split the files into 8x10 second chunks.
 ## Training a new model
 #### Can I train a model on my own bat data with different species?
 Yes. You just need to provide annotations in the correct format.
 #### Will this work for frequency-division or zero-crossing recordings?
 No. The code assumes that we can convert the input audio into a spectrogram.
 #### Will this code work for non-bat audio data e.g. insects or birds?
 In principle yes, however you may need to change some of the training hyper-parameters to ignore high frequency information when you re-train. Please open an issue on GitHub if you have a specific request.
 ## Usage
 #### Can I use the code for commercial purposes or incorporate raw source code or trained models into my commercial system?
 No. This codebase is only for non-commercial use.
--- a/models/Net2DFast_UK_same.pth.tar
+++ b/models/Net2DFast_UK_same.pth.tar
--- a/models/readme.md
+++ b/models/readme.md
@ -0,0 +1 @@
 Trained models go here. 
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,10 @@
 librosa==0.8.1
 matplotlib==3.3.4
 numpy==1.20.2
 pandas==1.3.4
 scikit_learn==1.0.1
 scipy==1.5.3
 six==1.16.0
 torch==1.9.0
 torchaudio==0.9.0a0+33b2469
 torchvision==0.10.0
--- a/run_batdetect.py
+++ b/run_batdetect.py
@ -0,0 +1,71 @@
 import os
 import argparse
 import config
 import bat_detect.utils.detector_utils as du
 def main(args):
    print('Loading model: ' + args['model_path'])
    model, params = du.load_model(args['model_path'])
    print('\nInput directory: ' + args['audio_dir'])
    files = du.get_audio_files(args['audio_dir'])
    print('Number of audio files: {}'.format(len(files)))
    print('\nSaving results to: ' + args['ann_dir'])
    # process files
    error_files = []
    for ii, audio_file in enumerate(files):
        print('\n' + str(ii).ljust(6) + os.path.basename(audio_file))
        try:
            results = du.process_file(audio_file, model, params, args)
            if args['save_preds_if_empty'] or (len(results['pred_dict']['annotation']) > 0):
                results_path = audio_file.replace(args['audio_dir'], args['ann_dir'])
                du.save_results_to_file(results, results_path)
        except:
            error_files.append(audio_file)
            print("Error processing file!")
    print('\nResults saved to: ' + args['ann_dir'])
    if len(error_files) > 0:
        print('\nUnable to process the follow files:')
        for err in error_files:
            print('  ' + err)
 if __name__ == "__main__":
    info_str = '\nBatDetect - Detection and Classification\n' + \
               '  Assumes audio files are mono, not stereo.\n' + \
               '  Spaces in the input paths will throw an error. Wrap in quotes "".\n' + \
               '  Input files should be short in duration e.g. < 1 minute.\n'
    print(info_str)
    parser = argparse.ArgumentParser()
    parser.add_argument('audio_dir', type=str, help='Input directory for audio')
    parser.add_argument('ann_dir', type=str, help='Output directory for where the predictions will be stored')
    parser.add_argument('detection_threshold', type=float, help='Cut-off probability for detector e.g. 0.1')
    parser.add_argument('--cnn_features', action='store_true', default=False, dest='cnn_features',
                        help='Extracts CNN call features')
    parser.add_argument('--spec_features', action='store_true', default=False, dest='spec_features',
                        help='Extracts low level call features')
    parser.add_argument('--time_expansion_factor', type=int, default=1, dest='time_expansion_factor',
                        help='The time expansion factor used for all files (default is 1)')
    parser.add_argument('--quiet', action='store_true', default=False, dest='quiet',
                        help='Minimize output printing')
    parser.add_argument('--save_preds_if_empty', action='store_true', default=False, dest='save_preds_if_empty',
                        help='Save empty annotation file if no detections made.')
    parser.add_argument('--model_path', type=str, default='',
                         help='Path to trained BatDetect model')
    args = vars(parser.parse_args())
    args['spec_slices'] = False  # used for visualization
    args['chunk_size']  = 2     # if files greater than this amount (seconds) they will be broken down into small chunks
    args['ann_dir']     = os.path.join(args['ann_dir'], '')
    if args['model_path'] == '':
        args['model_path'] = os.path.join('models', os.path.basename(config.MODEL_PATH))
    main(args)