batdetect2/scripts/gen_spec_video.py

"""
This script takes an audio file as input, runs the detector, and makes a video output

Notes:
    It needs ffmpeg installed to make the videos
    Sometimes conda can overwrite the default ffmpeg path set this to use system one.
    Check which one is being used with `which ffmpeg`. If conda version, can thow an error.
    Best to use system one - see ffmpeg_path.
"""

from scipy.io import wavfile
import os
import shutil
import matplotlib.pyplot as plt
import numpy as np
import argparse

import sys
sys.path.append(os.path.join('..'))
import bat_detect.detector.parameters as parameters
import bat_detect.utils.audio_utils as au
import bat_detect.utils.plot_utils as viz
import bat_detect.utils.detector_utils as du
import config


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('audio_file', type=str, help='Path to input audio file')
    parser.add_argument('model_path', type=str, help='Path to trained BatDetect model')
    parser.add_argument('--op_dir', type=str, default='generated_vids/', help='Path to output directory')
    parser.add_argument('--no_detector', action='store_true', help='Do not run detector')
    parser.add_argument('--plot_class_names_off', action='store_true', help='Do not plot class names')
    parser.add_argument('--disable_axis', action='store_true', help='Do not plot axis')
    parser.add_argument('--detection_threshold', type=float, default=0.2, help='Cut-off probability for detector')
    parser.add_argument('--time_expansion_factor', type=int, default=1, dest='time_expansion_factor',
                        help='The time expansion factor used for all files (default is 1)')
    args_cmd = vars(parser.parse_args())

    # file of interest
    audio_file = args_cmd['audio_file']
    op_dir = args_cmd['op_dir']
    op_str = '_output'
    ffmpeg_path = '/usr/bin/'

    if not os.path.isfile(audio_file):
        print('Audio file not found: ', audio_file)
        sys.exit()

    if not os.path.isfile(args_cmd['model_path']):
        print('Model not found: ', model_path)
        sys.exit()


    start_time = 0.0
    duration = 0.5
    reveal_boxes = True  # makes the boxes appear one at a time
    fps = 24
    dpi = 100

    op_dir_tmp = os.path.join(op_dir, 'op_tmp_vids', '')
    if not os.path.isdir(op_dir_tmp):
        os.makedirs(op_dir_tmp)
    if not os.path.isdir(op_dir):
        os.makedirs(op_dir)

    params = parameters.get_params(False)
    args = du.get_default_bd_args()
    args['time_expansion_factor'] = args_cmd['time_expansion_factor']
    args['detection_threshold'] = args_cmd['detection_threshold']


    # load audio file
    print('\nProcessing: ' + os.path.basename(audio_file))
    print('\nOutput directory: ' + op_dir)
    sampling_rate, audio = au.load_audio_file(audio_file, args['time_expansion_factor'], params['target_samp_rate'])
    audio = audio[int(sampling_rate*start_time):int(sampling_rate*start_time + sampling_rate*duration)]
    audio_orig = audio.copy()
    audio = au.pad_audio(audio, sampling_rate, params['fft_win_length'],
                         params['fft_overlap'], params['resize_factor'],
                         params['spec_divide_factor'])

    # generate spectrogram
    spec, _ = au.generate_spectrogram(audio, sampling_rate, params, True)
    max_val = spec.max()*1.1


    if not args_cmd['no_detector']:
        print('  Loading model and running detector on entire file ...')
        model, det_params = du.load_model(args_cmd['model_path'])
        det_params['detection_threshold'] = args['detection_threshold']
        results = du.process_file(audio_file, model, det_params, args)

        print('  Processing detections and plotting ...')
        detections = []
        for bb in results['pred_dict']['annotation']:
            if (bb['start_time'] >= start_time) and (bb['end_time'] < start_time+duration):
                detections.append(bb)

        # plot boxes
        fig = plt.figure(1, figsize=(spec.shape[1]/dpi, spec.shape[0]/dpi), dpi=dpi)
        duration = au.x_coords_to_time(spec.shape[1], sampling_rate, params['fft_win_length'], params['fft_overlap'])
        viz.create_box_image(spec, fig, detections, start_time, start_time+duration, duration, params, max_val,
                             plot_class_names=not args_cmd['plot_class_names_off'])
        op_im_file_boxes = os.path.join(op_dir, os.path.basename(audio_file)[:-4] + op_str + '_boxes.png')
        fig.savefig(op_im_file_boxes, dpi=dpi)
        plt.close(1)
        spec_with_boxes = plt.imread(op_im_file_boxes)


    print('  Saving audio file ...')
    if args['time_expansion_factor']==1:
        sampling_rate_op = int(sampling_rate/10.0)
    else:
        sampling_rate_op = sampling_rate
    op_audio_file = os.path.join(op_dir, os.path.basename(audio_file)[:-4] + op_str + '.wav')
    wavfile.write(op_audio_file, sampling_rate_op, audio_orig)


    print('  Saving image ...')
    op_im_file = os.path.join(op_dir, os.path.basename(audio_file)[:-4] + op_str + '.png')
    plt.imsave(op_im_file, spec, vmin=0, vmax=max_val, cmap='plasma')
    spec_blank = plt.imread(op_im_file)

    # create figure
    freq_scale = 1000  # turn Hz to kHz
    min_freq = params['min_freq']//freq_scale
    max_freq = params['max_freq']//freq_scale
    y_extent = [0, duration, min_freq, max_freq]

    print('  Saving video frames ...')
    # save images that will be combined into video
    # will either plot with or without boxes
    for ii, col in enumerate(np.linspace(0, spec.shape[1]-1, int(fps*duration*10))):
        if not args_cmd['no_detector']:
            spec_op = spec_with_boxes.copy()
            if ii > 0:
                spec_op[:, int(col), :] = 1.0
                if reveal_boxes:
                    spec_op[:, int(col)+1:, :] = spec_blank[:, int(col)+1:, :]
            elif ii == 0 and reveal_boxes:
                spec_op = spec_blank

            if not args_cmd['disable_axis']:
                plt.close('all')
                fig = plt.figure(ii, figsize=(1.2*(spec_op.shape[1]/dpi), 1.5*(spec_op.shape[0]/dpi)), dpi=dpi)
                plt.xlabel('Time - seconds')
                plt.ylabel('Frequency - kHz')
                plt.imshow(spec_op, vmin=0, vmax=1.0, cmap='plasma', extent=y_extent, aspect='auto')
                plt.tight_layout()
                fig.savefig(op_dir_tmp + str(ii).zfill(4) + '.png', dpi=dpi)
            else:
                plt.imsave(op_dir_tmp + str(ii).zfill(4) + '.png', spec_op, vmin=0, vmax=1.0, cmap='plasma')
        else:
            spec_op = spec.copy()
            if ii > 0:
                spec_op[:, int(col)] = max_val
            plt.imsave(op_dir_tmp + str(ii).zfill(4) + '.png', spec_op, vmin=0, vmax=max_val, cmap='plasma')


    print('  Creating video ...')
    op_vid_file = os.path.join(op_dir, os.path.basename(audio_file)[:-4] + op_str + '.avi')
    ffmpeg_cmd = 'ffmpeg -hide_banner -loglevel panic -y -r {} -f image2 -s {}x{} -i {}%04d.png -i {} -vcodec libx264 ' \
                 '-crf 25  -pix_fmt yuv420p -acodec copy {}'.format(fps, spec.shape[1], spec.shape[0], op_dir_tmp, op_audio_file, op_vid_file)
    ffmpeg_cmd = ffmpeg_path + ffmpeg_cmd
    os.system(ffmpeg_cmd)

    print('  Deleting temporary files ...')
    if os.path.isdir(op_dir_tmp):
       shutil.rmtree(op_dir_tmp)