batdetect2/example_data/config.yaml

targets:
  classes:
    classes:
      - name: myomys
        tags:
          - value: Myotis mystacinus
      - name: pippip
        tags:
          - value: Pipistrellus pipistrellus
      - name: eptser
        tags:
          - value: Eptesicus serotinus
      - name: rhifer
        tags:
          - value: Rhinolophus ferrumequinum
    generic_class:
      - key: class
        value: Bat

  filtering:
    rules:
      - match_type: all
        tags:
          - key: event
            value: Echolocation
      - match_type: exclude
        tags:
          - key: class
            value: Unknown

preprocess:
  audio:
    resample:
      samplerate: 256000
      method: "poly"
    scale: false
    center: true
    duration: null

  spectrogram:
    stft:
      window_duration: 0.002
      window_overlap: 0.75
      window_fn: hann
    frequencies:
      max_freq: 120000
      min_freq: 10000
    pcen:
      time_constant: 0.4
      gain: 0.98
      bias: 2
      power: 0.5
    scale: "amplitude"
    size:
      height: 128
      resize_factor: 0.5
    spectral_mean_substraction: true
    peak_normalize: false

postprocess:
  nms_kernel_size: 9
  detection_threshold: 0.01
  min_freq: 10000
  max_freq: 120000
  top_k_per_sec: 200

labels:
  sigma: 3

model:
  input_height: 128
  in_channels: 1
  out_channels: 32
  encoder:
    layers:
      - block_type: FreqCoordConvDown
        out_channels: 32
      - block_type: FreqCoordConvDown
        out_channels: 64
      - block_type: LayerGroup
        layers:
          - block_type: FreqCoordConvDown
            out_channels: 128
          - block_type: ConvBlock
            out_channels: 256
  bottleneck:
    channels: 256
    self_attention: true
  decoder:
    layers:
      - block_type: FreqCoordConvUp
        out_channels: 64
      - block_type: FreqCoordConvUp
        out_channels: 32
      - block_type: LayerGroup
        layers:
          - block_type: FreqCoordConvUp
            out_channels: 32
          - block_type: ConvBlock
            out_channels: 32

train:
  batch_size: 8
  learning_rate: 0.001
  t_max: 100
  loss:
    detection:
      weight: 1.0
      focal:
        beta: 4
        alpha: 2
    classification:
      weight: 2.0
      focal:
        beta: 4
        alpha: 2
    size:
      weight: 0.1
  logger:
    logger_type: dvclive
  augmentations:
    steps:
      - augmentation_type: mix_audio
        probability: 0.2
        min_weight: 0.3
        max_weight: 0.7
      - augmentation_type: add_echo
        probability: 0.2
        max_delay: 0.005
        min_weight: 0.0
        max_weight: 1.0
      - augmentation_type: scale_volume
        probability: 0.2
        min_scaling: 0.0
        max_scaling: 2.0
      - augmentation_type: warp
        probability: 0.2
        delta: 0.04
      - augmentation_type: mask_time
        probability: 0.2
        max_perc: 0.05
        max_masks: 3
      - augmentation_type: mask_freq
        probability: 0.2
        max_perc: 0.10
        max_masks: 3