audio:
  samplerate: 256000
  resample:
    enabled: True
    method: "poly"

preprocess:
  stft:
    window_duration: 0.002
    window_overlap: 0.75
    window_fn: hann
  frequencies:
    max_freq: 120000
    min_freq: 10000
  size:
    height: 128
    resize_factor: 0.5
  spectrogram_transforms:
    - name: pcen
      time_constant: 0.1
      gain: 0.98
      bias: 2
      power: 0.5
    - name: spectral_mean_substraction

postprocess:
  nms_kernel_size: 9
  detection_threshold: 0.01
  top_k_per_sec: 200

model:
  input_height: 128
  in_channels: 1
  out_channels: 32
  encoder:
    layers:
      - name: FreqCoordConvDown
        out_channels: 32
      - name: FreqCoordConvDown
        out_channels: 64
      - name: LayerGroup
        layers:
          - name: FreqCoordConvDown
            out_channels: 128
          - name: ConvBlock
            out_channels: 256
  bottleneck:
    channels: 256
    layers:
      - name: SelfAttention
        attention_channels: 256
  decoder:
    layers:
      - name: FreqCoordConvUp
        out_channels: 64
      - name: FreqCoordConvUp
        out_channels: 32
      - name: LayerGroup
        layers:
          - name: FreqCoordConvUp
            out_channels: 32
          - name: ConvBlock
            out_channels: 32

train:
  optimizer:
    learning_rate: 0.001
    t_max: 100

  labels:
    sigma: 3

  trainer:
    max_epochs: 10
    check_val_every_n_epoch: 5

  train_loader:
    batch_size: 8

    num_workers: 2

    shuffle: True

    clipping_strategy:
      name: random_subclip
      duration: 0.256

    augmentations:
      enabled: true
      audio:
        - name: mix_audio
          probability: 0.2
          min_weight: 0.3
          max_weight: 0.7
        - name: add_echo
          probability: 0.2
          max_delay: 0.005
          min_weight: 0.0
          max_weight: 1.0
      spectrogram:
        - name: scale_volume
          probability: 0.2
          min_scaling: 0.0
          max_scaling: 2.0
        - name: warp
          probability: 0.2
          delta: 0.04
        - name: mask_time
          probability: 0.2
          max_perc: 0.05
          max_masks: 3
        - name: mask_freq
          probability: 0.2
          max_perc: 0.10
          max_masks: 3

  val_loader:
    num_workers: 2
    clipping_strategy:
      name: whole_audio_padded
      chunk_size: 0.256

  loss:
    detection:
      weight: 1.0
      focal:
        beta: 4
        alpha: 2
    classification:
      weight: 2.0
      focal:
        beta: 4
        alpha: 2
    size:
      weight: 0.1

  logger:
    name: csv

  validation:
    tasks:
      - name: sound_event_detection
        metrics:
          - name: average_precision
      - name: sound_event_classification
        metrics:
          - name: average_precision

evaluation:
  tasks:
    - name: sound_event_detection
      metrics:
        - name: average_precision
        - name: roc_auc
      plots:
        - name: pr_curve
        - name: score_distribution
        - name: example_detection
    - name: sound_event_classification
      metrics:
        - name: average_precision
        - name: roc_auc
      plots:
        - name: pr_curve
    - name: top_class_detection
      metrics:
        - name: average_precision
      plots:
        - name: pr_curve
        - name: confusion_matrix
        - name: example_classification
    - name: clip_detection
      metrics:
        - name: average_precision
        - name: roc_auc
      plots:
        - name: pr_curve
        - name: roc_curve
        - name: score_distribution
    - name: clip_classification
      metrics:
        - name: average_precision
        - name: roc_auc
      plots:
        - name: pr_curve
        - name: roc_curve