From 30d3a2c92e7f8c42086d22947832ed6c083c7979 Mon Sep 17 00:00:00 2001
From: mbsantiago <santiago.mbal@gmail.com>
Date: Thu, 3 Apr 2025 16:46:43 +0100
Subject: [PATCH] Start work on expanding cli

---
 batdetect2/cli/__init__.py |   4 +
 batdetect2/cli/ascii.py    |  22 +++++
 batdetect2/cli/base.py     |   7 +-
 batdetect2/cli/data.py     |  40 ++++++++
 batdetect2/cli/train.py    | 188 +++++++++++++++++++++++++++++++++++++
 5 files changed, 256 insertions(+), 5 deletions(-)
 create mode 100644 batdetect2/cli/ascii.py
 create mode 100644 batdetect2/cli/data.py
 create mode 100644 batdetect2/cli/train.py

diff --git a/batdetect2/cli/__init__.py b/batdetect2/cli/__init__.py
index 9111b01..7b3e659 100644
--- a/batdetect2/cli/__init__.py
+++ b/batdetect2/cli/__init__.py
@@ -1,9 +1,13 @@
 from batdetect2.cli.base import cli
 from batdetect2.cli.compat import detect
+from batdetect2.cli.data import data
+from batdetect2.cli.train import train
 
 __all__ = [
     "cli",
     "detect",
+    "data",
+    "train",
 ]
 
 
diff --git a/batdetect2/cli/ascii.py b/batdetect2/cli/ascii.py
new file mode 100644
index 0000000..09c7a69
--- /dev/null
+++ b/batdetect2/cli/ascii.py
@@ -0,0 +1,22 @@
+BATDETECT_ASCII_ART = """                         .              
+             =#%:      .%%#             
+            :%%%:     .%%%%.            
+            %%%%.-===::%%%%*            
+           =%%%%+++++++%%%#.            
+    -:     .%%%#====+++#%%#     .-      
+  .+***= . =++. :   .=*+#%*=   :***.    
+  =+****+++==:%+#=+% *##%%%%*=##*#**-=  
+  ++***+**+=:  ##..  +##%%########**++  
+ .++*****#*+- :*:++  ##%#%%%%%####**++  
+ .++***+**++++-    :#%%%%%####*##***+=  
+ .+++***+#+++*########%%%##%#+*****++:  
+ .=++++++*+++##%##%%####%%##*:+****+=   
+  =++++++====*#%%#%###%%###- +***+++.   
+  .+*++++= =+==##########=  :****++.    
+   =++*+:.   .:=#####=     .++**++-     
+   .****:         .        -+**++=      
+    *###=                 .****==       
+    .#*#-                 **#*:         
+     -###                -*##.          
+      +*=                *#*            
+"""
diff --git a/batdetect2/cli/base.py b/batdetect2/cli/base.py
index fc379ee..6bc2c12 100644
--- a/batdetect2/cli/base.py
+++ b/batdetect2/cli/base.py
@@ -1,18 +1,14 @@
 """BatDetect2 command line interface."""
 
-import os
-
 import click
 
+# from batdetect2.cli.ascii import BATDETECT_ASCII_ART
 
 __all__ = [
     "cli",
 ]
 
 
-CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
-
-
 INFO_STR = """
 BatDetect2 - Detection and Classification
     Assumes audio files are mono, not stereo.
@@ -25,3 +21,4 @@ BatDetect2 - Detection and Classification
 def cli():
     """BatDetect2 - Bat Call Detection and Classification."""
     click.echo(INFO_STR)
+    # click.echo(BATDETECT_ASCII_ART)
diff --git a/batdetect2/cli/data.py b/batdetect2/cli/data.py
new file mode 100644
index 0000000..185476d
--- /dev/null
+++ b/batdetect2/cli/data.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+from typing import Optional
+
+import click
+
+from batdetect2.cli.base import cli
+from batdetect2.data import load_dataset_from_config
+
+__all__ = ["data"]
+
+
+@cli.group()
+def data(): ...
+
+
+@data.command()
+@click.argument(
+    "dataset_config",
+    type=click.Path(exists=True),
+)
+@click.option(
+    "--field",
+    type=str,
+    help="If the dataset info is in a nested field please specify here.",
+)
+@click.option(
+    "--base-dir",
+    type=click.Path(exists=True),
+    help="The base directory to which all recording and annotations paths are relative to.",
+)
+def summary(
+    dataset_config: Path,
+    field: Optional[str] = None,
+    base_dir: Optional[Path] = None,
+):
+    base_dir = base_dir or Path.cwd()
+    dataset = load_dataset_from_config(
+        dataset_config, field=field, base_dir=base_dir
+    )
+    print(f"Number of annotated clips: {len(dataset.clip_annotations)}")
diff --git a/batdetect2/cli/train.py b/batdetect2/cli/train.py
new file mode 100644
index 0000000..79e1aef
--- /dev/null
+++ b/batdetect2/cli/train.py
@@ -0,0 +1,188 @@
+from pathlib import Path
+from typing import Optional
+
+import click
+
+from batdetect2.cli.base import cli
+from batdetect2.data import load_dataset_from_config
+from batdetect2.preprocess import (
+    load_preprocessing_config,
+)
+from batdetect2.train import (
+    load_label_config,
+    load_target_config,
+    preprocess_annotations,
+)
+
+__all__ = ["train"]
+
+
+@cli.group()
+def train(): ...
+
+
+@train.command()
+@click.argument(
+    "dataset_config",
+    type=click.Path(exists=True),
+)
+@click.argument(
+    "output",
+    type=click.Path(),
+)
+@click.option(
+    "--dataset-field",
+    type=str,
+    help=(
+        "Specifies the key to access the dataset information within the "
+        "dataset configuration file, if the information is nested inside a "
+        "dictionary. If the dataset information is at the top level of the "
+        "config file, you don't need to specify this."
+    ),
+)
+@click.option(
+    "--base-dir",
+    type=click.Path(exists=True),
+    help=(
+        "The main directory where your audio recordings and annotation "
+        "files are stored. This helps the program find your data, "
+        "especially if the paths in your dataset configuration file "
+        "are relative."
+    ),
+)
+@click.option(
+    "--preprocess-config",
+    type=click.Path(exists=True),
+    help=(
+        "Path to the preprocessing configuration file. This file tells "
+        "the program how to prepare your audio data before training, such "
+        "as resampling or applying filters."
+    ),
+)
+@click.option(
+    "--preprocess-config-field",
+    type=str,
+    help=(
+        "If the preprocessing settings are inside a nested dictionary "
+        "within the preprocessing configuration file, specify the key "
+        "here to access them. If the preprocessing settings are at the "
+        "top level, you don't need to specify this."
+    ),
+)
+@click.option(
+    "--label-config",
+    type=click.Path(exists=True),
+    help=(
+        "Path to the label generation configuration file. This file "
+        "contains settings for how to create labels from your "
+        "annotations, which the model uses to learn."
+    ),
+)
+@click.option(
+    "--label-config-field",
+    type=str,
+    help=(
+        "If the label generation settings are inside a nested dictionary "
+        "within the label configuration file, specify the key here. If "
+        "the settings are at the top level, leave this blank."
+    ),
+)
+@click.option(
+    "--target-config",
+    type=click.Path(exists=True),
+    help=(
+        "Path to the training target configuration file. This file "
+        "specifies what sounds the model should learn to predict."
+    ),
+)
+@click.option(
+    "--target-config-field",
+    type=str,
+    help=(
+        "If the target settings are inside a nested dictionary "
+        "within the target configuration file, specify the key here. "
+        "If the settings are at the top level, you don't need to specify this."
+    ),
+)
+@click.option(
+    "--force",
+    is_flag=True,
+    help=(
+        "If a preprocessed file already exists, this option tells the "
+        "program to overwrite it with the new preprocessed data. Use "
+        "this if you want to re-do the preprocessing even if the files "
+        "already exist."
+    ),
+)
+@click.option(
+    "--num-workers",
+    type=int,
+    help=(
+        "The maximum number of computer cores to use when processing "
+        "your audio data. Using more cores can speed up the preprocessing, "
+        "but don't use more than your computer has available. By default, "
+        "the program will use all available cores."
+    ),
+)
+def preprocess(
+    dataset_config: Path,
+    output: Path,
+    base_dir: Optional[Path] = None,
+    preprocess_config: Optional[Path] = None,
+    target_config: Optional[Path] = None,
+    label_config: Optional[Path] = None,
+    force: bool = False,
+    num_workers: Optional[int] = None,
+    target_config_field: Optional[str] = None,
+    preprocess_config_field: Optional[str] = None,
+    label_config_field: Optional[str] = None,
+    dataset_field: Optional[str] = None,
+):
+    output = Path(output)
+    base_dir = base_dir or Path.cwd()
+
+    preprocess = (
+        load_preprocessing_config(
+            preprocess_config,
+            field=preprocess_config_field,
+        )
+        if preprocess_config
+        else None
+    )
+
+    target = (
+        load_target_config(
+            target_config,
+            field=target_config_field,
+        )
+        if target_config
+        else None
+    )
+
+    label = (
+        load_label_config(
+            label_config,
+            field=label_config_field,
+        )
+        if label_config
+        else None
+    )
+
+    dataset = load_dataset_from_config(
+        dataset_config,
+        field=dataset_field,
+        base_dir=base_dir,
+    )
+
+    if not output.exists():
+        output.mkdir(parents=True)
+
+    preprocess_annotations(
+        dataset.clip_annotations,
+        output_dir=output,
+        replace=force,
+        preprocessing_config=preprocess,
+        label_config=label,
+        target_config=target,
+        max_workers=num_workers,
+    )