Improve logging of train preprocessing

2026-01-10 17:19:34 +01:00 · 2025-06-26 13:08:44 -06:00 · 2025-06-26 13:08:44 -06:00 · 22f7d46f46
commit 22f7d46f46
parent 1384c549f7
4 changed files with 47 additions and 2 deletions
--- a/2
+++ b/2
@ -98,6 +98,8 @@ example-preprocess:
 		--base-dir . \
 		--dataset-field datasets.train \
 		--config config.yaml \
+		--force \
+		-vv \
 		config.yaml example_data/preprocessed

 example-train:
--- a/src/batdetect2/cli/preprocess.py
+++ b/src/batdetect2/cli/preprocess.py
@ -1,7 +1,9 @@
+import sys
 from pathlib import Path
 from typing import Optional

 import click
+import yaml
 from loguru import logger

 from batdetect2.cli.base import cli
@ -83,6 +85,12 @@ __all__ = ["preprocess"]
        "the program will use all available cores."
    ),
 )
+@click.option(
+    "-v",
+    "--verbose",
+    count=True,
+    help="Increase verbosity. -v for INFO, -vv for DEBUG.",
+)
 def preprocess(
    dataset_config: Path,
    output: Path,
@ -92,7 +100,17 @@ def preprocess(
    force: bool = False,
    num_workers: Optional[int] = None,
    dataset_field: Optional[str] = None,
+    verbose: int = 0,
 ):
+    logger.remove()
+    if verbose == 0:
+        log_level = "WARNING"
+    elif verbose == 1:
+        log_level = "INFO"
+    else:
+        log_level = "DEBUG"
+    logger.add(sys.stderr, level=log_level)
+
    logger.info("Starting preprocessing.")

    output = Path(output)
@ -101,11 +119,20 @@ def preprocess(
    base_dir = base_dir or Path.cwd()
    logger.debug("Current working directory: {base_dir}", base_dir=base_dir)

+    if config:
+        logger.info(
+            "Loading preprocessing config from: {config}", config=config
+        )
+
    conf = (
        load_train_preprocessing_config(config, field=config_field)
        if config is not None
        else TrainPreprocessConfig()
    )
+    logger.debug(
+        "Preprocessing config:\n{conf}",
+        conf=yaml.dump(conf.model_dump()),
+    )

    dataset = load_dataset_from_config(
        dataset_config,
--- a/src/batdetect2/configs.py
+++ b/src/batdetect2/configs.py
@ -38,7 +38,7 @@ class BaseConfig(BaseModel):
        Pydantic model configuration dictionary. Set to forbid extra fields.
    """

-    model_config = ConfigDict(extra="allow")
+    model_config = ConfigDict(extra="ignore")


 T = TypeVar("T", bound=BaseModel)
--- a/src/batdetect2/train/preprocess.py
+++ b/src/batdetect2/train/preprocess.py
@ -256,8 +256,17 @@ def preprocess_annotations(
    output_dir = Path(output_dir)

    if not output_dir.is_dir():
+        logger.info(
+            "Creating output directory: {output_dir}", output_dir=output_dir
+        )
        output_dir.mkdir(parents=True)

+    logger.info(
+        "Starting preprocessing of {num_annotations} annotations with {max_workers} workers.",
+        num_annotations=len(clip_annotations),
+        max_workers=max_workers or "all available",
+    )
+
    with Pool(max_workers) as pool:
        list(
            tqdm(
@ -273,8 +282,10 @@ def preprocess_annotations(
                    clip_annotations,
                ),
                total=len(clip_annotations),
+                desc="Preprocessing annotations",
            )
        )
+    logger.info("Finished preprocessing.")


 def preprocess_single_annotation(
@ -313,11 +324,15 @@ def preprocess_single_annotation(
    path = output_dir / filename

    if path.is_file() and not replace:
+        logger.debug("Skipping existing file: {path}", path=path)
        return

    if path.is_file() and replace:
+        logger.debug("Removing existing file: {path}", path=path)
        path.unlink()

+    logger.debug("Processing annotation {uuid}", uuid=clip_annotation.uuid)
+
    try:
        sample = generate_train_example(
            clip_annotation,
@ -326,8 +341,9 @@ def preprocess_single_annotation(
        )
    except Exception as error:
        logger.error(
-            "Failed to process annotation: {uuid}. Error {error}",
+            "Failed to process annotation {uuid} to {path}. Error: {error}",
            uuid=clip_annotation.uuid,
+            path=path,
            error=error,
        )
        return