Improve logging of train preprocessing

This commit is contained in:
mbsantiago 2025-06-26 13:08:44 -06:00
parent 1384c549f7
commit 22f7d46f46
4 changed files with 47 additions and 2 deletions

View File

@ -98,6 +98,8 @@ example-preprocess:
--base-dir . \
--dataset-field datasets.train \
--config config.yaml \
--force \
-vv \
config.yaml example_data/preprocessed
example-train:

View File

@ -1,7 +1,9 @@
import sys
from pathlib import Path
from typing import Optional
import click
import yaml
from loguru import logger
from batdetect2.cli.base import cli
@ -83,6 +85,12 @@ __all__ = ["preprocess"]
"the program will use all available cores."
),
)
@click.option(
"-v",
"--verbose",
count=True,
help="Increase verbosity. -v for INFO, -vv for DEBUG.",
)
def preprocess(
dataset_config: Path,
output: Path,
@ -92,7 +100,17 @@ def preprocess(
force: bool = False,
num_workers: Optional[int] = None,
dataset_field: Optional[str] = None,
verbose: int = 0,
):
logger.remove()
if verbose == 0:
log_level = "WARNING"
elif verbose == 1:
log_level = "INFO"
else:
log_level = "DEBUG"
logger.add(sys.stderr, level=log_level)
logger.info("Starting preprocessing.")
output = Path(output)
@ -101,11 +119,20 @@ def preprocess(
base_dir = base_dir or Path.cwd()
logger.debug("Current working directory: {base_dir}", base_dir=base_dir)
if config:
logger.info(
"Loading preprocessing config from: {config}", config=config
)
conf = (
load_train_preprocessing_config(config, field=config_field)
if config is not None
else TrainPreprocessConfig()
)
logger.debug(
"Preprocessing config:\n{conf}",
conf=yaml.dump(conf.model_dump()),
)
dataset = load_dataset_from_config(
dataset_config,

View File

@ -38,7 +38,7 @@ class BaseConfig(BaseModel):
Pydantic model configuration dictionary. Set to forbid extra fields.
"""
model_config = ConfigDict(extra="allow")
model_config = ConfigDict(extra="ignore")
T = TypeVar("T", bound=BaseModel)

View File

@ -256,8 +256,17 @@ def preprocess_annotations(
output_dir = Path(output_dir)
if not output_dir.is_dir():
logger.info(
"Creating output directory: {output_dir}", output_dir=output_dir
)
output_dir.mkdir(parents=True)
logger.info(
"Starting preprocessing of {num_annotations} annotations with {max_workers} workers.",
num_annotations=len(clip_annotations),
max_workers=max_workers or "all available",
)
with Pool(max_workers) as pool:
list(
tqdm(
@ -273,8 +282,10 @@ def preprocess_annotations(
clip_annotations,
),
total=len(clip_annotations),
desc="Preprocessing annotations",
)
)
logger.info("Finished preprocessing.")
def preprocess_single_annotation(
@ -313,11 +324,15 @@ def preprocess_single_annotation(
path = output_dir / filename
if path.is_file() and not replace:
logger.debug("Skipping existing file: {path}", path=path)
return
if path.is_file() and replace:
logger.debug("Removing existing file: {path}", path=path)
path.unlink()
logger.debug("Processing annotation {uuid}", uuid=clip_annotation.uuid)
try:
sample = generate_train_example(
clip_annotation,
@ -326,8 +341,9 @@ def preprocess_single_annotation(
)
except Exception as error:
logger.error(
"Failed to process annotation: {uuid}. Error {error}",
"Failed to process annotation {uuid} to {path}. Error: {error}",
uuid=clip_annotation.uuid,
path=path,
error=error,
)
return