Improve logging of train preprocessing

This commit is contained in:
mbsantiago 2025-06-26 13:08:44 -06:00
parent 1384c549f7
commit 22f7d46f46
4 changed files with 47 additions and 2 deletions

View File

@ -98,6 +98,8 @@ example-preprocess:
--base-dir . \ --base-dir . \
--dataset-field datasets.train \ --dataset-field datasets.train \
--config config.yaml \ --config config.yaml \
--force \
-vv \
config.yaml example_data/preprocessed config.yaml example_data/preprocessed
example-train: example-train:

View File

@ -1,7 +1,9 @@
import sys
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import click import click
import yaml
from loguru import logger from loguru import logger
from batdetect2.cli.base import cli from batdetect2.cli.base import cli
@ -83,6 +85,12 @@ __all__ = ["preprocess"]
"the program will use all available cores." "the program will use all available cores."
), ),
) )
@click.option(
"-v",
"--verbose",
count=True,
help="Increase verbosity. -v for INFO, -vv for DEBUG.",
)
def preprocess( def preprocess(
dataset_config: Path, dataset_config: Path,
output: Path, output: Path,
@ -92,7 +100,17 @@ def preprocess(
force: bool = False, force: bool = False,
num_workers: Optional[int] = None, num_workers: Optional[int] = None,
dataset_field: Optional[str] = None, dataset_field: Optional[str] = None,
verbose: int = 0,
): ):
logger.remove()
if verbose == 0:
log_level = "WARNING"
elif verbose == 1:
log_level = "INFO"
else:
log_level = "DEBUG"
logger.add(sys.stderr, level=log_level)
logger.info("Starting preprocessing.") logger.info("Starting preprocessing.")
output = Path(output) output = Path(output)
@ -101,11 +119,20 @@ def preprocess(
base_dir = base_dir or Path.cwd() base_dir = base_dir or Path.cwd()
logger.debug("Current working directory: {base_dir}", base_dir=base_dir) logger.debug("Current working directory: {base_dir}", base_dir=base_dir)
if config:
logger.info(
"Loading preprocessing config from: {config}", config=config
)
conf = ( conf = (
load_train_preprocessing_config(config, field=config_field) load_train_preprocessing_config(config, field=config_field)
if config is not None if config is not None
else TrainPreprocessConfig() else TrainPreprocessConfig()
) )
logger.debug(
"Preprocessing config:\n{conf}",
conf=yaml.dump(conf.model_dump()),
)
dataset = load_dataset_from_config( dataset = load_dataset_from_config(
dataset_config, dataset_config,

View File

@ -38,7 +38,7 @@ class BaseConfig(BaseModel):
Pydantic model configuration dictionary. Set to forbid extra fields. Pydantic model configuration dictionary. Set to forbid extra fields.
""" """
model_config = ConfigDict(extra="allow") model_config = ConfigDict(extra="ignore")
T = TypeVar("T", bound=BaseModel) T = TypeVar("T", bound=BaseModel)

View File

@ -256,8 +256,17 @@ def preprocess_annotations(
output_dir = Path(output_dir) output_dir = Path(output_dir)
if not output_dir.is_dir(): if not output_dir.is_dir():
logger.info(
"Creating output directory: {output_dir}", output_dir=output_dir
)
output_dir.mkdir(parents=True) output_dir.mkdir(parents=True)
logger.info(
"Starting preprocessing of {num_annotations} annotations with {max_workers} workers.",
num_annotations=len(clip_annotations),
max_workers=max_workers or "all available",
)
with Pool(max_workers) as pool: with Pool(max_workers) as pool:
list( list(
tqdm( tqdm(
@ -273,8 +282,10 @@ def preprocess_annotations(
clip_annotations, clip_annotations,
), ),
total=len(clip_annotations), total=len(clip_annotations),
desc="Preprocessing annotations",
) )
) )
logger.info("Finished preprocessing.")
def preprocess_single_annotation( def preprocess_single_annotation(
@ -313,11 +324,15 @@ def preprocess_single_annotation(
path = output_dir / filename path = output_dir / filename
if path.is_file() and not replace: if path.is_file() and not replace:
logger.debug("Skipping existing file: {path}", path=path)
return return
if path.is_file() and replace: if path.is_file() and replace:
logger.debug("Removing existing file: {path}", path=path)
path.unlink() path.unlink()
logger.debug("Processing annotation {uuid}", uuid=clip_annotation.uuid)
try: try:
sample = generate_train_example( sample = generate_train_example(
clip_annotation, clip_annotation,
@ -326,8 +341,9 @@ def preprocess_single_annotation(
) )
except Exception as error: except Exception as error:
logger.error( logger.error(
"Failed to process annotation: {uuid}. Error {error}", "Failed to process annotation {uuid} to {path}. Error: {error}",
uuid=clip_annotation.uuid, uuid=clip_annotation.uuid,
path=path,
error=error, error=error,
) )
return return