Starting to add logging to preprocess

This commit is contained in:
mbsantiago 2025-04-24 00:20:30 +01:00
parent 7dd35d6e3e
commit 8a6ed3dec7
3 changed files with 48 additions and 5 deletions

View File

@ -2,6 +2,7 @@ from pathlib import Path
from typing import Optional from typing import Optional
import click import click
from loguru import logger
from batdetect2.cli.base import cli from batdetect2.cli.base import cli
from batdetect2.data import load_dataset_from_config from batdetect2.data import load_dataset_from_config
@ -123,7 +124,7 @@ def train(): ...
def preprocess( def preprocess(
dataset_config: Path, dataset_config: Path,
output: Path, output: Path,
target_config: Path, target_config: Optional[Path] = None,
base_dir: Optional[Path] = None, base_dir: Optional[Path] = None,
preprocess_config: Optional[Path] = None, preprocess_config: Optional[Path] = None,
label_config: Optional[Path] = None, label_config: Optional[Path] = None,
@ -134,8 +135,13 @@ def preprocess(
label_config_field: Optional[str] = None, label_config_field: Optional[str] = None,
dataset_field: Optional[str] = None, dataset_field: Optional[str] = None,
): ):
logger.info("Starting preprocessing.")
output = Path(output) output = Path(output)
logger.info("Will save outputs to {output}", output=output)
base_dir = base_dir or Path.cwd() base_dir = base_dir or Path.cwd()
logger.debug("Current working directory: {base_dir}", base_dir=base_dir)
preprocess = ( preprocess = (
load_preprocessing_config( load_preprocessing_config(
@ -146,9 +152,13 @@ def preprocess(
else None else None
) )
target = load_target_config( target = (
target_config, load_target_config(
field=target_config_field, target_config,
field=target_config_field,
)
if target_config
else None
) )
label = ( label = (
@ -166,13 +176,20 @@ def preprocess(
base_dir=base_dir, base_dir=base_dir,
) )
logger.info(
"Loaded {num_examples} annotated clips from the configured dataset",
num_examples=len(dataset),
)
targets = build_targets(config=target) targets = build_targets(config=target)
preprocessor = build_preprocessor(config=preprocess) preprocessor = build_preprocessor(config=preprocess)
labeller = build_clip_labeler(targets, config=label) labeller = build_clip_labeler(targets, config=label)
if not output.exists(): if not output.exists():
logger.debug("Creating directory {directory}", directory=output)
output.mkdir(parents=True) output.mkdir(parents=True)
logger.info("Will start preprocessing")
preprocess_annotations( preprocess_annotations(
dataset, dataset,
output_dir=output, output_dir=output,

View File

@ -29,6 +29,7 @@ import os
from pathlib import Path from pathlib import Path
from typing import Literal, Optional, Union from typing import Literal, Optional, Union
from loguru import logger
from pydantic import Field, ValidationError from pydantic import Field, ValidationError
from soundevent import data from soundevent import data
@ -177,6 +178,11 @@ def load_batdetect2_files_annotated_dataset(
path = base_dir / path path = base_dir / path
paths = list_file_annotations(path) paths = list_file_annotations(path)
logger.debug(
"Found {num_files} files in the annotations directory {path}",
num_files=len(paths),
path=path,
)
annotations = [] annotations = []
@ -184,6 +190,7 @@ def load_batdetect2_files_annotated_dataset(
try: try:
file_annotation = load_file_annotation(p) file_annotation = load_file_annotation(p)
except (FileNotFoundError, ValidationError): except (FileNotFoundError, ValidationError):
logger.warning("Could not load annotations in file {path}", path=p)
continue continue
if ( if (
@ -191,6 +198,10 @@ def load_batdetect2_files_annotated_dataset(
and dataset.filter.only_annotated and dataset.filter.only_annotated
and not file_annotation.annotated and not file_annotation.annotated
): ):
logger.debug(
"Annotation in file {path} omited: not annotated",
path=p,
)
continue continue
if ( if (
@ -198,6 +209,10 @@ def load_batdetect2_files_annotated_dataset(
and dataset.filter.exclude_issues and dataset.filter.exclude_issues
and file_annotation.issues and file_annotation.issues
): ):
logger.debug(
"Annotation in file {path} omited: has issues",
path=p,
)
continue continue
try: try:
@ -205,7 +220,12 @@ def load_batdetect2_files_annotated_dataset(
file_annotation, file_annotation,
audio_dir=audio_dir, audio_dir=audio_dir,
) )
except FileNotFoundError: except FileNotFoundError as err:
logger.warning(
"Did not find the audio related to the annotation file {path}. Error: {err}",
path=p,
err=err,
)
continue continue
annotations.append( annotations.append(

View File

@ -21,6 +21,7 @@ The core components are:
from pathlib import Path from pathlib import Path
from typing import Annotated, List, Optional from typing import Annotated, List, Optional
from loguru import logger
from pydantic import Field from pydantic import Field
from soundevent import data, io from soundevent import data, io
@ -115,6 +116,11 @@ def load_dataset(
clip_annotations = [] clip_annotations = []
for source in dataset.sources: for source in dataset.sources:
annotated_source = load_annotated_dataset(source, base_dir=base_dir) annotated_source = load_annotated_dataset(source, base_dir=base_dir)
logger.debug(
"Loaded {num_examples} from dataset source '{source_name}'",
num_examples=len(annotated_source.clip_annotations),
source_name=source.name,
)
clip_annotations.extend( clip_annotations.extend(
insert_source_tag(clip_annotation, source) insert_source_tag(clip_annotation, source)
for clip_annotation in annotated_source.clip_annotations for clip_annotation in annotated_source.clip_annotations