Starting to add logging to preprocess

This commit is contained in:
mbsantiago 2025-04-24 00:20:30 +01:00
parent 7dd35d6e3e
commit 8a6ed3dec7
3 changed files with 48 additions and 5 deletions

View File

@ -2,6 +2,7 @@ from pathlib import Path
from typing import Optional
import click
from loguru import logger
from batdetect2.cli.base import cli
from batdetect2.data import load_dataset_from_config
@ -123,7 +124,7 @@ def train(): ...
def preprocess(
dataset_config: Path,
output: Path,
target_config: Path,
target_config: Optional[Path] = None,
base_dir: Optional[Path] = None,
preprocess_config: Optional[Path] = None,
label_config: Optional[Path] = None,
@ -134,8 +135,13 @@ def preprocess(
label_config_field: Optional[str] = None,
dataset_field: Optional[str] = None,
):
logger.info("Starting preprocessing.")
output = Path(output)
logger.info("Will save outputs to {output}", output=output)
base_dir = base_dir or Path.cwd()
logger.debug("Current working directory: {base_dir}", base_dir=base_dir)
preprocess = (
load_preprocessing_config(
@ -146,10 +152,14 @@ def preprocess(
else None
)
target = load_target_config(
target = (
load_target_config(
target_config,
field=target_config_field,
)
if target_config
else None
)
label = (
load_label_config(
@ -166,13 +176,20 @@ def preprocess(
base_dir=base_dir,
)
logger.info(
"Loaded {num_examples} annotated clips from the configured dataset",
num_examples=len(dataset),
)
targets = build_targets(config=target)
preprocessor = build_preprocessor(config=preprocess)
labeller = build_clip_labeler(targets, config=label)
if not output.exists():
logger.debug("Creating directory {directory}", directory=output)
output.mkdir(parents=True)
logger.info("Will start preprocessing")
preprocess_annotations(
dataset,
output_dir=output,

View File

@ -29,6 +29,7 @@ import os
from pathlib import Path
from typing import Literal, Optional, Union
from loguru import logger
from pydantic import Field, ValidationError
from soundevent import data
@ -177,6 +178,11 @@ def load_batdetect2_files_annotated_dataset(
path = base_dir / path
paths = list_file_annotations(path)
logger.debug(
"Found {num_files} files in the annotations directory {path}",
num_files=len(paths),
path=path,
)
annotations = []
@ -184,6 +190,7 @@ def load_batdetect2_files_annotated_dataset(
try:
file_annotation = load_file_annotation(p)
except (FileNotFoundError, ValidationError):
logger.warning("Could not load annotations in file {path}", path=p)
continue
if (
@ -191,6 +198,10 @@ def load_batdetect2_files_annotated_dataset(
and dataset.filter.only_annotated
and not file_annotation.annotated
):
logger.debug(
"Annotation in file {path} omited: not annotated",
path=p,
)
continue
if (
@ -198,6 +209,10 @@ def load_batdetect2_files_annotated_dataset(
and dataset.filter.exclude_issues
and file_annotation.issues
):
logger.debug(
"Annotation in file {path} omited: has issues",
path=p,
)
continue
try:
@ -205,7 +220,12 @@ def load_batdetect2_files_annotated_dataset(
file_annotation,
audio_dir=audio_dir,
)
except FileNotFoundError:
except FileNotFoundError as err:
logger.warning(
"Did not find the audio related to the annotation file {path}. Error: {err}",
path=p,
err=err,
)
continue
annotations.append(

View File

@ -21,6 +21,7 @@ The core components are:
from pathlib import Path
from typing import Annotated, List, Optional
from loguru import logger
from pydantic import Field
from soundevent import data, io
@ -115,6 +116,11 @@ def load_dataset(
clip_annotations = []
for source in dataset.sources:
annotated_source = load_annotated_dataset(source, base_dir=base_dir)
logger.debug(
"Loaded {num_examples} from dataset source '{source_name}'",
num_examples=len(annotated_source.clip_annotations),
source_name=source.name,
)
clip_annotations.extend(
insert_source_tag(clip_annotation, source)
for clip_annotation in annotated_source.clip_annotations