Create an encoder module

2026-01-11 09:29:33 +01:00 · 2025-04-21 15:28:56 +01:00 · 2025-04-21 15:28:56 +01:00 · 096d180ea3
commit 096d180ea3
parent ffa4c2e5e9
1 changed files with 357 additions and 0 deletions
--- a/batdetect2/models/encoder.py
+++ b/batdetect2/models/encoder.py
@ -0,0 +1,357 @@
 """Constructs the Encoder part of an Encoder-Decoder neural network.
 This module defines the configuration structure (`EncoderConfig`) and provides
 the `Encoder` class (an `nn.Module`) along with a factory function
 (`build_encoder`) to create sequential encoders commonly used as the
 downsampling path in architectures like U-Nets for spectrogram analysis.
 The encoder is built by stacking configurable downscaling blocks. Two types
 of downscaling blocks are supported, selectable via the configuration:
 - `StandardConvDownBlock`: A basic Conv2d -> MaxPool2d -> BN -> ReLU block.
 - `FreqCoordConvDownBlock`: A similar block that incorporates frequency
  coordinate information (CoordF) before the convolution to potentially aid
  spatial awareness along the frequency axis.
 The `Encoder`'s `forward` method provides access to intermediate feature maps
 from each stage, suitable for use as skip connections in a corresponding
 Decoder. A separate `encode` method returns only the final output (bottleneck)
 features.
 """
 from enum import Enum
 from typing import List
 import torch
 from pydantic import Field
 from torch import nn
 from batdetect2.configs import BaseConfig
 from batdetect2.models.blocks import (
    FreqCoordConvDownBlock,
    StandardConvDownBlock,
 )
 __all__ = [
    "DownscalingLayer",
    "EncoderLayer",
    "EncoderConfig",
    "Encoder",
    "build_encoder",
 ]
 class DownscalingLayer(str, Enum):
    """Enumeration of available downscaling layer types for the Encoder.
    Used in configuration to specify which block implementation to use at each
    stage of the encoder.
    Attributes
    ----------
    standard : str
        Identifier for the `StandardConvDownBlock`.
    coord : str
        Identifier for the `FreqCoordConvDownBlock` (incorporates frequency
        coords).
    """
    standard = "ConvBlockDownStandard"
    coord = "FreqCoordConvDownBlock"
 class EncoderLayer(BaseConfig):
    """Configuration for a single layer within the Encoder sequence.
    Attributes
    ----------
    layer_type : DownscalingLayer
        Specifies the type of downscaling block to use for this layer
        (either 'standard' or 'coord').
    channels : int
        The number of output channels this layer should produce. Must be > 0.
    """
    layer_type: DownscalingLayer
    channels: int
 class EncoderConfig(BaseConfig):
    """Configuration for building the entire sequential Encoder.
    Attributes
    ----------
    input_height : int
        The expected height (number of frequency bins) of the input spectrogram
        tensor fed into the first layer of the encoder. Required for
        calculating intermediate heights, especially for CoordF layers. Must be
        > 0.
    layers : List[EncoderLayer]
        An ordered list defining the sequence of downscaling layers in the
        encoder. Each item specifies the layer type and its output channel
        count. The number of input channels for each layer is inferred from the
        previous layer's output channels (or `input_channels` for the first
        layer). Must contain at least one layer definition.
    input_channels : int, default=1
        The number of channels in the initial input tensor to the encoder
        (e.g., 1 for a standard single-channel spectrogram). Must be > 0.
    """
    input_height: int = Field(gt=0)
    layers: List[EncoderLayer] = Field(min_length=1)
    input_channels: int = Field(gt=0)
 def build_downscaling_layer(
    in_channels: int,
    out_channels: int,
    input_height: int,
    layer_type: DownscalingLayer,
 ) -> tuple[nn.Module, int, int]:
    """Build a single downscaling layer based on configuration.
    Internal factory function used by `build_encoder`. Instantiates the
    appropriate downscaling block (`StandardConvDownBlock` or
    `FreqCoordConvDownBlock`) and returns it along with its expected output
    channel count and output height (assuming 2x spatial downsampling).
    Parameters
    ----------
    in_channels : int
        Number of input channels to the layer.
    out_channels : int
        Desired number of output channels from the layer.
    input_height : int
        Height of the input feature map to this layer.
    layer_type : DownscalingLayer
        The type of layer to build ('standard' or 'coord').
    Returns
    -------
    Tuple[nn.Module, int, int]
        A tuple containing:
        - The instantiated `nn.Module` layer.
        - The number of output channels (`out_channels`).
        - The expected output height (`input_height // 2`).
    Raises
    ------
    ValueError
        If `layer_type` is invalid.
    """
    if layer_type == DownscalingLayer.standard:
        return (
            StandardConvDownBlock(
                in_channels=in_channels,
                out_channels=out_channels,
            ),
            out_channels,
            input_height // 2,
        )
    if layer_type == DownscalingLayer.coord:
        return (
            FreqCoordConvDownBlock(
                in_channels=in_channels,
                out_channels=out_channels,
                input_height=input_height,
            ),
            out_channels,
            input_height // 2,
        )
    raise ValueError(
        f"Invalid downscaling layer type {layer_type}. "
        f"Valid values: ConvBlockDownCoordF, ConvBlockDownStandard"
    )
 class Encoder(nn.Module):
    """Sequential Encoder module composed of configurable downscaling layers.
    Constructs the downsampling path of an encoder-decoder network by stacking
    multiple downscaling blocks.
    The `forward` method executes the sequence and returns the output feature
    map from *each* downscaling stage, facilitating the implementation of skip
    connections in U-Net-like architectures. The `encode` method returns only
    the final output tensor (bottleneck features).
    Attributes
    ----------
    input_channels : int
        Number of channels expected in the input tensor.
    input_height : int
        Height (frequency bins) expected in the input tensor.
    output_channels : int
        Number of channels in the final output tensor (bottleneck).
    layers : nn.ModuleList
        The sequence of instantiated downscaling layer modules.
    depth : int
        The number of downscaling layers in the encoder.
    """
    def __init__(
        self,
        output_channels: int,
        layers: List[nn.Module],
        input_height: int = 128,
        input_channels: int = 1,
    ):
        """Initialize the Encoder module.
        Note: This constructor is typically called internally by the
        `build_encoder` factory function, which prepares the `layers` list.
        Parameters
        ----------
        output_channels : int
            Number of channels produced by the final layer.
        layers : List[nn.Module]
            A list of pre-instantiated downscaling layer modules (e.g.,
            `StandardConvDownBlock` or `FreqCoordConvDownBlock`) in the desired
            sequence.
        input_height : int, default=128
            Expected height of the input tensor.
        input_channels : int, default=1
            Expected number of channels in the input tensor.
        """
        super().__init__()
        self.input_channels = input_channels
        self.input_height = input_height
        self.output_channels = output_channels
        self.layers = nn.ModuleList(layers)
        self.depth = len(self.layers)
    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        """Pass input through encoder layers, returns all intermediate outputs.
        This method is typically used when the Encoder is part of a U-Net or
        similar architecture requiring skip connections.
        Parameters
        ----------
        x : torch.Tensor
            Input tensor, shape `(B, C_in, H_in, W)`, where `C_in` must match
            `self.input_channels` and `H_in` must match `self.input_height`.
        Returns
        -------
        List[torch.Tensor]
            A list containing the output tensors from *each* downscaling layer
            in the sequence. `outputs[0]` is the output of the first layer,
            `outputs[-1]` is the final output (bottleneck) of the encoder.
        Raises
        ------
        ValueError
            If input tensor channel count or height does not match expected
            values.
        """
        if x.shape[1] != self.input_channels:
            raise ValueError(
                f"Input tensor has {x.shape[1]} channels, "
                f"but encoder expects {self.input_channels}."
            )
        if x.shape[2] != self.input_height:
            raise ValueError(
                f"Input tensor height {x.shape[2]} does not match "
                f"encoder expected input_height {self.input_height}."
            )
        outputs = []
        for layer in self.layers:
            x = layer(x)
            outputs.append(x)
        return outputs
    def encode(self, x: torch.Tensor) -> torch.Tensor:
        """Pass input through encoder layers, returning only the final output.
        This method provides access to the bottleneck features produced after
        the last downscaling layer.
        Parameters
        ----------
        x : torch.Tensor
            Input tensor, shape `(B, C_in, H_in, W)`. Must match expected
            `input_channels` and `input_height`.
        Returns
        -------
        torch.Tensor
            The final output tensor (bottleneck features) from the last layer
            of the encoder. Shape `(B, C_out, H_out, W_out)`.
        Raises
        ------
        ValueError
            If input tensor channel count or height does not match expected
            values.
        """
        if x.shape[1] != self.input_channels:
            raise ValueError(
                f"Input tensor has {x.shape[1]} channels, "
                f"but encoder expects {self.input_channels}."
            )
        if x.shape[2] != self.input_height:
            raise ValueError(
                f"Input tensor height {x.shape[2]} does not match "
                f"encoder expected input_height {self.input_height}."
            )
        for layer in self.layers:
            x = layer(x)
        return x
 def build_encoder(config: EncoderConfig) -> Encoder:
    """Factory function to build an Encoder instance from configuration.
    Constructs a sequential `Encoder` module based on the specifications in
    an `EncoderConfig` object. It iteratively builds the specified sequence
    of downscaling layers (`StandardConvDownBlock` or `FreqCoordConvDownBlock`),
    tracking the changing number of channels and feature map height.
    Parameters
    ----------
    config : EncoderConfig
        The configuration object detailing the encoder architecture, including
        input dimensions, layer types, and channel counts for each stage.
    Returns
    -------
    Encoder
        An initialized `Encoder` module.
    Raises
    ------
    ValueError
        If the layer configuration is invalid (e.g., unknown layer type).
    """
    current_channels = config.input_channels
    current_height = config.input_height
    layers = []
    for layer_config in config.layers:
        layer, current_channels, current_height = build_downscaling_layer(
            in_channels=current_channels,
            out_channels=layer_config.channels,
            input_height=current_height,
            layer_type=layer_config.layer_type,
        )
        layers.append(layer)
    return Encoder(
        input_height=config.input_height,
        layers=layers,
        input_channels=config.input_channels,
        output_channels=current_channels,
    )