zae_engine.models.builds.transformer의 소스 코드

from typing import Type, Sequence, Union

import torch
import torch.nn as nn


# TODO: implement Scaled-Dot Product Attention (SDPA).
# ref: https://magentino.tistory.com/176
# ref: https://tutorials.pytorch.kr/intermediate/scaled_dot_product_attention_tutorial.html
# from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa



[문서]
class TransformerBase(nn.Module):
    """
    A flexible Transformer model that supports both encoder-only and encoder-decoder architectures.

    Parameters
    ----------
    encoder_embedding : nn.Module
        The embedding layer for the encoder input.
    decoder_embedding : nn.Module, optional
        The embedding layer for the decoder input. If not provided, encoder_embedding is used for both encoder and decoder.
    encoder : nn.Module, optional
        The encoder module. Defaults to nn.Identity(), which can be replaced with any custom encoder (e.g., TransformerEncoder).
    decoder : nn.Module, optional
        The decoder module. If None, the model operates as an encoder-only model (e.g., BERT). Otherwise, uses a decoder (e.g., for translation models).

    Notes
    -----
    - If `decoder` is None, the model acts as an encoder-only transformer (similar to BERT).
    - If `decoder` is provided, the model functions as an encoder-decoder transformer (e.g., for translation tasks).
    - The forward pass adjusts based on the presence of the decoder.

    Methods
    -------
    forward(src, tgt=None, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None)
        Forward pass through the model. If `tgt` and `decoder` are provided, both encoder and decoder are used. Otherwise, only the encoder is applied.

    """

    def __init__(
        self,
        encoder_embedding: nn.Module,
        decoder_embedding: nn.Module = None,
        encoder: nn.Module = nn.Identity(),
        decoder: nn.Module = None,  # Set decoder to None by default
    ):
        super().__init__()
        self.encoder_embedding = encoder_embedding
        # If no decoder_embedding is provided, use encoder_embedding for both
        self.decoder_embedding = decoder_embedding
        self.encoder = encoder
        self.decoder = decoder


[문서]
    def forward(
        self, src, tgt=None, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None
    ):
        """
        Forward pass through the Transformer model.

        Parameters
        ----------
        src : torch.Tensor
            The input tensor representing the source sequence (e.g., for BERT-style models). Shape: (batch_size, seq_len).
        tgt : torch.Tensor, optional
            The input tensor representing the target sequence (for models with a decoder). Shape: (batch_size, seq_len).
        src_mask : torch.Tensor, optional
            Source mask for masking certain positions in the encoder input.
        tgt_mask : torch.Tensor, optional
            Target mask for masking certain positions in the decoder input.
        src_key_padding_mask : torch.Tensor, optional
            Mask for padding tokens in the source sequence.
        tgt_key_padding_mask : torch.Tensor, optional
            Mask for padding tokens in the target sequence.

        Returns
        -------
        torch.Tensor
            If a decoder is provided, returns the output of the decoder. Otherwise, returns the output of the encoder.
        """
        # Apply embeddings to source and target sequences
        src_embed = self.encoder_embedding(src)

        # If a decoder exists, apply decoder embedding and pass through the decoder
        if self.decoder is not None and tgt is not None:
            if self.decoder_embedding is not None:
                tgt_embed = self.decoder_embedding(tgt)
            else:
                tgt_embed = self.encoder_embedding(tgt)
            encoded = self.encoder(src_embed, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
            out = self.decoder(
                tgt_embed,
                encoded,
                tgt_mask=tgt_mask,
                memory_mask=src_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=src_key_padding_mask,
            )
            return out
        else:
            # If no decoder, only pass through the encoder
            return self.encoder(src_embed, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)





[문서]
class BertBase(nn.Module):
    """
    BertBase is a specialized version of TransformerBase, including a pooler for processing the [CLS] token.

    This class adds a pooler layer that processes the first token ([CLS]) from the encoder output, similar to the
    original BERT architecture. If a hidden dimension is provided during initialization, the pooler will be applied.
    Otherwise, only the encoder output is returned.

    Parameters
    ----------
    encoder_embedding : nn.Module
        The embedding layer for the encoder input.
    encoder : nn.Module
        The encoder module responsible for transforming the input sequence.
    dim_hidden : int, optional
        The hidden dimension used by the pooler layer. If provided, a pooler layer will be applied to the [CLS] token
        (first token) of the encoder output. Otherwise, only the encoder output is returned.
    sep_token_id : int, optional
        The ID representing the [SEP] token, used to identify sentence boundaries. The default value is 102, which
        is the standard for Hugging Face's BERT tokenizer. In BERT, the [SEP] token separates different sentences
        or segments, and is expected to be present once or twice in the input. An error will be raised if more than
        two [SEP] tokens are found in the input.

    Notes
    -----
    - The default value for `sep_token_id` is 102, which corresponds to the [SEP] token in Hugging Face's pre-trained
      BERT models. This token is used to separate sentences or indicate the end of a sentence. If you are using a different
      tokenizer or model, you may need to adjust this value accordingly.
    - If `input_sequence` is precomputed embeddings (dtype is float), the embedding layer is skipped, and
      `position_ids` and `token_type_ids` are not generated, as these are already embedded.

    Methods
    -------
    forward(input_sequence, src_mask=None, src_key_padding_mask=None)
        Performs the forward pass. If a hidden dimension (dim_hidden) is provided, the pooler is applied to the
        [CLS] token. Otherwise, it returns the encoder output as-is.
    """

    def __init__(self, encoder_embedding: nn.Module, encoder: nn.Module, sep_token_id: int = 102, **kwargs):
        super().__init__()
        self.encoder_embedding = encoder_embedding
        self.encoder = encoder
        self.sep_token_id = sep_token_id
        self.dim_hidden = kwargs.get("dim_hidden", None)
        if self.dim_hidden:
            self.pool_dense = nn.Linear(self.dim_hidden, self.dim_hidden)
            self.pool_activation = nn.Tanh()


[문서]
    def forward(self, input_sequence: torch.Tensor, src_mask=None, src_key_padding_mask=None):
        """
        Forward pass through the BERT model with an optional pooler.

        If a hidden dimension is provided, the pooler is applied to the first token of the encoder output.
        Otherwise, the encoder output is returned as-is.

        Parameters
        ----------
        input_sequence : torch.Tensor
            The input tensor representing either input_ids (token IDs) or input embeddings.
            If dtype is int, it is assumed to be token IDs (input_ids).
            If dtype is float, it is assumed to be precomputed embeddings (inputs_embeds), and the embedding layer
            is skipped. In this case, `position_ids` and `token_type_ids` are not generated.
        src_mask : torch.Tensor, optional
            Source mask for masking certain positions in the encoder input. Shape: (batch_size, seq_len).
        src_key_padding_mask : torch.Tensor, optional
            Mask for padding tokens in the source sequence. Shape: (batch_size, seq_len).

        Returns
        -------
        torch.Tensor
            If dim_hidden is provided, returns the pooled output from the [CLS] token. Otherwise, returns the
            encoder output for the entire sequence. Shape: (batch_size, dim_hidden) if pooled, or
            (batch_size, seq_len, dim_hidden) if not.
        """

        if torch.is_floating_point(input_sequence):
            # input_sequence is precomputed embeddings (inputs_embeds), skip embedding layer
            input_embeds = input_sequence
        else:
            # input_sequence is token IDs (input_ids), generate position_ids and token_type_ids
            batch_size, seq_len = input_sequence.size()

            # Generate position_ids: [batch_size, seq_len]
            position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)

            # Generate token_type_ids: [batch_size, seq_len]
            token_type_ids = self._generate_token_type_ids(input_sequence.tolist())

            # Pass input_ids, position_ids, and token_type_ids to embedding layer
            input_embeds = self.encoder_embedding(input_sequence, position_ids, token_type_ids)

        # Pass through the encoder
        encoded_output = self.encoder(input_embeds, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)

        # Apply pooling if a hidden dimension is specified
        if self.dim_hidden:
            cls_tkn = encoded_output[:, 0]  # Extract the first token ([CLS] token)
            return self.pool_activation(self.pool_dense(cls_tkn))

        return encoded_output


    def _generate_token_type_ids(self, input_sequences: list) -> torch.Tensor:
        """
        Generate token_type_ids for each sequence in the batch based on the presence of [SEP] tokens.

        Parameters
        ----------
        input_sequences : list[list[int]]
            The list of token ID sequences (batch of sequences) from which token_type_ids are generated.

        Returns
        -------
        torch.Tensor
            A tensor of token_type_ids where 0 represents the first sentence, and 1 represents the second sentence.

        Raises
        ------
        ValueError
            If more than two [SEP] tokens are present in any sequence.
        """
        token_type_ids_batch = []

        for input_sequence in input_sequences:
            token_type_ids = torch.zeros(len(input_sequence), dtype=torch.long)

            sep_indices = [i for i, token_id in enumerate(input_sequence) if token_id == self.sep_token_id]

            if len(sep_indices) > 2:
                raise ValueError(f"Input sequence contains more than two [SEP] tokens: {len(sep_indices)} found.")

            if len(sep_indices) == 2:
                # First sentence is before the first [SEP], second sentence is after the first [SEP]
                token_type_ids[sep_indices[0] + 1 :] = 1
            elif len(sep_indices) == 1:
                # Second sentence starts after the [SEP]
                token_type_ids[sep_indices[0] + 1 :] = 1

            token_type_ids_batch.append(token_type_ids)

        # Stack the token_type_ids for each sequence in the batch to form a tensor of shape [batch_size, seq_len]
        return torch.stack(token_type_ids_batch)




[문서]
class CoderBase(nn.Module):
    """
    Base class for both Encoder and Decoder that defines the core structure of the transformer layers.

    Parameters
    ----------
    d_model : int
        The dimension of the embedding space (output size of each layer).
    num_layers : int
        The number of layers in the encoder/decoder.
    layer_factory : nn.Module, optional
        Custom layer module. Defaults to `nn.TransformerEncoderLayer` for encoders and `nn.TransformerDecoderLayer` for decoders.
    # norm_layer : str or nn.Module, optional
    #     The normalization layer to apply. Can be a string or custom `nn.Module`. Default is 'LayerNorm'.
    dim_feedforward : int, optional
        The dimension of the feedforward network. Default is 2048.
    dropout : float, optional
        Dropout rate for regularization. Default is 0.1.
    num_heads : int, optional
        Number of attention heads in multi-head attention. Default is 8.
    factory_kwargs : dict, optional
        Additional arguments to pass to `layer_factory` when creating layers.
    """

    def __init__(
        self,
        d_model: int,
        num_layers: int,
        layer_factory: Type[nn.Module] = nn.TransformerEncoderLayer,
        # norm_layer: Union[str, nn.Module] = "LayerNorm",
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
        num_heads: int = 8,
        **factory_kwargs,
    ):
        super(CoderBase, self).__init__()
        self.d_model = d_model

        # Create layers using the provided layer factory
        self.layers = nn.ModuleList(
            [
                layer_factory(
                    d_model=d_model, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout, **factory_kwargs
                )
                for _ in range(num_layers)
            ]
        )

    def _get_norm_layer(self, norm_type):
        """
        Returns the appropriate normalization layer based on user input.
        """
        if isinstance(norm_type, nn.Module):
            return norm_type
        elif norm_type == "LayerNorm":
            return nn.LayerNorm(self.d_model)
        elif norm_type == "BatchNorm1d":
            return nn.BatchNorm1d(self.d_model)
        elif norm_type == "InstanceNorm1d":
            return nn.InstanceNorm1d(self.d_model)
        elif norm_type == "GroupNorm":
            return nn.GroupNorm(8, self.d_model)
        else:
            raise ValueError(f"Unsupported norm layer type: {norm_type}")




[문서]
class EncoderBase(CoderBase):
    """
    Encoder class that builds on CoderBase for encoding the input sequences.

    Parameters
    ----------
    d_model : int
        The dimension of the embedding space (output size of each layer).
    num_layers : int
        The number of layers in the encoder.
    layer_factory : nn.Module, optional
        Custom layer module. Defaults to `nn.TransformerEncoderLayer`.
    # norm_layer : str or nn.Module, optional
    #     The normalization layer to apply. Can be a string or custom `nn.Module`. Default is 'LayerNorm'.
    dim_feedforward : int, optional
        The dimension of the feedforward network. Default is 2048.
    dropout : float, optional
        Dropout rate for regularization. Default is 0.1.
    num_heads : int, optional
        Number of attention heads in multi-head attention. Default is 8.
    factory_kwargs : dict, optional
        Additional arguments to pass to `layer_factory` when creating layers.
    """

    def __init__(
        self,
        d_model: int,
        num_layers: int,
        layer_factory: Type[nn.Module] = nn.TransformerEncoderLayer,
        # norm_layer: Union[str, nn.Module] = "LayerNorm",
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
        num_heads: int = 8,
        **factory_kwargs,
    ):
        super(EncoderBase, self).__init__(
            d_model, num_layers, layer_factory, dim_feedforward, dropout, num_heads, **factory_kwargs
        )


[문서]
    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        """
        Forward pass through the encoder.

        Parameters
        ----------
        src : torch.Tensor
            The input tensor representing the source sequence. Shape: (batch_size, seq_len, d_model).
        src_mask : torch.Tensor, optional
            A mask tensor to prevent attention to certain positions in the source sequence.
        src_key_padding_mask : torch.Tensor, optional
            A mask tensor to prevent attention to padding tokens in the source sequence.

        Returns
        -------
        torch.Tensor
            The encoded output of the source sequence. Shape: (batch_size, seq_len, d_model).
        """

        # Pass the source sequence through each encoder layer
        for layer in self.layers:
            src = layer(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)

        return src





[문서]
class DecoderBase(CoderBase):
    """
    Decoder class that builds on CoderBase for decoding sequences based on the encoder's memory.

    Parameters
    ----------
    d_model : int
        The dimension of the embedding space (output size of each layer).
    num_layers : int
        The number of layers in the decoder.
    layer_factory : nn.Module, optional
        Custom layer module. Defaults to `nn.TransformerDecoderLayer`.
    norm_layer : str or nn.Module, optional
        The normalization layer to apply. Can be a string or custom `nn.Module`. Default is 'LayerNorm'.
    dim_feedforward : int, optional
        The dimension of the feedforward network. Default is 2048.
    dropout : float, optional
        Dropout rate for regularization. Default is 0.1.
    num_heads : int, optional
        Number of attention heads in multi-head attention. Default is 8.
    factory_kwargs : dict, optional
        Additional arguments to pass to `layer_factory` when creating layers.
    """

    def __init__(
        self,
        d_model: int,
        num_layers: int,
        layer_factory: Type[nn.Module] = nn.TransformerDecoderLayer,
        # norm_layer: Union[str, nn.Module] = "LayerNorm",
        dim_feedforward: int = 2048,
        dropout: float = 0.1,
        num_heads: int = 8,
        **factory_kwargs,
    ):
        super(DecoderBase, self).__init__(
            d_model, num_layers, layer_factory, dim_feedforward, dropout, num_heads, **factory_kwargs
        )


[문서]
    def forward(
        self,
        tgt,
        memory,
        tgt_mask=None,
        memory_mask=None,
        tgt_key_padding_mask=None,
        memory_key_padding_mask=None,
    ):
        """
        Forward pass through the decoder.

        Parameters
        ----------
        tgt : torch.Tensor
            The input tensor representing the target sequence. Shape: (batch_size, seq_len, d_model).
        memory : torch.Tensor
            The encoded memory output from the encoder. Shape: (batch_size, seq_len_src, d_model).
        tgt_mask : torch.Tensor, optional
            A mask tensor to prevent attention to certain positions in the target sequence.
        memory_mask : torch.Tensor, optional
            A mask tensor to prevent attention to certain positions in the memory sequence (from the encoder).
        tgt_key_padding_mask : torch.Tensor, optional
            A mask tensor to prevent attention to padding tokens in the target sequence.
        memory_key_padding_mask : torch.Tensor, optional
            A mask tensor to prevent attention to padding tokens in the memory sequence.

        Returns
        -------
        torch.Tensor
            The decoded output of the target sequence. Shape: (batch_size, seq_len_tgt, d_model).
        """

        # Pass the target sequence through each decoder layer
        for layer in self.layers:
            tgt = layer(
                tgt,
                memory,
                tgt_mask=tgt_mask,
                memory_mask=memory_mask,
                tgt_key_padding_mask=tgt_key_padding_mask,
                memory_key_padding_mask=memory_key_padding_mask,
            )

        return tgt