Source code for zae_engine.nn_night.layers.position_encoding

import torch
import torch.nn as nn
import math


[docs] class SinusoidalPositionalEncoding(nn.Module): """ Computes sinusoidal positional encoding as described in the Transformer paper [1]_. Parameters ---------- d_model : int Dimension of the embedding space. Must be an even number. max_len : int, optional Maximum sequence length. Default is 512. Notes ----- - This method was introduced in the original Transformer paper (Vaswani et al., 2017) [1]_. - Uses fixed sine and cosine functions of different frequencies to encode token positions. - Benefits: Simple and efficient to compute, and captures positional information effectively. - Drawbacks: Does not capture relative positional information. References ---------- .. [1] Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.Õ., Kaiser, Ł., Polosukhin, I. (2017). Attention is All You Need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (NeurIPS 2017). Available at: https://arxiv.org/abs/1706.03762 DOI: 10.48550/arXiv.1706.03762 """ def __init__(self, d_model, max_len=512): super(SinusoidalPositionalEncoding, self).__init__() assert d_model % 2 == 0, "d_model must be an even number for sinusoidal positional encoding." self.d_model = d_model self.max_len = max_len def _create_positional_encoding(self, positions): """ Internal method to create positional encodings using sine and cosine functions. Parameters ---------- positions : torch.Tensor Tensor of positions to be encoded. Returns ------- torch.Tensor Positional encoding tensor of shape (batch_size, seq_len, d_model). """ div_term = torch.exp( torch.arange(0, self.d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / self.d_model) ) pos_enc = torch.zeros(positions.size(0), positions.size(1), self.d_model) pos_enc[:, :, 0::2] = torch.sin(positions * div_term.unsqueeze(0).unsqueeze(0)) pos_enc[:, :, 1::2] = torch.cos(positions * div_term.unsqueeze(0).unsqueeze(0)) return pos_enc
[docs] def forward(self, x, positions: torch.Tensor = None): """ Apply sinusoidal positional encoding to the input tensor. Parameters ---------- x : torch.Tensor Input tensor of shape (batch_size, seq_len, d_model). positions : torch.Tensor, optional Optional tensor of shape (batch_size, seq_len) specifying the positions (e.g., timestamps) for each element in the sequence. If not provided, the default positions (0 to seq_len - 1) are used. Returns ------- torch.Tensor Tensor with added positional encoding. """ batch_size, seq_len, _ = x.size() if positions is not None: assert positions.size(0) == batch_size, "Positions batch size must match input batch size" assert positions.size(1) == seq_len, "Positions sequence length must match input sequence length" else: positions = torch.arange(seq_len).unsqueeze(0).expand(batch_size, -1).float() pos_enc = self._create_positional_encoding(positions.unsqueeze(-1)) return x + pos_enc
[docs] class LearnablePositionalEncoding(nn.Module): """ Implements learnable positional encoding where positional embeddings are learned during training. Parameters ---------- d_model : int Dimension of the embedding space. max_len : int, optional Maximum sequence length. Default is 512. Notes ----- - This method allows the model to learn optimal positional encodings during training. - Benefits: Can adapt the positional encoding to the specific task. - Drawbacks: Requires additional parameters and training time. References ---------- - No specific reference, this approach is commonly used in various models including BERT and GPT. """ def __init__(self, d_model, max_len=512): super(LearnablePositionalEncoding, self).__init__() self.position_embeddings = nn.Parameter(torch.randn(max_len, d_model))
[docs] def forward(self, x): """ Apply learnable positional encoding to input tensor. Parameters ---------- x : torch.Tensor Input tensor of shape (batch_size, seq_len, d_model). Returns ------- torch.Tensor Tensor with added positional encoding. """ seq_len = x.size(1) pos_enc = self.position_embeddings[:seq_len].unsqueeze(0).expand(x.size(0), -1, -1) return x + pos_enc
[docs] class RotaryPositionalEncoding(nn.Module): """ Implements Rotary Positional Encoding as described in "RoFormer: Enhanced Transformer with Rotary Position Embedding". Parameters ---------- d_model : int Dimension of the embedding space. Must be divisible by 2 for rotary encoding. """ def __init__(self, d_model): super(RotaryPositionalEncoding, self).__init__() assert d_model % 2 == 0, "d_model should be divisible by 2 for rotary encoding." self.d_model = d_model self.inv_freq = 1.0 / (10000 ** (torch.arange(0, d_model, 2).float() / d_model))
[docs] def forward(self, x): """ Apply rotary positional encoding to input tensor. Parameters ---------- x : torch.Tensor Input tensor of shape (batch_size, seq_len, d_model). Returns ------- torch.Tensor Tensor with added rotary positional encoding. """ seq_len = x.size(1) # Create sinusoidal positional encoding positions = torch.arange(seq_len, device=x.device).type_as(self.inv_freq) sinusoidal_inp = torch.einsum("i,j->ij", positions, self.inv_freq) sin_enc = torch.sin(sinusoidal_inp) cos_enc = torch.cos(sinusoidal_inp) # Apply rotary embedding x1, x2 = x[..., ::2], x[..., 1::2] x = torch.cat([x1 * cos_enc - x2 * sin_enc, x1 * sin_enc + x2 * cos_enc], dim=-1) return x
[docs] class RelativePositionalEncoding(nn.Module): """ Implements relative positional encoding that captures relative distances between tokens. Parameters ---------- d_model : int Dimension of the embedding space. max_len : int, optional Maximum sequence length. Default is 512. Notes ----- - This method is used in models like Transformer-XL and T5 [2]_[3]_. - Benefits: Handles long sequences and captures relative positions. - Drawbacks: May increase computational complexity. References ---------- .. [2] Dai, Z., Yang, Z., Yang, Y., Carbonell, J. G., Salakhutdinov, R., & Liu, T.-Y. (2019). Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL 2019). Available at: https://arxiv.org/abs/1901.02860 DOI: 10.48550/arXiv.1901.02860 .. [3] Raffel, C., Shinn, C., Gauthier, J., & others (2020). Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research, 21, 1-67. Available at: https://arxiv.org/abs/1910.10683 DOI: 10.48550/arXiv.1910.10683 """ def __init__(self, d_model, max_len=512): super(RelativePositionalEncoding, self).__init__() self.d_model = d_model self.max_len = max_len self.relative_embeddings = nn.Parameter(torch.randn(max_len, d_model))
[docs] def forward(self, x): """ Apply relative positional encoding to input tensor. Parameters ---------- x : torch.Tensor Input tensor of shape (batch_size, seq_len, d_model). Returns ------- torch.Tensor Tensor with added positional encoding. """ seq_len = x.size(1) pos_enc = self.relative_embeddings[:seq_len].unsqueeze(0).expand(x.size(0), -1, -1) return x + pos_enc
[docs] class AdaptivePositionalEncoding(nn.Module): """ Implements adaptive positional encoding that adjusts position encoding based on input sequence length. Parameters ---------- d_model : int Dimension of the embedding space. max_len : int, optional Maximum sequence length. Default is 512. Notes ----- - This method adjusts the position encoding dynamically based on sequence length. - Benefits: Flexible for handling sequences of varying lengths. - Drawbacks: Requires additional handling for sequences with different lengths. References ---------- - No specific reference, this approach is inspired by the need for adaptive positional encodings in models handling variable-length sequences. """ def __init__(self, d_model, max_len=512): super(AdaptivePositionalEncoding, self).__init__() self.d_model = d_model self.max_len = max_len self.position_embeddings = nn.Parameter(torch.randn(max_len, d_model))
[docs] def forward(self, x, **kwargs): """ Apply adaptive positional encoding to input tensor based on sequence length. Parameters ---------- x : torch.Tensor Input tensor of shape (batch_size, seq_len, d_model). **kwargs : dict Additional keyword arguments, such as 'seq_lengths' for handling variable-length sequences. Returns ------- torch.Tensor Tensor with added positional encoding. """ seq_lengths = kwargs.get("seq_lengths") if seq_lengths is None: seq_lengths = torch.full((x.size(0),), x.size(1), dtype=torch.long) batch_size, seq_len, _ = x.size() output = torch.zeros_like(x) for i in range(batch_size): length = seq_lengths[i] pos_enc = self.position_embeddings[:length] pos_enc = pos_enc.unsqueeze(0).expand(1, -1, -1) output[i, :length, :] = x[i, :length, :] + pos_enc return output