Source code for zae_engine.models.foundations.word_embedding

import torch
import torch.nn as nn
import gensim.downloader as api
import numpy as np



[docs]
class Word2VecEmbedding(nn.Module):
    """
    A PyTorch module that uses pre-trained Word2Vec embeddings from gensim.

    Attributes
    ----------
    embedding : nn.Embedding
        PyTorch embedding layer initialized with Word2Vec pre-trained weights.

    Methods
    -------
    forward(x)
        Passes input tensor through the embedding layer.
    """

    def __init__(self):
        """
        Initializes the Word2VecEmbedding class by loading the pre-trained Word2Vec model
        and setting the weights to a PyTorch nn.Embedding layer.
        """
        super(Word2VecEmbedding, self).__init__()
        self.embedding = self._load_word2vec()

    def _load_word2vec(self):
        """
        Loads the pre-trained Word2Vec model from gensim, creates a PyTorch nn.Embedding layer,
        and initializes it with the Word2Vec weights.

        Returns
        -------
        nn.Embedding
            An nn.Embedding layer with weights set to the pre-trained Word2Vec embeddings.
        """
        model = api.load("word2vec-google-news-300")
        vocab_size = len(model.key_to_index)
        embedding_dim = model.vector_size

        embedding = nn.Embedding(vocab_size, embedding_dim)

        weights = np.zeros((vocab_size, embedding_dim))
        for i, word in enumerate(model.key_to_index):
            weights[i] = model[word]

        embedding.weight.data.copy_(torch.from_numpy(weights))
        return embedding


[docs]
    def forward(self, x):
        """
        Passes the input tensor through the embedding layer.

        Parameters
        ----------
        x : torch.Tensor
            The input tensor containing indices of words.

        Returns
        -------
        torch.Tensor
            The output tensor with embeddings for the input indices.
        """
        return self.embedding(x)





[docs]
class FastTextEmbedding(nn.Module):
    """
    A PyTorch module that uses pre-trained FastText embeddings from gensim.

    Attributes
    ----------
    embedding : nn.Embedding
        PyTorch embedding layer initialized with FastText pre-trained weights.

    Methods
    -------
    forward(x)
        Passes input tensor through the embedding layer.
    """

    def __init__(self):
        """
        Initializes the FastTextEmbedding class by loading the pre-trained FastText model
        and setting the weights to a PyTorch nn.Embedding layer.
        """
        super(FastTextEmbedding, self).__init__()
        self.embedding = self._load_fasttext()

    def _load_fasttext(self):
        """
        Loads the pre-trained FastText model from gensim, creates a PyTorch nn.Embedding layer,
        and initializes it with the FastText weights.

        Returns
        -------
        nn.Embedding
            An nn.Embedding layer with weights set to the pre-trained FastText embeddings.
        """
        model = api.load("fasttext-wiki-news-subwords-300")
        vocab_size = len(model.key_to_index)
        embedding_dim = model.vector_size

        embedding = nn.Embedding(vocab_size, embedding_dim)

        weights = np.zeros((vocab_size, embedding_dim))
        for i, word in enumerate(model.key_to_index):
            weights[i] = model[word]

        embedding.weight.data.copy_(torch.from_numpy(weights))
        return embedding


[docs]
    def forward(self, x):
        """
        Passes the input tensor through the embedding layer.

        Parameters
        ----------
        x : torch.Tensor
            The input tensor containing indices of words.

        Returns
        -------
        torch.Tensor
            The output tensor with embeddings for the input indices.
        """
        return self.embedding(x)