Source code for zae_engine.models.foundations.word_embedding

import torch
import torch.nn as nn
import gensim.downloader as api
import numpy as np


[docs] class Word2VecEmbedding(nn.Module): """ A PyTorch module that uses pre-trained Word2Vec embeddings from gensim. Attributes ---------- embedding : nn.Embedding PyTorch embedding layer initialized with Word2Vec pre-trained weights. Methods ------- forward(x) Passes input tensor through the embedding layer. """ def __init__(self): """ Initializes the Word2VecEmbedding class by loading the pre-trained Word2Vec model and setting the weights to a PyTorch nn.Embedding layer. """ super(Word2VecEmbedding, self).__init__() self.embedding = self._load_word2vec() def _load_word2vec(self): """ Loads the pre-trained Word2Vec model from gensim, creates a PyTorch nn.Embedding layer, and initializes it with the Word2Vec weights. Returns ------- nn.Embedding An nn.Embedding layer with weights set to the pre-trained Word2Vec embeddings. """ model = api.load("word2vec-google-news-300") vocab_size = len(model.key_to_index) embedding_dim = model.vector_size embedding = nn.Embedding(vocab_size, embedding_dim) weights = np.zeros((vocab_size, embedding_dim)) for i, word in enumerate(model.key_to_index): weights[i] = model[word] embedding.weight.data.copy_(torch.from_numpy(weights)) return embedding
[docs] def forward(self, x): """ Passes the input tensor through the embedding layer. Parameters ---------- x : torch.Tensor The input tensor containing indices of words. Returns ------- torch.Tensor The output tensor with embeddings for the input indices. """ return self.embedding(x)
[docs] class FastTextEmbedding(nn.Module): """ A PyTorch module that uses pre-trained FastText embeddings from gensim. Attributes ---------- embedding : nn.Embedding PyTorch embedding layer initialized with FastText pre-trained weights. Methods ------- forward(x) Passes input tensor through the embedding layer. """ def __init__(self): """ Initializes the FastTextEmbedding class by loading the pre-trained FastText model and setting the weights to a PyTorch nn.Embedding layer. """ super(FastTextEmbedding, self).__init__() self.embedding = self._load_fasttext() def _load_fasttext(self): """ Loads the pre-trained FastText model from gensim, creates a PyTorch nn.Embedding layer, and initializes it with the FastText weights. Returns ------- nn.Embedding An nn.Embedding layer with weights set to the pre-trained FastText embeddings. """ model = api.load("fasttext-wiki-news-subwords-300") vocab_size = len(model.key_to_index) embedding_dim = model.vector_size embedding = nn.Embedding(vocab_size, embedding_dim) weights = np.zeros((vocab_size, embedding_dim)) for i, word in enumerate(model.key_to_index): weights[i] = model[word] embedding.weight.data.copy_(torch.from_numpy(weights)) return embedding
[docs] def forward(self, x): """ Passes the input tensor through the embedding layer. Parameters ---------- x : torch.Tensor The input tensor containing indices of words. Returns ------- torch.Tensor The output tensor with embeddings for the input indices. """ return self.embedding(x)