fastNLP.embeddings.embedding 源代码

r"""
该模块中的Embedding主要用于随机初始化的embedding(更推荐使用 :class:`fastNLP.embeddings.StaticEmbedding` ),或按照预训练权重初始化Embedding。

"""

__all__ = [
    "Embedding",
    "TokenEmbedding"
]

from abc import abstractmethod

import torch
import torch.nn as nn

from .utils import get_embeddings


[文档]class Embedding(nn.Module): r""" 词向量嵌入,支持输入多种方式初始化. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度. Example:: >>> import numpy as np >>> from fastNLP.embeddings import Embedding >>> init_embed = (2000, 100) >>> embed = Embedding(init_embed) # 随机初始化一个具有2000个词,每个词表示为100维的词向量 >>> init_embed = np.zeros((2000, 100)) >>> embed = Embedding(init_embed) # 使用numpy.ndarray的值作为初始化值初始化一个Embedding """
[文档] def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): r""" :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: 支持传入Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 或传入Tensor, Embedding, numpy.ndarray等则直接使用该值初始化Embedding; :param float word_dropout: 按照一定概率随机将word设置为unk_index,这样可以使得unk这个token得到足够的训练, 且会对网络有 一定的regularize的作用。设置该值时,必须同时设置unk_index :param float dropout: 对Embedding的输出的dropout。 :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 """ super(Embedding, self).__init__() self.embed = get_embeddings(init_embed) self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): if hasattr(self.embed, 'embed_size'): self._embed_size = self.embed.embed_size elif hasattr(self.embed, 'embedding_dim'): self._embed_size = self.embed.embedding_dim else: self._embed_size = self.embed.weight.size(1) if word_dropout > 0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size unk_index = self.embed.get_word_vocab().unknown_idx self.unk_index = unk_index self.word_dropout = word_dropout
[文档] def forward(self, words): r""" :param torch.LongTensor words: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self.unk_index) words = self.embed(words) return self.dropout(words)
@property def num_embedding(self) -> int: if isinstance(self.embed, nn.Embedding): return self.embed.weight.size(0) else: return self.embed.num_embeddings def __len__(self): return len(self.embed) @property def embed_size(self) -> int: return self._embed_size @property def embedding_dim(self) -> int: return self._embed_size @property def requires_grad(self): r""" Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 :return: """ if not isinstance(self.embed, TokenEmbedding): return self.embed.weight.requires_grad else: return self.embed.requires_grad @requires_grad.setter def requires_grad(self, value): if not isinstance(self.embed, TokenEmbedding): self.embed.weight.requires_grad = value else: self.embed.requires_grad = value @property def size(self): if isinstance(self.embed, TokenEmbedding): return self.embed.size else: return self.embed.weight.size()
[文档]class TokenEmbedding(nn.Module): r""" fastNLP中各种Embedding的基类 """ def __init__(self, vocab, word_dropout=0.0, dropout=0.0): super(TokenEmbedding, self).__init__() if vocab.rebuild: vocab.build_vocab() assert vocab.padding is not None, "Vocabulary must have a padding entry." self._word_vocab = vocab self._word_pad_index = vocab.padding_idx if word_dropout > 0: assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." self.word_dropout = word_dropout self._word_unk_index = vocab.unknown_idx self.dropout_layer = nn.Dropout(dropout)
[文档] def drop_word(self, words): r""" 按照设定随机将words设置为unknown_index。 :param torch.LongTensor words: batch_size x max_len :return: """ if self.word_dropout > 0 and self.training: mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 pad_mask = words.ne(self._word_pad_index) mask = mask.__and__(pad_mask) words = words.masked_fill(mask, self._word_unk_index) return words
[文档] def dropout(self, words): r""" 对embedding后的word表示进行drop。 :param torch.FloatTensor words: batch_size x max_len x embed_size :return: """ return self.dropout_layer(words)
@property def requires_grad(self): r""" Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 :return: """ requires_grads = set([param.requires_grad for param in self.parameters()]) if len(requires_grads) == 1: return requires_grads.pop() else: return None @requires_grad.setter def requires_grad(self, value): for param in self.parameters(): param.requires_grad = value def __len__(self): return len(self._word_vocab) @property def embed_size(self) -> int: return self._embed_size @property def embedding_dim(self) -> int: return self._embed_size @property def num_embeddings(self) -> int: r""" 这个值可能会大于实际的embedding矩阵的大小。 :return: """ return len(self._word_vocab)
[文档] def get_word_vocab(self): r""" 返回embedding的词典。 :return: Vocabulary """ return self._word_vocab
@property def size(self): return torch.Size(self.num_embeddings, self._embed_size) @abstractmethod def forward(self, words): raise NotImplementedError