fastNLP.models.star_transformer 源代码

r"""
Star-Transformer 的 Pytorch 实现。
"""
__all__ = [
    "StarTransEnc",
    "STNLICls",
    "STSeqCls",
    "STSeqLabel",
]

import torch
from torch import nn

from ..core.const import Const
from ..core.utils import seq_len_to_mask
from ..embeddings.utils import get_embeddings
from ..modules.encoder.star_transformer import StarTransformer


[文档]class StarTransEnc(nn.Module): r""" 带word embedding的Star-Transformer Encoder """
[文档] def __init__(self, embed, hidden_size, num_layers, num_head, head_dim, max_len, emb_dropout, dropout): r""" :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,此时就以传入的对象作为embedding :param hidden_size: 模型中特征维度. :param num_layers: 模型层数. :param num_head: 模型中multi-head的head个数. :param head_dim: 模型中multi-head中每个head特征维度. :param max_len: 模型能接受的最大输入长度. :param emb_dropout: 词嵌入的dropout概率. :param dropout: 模型除词嵌入外的dropout概率. """ super(StarTransEnc, self).__init__() self.embedding = get_embeddings(embed) emb_dim = self.embedding.embedding_dim self.emb_fc = nn.Linear(emb_dim, hidden_size) # self.emb_drop = nn.Dropout(emb_dropout) self.encoder = StarTransformer(hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, head_dim=head_dim, dropout=dropout, max_len=max_len)
[文档] def forward(self, x, mask): r""" :param FloatTensor x: [batch, length, hidden] 输入的序列 :param ByteTensor mask: [batch, length] 输入序列的padding mask, 在没有内容(padding 部分) 为 0, 否则为 1 :return: [batch, length, hidden] 编码后的输出序列 [batch, hidden] 全局 relay 节点, 详见论文 """ x = self.embedding(x) x = self.emb_fc(x) nodes, relay = self.encoder(x, mask) return nodes, relay
class _Cls(nn.Module): def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1): super(_Cls, self).__init__() self.fc = nn.Sequential( nn.Linear(in_dim, hid_dim), nn.LeakyReLU(), nn.Dropout(dropout), nn.Linear(hid_dim, num_cls), ) def forward(self, x): h = self.fc(x) return h class _NLICls(nn.Module): def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1): super(_NLICls, self).__init__() self.fc = nn.Sequential( nn.Dropout(dropout), nn.Linear(in_dim * 4, hid_dim), # 4 nn.LeakyReLU(), nn.Dropout(dropout), nn.Linear(hid_dim, num_cls), ) def forward(self, x1, x2): x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1) h = self.fc(x) return h
[文档]class STSeqLabel(nn.Module): r""" 用于序列标注的Star-Transformer模型 """
[文档] def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, head_dim=32, max_len=512, cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): r""" :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 :param hidden_size: 模型中特征维度. Default: 300 :param num_layers: 模型层数. Default: 4 :param num_head: 模型中multi-head的head个数. Default: 8 :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 :param max_len: 模型能接受的最大输入长度. Default: 512 :param cls_hidden_size: 分类器隐层维度. Default: 600 :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ super(STSeqLabel, self).__init__() self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, head_dim=head_dim, max_len=max_len, emb_dropout=emb_dropout, dropout=dropout) self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
[文档] def forward(self, words, seq_len): r""" :param words: [batch, seq_len] 输入序列 :param seq_len: [batch,] 输入序列的长度 :return output: [batch, num_cls, seq_len] 输出序列中每个元素的分类的概率 """ mask = seq_len_to_mask(seq_len) nodes, _ = self.enc(words, mask) output = self.cls(nodes) output = output.transpose(1, 2) # make hidden to be dim 1 return {Const.OUTPUT: output} # [bsz, n_cls, seq_len]
[文档] def predict(self, words, seq_len): r""" :param words: [batch, seq_len] 输入序列 :param seq_len: [batch,] 输入序列的长度 :return output: [batch, seq_len] 输出序列中每个元素的分类 """ y = self.forward(words, seq_len) _, pred = y[Const.OUTPUT].max(1) return {Const.OUTPUT: pred}
[文档]class STSeqCls(nn.Module): r""" 用于分类任务的Star-Transformer """
[文档] def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, head_dim=32, max_len=512, cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): r""" :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 :param hidden_size: 模型中特征维度. Default: 300 :param num_layers: 模型层数. Default: 4 :param num_head: 模型中multi-head的head个数. Default: 8 :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 :param max_len: 模型能接受的最大输入长度. Default: 512 :param cls_hidden_size: 分类器隐层维度. Default: 600 :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ super(STSeqCls, self).__init__() self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, head_dim=head_dim, max_len=max_len, emb_dropout=emb_dropout, dropout=dropout) self.cls = _Cls(hidden_size, num_cls, cls_hidden_size, dropout=dropout)
[文档] def forward(self, words, seq_len): r""" :param words: [batch, seq_len] 输入序列 :param seq_len: [batch,] 输入序列的长度 :return output: [batch, num_cls] 输出序列的分类的概率 """ mask = seq_len_to_mask(seq_len) nodes, relay = self.enc(words, mask) y = 0.5 * (relay + nodes.max(1)[0]) output = self.cls(y) # [bsz, n_cls] return {Const.OUTPUT: output}
[文档] def predict(self, words, seq_len): r""" :param words: [batch, seq_len] 输入序列 :param seq_len: [batch,] 输入序列的长度 :return output: [batch, num_cls] 输出序列的分类 """ y = self.forward(words, seq_len) _, pred = y[Const.OUTPUT].max(1) return {Const.OUTPUT: pred}
[文档]class STNLICls(nn.Module): r""" 用于自然语言推断(NLI)的Star-Transformer """
[文档] def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, head_dim=32, max_len=512, cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): r""" :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 :param hidden_size: 模型中特征维度. Default: 300 :param num_layers: 模型层数. Default: 4 :param num_head: 模型中multi-head的head个数. Default: 8 :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 :param max_len: 模型能接受的最大输入长度. Default: 512 :param cls_hidden_size: 分类器隐层维度. Default: 600 :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ super(STNLICls, self).__init__() self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, head_dim=head_dim, max_len=max_len, emb_dropout=emb_dropout, dropout=dropout) self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)
[文档] def forward(self, words1, words2, seq_len1, seq_len2): r""" :param words1: [batch, seq_len] 输入序列1 :param words2: [batch, seq_len] 输入序列2 :param seq_len1: [batch,] 输入序列1的长度 :param seq_len2: [batch,] 输入序列2的长度 :return output: [batch, num_cls] 输出分类的概率 """ mask1 = seq_len_to_mask(seq_len1) mask2 = seq_len_to_mask(seq_len2) def enc(seq, mask): nodes, relay = self.enc(seq, mask) return 0.5 * (relay + nodes.max(1)[0]) y1 = enc(words1, mask1) y2 = enc(words2, mask2) output = self.cls(y1, y2) # [bsz, n_cls] return {Const.OUTPUT: output}
[文档] def predict(self, words1, words2, seq_len1, seq_len2): r""" :param words1: [batch, seq_len] 输入序列1 :param words2: [batch, seq_len] 输入序列2 :param seq_len1: [batch,] 输入序列1的长度 :param seq_len2: [batch,] 输入序列2的长度 :return output: [batch, num_cls] 输出分类的概率 """ y = self.forward(words1, words2, seq_len1, seq_len2) _, pred = y[Const.OUTPUT].max(1) return {Const.OUTPUT: pred}