Create model.py
This commit is contained in:
Родитель
9537cf59a8
Коммит
7bdc032d5d
|
@ -0,0 +1,44 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from data import get_dataset
|
||||
from lr import PolynomialDecayLR
|
||||
import torch
|
||||
import math
|
||||
import torch.nn as nn
|
||||
import pytorch_lightning as pl
|
||||
|
||||
from utils.flag import flag, flag_bounded
|
||||
|
||||
def init_model_params(module, n_layers):
|
||||
if isinstance(module, nn.Linear):
|
||||
module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
|
||||
if module.bias is not None:
|
||||
module.bias.data.zero_()
|
||||
if isinstance(module, nn.Embedding):
|
||||
module.weight.data.normal_(mean=0.0, std=0.02)
|
||||
|
||||
#pre_ln: On Layer Normalization in the Transformer Architecture (http://proceedings.mlr.press/v119/xiong20b/xiong20b.pdf)
|
||||
class EncoderLayer(nn.Module):
|
||||
def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, head_size):
|
||||
super(EncoderLayer, self).__init__()
|
||||
|
||||
self.self_attention_norm = nn.LayerNorm(hidden_size)
|
||||
self.self_attention = MultiHeadAttention(hidden_size, attention_dropout_rate, head_size)
|
||||
self.self_attention_dropout = nn.Dropout(dropout_rate)
|
||||
|
||||
self.ffn_norm = nn.LayerNorm(hidden_size)
|
||||
self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
|
||||
self.ffn_dropout = nn.Dropout(dropout_rate)
|
||||
|
||||
def forward(self, x, attn_bias=None):
|
||||
y = self.self_attention_norm(x)
|
||||
y = self.self_attention(y, y, y, attn_bias)
|
||||
y = self.self_attention_dropout(y)
|
||||
x = x + y
|
||||
|
||||
y = self.ffn_norm(x)
|
||||
y = self.ffn(y)
|
||||
y = self.ffn_dropout(y)
|
||||
x = x + y
|
||||
return x
|
Загрузка…
Ссылка в новой задаче