Create model.py

2021-06-10 18:04:55 +08:00 · 2021-06-10 18:04:55 +08:00 · 7bdc032d5d
--- a/model.py
+++ b/model.py
@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from data import get_dataset
+from lr import PolynomialDecayLR
+import torch
+import math
+import torch.nn as nn
+import pytorch_lightning as pl
+
+from utils.flag import flag, flag_bounded
+
+def init_model_params(module, n_layers):
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=0.02 / math.sqrt(n_layers))
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=0.02)
+
+#pre_ln: On Layer Normalization in the Transformer Architecture (http://proceedings.mlr.press/v119/xiong20b/xiong20b.pdf)
+class EncoderLayer(nn.Module):
+    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, head_size):
+        super(EncoderLayer, self).__init__()
+
+        self.self_attention_norm = nn.LayerNorm(hidden_size)
+        self.self_attention = MultiHeadAttention(hidden_size, attention_dropout_rate, head_size)
+        self.self_attention_dropout = nn.Dropout(dropout_rate)
+
+        self.ffn_norm = nn.LayerNorm(hidden_size)
+        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
+        self.ffn_dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, x, attn_bias=None):
+        y = self.self_attention_norm(x)
+        y = self.self_attention(y, y, y, attn_bias)
+        y = self.self_attention_dropout(y)
+        x = x + y
+
+        y = self.ffn_norm(x)
+        y = self.ffn(y)
+        y = self.ffn_dropout(y)
+        x = x + y
+        return x