2. Add DeBERTv2 xlarge, xxlarge and MNLI xlarge-v2, xxlarge-v2 models
3. Fix GLUE data downloading issue.
4. Support plugin tasks
5. Update experiments
This commit is contained in:
Pengcheng He 2021-01-26 15:17:10 -05:00
Родитель d9e01c6461
Коммит 839e3b45c7
72 изменённых файлов: 2077 добавлений и 2095 удалений

Просмотреть файл

@ -1 +1,3 @@
from .task_registry import tasks
import os
# This statement must be executed at the very beginning, i.e. before import torch
os.environ["OMP_NUM_THREADS"] = "1"

Просмотреть файл

@ -0,0 +1,3 @@
from .ner import *
from .multi_choice import *
from .sequence_classification import *

Просмотреть файл

@ -15,17 +15,17 @@ import torch
from torch.nn import CrossEntropyLoss
import math
from ..deberta import *
from ..utils import *
from ...deberta import *
from ...utils import *
import pdb
__all__ = ['MultiChoiceModel']
class MultiChoiceModel(NNModule):
def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
super().__init__(config)
self.bert = DeBERTa(config)
self.deberta = DeBERTa(config)
self.num_labels = num_labels
self.classifier = nn.Linear(config.hidden_size, 1)
self.classifier = torch.nn.Linear(config.hidden_size, 1)
drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.apply(self.init_weights)
@ -39,7 +39,7 @@ class MultiChoiceModel(NNModule):
position_ids = position_ids.view([-1, position_ids.size(-1)])
if input_mask is not None:
input_mask = input_mask.view([-1, input_mask.size(-1)])
encoder_layers = self.bert(input_ids, token_type_ids=type_ids, attention_mask=input_mask,
encoder_layers = self.deberta(input_ids, token_type_ids=type_ids, attention_mask=input_mask,
position_ids=position_ids, output_all_encoded_layers=True)
seqout = encoder_layers[-1]
cls = seqout[:,:1,:]

Просмотреть файл

@ -15,7 +15,7 @@ import torch
import math
from torch import nn
from torch.nn import CrossEntropyLoss
from ..deberta import DeBERTa,NNModule,ACT2FN,StableDropout
from ...deberta import DeBERTa,NNModule,ACT2FN,StableDropout
__all__ = ['NERModel']

Просмотреть файл

@ -14,22 +14,24 @@ from __future__ import print_function
import torch
from torch.nn import CrossEntropyLoss
import math
import pdb
from ..deberta import *
from ..utils import *
from ...deberta import *
from ...utils import *
__all__= ['SequenceClassificationModel']
class SequenceClassificationModel(NNModule):
def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):
super().__init__(config)
self.num_labels = num_labels
self.bert = DeBERTa(config, pre_trained=pre_trained)
self._register_load_state_dict_pre_hook(self._pre_load_hook)
self.deberta = DeBERTa(config, pre_trained=pre_trained)
if pre_trained is not None:
self.config = self.bert.config
self.config = self.deberta.config
else:
self.config = config
pool_config = PoolConfig(self.config)
output_dim = self.bert.config.hidden_size
output_dim = self.deberta.config.hidden_size
self.pooler = ContextPooler(pool_config)
output_dim = self.pooler.output_dim()
@ -37,10 +39,10 @@ class SequenceClassificationModel(NNModule):
drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
self.dropout = StableDropout(drop_out)
self.apply(self.init_weights)
self.bert.apply_state()
self.deberta.apply_state()
def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
encoder_layers = self.bert(input_ids, attention_mask=input_mask, token_type_ids=type_ids,
encoder_layers = self.deberta(input_ids, attention_mask=input_mask, token_type_ids=type_ids,
position_ids=position_ids, output_all_encoded_layers=True)
pooled_output = self.pooler(encoder_layers[-1])
pooled_output = self.dropout(pooled_output)
@ -69,3 +71,15 @@ class SequenceClassificationModel(NNModule):
loss = -((log_softmax(logits)*labels).sum(-1)*label_confidence).mean()
return (logits,loss)
def _pre_load_hook(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
new_state = dict()
bert_prefix = prefix + 'bert.'
deberta_prefix = prefix + 'deberta.'
for k in list(state_dict.keys()):
if k.startswith(bert_prefix):
nk = deberta_prefix + k[len(bert_prefix):]
value = state_dict[k]
del state_dict[k]
state_dict[nk] = value

Просмотреть файл

@ -10,6 +10,7 @@
"""DeBERTa finetuning runner."""
import os
os.environ["OMP_NUM_THREADS"] = "1"
from collections import OrderedDict, Mapping, Sequence
import argparse
import random
@ -20,13 +21,14 @@ import math
import torch
import json
from torch.utils.data import DataLoader
from ..deberta import GPT2Tokenizer
from ..deberta import tokenizers,load_vocab
from ..utils import *
from ..utils import xtqdm as tqdm
from .task_registry import tasks
from .tasks import load_tasks,get_task
from ..training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children
from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader
import LASER
from LASER.training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children
from LASER.data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader
def create_model(args, num_labels, model_class_fn):
# Prepare model
@ -38,6 +40,7 @@ def create_model(args, num_labels, model_class_fn):
if args.fp16:
model = model.half()
logger.info(f'Total parameters: {sum([p.numel() for p in model.parameters()])}')
return model
def train_model(args, model, device, train_data, eval_data):
@ -58,7 +61,7 @@ def train_model(args, model, device, train_data, eval_data):
_, loss = model(**data)
return loss.mean(), data['input_ids'].size(0)
trainer = DistributedTrainer(args, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval)
trainer = DistributedTrainer(args, args.output_dir, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval)
trainer.train()
def merge_distributed(data_list, max_len=None):
@ -191,54 +194,52 @@ def run_predict(args, model, device, eval_data, prefix=None):
batch_sampler = DistributedBatchSampler(batch_sampler, rank=args.rank, world_size=args.world_size)
eval_dataloader = DataLoader(eval_item.data, batch_sampler=batch_sampler, num_workers=args.workers)
model.eval()
predicts=None
predicts = []
for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=args.rank>0):
batch = batch_to(batch, device)
with torch.no_grad():
logits, _ = model(**batch)
if args.world_size>1:
logits_all = [torch.zeros_like(logits) for _ in range(args.world_size)]
torch.distributed.all_gather(logits_all, logits)
torch.cuda.synchronize()
logits = torch.cat(logits_all)
logits = logits.detach().cpu().numpy()
if predicts is None:
predicts = np.copy(logits)
else:
predicts = np.append(predicts, logits, axis=0)
predicts = predicts[:len(eval_item.data)]
predicts.append(logits)
predicts = merge_distributed(predicts, len(eval_item.data))
if args.rank<=0:
output_test_file = os.path.join(args.output_dir, "test_logits_{}_{}.txt".format(name, prefix))
logger.info("***** Dump prediction results-{}-{} *****".format(name, prefix))
logger.info("Location: {}".format(output_test_file))
np.savetxt(output_test_file, predicts, delimiter='\t')
predict_fn = eval_item.predict_fn
if predict_fn:
predict_fn(predicts, args.output_dir, name, prefix)
if isinstance(predicts, Sequence):
for k,pred in enumerate(predicts):
output_test_file = os.path.join(args.output_dir, f"test_logits_{name}@{k}_{prefix}.txt")
logger.info(f"***** Dump prediction results-{name}@{k}-{prefix} *****")
logger.info("Location: {}".format(output_test_file))
pred = pred.detach().cpu().numpy()
np.savetxt(output_test_file, pred, delimiter='\t')
predict_fn(pred, args.output_dir, name + f'@{k}', prefix)
else:
output_test_file = os.path.join(args.output_dir, "test_logits_{}_{}.txt".format(name, prefix))
logger.info("***** Dump prediction results-{}-{} *****".format(name, prefix))
logger.info("Location: {}".format(output_test_file))
np.savetxt(output_test_file, predicts.detach().cpu().numpy(), delimiter='\t')
predict_fn(predicts.detach().cpu().numpy(), args.output_dir, name, prefix)
def main(args):
if not args.do_train and not args.do_eval and not args.do_predict:
raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.")
os.makedirs(args.output_dir, exist_ok=True)
task_name = args.task_name.lower()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
tokenizer = GPT2Tokenizer()
processor = tasks[task_name](tokenizer = tokenizer, max_seq_len = args.max_seq_length, data_dir = args.data_dir)
label_list = processor.get_labels()
vocab_path, vocab_type = load_vocab(vocab_path = args.vocab_path, vocab_type = args.vocab_type, pretrained_id = args.init_model)
tokenizer = tokenizers[vocab_type](vocab_path)
task = get_task(args.task_name)(tokenizer = tokenizer, args=args, max_seq_len = args.max_seq_length, data_dir = args.data_dir)
label_list = task.get_labels()
eval_data = processor.eval_data(max_seq_len=args.max_seq_length)
eval_data = task.eval_data(max_seq_len=args.max_seq_length)
logger.info(" Evaluation batch size = %d", args.eval_batch_size)
if args.do_predict:
test_data = processor.test_data(max_seq_len=args.max_seq_length)
test_data = task.test_data(max_seq_len=args.max_seq_length)
logger.info(" Prediction batch size = %d", args.predict_batch_size)
if args.do_train:
train_data = processor.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug)
model_class_fn = processor.get_model_class_fn()
train_data = task.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug)
model_class_fn = task.get_model_class_fn()
model = create_model(args, len(label_list), model_class_fn)
if args.do_train:
with open(os.path.join(args.output_dir, 'model_config.json'), 'w', encoding='utf-8') as fs:
@ -257,20 +258,47 @@ def main(args):
if args.do_predict:
run_predict(args, model, device, test_data, prefix=args.tag)
class LoadTaskAction(argparse.Action):
_registered = False
def __call__(self, parser, args, values, option_string=None):
setattr(args, self.dest, values)
if not self._registered:
load_tasks(args.task_dir)
all_tasks = get_task()
if values=="*":
for task in all_tasks.values():
parser.add_argument_group(title=f'Task {task._meta["name"]}', description=task._meta["desc"])
return
assert values.lower() in all_tasks, f'{values} is not registed. Valid tasks {list(all_tasks.keys())}'
task = get_task(values)
group = parser.add_argument_group(title=f'Task {task._meta["name"]}', description=task._meta["desc"])
task.add_arguments(group)
type(self)._registered = True
def build_argument_parser():
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser(parents=[LASER.optims.get_args(), LASER.training.get_args()], formatter_class=argparse.ArgumentDefaultsHelpFormatter)
## Required parameters
parser.add_argument("--data_dir",
parser.add_argument("--task_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
required=False,
help="The directory to load customized tasks.")
parser.add_argument("--task_name",
default=None,
type=str,
action=LoadTaskAction,
required=True,
help="The name of the task to train.")
help="The name of the task to train. To list all registered tasks, use \"*\" as the name, e.g. \n"
"\npython -m DeBERTa.apps.run --task_name \"*\" --help")
parser.add_argument("--data_dir",
default=None,
type=str,
required=False,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--output_dir",
default=None,
type=str,
@ -284,96 +312,31 @@ def build_argument_parser():
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_train",
default=False,
action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval",
default=False,
action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_predict",
default=False,
action='store_true',
help="Whether to run prediction on the test set.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=32,
type=int,
help="Total batch size for eval.")
parser.add_argument("--predict_batch_size",
default=32,
type=int,
help="Total batch size for prediction.")
parser.add_argument("--max_grad_norm",
default=1,
type=float,
help="The clip threshold of global gradient norm")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--epsilon",
default=1e-6,
type=float,
help="epsilon setting for Adam.")
parser.add_argument("--adam_beta1",
default=0.9,
type=float,
help="The beta1 parameter for Adam.")
parser.add_argument("--adam_beta2",
default=0.999,
type=float,
help="The beta2 parameter for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--lr_schedule_ends",
default=0,
type=float,
help="The ended learning rate scale for learning rate scheduling")
parser.add_argument("--lr_schedule",
default='warmup_linear',
type=str,
help="The learning rate scheduler used for traning. "
"E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=1234,
help="random seed for initialization")
parser.add_argument('--accumulative_update',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
type=boolean_string,
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=256,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--scale_steps',
type=int, default=1000,
help='The steps to wait to increase the loss scale.')
parser.add_argument('--init_model',
type=str,
@ -387,47 +350,12 @@ def build_argument_parser():
type=float,
default=None,
help="The config file model initialization and fine tuning.")
parser.add_argument('--weight_decay',
type=float,
default=0.01,
help="The weight decay rate")
parser.add_argument('--tag',
type=str,
default='final',
help="The tag name of current prediction/runs.")
parser.add_argument("--dump_interval",
default=10000,
type=int,
help="Interval steps for generating checkpoint.")
parser.add_argument('--lookahead_k',
default=-1,
type=int,
help="lookahead k parameter")
parser.add_argument('--lookahead_alpha',
default=0.5,
type=float,
help="lookahead alpha parameter")
parser.add_argument('--with_radam',
default=False,
type=boolean_string,
help="whether to use RAdam")
parser.add_argument('--opt_type',
type=str.lower,
default='adam',
choices=['adam', 'admax'],
help="The optimizer to be used.")
parser.add_argument('--workers',
type=int,
default=2,
help="The workers to load data.")
parser.add_argument('--debug',
default=False,
type=boolean_string,
@ -437,11 +365,24 @@ def build_argument_parser():
default=None,
type=str,
help="The path of pre-trained RoBERTa model")
parser.add_argument('--vocab_type',
default='gpt2',
type=str,
help="Vocabulary type: [spm, gpt2]")
parser.add_argument('--vocab_path',
default=None,
type=str,
help="The path of the vocabulary")
return parser
if __name__ == "__main__":
parser = build_argument_parser()
parser.parse_known_args()
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
logger = set_logger(args.task_name, os.path.join(args.output_dir, 'training_{}.log'.format(args.task_name)))
logger.info(args)
try:

Просмотреть файл

@ -1,17 +0,0 @@
from .glue_tasks import *
from .ner_task import *
from .race_task import *
tasks={
'mnli': MNLITask,
'anli': ANLITask,
'sts-b': STSBTask,
'sst-2': SST2Task,
'qqp': QQPTask,
'cola': ColaTask,
'mrpc': MRPCTask,
'rte': RTETask,
'qnli': QNLITask,
'race': RACETask,
'ner': NERTask
}

Просмотреть файл

@ -0,0 +1,5 @@
#from .ner_task import *
from .glue_tasks import *
#from .race_task import *
from .task import *
from .task_registry import *

Просмотреть файл

@ -23,20 +23,22 @@ import torch
import re
import ujson as json
from .metrics import *
from .task import EvalData, DataTask
from DeBERTa.utils import xtqdm as tqdm
from DeBERTa.data import ExampleInstance, ExampleSet, DynamicDataset,example_to_feature
from DeBERTa.data.example import _truncate_segments
from DeBERTa.data.example import *
import DeBERTa.utils as utils
from .task import EvalData, Task
from .task_registry import register_task
from ...utils import xtqdm as tqdm
from ...utils import get_logger
from ...data import ExampleInstance, ExampleSet, DynamicDataset,example_to_feature
from ...data.example import _truncate_segments
from ...data.example import *
logger=utils.get_logger()
logger=get_logger()
__all__ = ["MNLITask", "ANLITask", "STSBTask", "SST2Task", "QQPTask", "ColaTask", "MRPCTask", "RTETask", "QNLITask"]
class STSBTask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task("sts-b")
class STSBTask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -109,9 +111,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["1"]
class RTETask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task("rte")
class RTETask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -199,9 +202,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["not_entailment", "entailment"]
class MRPCTask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('mrpc')
class MRPCTask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -275,9 +279,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["0", "1"]
class QNLITask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('qnli')
class QNLITask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -350,11 +355,11 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["not_entailment", "entailment"]
class ColaTask(DataTask):
def __init__(self, data_dir, tokenizer, soft_threshold=0, with_dev_data=None, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('cola')
class ColaTask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
self.soft_threshold = soft_threshold
if True:
input_src = os.path.join(self.data_dir, 'train.tsv')
assert os.path.exists(input_src), f"{input_src} doesn't exists"
@ -472,9 +477,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["0", "1"]
class SST2Task(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('sst-2')
class SST2Task(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -537,9 +543,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["0", "1"]
class QQPTask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('qqp')
class QQPTask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -601,9 +608,10 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["0", "1"]
class MNLITask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task('mnli')
class MNLITask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def get_feature_fn(self, max_seq_len = 512, mask_gen = None):
@ -709,10 +717,11 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
"""See base class."""
return ["contradiction", "neutral", "entailment"]
@register_task('anli')
class ANLITask(MNLITask):
def __init__(self, data_dir, tokenizer, **kwargs):
def __init__(self, data_dir, tokenizer, args, **kwargs):
data_dir = data_dir.replace('/ANLI', '/MNLI')
super().__init__(data_dir, tokenizer, **kwargs)
super().__init__(data_dir, tokenizer, args, **kwargs)
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
examples = []

Просмотреть файл

Просмотреть файл

@ -7,15 +7,16 @@ import pdb
import random
import torch
import ujson as json
from ..utils import xtqdm as tqdm
from ..utils import get_logger
from ...utils import xtqdm as tqdm
from ...utils import get_logger
from ..deberta import GPT2Tokenizer
from .ner import NERModel
from ..data import ExampleInstance, ExampleSet, DynamicDataset
from ..data.example import *
from ..data.example import _truncate_segments
from .task import EvalData, DataTask
from ...deberta import GPT2Tokenizer
from ..models import NERModel
from ...data import ExampleInstance, ExampleSet, DynamicDataset
from ...data.example import *
from ...data.example import _truncate_segments
from .task import EvalData, Task
from .task_registry import register_task
from seqeval import metrics as seq_metrics
@ -23,9 +24,10 @@ __all__ = ['NERTask']
logger = get_logger()
class NERTask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task(name="NER", desc="Named-entity recognition task")
class NERTask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):

Просмотреть файл

@ -19,20 +19,22 @@ import torch
import re
import ujson as json
from .metrics import *
from .task import EvalData, DataTask
from DeBERTa.utils import xtqdm as tqdm
from DeBERTa.data import ExampleInstance, ExampleSet, DynamicDataset,example_to_feature
from DeBERTa.data.example import *
import DeBERTa.utils as utils
from .multi_choice import MultiChoiceModel
from .task import EvalData, Task
from .task_registry import register_task
from ...utils import xtqdm as tqdm
from ...data import ExampleInstance, ExampleSet, DynamicDataset,example_to_feature
from ...data.example import *
from ...utils import get_logger
from ..models.multi_choice import MultiChoiceModel
logger=utils.get_logger()
logger=get_logger()
__all__ = ["RACETask"]
class RACETask(DataTask):
def __init__(self, data_dir, tokenizer, **kwargs):
super().__init__(tokenizer, **kwargs)
@register_task(name="RACE", desc="ReAding Comprehension dataset collected from English Examinations, http://www.qizhexie.com/data/RACE_leaderboard.html")
class RACETask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
@ -41,7 +43,7 @@ class RACETask(DataTask):
examples = ExampleSet(middle + high)
if dataset_size is None:
dataset_size = len(examples)*epochs
return StreamingDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
dataset_size = dataset_size, shuffle=True, **kwargs)
def eval_data(self, max_seq_len=512, dataset_size=None, **kwargs):
@ -54,7 +56,7 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
for d in ds:
if dataset_size is None:
_size = len(d.data)
d.data = StreamingDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
d.data = DynamicDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
return ds
def test_data(self,max_seq_len=512, dataset_size = None, **kwargs):
@ -67,7 +69,7 @@ dataset_size = dataset_size, shuffle=True, **kwargs)
for d in ds:
if dataset_size is None:
_size = len(d.data)
d.data = StreamingDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
d.data = DynamicDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
return ds
def _data(self, name, path, type_name = 'dev', ignore_metric=False):

Просмотреть файл

@ -8,14 +8,15 @@ import csv
import copy
from collections import OrderedDict,defaultdict,Sequence,Counter
import numpy as np
import DeBERTa.utils as utils
from DeBERTa.utils import xtqdm as tqdm
from ...utils import get_logger
from ...utils import xtqdm as tqdm
from ...data import example_to_feature
from .metrics import *
from .sequence_classification import SequenceClassificationModel
logger=utils.get_logger()
from ..models import SequenceClassificationModel
logger=get_logger()
__all__ = ['EvalData', 'DataTask']
__all__ = ['EvalData', 'Task']
class EvalData:
def __init__(self, name, examples, metrics_fn=None, predict_fn=None, ignore_metric=False, critial_metrics=None):
@ -39,9 +40,12 @@ class EvalData:
def __repr__(self):
return f'{self.name}, {type(self.data)}: {len(self.data)}, {self.predict_fn}, {self.metrics_fn}'
class DataTask():
def __init__(self, tokenizer, **kwargs):
class Task():
_meta={}
def __init__(self, tokenizer, args, **kwargs):
self.tokenizer = tokenizer
self.args = args
def eval_data(self, **kwargs):
raise NotImplementedError('Eval_data method not implemented yet.')
@ -90,8 +94,7 @@ class DataTask():
return lines
def get_feature_fn(self, max_seq_len = 512, mask_gen = None, label_type='int', training=False):
from ..data import example_to_feature
tokenizer = copy.deepcopy(self.tokenizer)
tokenizer = self.tokenizer
def _example_to_feature(example, rng=None, ext_params=None, **kwargs):
return example_to_feature(tokenizer, example, max_seq_len = max_seq_len, \
rng = rng, mask_generator = mask_gen, ext_params = ext_params, label_type=label_type, **kwargs)
@ -99,3 +102,12 @@ class DataTask():
def get_model_class_fn(self):
return SequenceClassificationModel.load_model
@classmethod
def add_arguments(cls, parser):
"""Add task specific arguments
e.g. parser.add_argument('--data_dir', type=str, help='The path of data directory.')
"""
parser.add_argument('--task_example_arg', type=str, default=None, help='An example task specific argument')
return parser

Просмотреть файл

@ -0,0 +1,70 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 01/25/2019
#
from glob import glob
import os
import importlib
import pdb
import sys
from ...utils import get_logger
from .task import Task
__all__ = ['load_tasks', 'register_task', 'get_task']
tasks={}
logger=get_logger()
def register_task(name=None, desc=None):
def register_task_x(cls):
_name = name
if _name is None:
_name = cls.__name__
_desc = desc
if _desc is None:
_desc = _name
_name = _name.lower()
if _name in tasks:
logger.warning(f'{_name} already registered in the registry: {tasks[_name]}.')
assert issubclass(cls, Task), f'Registered class must be a subclass of Task.'
tasks[_name] = cls
cls._meta = {
'name': _name,
'desc': _desc}
return cls
if type(name)==type:
cls = name
name = None
return register_task_x(cls)
return register_task_x
def load_tasks(task_dir = None):
script_dir = os.path.dirname(os.path.abspath(__file__))
sys_tasks = glob(os.path.join(script_dir, "*.py"))
for t in sys_tasks:
m = os.path.splitext(os.path.basename(t))[0]
if not m.startswith('_'):
importlib.import_module(f'DeBERTa.apps.tasks.{m}')
if task_dir:
assert os.path.exists(task_dir), f"{task_dir} must be a valid directory."
customer_tasks = glob(os.path.join(task_dir, "*.py"))
sys.path.append(task_dir)
for t in customer_tasks:
m = os.path.splitext(os.path.basename(t))[0]
if not m.startswith('_'):
importlib.import_module(f'{m}')
def get_task(name=None):
if name is None:
return tasks
return tasks[name.lower()]

Просмотреть файл

@ -1,5 +1,3 @@
from .example import ExampleInstance,ExampleSet,example_to_feature
from .dataloader import SequentialDataLoader
from .async_data import AsyncDataLoader
from .data_sampler import *
from .dynamic_dataset import *

Просмотреть файл

@ -1,39 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 05/15/2019
#
from queue import Queue,Empty
from threading import Thread
class AsyncDataLoader(object):
def __init__(self, dataloader, buffer_size=100):
self.buffer_size = buffer_size
self.dataloader = dataloader
def __iter__(self):
queue = Queue(self.buffer_size)
dl=iter(self.dataloader)
def _worker():
while True:
try:
queue.put(next(dl))
except StopIteration:
break
queue.put(None)
t=Thread(target=_worker)
t.start()
while True:
d = queue.get()
if d is None:
break
yield d
del t
del queue
def __len__(self):
return len(self.dataloader)

Просмотреть файл

@ -1,77 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 05/15/2019
#
import os
import numpy as np
import math
import sys
from torch.utils.data import Sampler
__all__=['BatchSampler', 'DistributedBatchSampler', 'RandomSampler', 'SequentialSampler']
class BatchSampler(Sampler):
def __init__(self, sampler, batch_size):
self.sampler = sampler
self.batch_size = batch_size
def __iter__(self):
batch = []
for idx in self.sampler:
batch.append(idx)
if len(batch)==self.batch_size:
yield batch
batch = []
if len(batch)>0:
yield batch
def __len__(self):
return (len(self.sampler) + self.batch_size - 1)//self.batch_size
class DistributedBatchSampler(Sampler):
def __init__(self, sampler, rank=0, world_size = 1, drop_last = False):
self.sampler = sampler
self.rank = rank
self.world_size = world_size
self.drop_last = drop_last
def __iter__(self):
for b in self.sampler:
if len(b)%self.world_size != 0:
if self.drop_last:
break
else:
b.extend([b[0] for _ in range(self.world_size-len(b)%self.world_size)])
chunk_size = len(b)//self.world_size
yield b[self.rank*chunk_size:(self.rank+1)*chunk_size]
def __len__(self):
return len(self.sampler)
class RandomSampler(Sampler):
def __init__(self, total_samples:int, data_seed:int = 0):
self.indices = np.array(np.arange(total_samples))
self.rng = np.random.RandomState(data_seed)
def __iter__(self):
self.rng.shuffle(self.indices)
for i in self.indices:
yield i
def __len__(self):
return len(self.indices)
class SequentialSampler(Sampler):
def __init__(self, total_samples:int):
self.indices = np.array(np.arange(total_samples))
def __iter__(self):
for i in self.indices:
yield i
def __len__(self):
return len(self.indices)

Просмотреть файл

@ -18,4 +18,7 @@ from .disentangled_attention import *
from .ops import *
from .bert import *
from .gpt2_tokenizer import GPT2Tokenizer
from .spm_tokenizer import *
from .tokenizers import *
from .config import *
from .cache_utils import *

Просмотреть файл

@ -19,50 +19,15 @@ import pdb
import json
from .ops import *
from .disentangled_attention import *
from .da_utils import *
__all__ = ['BertEncoder', 'BertEmbeddings', 'ACT2FN', 'BertLayerNorm']
def gelu(x):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def swish(x):
return x * torch.sigmoid(x)
def linear_act(x):
return x
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "tanh": torch.nn.functional.tanh, "linear": linear_act, 'sigmoid': torch.sigmoid}
class BertLayerNorm(nn.Module):
"""LayerNorm module in the TF style (epsilon inside the square root).
"""
def __init__(self, size, eps=1e-12):
super().__init__()
self.weight = nn.Parameter(torch.ones(size))
self.bias = nn.Parameter(torch.zeros(size))
self.variance_epsilon = eps
def forward(self, x):
input_type = x.dtype
x = x.float()
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
x = x.to(input_type)
y = self.weight * x + self.bias
return y
__all__ = ['BertEncoder', 'BertEmbeddings', 'ACT2FN', 'LayerNorm']
class BertSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
@ -81,9 +46,8 @@ class BertAttention(nn.Module):
self.config = config
def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
self_output = self.self(hidden_states, attention_mask, return_att, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
if return_att:
self_output, att_matrix = self_output
output = self.self(hidden_states, attention_mask, return_att, query_states=query_states, relative_pos=relative_pos, rel_embeddings=rel_embeddings)
self_output, att_matrix, att_logits_=output['hidden_states'], output['attention_probs'], output['attention_logits']
if query_states is None:
query_states = hidden_states
attention_output = self.output(self_output, query_states, attention_mask)
@ -109,7 +73,7 @@ class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
@ -139,6 +103,29 @@ class BertLayer(nn.Module):
else:
return layer_output
class ConvLayer(nn.Module):
def __init__(self, config):
super().__init__()
kernel_size = getattr(config, 'conv_kernel_size', 3)
groups = getattr(config, 'conv_groups', 1)
self.conv_act = getattr(config, 'conv_act', 'tanh')
self.conv = torch.nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size, padding = (kernel_size-1)//2, groups = groups)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.config = config
def forward(self, hidden_states, residual_states, input_mask):
out = self.conv(hidden_states.permute(0,2,1).contiguous()).permute(0,2,1).contiguous()
if version.Version(torch.__version__) >= version.Version('1.2.0a'):
rmask = (1-input_mask).bool()
else:
rmask = (1-input_mask).byte()
out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0)
out = ACT2FN[self.conv_act](self.dropout(out))
output_states = MaskedLayerNorm(self.LayerNorm, residual_states + out, input_mask)
return output_states
class BertEncoder(nn.Module):
""" Modified BertEncoder with relative position bias support
"""
@ -151,9 +138,25 @@ class BertEncoder(nn.Module):
self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
if self.max_relative_positions <1:
self.max_relative_positions = config.max_position_embeddings
self.rel_embeddings = nn.Embedding(self.max_relative_positions*2, config.hidden_size)
self.position_buckets = getattr(config, 'position_buckets', -1)
pos_ebd_size = self.max_relative_positions*2
if self.position_buckets>0:
pos_ebd_size = self.position_buckets*2
self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
self.norm_rel_ebd = [x.strip() for x in getattr(config, 'norm_rel_ebd', 'none').lower().split('|')]
if 'layer_norm' in self.norm_rel_ebd:
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine = True)
kernel_size = getattr(config, 'conv_kernel_size', 0)
self.with_conv = False
if kernel_size > 0:
self.with_conv = True
self.conv = ConvLayer(config)
def get_rel_embedding(self):
rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
if rel_embeddings is not None and ('layer_norm' in self.norm_rel_ebd):
rel_embeddings = self.LayerNorm(rel_embeddings)
return rel_embeddings
def get_attention_mask(self, attention_mask):
@ -169,10 +172,14 @@ class BertEncoder(nn.Module):
def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
if self.relative_attention and relative_pos is None:
q = query_states.size(-2) if query_states is not None else hidden_states.size(-2)
relative_pos = build_relative_position(q, hidden_states.size(-2), hidden_states.device)
relative_pos = build_relative_position(q, hidden_states.size(-2), bucket_size = self.position_buckets, max_position=self.max_relative_positions)
return relative_pos
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, return_att=False, query_states = None, relative_pos=None):
if attention_mask.dim()<=2:
input_mask = attention_mask
else:
input_mask = (attention_mask.sum(-2)>0).byte()
attention_mask = self.get_attention_mask(attention_mask)
relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
@ -188,6 +195,10 @@ class BertEncoder(nn.Module):
if return_att:
output_states, att_m = output_states
if i == 0 and self.with_conv:
prenorm = output_states #output['prenorm_states']
output_states = self.conv(hidden_states, prenorm, input_mask)
if query_states is not None:
query_states = output_states
if isinstance(hidden_states, Sequence):
@ -228,7 +239,7 @@ class BertEmbeddings(nn.Module):
if self.embedding_size != config.hidden_size:
self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps)
self.dropout = StableDropout(config.hidden_dropout_prob)
self.output_to_half = False
self.config = config

Просмотреть файл

@ -18,7 +18,32 @@ from zipfile import ZipFile
from ..utils import get_logger
logger = get_logger()
def download_asset(name, tag=None, no_cache=False, cache_dir=None):
__all__ = ['pretrained_models', 'load_model_state', 'load_vocab']
class PretrainedModel:
def __init__(self, name, vocab, vocab_type, model='pytorch_model.bin', config='config.json', **kwargs):
self.__dict__.update(kwargs)
host = f'https://huggingface.co/microsoft/{name}/resolve/main/'
self.name = name
self.model_url = host + model
self.config_url = host + config
self.vocab_url = host + vocab
self.vocab_type = vocab_type
pretrained_models= {
'base': PretrainedModel('deberta-base', 'bpe_encoder.bin', 'gpt2'),
'large': PretrainedModel('deberta-large', 'bpe_encoder.bin', 'gpt2'),
'xlarge': PretrainedModel('deberta-xlarge', 'bpe_encoder.bin', 'gpt2'),
'base-mnli': PretrainedModel('deberta-base-mnli', 'bpe_encoder.bin', 'gpt2'),
'large-mnli': PretrainedModel('deberta-large-mnli', 'bpe_encoder.bin', 'gpt2'),
'xlarge-mnli': PretrainedModel('deberta-xlarge-mnli', 'bpe_encoder.bin', 'gpt2'),
'xlarge-v2': PretrainedModel('deberta-xlarge-v2', 'spm.model', 'spm'),
'xxlarge-v2': PretrainedModel('deberta-xxlarge-v2', 'spm.model', 'spm'),
'xlarge-v2-mnli': PretrainedModel('deberta-xlarge-v2-mnli', 'spm.model', 'spm'),
'xxlarge-v2-mnli': PretrainedModel('deberta-xxlarge-v2-mnli', 'spm.model', 'spm')
}
def download_asset(url, name, tag=None, no_cache=False, cache_dir=None):
_tag = tag
if _tag is None:
_tag = 'latest'
@ -29,18 +54,7 @@ def download_asset(name, tag=None, no_cache=False, cache_dir=None):
if os.path.exists(output) and (not no_cache):
return output
repo = 'https://api.github.com/repos/microsoft/DeBERTa/releases'
releases = requests.get(repo).json()
if tag and tag != 'latest':
release = [r for r in releases if r['name'].lower()==tag.lower()]
if len(release)!=1:
raise Exception(f'{tag} can\'t be found in the repository.')
else:
release = releases[0]
asset = [s for s in release['assets'] if s['name'].lower()==name.lower()]
if len(asset)!=1:
raise Exception(f'{name} can\'t be found in the release.')
url = asset[0]['url']
#repo=f'https://huggingface.co/microsoft/deberta-{name}/blob/main/bpe_encoder.bin'
headers = {}
headers['Accept'] = 'application/octet-stream'
resp = requests.get(url, stream=True, headers=headers)
@ -60,64 +74,51 @@ def download_asset(name, tag=None, no_cache=False, cache_dir=None):
return output
def load_model_state(name, tag=None, no_cache=False, cache_dir=None):
model_path = name
if model_path and (not os.path.exists(model_path)) and not (('/' in model_path) or ('\\' in model_path)):
def load_model_state(path_or_pretrained_id, tag=None, no_cache=False, cache_dir=None):
model_path = path_or_pretrained_id
if model_path and (not os.path.exists(model_path)) and (path_or_pretrained_id.lower() in pretrained_models):
_tag = tag
pretrained = pretrained_models[path_or_pretrained_id.lower()]
if _tag is None:
_tag = 'latest'
if not cache_dir:
cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/')
cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
os.makedirs(cache_dir, exist_ok=True)
out_dir = os.path.join(cache_dir, name)
model_path = os.path.join(out_dir, 'pytorch.model.bin')
model_path = os.path.join(cache_dir, 'pytorch_model.bin')
if (not os.path.exists(model_path)) or no_cache:
asset = download_asset(name + '.zip', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
with ZipFile(asset, 'r') as zipf:
for zip_info in zipf.infolist():
if zip_info.filename[-1] == '/':
continue
zip_info.filename = os.path.basename(zip_info.filename)
zipf.extract(zip_info, out_dir)
asset = download_asset(pretrained.model_url, 'pytorch_model.bin', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
asset = download_asset(pretrained.config_url, 'model_config.json', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
elif not model_path:
return None,None
config_path = os.path.join(os.path.dirname(model_path), 'model_config.json')
model_state = torch.load(model_path, map_location='cpu')
logger.info("Loaded pre-trained model file {}".format(model_path))
logger.info("Loaded pretrained model file {}".format(model_path))
if 'config' in model_state:
model_config = ModelConfig.from_dict(model_state['config'])
elif os.path.exists(config_path):
model_config = ModelConfig.from_json_file(config_path)
return model_state, model_config
def load_vocab(name=None, tag=None, no_cache=False, cache_dir=None):
if name is None:
name = 'bpe_encoder'
model_path = name
if model_path and (not os.path.exists(model_path)) and not (('/' in model_path) or ('\\' in model_path)):
def load_vocab(vocab_path=None, vocab_type=None, pretrained_id=None, tag=None, no_cache=False, cache_dir=None):
if pretrained_id and (pretrained_id.lower() in pretrained_models):
_tag = tag
if _tag is None:
_tag = 'latest'
if not cache_dir:
cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/')
os.makedirs(cache_dir, exist_ok=True)
out_dir = os.path.join(cache_dir, name)
model_path =os.path.join(out_dir, 'bpe_encoder.bin')
if (not os.path.exists(model_path)) or no_cache:
asset = download_asset(name + '.zip', tag=tag, no_cache=no_cache, cache_dir=cache_dir)
with ZipFile(asset, 'r') as zipf:
for zip_info in zipf.infolist():
if zip_info.filename[-1] == '/':
continue
zip_info.filename = os.path.basename(zip_info.filename)
zipf.extract(zip_info, out_dir)
elif not model_path:
return None,None
encoder_state = torch.load(model_path)
return encoder_state
pretrained = pretrained_models[pretrained_id.lower()]
if not cache_dir:
cache_dir = os.path.join(pathlib.Path.home(), f'.~DeBERTa/assets/{_tag}/{pretrained.name}')
os.makedirs(cache_dir, exist_ok=True)
vocab_type = pretrained.vocab_type
url = pretrained.vocab_url
outname = os.path.basename(url)
vocab_path =os.path.join(cache_dir, outname)
if (not os.path.exists(vocab_path)) or no_cache:
asset = download_asset(url, outname, tag=tag, no_cache=no_cache, cache_dir=cache_dir)
if vocab_type is None:
vocab_type = 'spm'
return vocab_path, vocab_type
def test_download():
vocab = load_vocab()

Просмотреть файл

@ -0,0 +1,33 @@
import torch
import pdb
from functools import lru_cache
import numpy as np
__all__=['build_relative_position', 'make_log_bucket_position']
def make_log_bucket_position(relative_pos, bucket_size, max_position):
sign = np.sign(relative_pos)
mid = bucket_size//2
abs_pos = np.where((relative_pos<mid) & (relative_pos > -mid), mid-1, np.abs(relative_pos))
log_pos = np.ceil(np.log(abs_pos/mid)/np.log((max_position-1)/mid) * (mid-1)) + mid
bucket_pos = np.where(abs_pos<=mid, relative_pos, log_pos*sign).astype(np.int)
return bucket_pos
@lru_cache(maxsize=128)
def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
q_ids = np.arange(0, query_size)
k_ids = np.arange(0, key_size)
rel_pos_ids = q_ids[:, None] - np.tile(k_ids, (q_ids.shape[0],1))
if bucket_size>0 and max_position > 0:
rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
rel_pos_ids = torch.tensor(rel_pos_ids, dtype=torch.long)
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = rel_pos_ids.unsqueeze(0)
return rel_pos_ids
def test_log_bucket():
x=np.arange(-511,511)
y=make_log_bucket_position(x, 128, 512)
pdb.set_trace()

Просмотреть файл

@ -16,6 +16,7 @@ from .ops import *
from .bert import *
from .config import ModelConfig
from .cache_utils import load_model_state
import pdb
__all__ = ['DeBERTa']
@ -36,11 +37,6 @@ class DeBERTa(torch.nn.Module):
def __init__(self, config=None, pre_trained=None):
super().__init__()
if config:
self.z_steps = getattr(config, 'z_steps', 0)
else:
self.z_steps = 0
state = None
if pre_trained is not None:
state, model_config = load_model_state(pre_trained)
@ -121,17 +117,6 @@ class DeBERTa(torch.nn.Module):
if return_att:
encoded_layers, att_matrixs = encoded_layers
if self.z_steps>1:
hidden_states = encoded_layers[-2]
layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
query_states = encoded_layers[-1]
rel_embeddings = self.encoder.get_rel_embedding()
attention_mask = self.encoder.get_attention_mask(attention_mask)
rel_pos = self.encoder.get_rel_pos(embedding_output)
for layer in layers[1:]:
query_states = layer(hidden_states, attention_mask, return_att=False, query_states = query_states, relative_pos=rel_pos, rel_embeddings=rel_embeddings)
encoded_layers.append(query_states)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1:]
@ -161,3 +146,4 @@ class DeBERTa(torch.nn.Module):
for c in current.keys():
current[c] = state[key_match(c, state.keys())]
self.load_state_dict(current)

Просмотреть файл

@ -11,145 +11,67 @@
Disentangled SelfAttention module
"""
import torch
import numpy as np
import math
import torch
from torch import nn
import functools
import pdb
from .ops import *
from .da_utils import build_relative_position
__all__=['build_relative_position', 'DisentangledSelfAttention']
from ..utils import get_logger
logger=get_logger()
def build_relative_position(query_size, key_size, device):
""" Build relative position according to the query and key
We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key :math:`P_k` is range from (0, key_size),
The relative positions from query to key is
:math:`R_{q \\rightarrow k} = P_q - P_k`
Args:
query_size (int): the length of query
key_size (int): the length of key
Return:
:obj:`torch.LongTensor`: A tensor with shape [1, query_size, key_size]
"""
q_ids = torch.arange(query_size, dtype=torch.long, device=device)
k_ids = torch.arange(key_size, dtype=torch.long, device=device)
rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1)
rel_pos_ids = rel_pos_ids[:query_size, :]
rel_pos_ids = rel_pos_ids.unsqueeze(0)
return rel_pos_ids
@torch.jit.script
def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)])
@torch.jit.script
def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
return c2p_pos.expand([query_layer.size(0), query_layer.size(1), key_layer.size(-2), key_layer.size(-2)])
@torch.jit.script
def pos_dynamic_expand(pos_index, p2c_att, key_layer):
return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2)))
class DisentangledSelfAttention(torch.nn.Module):
""" Disentangled self-attention module
Parameters:
config (:obj:`str`):
A model config class instance with the configuration to build a new model. The schema is similar to `BertConfig`, \
for more details, please refer :class:`~DeBERTa.deberta.ModelConfig`
"""
__all__=['DisentangledSelfAttention']
class DisentangledSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
_attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.attention_head_size = getattr(config, 'attention_head_size', _attention_head_size)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
self.relative_attention = getattr(config, 'relative_attention', False)
self.talking_head = getattr(config, 'talking_head', False)
self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
if self.talking_head:
self.head_logits_proj = torch.nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
self.head_weights_proj = torch.nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False)
self.share_att_key = getattr(config, 'share_att_key', False)
self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'c2p').lower().split('|')] # c2p|p2c
self.relative_attention = getattr(config, 'relative_attention', False)
if self.relative_attention:
self.position_buckets = getattr(config, 'position_buckets', -1)
self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
if self.max_relative_positions <1:
self.max_relative_positions = config.max_position_embeddings
self.pos_ebd_size = self.max_relative_positions
if self.position_buckets>0:
self.pos_ebd_size = self.position_buckets
# For backward compitable
self.pos_dropout = StableDropout(config.hidden_dropout_prob)
if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
if (not self.share_att_key):
if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=True)
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = StableDropout(config.attention_probs_dropout_prob)
self._register_load_state_dict_pre_hook(self._pre_load_hook)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
def transpose_for_scores(self, x, attention_heads):
new_x_shape = x.size()[:-1] + (attention_heads, -1)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1))
def forward(self, hidden_states, attention_mask, return_att=False, query_states=None, relative_pos=None, rel_embeddings=None):
""" Call the module
Args:
hidden_states (:obj:`torch.FloatTensor`):
Input states to the module usally the output from previous layer, it will be the Q,K and V in `Attention(Q,K,V)`
attention_mask (:obj:`torch.ByteTensor`):
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j` th token.
return_att (:obj:`bool`, optional):
Whether return the attention maxitrix.
query_states (:obj:`torch.FloatTensor`, optional):
The `Q` state in `Attention(Q,K,V)`.
relative_pos (:obj:`torch.LongTensor`):
The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with values ranging in [`-max_relative_positions`, `max_relative_positions`].
rel_embeddings (:obj:`torch.FloatTensor`):
The embedding of relative distances. It's a tensor of shape [:math:`2 \\times \\text{max_relative_positions}`, `hidden_size`].
"""
if query_states is None:
qp = self.in_proj(hidden_states) #.split(self.all_head_size, dim=-1)
query_layer,key_layer,value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1)
else:
def linear(w,b,x):
if b is not None:
return torch.matmul(x, w.t()) + b.t()
else:
return torch.matmul(x, w.t()) # + b.t()
ws = self.in_proj.weight.chunk(self.num_attention_heads*3, dim=0)
qkvw = [torch.cat([ws[i*3+k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)]
qkvb = [None]*3
q = linear(qkvw[0], qkvb[0], query_states)
k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)]
query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]]
query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0))
value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0))
query_states = hidden_states
query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
rel_att = None
# Take the dot product between "query" and "key" to get the raw attention scores.
@ -161,37 +83,34 @@ class DisentangledSelfAttention(torch.nn.Module):
if 'p2p' in self.pos_att_type:
scale_factor += 1
scale = math.sqrt(query_layer.size(-1)*scale_factor)
query_layer = query_layer/scale
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2))/scale
if self.relative_attention:
rel_embeddings = self.pos_dropout(rel_embeddings)
rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
rel_att = self.disentangled_attention_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
if rel_att is not None:
attention_scores = (attention_scores + rel_att)
attention_scores = attention_scores
attention_scores = attention_scores.view(-1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1))
# bxhxlxd
if self.talking_head:
attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)
attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
attention_probs = self.dropout(attention_probs)
if self.talking_head:
attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
_attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
attention_probs = self.dropout(_attention_probs)
context_layer = torch.bmm(attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer)
context_layer = context_layer.view(-1, self.num_attention_heads, context_layer.size(-2), context_layer.size(-1)).permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (-1,)
context_layer = context_layer.view(*new_context_layer_shape)
if return_att:
return (context_layer, attention_probs)
else:
return context_layer
def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
return {
'hidden_states': context_layer,
'attention_probs': _attention_probs,
'attention_logits': attention_scores
}
def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
if relative_pos is None:
q = query_layer.size(-2)
relative_pos = build_relative_position(q, key_layer.size(-2), query_layer.device)
relative_pos = build_relative_position(q, key_layer.size(-2), bucket_size = self.position_buckets, max_position = self.max_relative_positions)
if relative_pos.dim()==2:
relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
elif relative_pos.dim()==3:
@ -200,41 +119,91 @@ class DisentangledSelfAttention(torch.nn.Module):
elif relative_pos.dim()!=4:
raise ValueError(f'Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}')
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
att_span = self.pos_ebd_size
relative_pos = relative_pos.long().to(query_layer.device)
rel_embeddings = rel_embeddings[self.max_relative_positions - att_span:self.max_relative_positions + att_span, :].unsqueeze(0)
if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
pos_key_layer = self.pos_proj(rel_embeddings)
pos_key_layer = self.transpose_for_scores(pos_key_layer)
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
pos_query_layer = self.pos_q_proj(rel_embeddings)
pos_query_layer = self.transpose_for_scores(pos_query_layer)
rel_embeddings = rel_embeddings[self.pos_ebd_size - att_span:self.pos_ebd_size + att_span, :].unsqueeze(0) #.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1)
if self.share_att_key:
pos_query_layer = self.transpose_for_scores(self.query_proj(rel_embeddings), self.num_attention_heads)\
.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads)\
.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
else:
if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
pos_key_layer = self.transpose_for_scores(self.pos_key_proj(rel_embeddings), self.num_attention_heads)\
.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
pos_query_layer = self.transpose_for_scores(self.pos_query_proj(rel_embeddings), self.num_attention_heads)\
.repeat(query_layer.size(0)//self.num_attention_heads, 1, 1) #.split(self.all_head_size, dim=-1)
score = 0
# content->position
if 'c2p' in self.pos_att_type:
c2p_att = torch.matmul(query_layer, pos_key_layer.transpose(-1, -2))
scale = math.sqrt(pos_key_layer.size(-1)*scale_factor)
c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2))
c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span*2-1)
c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos))
score += c2p_att
c2p_att = torch.gather(c2p_att, dim=-1, index=c2p_pos.squeeze(0).expand([query_layer.size(0), query_layer.size(1), relative_pos.size(-1)]))
score += c2p_att/scale
# position->content
if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
pos_query_layer /= math.sqrt(pos_query_layer.size(-1)*scale_factor)
if query_layer.size(-2) != key_layer.size(-2):
r_pos = build_relative_position(key_layer.size(-2), key_layer.size(-2), query_layer.device)
scale = math.sqrt(pos_query_layer.size(-1)*scale_factor)
if key_layer.size(-2) != query_layer.size(-2):
r_pos = build_relative_position(key_layer.size(-2), key_layer.size(-2), bucket_size = self.position_buckets, max_position = self.max_relative_positions).to(query_layer.device)
r_pos = r_pos.unsqueeze(0)
else:
r_pos = relative_pos
p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span*2-1)
if query_layer.size(-2) != key_layer.size(-2):
pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
if 'p2c' in self.pos_att_type:
p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2))
p2c_att = torch.gather(p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer)).transpose(-1,-2)
p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2))
p2c_att = torch.gather(p2c_att, dim=-1, index=p2c_pos.squeeze(0).expand([query_layer.size(0), key_layer.size(-2), key_layer.size(-2)])).transpose(-1,-2)
if query_layer.size(-2) != key_layer.size(-2):
p2c_att = torch.gather(p2c_att, dim=-2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer))
score += p2c_att
p2c_att = torch.gather(p2c_att, dim=-2, index=pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))))
score += p2c_att/scale
# position->position
if 'p2p' in self.pos_att_type:
pos_query = pos_query_layer[:,:,att_span:,:]
p2p_att = torch.matmul(pos_query, pos_key_layer.transpose(-1, -2))
p2p_att = p2p_att.expand(query_layer.size()[:2] + p2p_att.size()[2:])
if query_layer.size(-2) != key_layer.size(-2):
p2p_att = torch.gather(p2p_att, dim=-2, index=pos_index.expand(query_layer.size()[:2] + (pos_index.size(-2), p2p_att.size(-1))))
p2p_att = torch.gather(p2p_att, dim=-1, index=c2p_pos.expand([query_layer.size(0), query_layer.size(1), query_layer.size(2), relative_pos.size(-1)]))
score += p2p_att
return score
def _pre_load_hook(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
self_state = self.state_dict()
if ((prefix + 'query_proj.weight') not in state_dict) and ((prefix + 'in_proj.weight') in state_dict):
v1_proj = state_dict[prefix+'in_proj.weight']
v1_proj = v1_proj.unsqueeze(0).reshape(self.num_attention_heads, -1, v1_proj.size(-1))
q,k,v=v1_proj.chunk(3, dim=1)
state_dict[prefix + 'query_proj.weight'] = q.reshape(-1, v1_proj.size(-1))
state_dict[prefix + 'key_proj.weight'] = k.reshape(-1, v1_proj.size(-1))
state_dict[prefix + 'key_proj.bias'] = self_state['key_proj.bias']
state_dict[prefix + 'value_proj.weight'] = v.reshape(-1, v1_proj.size(-1))
v1_query_bias = state_dict[prefix + 'q_bias']
state_dict[prefix + 'query_proj.bias'] = v1_query_bias
v1_value_bias = state_dict[prefix +'v_bias']
state_dict[prefix + 'value_proj.bias'] = v1_value_bias
v1_pos_key_proj = state_dict[prefix + 'pos_proj.weight']
state_dict[prefix + 'pos_key_proj.weight'] = v1_pos_key_proj
v1_pos_query_proj = state_dict[prefix + 'pos_q_proj.weight']
state_dict[prefix + 'pos_query_proj.weight'] = v1_pos_query_proj
v1_pos_query_proj_bias = state_dict[prefix + 'pos_q_proj.bias']
state_dict[prefix + 'pos_query_proj.bias'] = v1_pos_query_proj_bias
state_dict[prefix + 'pos_key_proj.bias'] = self_state['pos_key_proj.bias']
del state_dict[prefix + 'in_proj.weight']
del state_dict[prefix + 'q_bias']
del state_dict[prefix + 'v_bias']
del state_dict[prefix + 'pos_proj.weight']
del state_dict[prefix + 'pos_q_proj.weight']
del state_dict[prefix + 'pos_q_proj.bias']

Просмотреть файл

@ -59,7 +59,7 @@ class GPT2Tokenizer(object):
self.sep_token_id = self.add_symbol(self.sep_token)
self.unk_token_id = self.add_symbol(self.unk_token)
self.gpt2_encoder = load_vocab(vocab_file)
self.gpt2_encoder = torch.load(vocab_file)
self.bpe = get_encoder(self.gpt2_encoder['encoder'], self.gpt2_encoder['vocab'])
for w,n in self.gpt2_encoder['dict_map']:
self.add_symbol(w, n)

Просмотреть файл

@ -10,7 +10,7 @@ import torch
from torch import nn
import pdb
from .bert import BertLayerNorm,ACT2FN
from .bert import LayerNorm,ACT2FN
__all__ = ['MLMPredictionHead']
@ -22,7 +22,7 @@ class MLMPredictionHead(nn.Module):
self.transform_act_fn = ACT2FN[config.hidden_act] \
if isinstance(config.hidden_act, str) else config.hidden_act
self.LayerNorm = BertLayerNorm(self.embedding_size, config.layer_norm_eps)
self.LayerNorm = LayerNorm(self.embedding_size, config.layer_norm_eps)
self.bias = nn.Parameter(torch.zeros(vocab_size))
self.pre_norm = PreLayerNorm(config)

Просмотреть файл

@ -130,4 +130,5 @@ class NNModule(nn.Module):
if child is not None:
load(child, prefix + name + '.')
load(model)
logger.warning(f'Missing keys: {missing_keys}, unexpected_keys: {unexpected_keys}, error_msgs: {error_msgs}')
return model

Просмотреть файл

@ -10,6 +10,7 @@
import math
from packaging import version
import torch
from torch.nn import LayerNorm
from ..utils.jit_tracing import traceable
if version.Version(torch.__version__) >= version.Version('1.0.0'):
@ -17,7 +18,7 @@ if version.Version(torch.__version__) >= version.Version('1.0.0'):
else:
from torch import softmax_backward_data as _softmax_backward_data
__all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax']
__all__ = ['StableDropout', 'MaskedLayerNorm', 'XSoftmax', 'ACT2FN', 'LayerNorm']
@traceable
class XSoftmax(torch.autograd.Function):
@ -170,7 +171,7 @@ def MaskedLayerNorm(layerNorm, input, mask = None):
""" Masked LayerNorm which will apply mask over the output of LayerNorm to avoid inaccurate updatings to the LayerNorm module.
Args:
layernorm (:obj:`~DeBERTa.deberta.BertLayerNorm`): LayerNorm module or function
layernorm (:obj:`~DeBERTa.deberta.LayerNorm`): LayerNorm module or function
input (:obj:`torch.tensor`): The input tensor
mask (:obj:`torch.IntTensor`): The mask to applied on the output of LayerNorm where `0` indicate the output of that element will be ignored, i.e. set to `0`
@ -179,7 +180,7 @@ def MaskedLayerNorm(layerNorm, input, mask = None):
# Create a tensor b x n x d
x = torch.randn([1,10,100])
m = torch.tensor([[1,1,1,0,0,0,0,0,0,0]], dtype=torch.int)
LayerNorm = DeBERTa.deberta.BertLayerNorm(100)
LayerNorm = DeBERTa.deberta.LayerNorm(100)
y = MaskedLayerNorm(LayerNorm, x, m)
"""
@ -192,3 +193,21 @@ def MaskedLayerNorm(layerNorm, input, mask = None):
mask = mask.unsqueeze(2)
mask = mask.to(output.dtype)
return output*mask
def gelu(x):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def swish(x):
return x * torch.sigmoid(x)
def linear_act(x):
return x
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "tanh": torch.tanh, "linear": linear_act, 'sigmoid': torch.sigmoid}

Просмотреть файл

@ -0,0 +1,2 @@

322
DeBERTa/deberta/spm_tokenizer.py Executable file
Просмотреть файл

@ -0,0 +1,322 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 11/15/2020
#
import six
import unicodedata
import os
import sentencepiece as sp
import regex as re
from .cache_utils import load_vocab
from ..utils import get_logger
logger=get_logger()
import pdb
__all__ = ['SPMTokenizer']
class SPMTokenizer:
def __init__(self, vocab_file, do_lower_case=False, special_tokens=None, bpe_dropout=0, split_by_punct=False):
self.split_by_punct = split_by_punct
spm = sp.SentencePieceProcessor()
assert os.path.exists(vocab_file)
spm.load(vocab_file)
bpe_vocab_size = spm.GetPieceSize()
# Token map
# <unk> 0+1
# <s> 1+1
# </s> 2+1
self.vocab = {spm.IdToPiece(i):i for i in range(bpe_vocab_size)}
self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
#self.vocab['[PAD]'] = 0
#self.vocab['[CLS]'] = 1
#self.vocab['[SEP]'] = 2
#self.vocab['[UNK]'] = 3
_special_tokens = ['[MASK]', '[SEP]', '[PAD]', '[UNK]', '[CLS]']
self.special_tokens = []
if special_tokens is not None:
_special_tokens.extend(special_tokens)
for t in _special_tokens:
self.add_special_token(t)
self.spm = spm
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
def tokenize(self, text):
pieces = self._encode_as_pieces(text)
def _norm(x):
if x not in self.vocab or x=='<unk>':
return '[UNK]'
else:
return x
pieces = [_norm(p) for p in pieces]
return pieces
def convert_tokens_to_ids(self, tokens):
return [self.vocab[t] if t in self.vocab else 1 for t in tokens]
def convert_ids_to_tokens(self, ids):
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
def decode(self, tokens, start=-1, end=-1, raw_text=None):
if raw_text is None:
return self.spm.decode_pieces([t for t in tokens if t not in self.special_tokens])
else:
words = self.split_to_words(raw_text)
word_tokens = [self.tokenize(w) for w in words]
wt = [w for t in word_tokens for w in t]
#assert tokens == wt, f'{tokens} || {wt}'
if wt!=tokens:
for a,b in zip(wt, tokens):
if a!=b:
pdb.set_trace()
token2words = [0]*len(tokens)
tid = 0
for i,w in enumerate(word_tokens):
for k,t in enumerate(w):
token2words[tid] = i
tid += 1
word_start = token2words[start]
word_end = token2words[end] if end <len(tokens) else len(words)
text = ''.join(words[word_start:word_end])
return text
def add_special_token(self, token):
if token not in self.special_tokens:
self.special_tokens.append(token)
if token not in self.vocab:
self.vocab[token] = len(self.vocab)
self.id_to_tokens.append(token)
return self.id(token)
def part_of_whole_word(self, token, is_bos=False):
if is_bos:
return True
if (len(token)==1 and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))) or token in self.special_tokens:
return False
word_start = b'\xe2\x96\x81'.decode('utf-8')
return not token.startswith(word_start)
def pad(self):
return '[PAD]'
def bos(self):
return '[CLS]'
def eos(self):
return '[SEP]'
def unk(self):
return '[UNK]'
def mask(self):
return '[MASK]'
def sym(self, id):
return self.ids_to_tokens[id]
def id(self, sym):
return self.vocab[sym] if sym in self.vocab else 1
def _encode_as_pieces(self, text):
text = convert_to_unicode(text)
if self.split_by_punct:
words = self._run_split_on_punc(text)
pieces = [self.spm.encode_as_pieces(w) for w in words]
return [p for w in pieces for p in w]
else:
return self.spm.encode_as_pieces(text)
def split_to_words(self, text):
pieces = self._encode_as_pieces(text)
word_start = b'\xe2\x96\x81'.decode('utf-8')
words = []
offset = 0
prev_end = 0
for i,p in enumerate(pieces):
if p.startswith(word_start):
if offset>prev_end:
words.append(text[prev_end:offset])
prev_end = offset
w = p.replace(word_start, '')
else:
w = p
try:
s = text.index(w, offset)
pn = ""
k = i+1
while k < len(pieces):
pn = pieces[k].replace(word_start, '')
if len(pn)>0:
break
k += 1
if len(pn)>0 and pn in text[offset:s]:
offset = offset + 1
else:
offset = s + len(w)
except:
offset = offset + 1
if prev_end< offset:
words.append(text[prev_end:offset])
return words
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
#words = list(re.findall(self.pat, text))
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")

Просмотреть файл

@ -0,0 +1,16 @@
#
# Author: penhe@microsoft.com
# Date: 04/25/2019
#
""" tokenizers
"""
from .gpt2_tokenizer import GPT2Tokenizer
from .spm_tokenizer import *
__all__ = ['tokenizers']
tokenizers={
'gpt2': GPT2Tokenizer,
'spm': SPMTokenizer
}

Просмотреть файл

@ -1,10 +0,0 @@
#
# Author: penhe@microsoft.com
# Date: 04/25/2019
#
""" optimizers
"""
from .xadam import XAdam
from .fp16_optimizer import *
from .lr_schedulers import SCHEDULES

Просмотреть файл

@ -1,279 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 05/30/2019
#
""" FP16 optimizer wrapper
"""
from collections import defaultdict
import numpy as np
import math
import torch
import torch.distributed as dist
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
import ctypes
from ..utils import get_logger
logger=get_logger()
try:
lib = ctypes.cdll.LoadLibrary(None)
lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
lib.THCudaHalfTensor_normall.restype = ctypes.c_float
except:
lib = None
logger.warning('Failed to load half normal.')
pass
__all__ = ['Fp16Optimizer', 'ExpLossScaler']
def get_world_size():
try:
wd = dist.get_world_size()
return wd
except:
return 1
def fused_norm(input):
if input.type() == 'torch.cuda.HalfTensor':
if (lib is not None):
return lib.THCudaHalfTensor_normall(torch.cuda._state_cdata, input._cdata, 16384)
else:
return input.norm()
else:
return input.norm()
class OptParameter(torch.Tensor):
def __new__(cls, data, out_data=None, grad=None, name=None):
param = torch.Tensor._make_subclass(cls, data)
param._xgrad = grad
param.out_data = out_data
param._name = name
return param
@property
def name(self):
return self._name
@property
def grad(self):
return self._xgrad
@grad.setter
def grad(self, grad):
self._xgrad = grad
class Fp16Optimizer(object):
def __init__(self, param_groups, optimizer_fn, loss_scaler=None, grad_clip_norm = 1.0, lookahead_k = -1, lookahead_alpha = 0.5):
# all parameters should on the same device
groups = []
original_groups = []
for group in param_groups:
params = group['params'] # parameter
flattened_params = _flatten_dense_tensors([p.data for p in params])
unflattend_params = _unflatten_dense_tensors(flattened_params, [p.data for p in params])
for uf,p in zip(unflattend_params, params):
p.data = uf
if params[0].dtype==torch.half:
master_params = flattened_params.clone().to(torch.float).detach_().to(flattened_params.device)
group['params'] = [OptParameter(master_params, flattened_params, name='master')]
else:
group['params'] = [OptParameter(flattened_params, None, name='master')]
o_group = defaultdict(list)
o_group['names'] = group['names']
o_group['params'] = params
group['names'] = ['master']
original_groups.append(o_group)
groups.append(group)
self.param_groups = groups
self.loss_scaler = loss_scaler
self.optimizer = optimizer_fn(self.param_groups)
self.original_param_groups = original_groups
self.max_grad_norm = grad_clip_norm
self.lookahead_k = lookahead_k
self.lookahead_alpha = lookahead_alpha
def backward(self, loss):
if self.loss_scaler:
loss_scale, loss, step_loss = self.loss_scaler.scale(loss)
else:
loss_scale = 1
step_loss = loss.item()
loss.backward()
return loss_scale, step_loss
def step(self, lr_scale, loss_scale = 1):
grad_scale = self._grad_scale(loss_scale)
if grad_scale is None or math.isinf(grad_scale):
self.loss_scaler.update(False)
return False
if self.lookahead_k > 0:
for p in self.param_groups:
if 'la_count' not in p:
# init
#make old copy
p['la_count'] = 0
p['slow_params'] = [x.data.detach().clone().requires_grad_(False) for x in p['params']]
self.optimizer.step(grad_scale, lr_scale)
if self.lookahead_k > 0:
for p in self.param_groups:
p['la_count'] += 1
if p['la_count'] == self.lookahead_k:
p['la_count'] = 0
for s,f in zip(p['slow_params'], p['params']):
s.mul_(1-self.lookahead_alpha)
s.add_(f.data.detach()*self.lookahead_alpha)
f.data.copy_(s, non_blocking=True)
if hasattr(f, 'out_data') and f.out_data is not None:
f.out_data.copy_(f.data, non_blocking=True)
if self.loss_scaler:
self.loss_scaler.update(True)
return True
def zero_grad(self):
for group, o_group in zip(self.param_groups, self.original_param_groups):
for p in group['params']:
p.grad = None
for p in o_group['params']:
p.grad = None
def get_master_params(self):
params = []
for g, o_g in zip(self.param_groups, self.original_param_groups):
ops = [p.data for p in o_g['params']]
unflattend_params = _unflatten_dense_tensors(g['params'][0].data, ops)
for up, op, n in zip(unflattend_params, o_g['params'], o_g['names']):
params.append((n, torch.nn.Parameter(up)))
state = defaultdict(dict)
state['params'] = params
return state
def get_params_norm(self):
norm0 = torch.zeros(1, dtype=torch.float)
norm1 = torch.zeros(1, dtype=torch.float)
for g in self.param_groups:
for p in g['params']:
norm0 = norm0.to(p.data.device)
norm1 = norm1.to(p.data.device)
norm0 += fused_norm(p.data)
if p.out_data is not None:
norm1 += fused_norm(p.out_data)
norm = torch.cat((norm0, norm1))
wd = get_world_size()
if wd > 1:
norms = [torch.zeros_like(norm) for _ in range(wd)]
dist.all_gather(norms, norm)
else:
norms = [norm]
return norms
def state_dict(self):
state = defaultdict(dict)
opt_state = self.optimizer.state_dict()
state['optimizer'] = opt_state
master_params = [p['params'][0].data for p in self.param_groups if p['params'][0].out_data is not None]
state['master_params'] = master_params
if self.loss_scaler is not None:
state['scaler'] = self.loss_scaler.state_dict()
return state
def load_state_dict(self, state):
opt_state = state['optimizer']
self.optimizer.load_state_dict(opt_state)
self.param_groups = self.optimizer.param_groups
saved_master_params = state['master_params']
master_params = [p['params'][0].data for p in self.param_groups if p['params'][0].out_data is not None]
assert len(saved_master_params) == len(master_params), f'Saved master parameters must matches the master parameters in the object.'
for s, m in zip(saved_master_params, master_params):
assert s.size()==m.size()
m.copy_(s)
if self.loss_scaler is not None:
self.loss_scaler.load_state_dict(state['scaler'])
def _grad_scale(self, loss_scale = 1):
norm = torch.zeros(1, dtype=torch.float)
for group, o_g in zip(self.param_groups, self.original_param_groups):
grads = [p.grad if p.grad is not None else torch.zeros_like(p.data) for p in o_g['params']]
flattened_grads = _flatten_dense_tensors(grads)
wd = get_world_size()
if wd > 1:
loss_scale *= wd
dist.all_reduce(flattened_grads)
torch.cuda.synchronize()
norm = norm.to(flattened_grads.device)
norm = norm + fused_norm(flattened_grads)**2
group['params'][0].grad = flattened_grads
norm = norm**0.5
if torch.isnan(norm) or torch.isinf(norm) :
return None
grad_scale = loss_scale
if self.max_grad_norm>0:
scale = norm/(loss_scale*self.max_grad_norm)
if scale>1:
grad_scale *= scale
return grad_scale
class ExpLossScaler:
def __init__(self, init_scale=2**16, scale_interval=1000):
self.cur_scale = init_scale
self.scale_interval = scale_interval
self.invalid_cnt = 0
self.last_scale = 0
self.steps = 0
self.down_scale_smooth = 0
def scale(self, loss):
assert self.cur_scale > 0, self.init_scale
step_loss = loss.float().detach().item()
if step_loss != 0 and math.isfinite(step_loss):
loss_scale = self.cur_scale
else:
loss_scale = 1
loss = loss.float()*loss_scale
return (loss_scale, loss, step_loss)
def update(self, is_valid = True):
if not is_valid:
self.invalid_cnt += 1
if self.invalid_cnt>self.down_scale_smooth:
self.cur_scale /= 2
self.cur_scale = max(self.cur_scale, 1)
self.last_scale = self.steps
else:
self.invalid_cnt = 0
if self.steps - self.last_scale>self.scale_interval:
self.cur_scale *= 2
self.last_scale = self.steps
self.steps += 1
def state_dict(self):
state = defaultdict(float)
state['steps'] = self.steps
state['invalid_cnt'] = self.invalid_cnt
state['cur_scale'] = self.cur_scale
state['last_scale'] = self.last_scale
return state
def load_state_dict(self, state):
self.steps = state['steps']
self.invalid_cnt = state['invalid_cnt']
self.cur_scale = state['cur_scale']
self.last_scale = state['last_scale']

Просмотреть файл

@ -1,63 +0,0 @@
""" Learning rate schedulers
"""
import math
import torch
from torch.optim import Optimizer
from torch.nn.utils import clip_grad_norm_
def warmup_cosine(step, total, warmup=0.002, ends = 0):
x = step/total
x = x-int(x)
if x < warmup:
return x/warmup
return 0.5 * (1.0 + math.cos(math.pi * x))
def warmup_constant(step, total, warmup=0.002, ends = 0):
x = step/total
x = x-int(x)
if x < warmup:
return x/warmup
return 1.0
def warmup_linear(step, total, warmup=0.002, ends = 0):
x = step/total
x = x-int(x)
if x < warmup:
return x/warmup
return (1-ends)*(1.0 - x) + ends
def warmup_linear_cosine(step, total, warmup=0.002, ends = 0):
x = step/total
x = x-int(x)
if x < warmup:
return x/warmup
return (1-ends)*max(0.5*(1+math.cos(math.pi*(x-warmup)/(1-warmup))), 0) + ends
def warmup_cyclic_linear_cosine(step, total, warmup=0.002, ends = 0):
x = step/total
if x < warmup:
return x/warmup
total = total - int(total*warmup)
step = step - int(total*warmup)
n_epoch = 4
period = total//n_epoch
k = step//period
s = 1-k/n_epoch + 1/(2*n_epoch)*(math.pow(-1, k)*math.cos(math.pi*step/period)-1)
return (1-ends)*max(s, 0) + ends
def warmup_linear_shift(step, total, warmup=0.002, ends = 0):
x = step/total
x = x-int(x)
if x < warmup:
return x/warmup
return (1-ends)*(1.0 - (x-warmup)/(1-warmup)) + ends
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
'warmup_linear_cosine':warmup_linear_cosine,
'warmup_cyclic_linear_cosine':warmup_cyclic_linear_cosine,
'warmup_linear_shift':warmup_linear_shift,
}

Просмотреть файл

@ -1,186 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 04/25/2019
#
""" Optimizer
"""
import math
import torch
from torch.optim import Optimizer
from torch.nn.utils import clip_grad_norm_
from torch import distributed as dist
import pdb
from .lr_schedulers import SCHEDULES
from ..utils import get_logger
def adamw(data,
out_data,
next_m,
next_v,
grad,
lr,
beta1,
beta2,
eps,
grad_scale, #combined_scale, g = g/scale
step,
eps_mode = 1, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2
bias_correction = 0,
weight_decay = 0):
if bias_correction > 0:
lr *= bias_correction
beta1_ = 1 - beta1
beta2_ = 1 - beta2
if grad_scale != 1:
grad *= 1/grad_scale
grad = grad.to(next_m)
next_m.mul_(beta1).add_(beta1_, grad)
# admax
admax = eps_mode>>4
eps_mode = eps_mode&0xF
if admax > 0:
torch.max(next_v.mul_(beta2), grad.abs().to(next_v), out=next_v)
update = next_m/(next_v+eps)
else:
next_v.mul_(beta2).addcmul_(beta2_, grad, grad)
if eps_mode == 0:
update = next_m*(next_v+eps).rsqrt()
elif eps_mode == 1:
update = next_m/(next_v.sqrt()+eps)
else: #=2
update = next_m
if weight_decay>0:
update.add_(weight_decay, data)
data.add_(-lr, update)
if (out_data is not None) and len(out_data)>0:
out_data.copy_(data)
class XAdam(Optimizer):
"""Implements optimized version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay_rate: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
with_radam: Whether to enable radam. Default: False
radam_th: RAdam threshold for tractable variance. Default: 4
opt_type: The type of optimizer, [adam, admax], default: adam
"""
def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-8, weight_decay_rate=0.01,
lr_ends = 0,
max_grad_norm = 1.0,
with_radam = False,
radam_th = 4,
opt_type=None):
if not lr >= 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
self.defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate,
lr_ends = lr_ends,
max_grad_norm=max_grad_norm,
with_radam = with_radam, radam_th = radam_th)
self.opt_type = opt_type.lower() if opt_type is not None else ""
super().__init__(params, self.defaults)
def step(self, grad_scale = 1, lr_scale = 1):
"""Performs a single optimization step.
Arguments:
grad_scale: divid grad by grad_scale
lr_scale: scale learning rate by bs_scale
"""
loss = None
if 'global_step' not in self.state:
self.state['global_step'] = 0
self.state['global_step'] += 1
self.last_grad_scale = grad_scale
for group in self.param_groups:
for param in group['params']:
grad = param.grad
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.get_state(param)
if group['t_total'] > 0:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step'], group['t_total'], group['warmup'], group['lr_ends'])
else:
lr_scheduled = group['lr']
lr = lr_scale*lr_scheduled
next_m, next_v = state['next_m'], state['next_v']
beta1, beta2 = group['b1'], group['b2']
state['step'] += 1
# Support for RAdam
t = state['step']
eps_mode = 1
if group['with_radam']:
rou_ = 2/(1-beta2) - 1
rou_t = rou_ - 2*t/(beta2**-t - 1)
bias_c = 1/(1-beta1**t)
if rou_t > group['radam_th']:
bias_c *= math.sqrt(1 - beta2**t)
bias_c *= math.sqrt(((rou_t - 4)*(rou_t - 2)*rou_)/((rou_ - 4)*(rou_ - 2)*rou_t))
else:
eps_mode = 2
bias_c = 0
lr *= bias_c
if self.opt_type == 'admax':
eps_mode |= 0x10
with torch.cuda.device(param.device.index):
out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None
if out_p is None or out_p.dtype != grad.dtype:
out_p = torch.tensor([], dtype=torch.float).to(param.data)
adamw(param.data,
out_p,
next_m,
next_v,
grad,
lr,
beta1,
beta2,
group['e'],
grad_scale, #combined_scale, g = g/scale
state['step'],
eps_mode, #self.eps_mode, esp inside sqrt:0, outside: 1, only update with momentum: 2
0, #bias_correction,
group['weight_decay_rate'])
out_p = param.out_data if hasattr(param, 'out_data') and (param.out_data is not None) else None
if out_p is not None and out_p.dtype != grad.dtype:
out_p.copy_(param.data)
def get_state(self, param):
state = self.state[param]
# State initialization
if len(state) == 0:
state['step'] = 0
state['next_m'] = torch.zeros_like(param.data)
state['next_v'] = torch.zeros_like(param.data)
return state

Просмотреть файл

@ -1,3 +0,0 @@
from .trainer import DistributedTrainer, set_random_seed
from .dist_launcher import initialize_distributed,kill_children
from ._utils import batch_to,batch_apply

Просмотреть файл

@ -1,16 +0,0 @@
import torch
from collections import Sequence, Mapping
def batch_apply(batch, fn):
if isinstance(batch, torch.Tensor):
return fn(batch)
elif isinstance(batch, Sequence):
return [batch_apply(x, fn) for x in batch]
elif isinstance(batch, Mapping):
return {x:batch_apply(batch[x], fn) for x in batch}
else:
raise NotImplementedError(f'Type of {type(batch)} are not supported in batch_apply')
def batch_to(batch, device):
return batch_apply(batch, lambda x: x.to(device))

Просмотреть файл

@ -1,164 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 05/15/2019
#
import os
import time
import pdb
import signal
import torch
from multiprocessing import Process,Pool
from collections import defaultdict
import sys
import psutil
from ..utils import set_logger, get_logger
logger = get_logger()
def kill_children(proc=None, recursive = True):
if proc is None:
proc = psutil.Process()
_children = proc.children(recursive=False)
for c in _children:
try:
if recursive:
kill_children(c, recursive=recursive)
os.kill(c.pid, signal.SIGKILL)
except:
pass
for c in _children:
try:
c.wait(1)
except:
pass
def gc(i):
return torch.cuda.device_count()
def get_ngpu():
with Pool(1) as p:
return p.map(gc, range(1))[0]
def _setup_distributed_group(args):
"""Initialize torch.distributed."""
torch.backends.cudnn.enabled = False
if args.world_size == 1:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.rank}.log'), rank=args.rank, verbose=1 if args.local_rank==0 else 0)
device_id = args.rank % args.n_gpu
if args.local_rank >= 0:
device_id = args.local_rank
device = torch.device("cuda", device_id)
init_method = 'tcp://'
init_method += args.master_ip + ':' + args.master_port
distributed_backend = getattr(args, 'distributed_backend', 'nccl')
torch.distributed.init_process_group(
backend=distributed_backend,
world_size=args.world_size, rank=args.rank,
init_method=init_method)
torch.cuda.set_device(device)
n_gpu = torch.cuda.device_count()
logger.info("device=%s, n_gpu=%d, distributed training=%r, world_size=%d", device, n_gpu, bool(args.world_size != 1), args.world_size)
return device
def _get_world_size(args):
world_size = int(os.getenv("WORLD_SIZE", '1'))
if not hasattr(args, 'n_gpu') or args.n_gpu is None:
n_gpu = get_ngpu()
return n_gpu * world_size
def initialize_distributed(args, join=True):
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
args.rank = int(os.getenv('RANK', '0'))
args.master_ip = os.getenv('MASTER_ADDR', 'localhost')
args.master_port = os.getenv('MASTER_PORT', '17006')
if args.world_size == 1:
args.rank = 0
args.master_ip = 'localhost'
if not hasattr(args, 'n_gpu') or args.n_gpu is None:
args.n_gpu = get_ngpu()
args.node_rank = args.rank
args.world_size = args.n_gpu * args.world_size
seed = args.seed
is_child = False
if args.world_size>1:
children = []
for r in range(args.n_gpu):
args.rank = r + args.n_gpu*args.node_rank
args.local_rank = r
args.seed = seed + args.rank
child = os.fork()
if child>0:
children.append(child)
else:
signal.signal(signal.SIGINT, signal.SIG_IGN)
is_child = True
break
else:
is_child = True
if is_child:
return _setup_distributed_group(args)
else:
if join:
try:
for c in children:
cid, ccode = os.waitpid(0,0)
logger.debug(f'Worker {c} done with code {ccode}')
if ccode != 0:
logger.error(f'Worker {c} : {cid} failed with code {ccode}')
kill_children()
raise ValueError(f'Job failed. {cid}:{ccode}')
except (KeyboardInterrupt, SystemExit):
logger.warning('Keybord interrupt by user. Terminate all processes')
kill_children(None)
return children
def test_dist_launch():
def test_functions(args):
global logger
set_logger(args.task_name, os.path.join(args.output_dir, f'training_{args.task_name}_{args.node_rank}.log'), rank=args.rank)
logger.info(args)
class Args:
def __init__(self):
pass
def __repr__(self):
return str(self.__dict__)
args = Args()
args.task_name = 'test'
args.seed = 0
args.n_gpu = None
args.no_cuda=False
args.output_dir = '/tmp'
distributed_launch(args, test_functions, (args,))
def test_init_dist():
class Args:
def __init__(self):
pass
def __repr__(self):
return str(self.__dict__)
args = Args()
args.task_name = 'test'
args.seed = 0
args.n_gpu = None
args.no_cuda=False
args.output_dir = '/tmp'
device = initialize_distributed(args)
if isinstance(device, torch.device):
return 0
else:
return 1

Просмотреть файл

@ -1,78 +0,0 @@
#
# Author: penhe@microsoft.com
# Date: 05/15/2019
#
from collections import defaultdict
import numpy as np
import pdb
import torch
import re
from ..optims import Fp16Optimizer,XAdam,ExpLossScaler
from ..utils import get_logger
logger=get_logger()
def xadam_factory(args, training_steps=None):
def optimizer_fn(param_groups, max_grad_norm=None):
with_radam = getattr(args, 'with_radam', False)
opt_type = getattr(args, 'opt_type', None)
optimizer = XAdam(param_groups,
lr=args.learning_rate,
b1=args.adam_beta1,
b2=args.adam_beta2,
lr_ends=args.lr_schedule_ends,
e=args.epsilon,
warmup=args.warmup_proportion if args.warmup_proportion<1 else args.warmup_proportion/training_steps,
t_total=training_steps,
schedule=args.lr_schedule,
max_grad_norm = args.max_grad_norm if max_grad_norm is None else max_grad_norm,
weight_decay_rate = args.weight_decay,
with_radam = with_radam,
opt_type = opt_type)
return optimizer
return optimizer_fn
def create_xoptimizer(model, args, num_train_steps=None, no_decay=['bias', 'LayerNorm.weight']):
if args.fp16:
loss_scaler = ExpLossScaler(scale_interval = args.scale_steps, init_scale=args.loss_scale)
else:
loss_scaler = None
_no_decay = [x.strip() for x in getattr(args, 'no_decay', '').split('|') if len(x.strip())>0]
if len(_no_decay)>0:
no_decay = _no_decay
opt_fn = xadam_factory(args, num_train_steps)
named_params = list(model.named_parameters())
type_groups = defaultdict(list)
for n,p in named_params:
key = ''
if any(re.search(nd,n) for nd in no_decay):
key += f'{str(p.dtype)}-nd'
else:
key += f'{str(p.dtype)}-d'
type_groups[key].append((n,p))
param_groups = []
for key, params in type_groups.items():
wd_theta = 0
weight_decay = args.weight_decay
if key.endswith('-nd'):
weight_decay = 0
group = dict(params=[],
weight_decay_rate=weight_decay,
wd_theta = wd_theta,
names=[])
for (n,p) in params:
group['params'].append(p)
group['names'].append(n)
param_groups.append(group)
lookahead_k = getattr(args, 'lookahead_k', -1)
lookahead_alpha = getattr(args, 'lookahead_alpha', 0.5)
optimizer = Fp16Optimizer(param_groups, opt_fn, loss_scaler, args.max_grad_norm, lookahead_k = lookahead_k,\
lookahead_alpha = lookahead_alpha)
return optimizer

Просмотреть файл

@ -1,225 +0,0 @@
# Copyright (c) Microsoft, Inc. 2020
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Author: penhe@microsoft.com
# Date: 05/15/2019
#
import os
import torch
import random
import time
import numpy as np
import pdb
from collections import defaultdict, Mapping, Sequence, OrderedDict
from torch.utils.data import DataLoader
from ..data import BatchSampler, DistributedBatchSampler,RandomSampler,SequentialSampler, AsyncDataLoader
from ..utils import get_logger
logger = get_logger()
from .dist_launcher import get_ngpu
from .optimizer_utils import create_xoptimizer
from ._utils import batch_to
def set_random_seed(seed, cpu_only=False):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
n_gpu = get_ngpu()
if n_gpu > 0 and not cpu_only:
torch.cuda.manual_seed_all(seed)
class TrainerState:
def __init__(self, training_steps):
self.__dict__ = defaultdict(float)
self.loss = 0.0
self.examples = 0
self.steps = 0
self._last_report_step = 0
self.epochs = 0
self.next_batch = 0
self.num_training_steps = training_steps
self._last_report_time = time.time()
self.best_steps = 0
self.best_metric = -1e9
self.run_id = None
def update_step(self, loss, examples, loss_scale):
self.examples += examples
self.loss += loss
self.steps += 1
self.next_batch += 1
self.loss_scale = loss_scale
def report_state(self):
if self.steps <= self._last_report_step:
return
end = time.time()
start = self._last_report_time
logger.info('[{:0.1f}%][{:0.2f}h] Steps={}, loss={}, examples={}, loss_scale={:0.1f}, {:0.1f}s'.format(100*self.steps/self.num_training_steps, \
(self.num_training_steps - self.steps)*(start-end)/((self.steps-self._last_report_step)*3600), self.steps, self.loss/self.steps, self.examples, self.loss_scale, end-start))
self._last_report_time = end
self._last_report_step = self.steps
class DistributedTrainer:
def __init__(self, args, model, device, data_fn, loss_fn=None, optimizer_fn=None, eval_fn=None, init_fn=None, update_fn=None, dump_interval = 10000, **kwargs):
"""
data_fn return tuples (training_dataset, training_steps, train_sampler, batch_scheduler), training_dataset is required
loss_fn return the loss of current mini-batch and the size of the batch
optimizer_fn return the created optimizer
eval_fn return metrics for model selection
"""
self.__dict__.update(kwargs)
self.args = args
self.device = device
self.eval_fn = eval_fn
self.accumulative_update = 1
if hasattr(args, 'accumulative_update'):
self.accumulative_update = args.accumulative_update
train_data, training_steps, train_sampler = data_fn(self)
self.train_data = train_data
self.train_sampler = train_sampler if train_sampler is not None else RandomSampler(len(train_data))
self.training_epochs = int(getattr(args, 'num_train_epochs', 1))
if training_steps is None:
training_steps = getattr(args, 'training_steps', (len(training_data) + self.args.train_batch_size-1)//self.args.train_batch_size*self.training_epochs)
self.training_steps = training_steps
self.init_fn = init_fn
self.trainer_state = TrainerState(self.training_steps)
self.dump_interval = dump_interval
self.model = self._setup_model(args, model)
def _opt_fn(trainer, model, training_steps):
return create_xoptimizer(model, args, num_train_steps = training_steps)
optimizer_fn = optimizer_fn if optimizer_fn is not None else _opt_fn
self.optimizer = optimizer_fn(self, model, training_steps)
def _loss_fn(trainer, model, batch):
_,loss = model(**batch)
batch_size = batch['input_ids'].size(0)
return loss.mean(), batch_size
self.loss_fn = loss_fn if loss_fn is not None else _loss_fn
self.initialized = False
self.update_fn = update_fn
def initialize(self):
set_random_seed(self.args.seed)
if self.args.world_size>1:
torch.distributed.barrier()
self.initialized = True
def train(self):
if not self.initialized:
self.initialize()
rank = self.args.rank
world_size = self.args.world_size
for n_epoch in range(self.trainer_state.epochs, self.training_epochs):
batch_sampler = BatchSampler(self.train_sampler, self.args.train_batch_size)
batch_sampler = DistributedBatchSampler(batch_sampler, rank = rank, world_size = world_size)
batch_sampler.next = self.trainer_state.next_batch
num_workers = getattr(self.args, 'workers', 2)
train_dataloader = DataLoader(self.train_data, batch_sampler=batch_sampler, num_workers=num_workers, worker_init_fn=self.init_fn, pin_memory=True)
torch.cuda.empty_cache()
for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)):
if self.trainer_state.steps >= self.training_steps:
break
bs_scale = 1
batch = batch_to(batch, self.device)
self._train_step(batch, bs_scale)
if self.trainer_state.steps%100 == 0:
self.trainer_state.report_state()
if self.trainer_state.steps%self.dump_interval == 0:
self._eval_model()
# Save model
self.trainer_state.epochs += 1
self.trainer_state.next_batch = 0
self.trainer_state.report_state()
self._eval_model()
def save_model(self, args, checkpoint_dir, chk_postfix, model, optimizer):
save_path= os.path.join(checkpoint_dir, f'pytorch.model-{chk_postfix}.bin')
param_optimizer = optimizer.get_master_params()['params']
if hasattr(model, 'module'):
model_state = OrderedDict([(n,p) for n,p in model.module.state_dict().items()])
else:
model_state = OrderedDict([(n,p) for n,p in model.state_dict().items()])
if args.fp16:
model_state.update([(n[len('module.'):] if n.startswith('module.') else n ,p.detach().cpu().requires_grad_(model_state[n].requires_grad)) for n,p in param_optimizer])
if args.rank < 1:
torch.save(model_state, save_path)
return save_path
def _eval_model(self, with_checkpoint=True):
if with_checkpoint:
checkpoint_dir = getattr(self.args, 'checkpoint_dir', None)
checkpoint_dir = checkpoint_dir if checkpoint_dir is not None else self.args.output_dir
chk_postfix = f'{self.trainer_state.steps:06}'
self.save_model(self.args, checkpoint_dir, chk_postfix, self.model, self.optimizer)
_metric = self.trainer_state.best_metric
_steps = self.trainer_state.best_steps
if self.eval_fn is not None:
metric = self.eval_fn(self, self.model, self.device, tag=f'{self.trainer_state.steps:06}-{self.training_steps}')
if metric > _metric:
_metric = metric
_steps = self.trainer_state.steps
logger.info(f'Best metric: {_metric}@{_steps}')
self.trainer_state.best_metric, self.trainer_state.best_steps = _metric, _steps
def _train_step(self, data, bs_scale):
self.model.train()
go_next=False
def split(batch, parts):
sub_batches = [{} for _ in range(parts)]
for k in batch.keys():
b = batch[k].size(0)
s = (b + parts - 1)//parts
v = batch[k].split(s)
for i,z in enumerate(v):
sub_batches[i][k]=z
chunks = [b for b in sub_batches if len(b)>0]
return chunks
if self.accumulative_update>1:
data_chunks = split(data, self.accumulative_update)
else:
data_chunks = [data]
while not go_next:
step_loss = 0
batch_size = 0
self.optimizer.zero_grad()
for i, sub in enumerate(data_chunks):
loss, sub_size = self.loss_fn(self, self.model, sub)
loss = loss/len(data_chunks)
if i == 0:
loss_scale, _loss = self.optimizer.backward(loss)
else:
_loss = loss.float().detach().item()
loss = loss.float() * loss_scale
loss.backward()
step_loss += _loss
batch_size += sub_size
if not self.optimizer.step(bs_scale, loss_scale):
self.optimizer.zero_grad()
continue
go_next = True
self.trainer_state.update_step(step_loss, batch_size , loss_scale)
if self.update_fn is not None:
self.update_fn(self, self.model, loss_scale)
self.optimizer.zero_grad()
def _setup_model(self, args, model):
if args.world_size > 1:
for p in model.parameters():
torch.distributed.broadcast(p.data, 0)
torch.cuda.synchronize()
return model

103
README.md
Просмотреть файл

@ -3,25 +3,52 @@
This repository is the official implementation of [ **DeBERTa**: **D**ecoding-**e**nhanced **BERT** with Disentangled **A**ttention ](https://arxiv.org/abs/2006.03654)
## News
### 2/03/2021
DeBERTa v2 code and the **900M, 1.5B** [model](https://huggingface.co/models?search=microsoft%2Fdeberta) are here now. This includes the 1.5B model used for our SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can find more details about this submission in our [blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
#### What's new in v2
- **Vocabulary** In v2 we use a new vocabulary of size 128K built from the training data. Instead of GPT2 tokenizer, we use [sentencepiece](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** In v2 we use an additional convolution layer aside with the first transformer layer to better learn the local dependency of input tokens. We will add more ablation studies on this feature.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on our previous experiment, we found this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** In v2 we use log bucket to encode relative positions similar to T5.
- **900M model & 1.5B model** In v2 we scale our model size to 900M and 1.5B which significantly improves the performance of downstream tasks.
### 12/29/2020
With DeBERTa 1.5B model, we surpass T5 11B model and human performance on SuperGLUE leaderboard. Code and model will be released soon. Please check out our paper for more details.
### 06/13/2020
We released the pre-trained models, source code, and fine-tuning scripts to reproduce some of the experimental results in the paper. You can follow similar scripts to apply DeBERTa to your own experiments or applications. Pre-training scripts will be released in the next step.
## TODOs
- [ ] Add SuperGLUE tasks
- [ ] Add SiFT code
- [ ] Add Pretraining code
## Introduction to DeBERTa
DeBERTa (Decoding-enhanced BERT with disentangled attention) improves the BERT and RoBERTa models using two novel techniques. The first is the disentangled attention mechanism, where each word is represented using two vectors that encode its content and position, respectively, and the attention weights among words are computed using disentangled matrices on their contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency of model pre-training and performance of downstream tasks.
# Pre-trained Models
Our pre-trained models are packaged into zipped files. You can download them from our [releasements](https://github.com/microsoft/DeBERTa/releases), or download an individual model via the links below:
- [Large](https://github.com/microsoft/DeBERTa/releases/download/v0.1/large.zip): the pre-trained Large model
- [Base](https://github.com/microsoft/DeBERTa/releases/download/v0.1/base.zip) : the pre-trained Base model
- [Large MNLI](https://github.com/microsoft/DeBERTa/releases/download/v0.1/large_mnli.zip): Large model fine-tuned with MNLI task
- [Base MNLI](https://github.com/microsoft/DeBERTa/releases/download/v0.1/base_mnli.zip): Base model fine-tuned with MNLI task
Our pre-trained models are packaged into zipped files. You can download them from our [releases](https://huggingface.co/models?search=microsoft%2Fdeberta), or download an individual model via the links below:
|Model | Parameters| Hidden Size | Layers| Note|
|-------------|------|-----|-----|---------|
|**[XXLarge-V2](https://huggingface.co/microsoft/deberta-xxlarge-v2)<sup>1</sup>**|1.5B|1536| 48|128K new SPM vocab |
|[XLarge-V2](https://huggingface.co/microsoft/deberta-xlarge-v2)|900M|1536| 24| 128K new SPM vocab|
|[XLarge](https://huggingface.co/microsoft/deberta-xlarge)|750M|1024|48| Same vocab as RoBERTa|
|[Large](https://huggingface.co/microsoft/deberta-large)|400M|1024|24|Same vocab as RoBERTa|
|[Base](https://huggingface.co/microsoft/deberta-base)|140M|768|12|Same vocab as RoBERTa|
|[XXLarge-V2-MNLI](https://huggingface.co/microsoft/deberta-xxlarge-v2-mnli)|1.5B|1536| 48|Fine-turned with MNLI |
|[XLarge-V2-MNLI](https://huggingface.co/microsoft/deberta-xlarge-v2-mnli)|900M|1536| 24|Fine-turned with MNLI |
|[XLarge-MNLI](https://huggingface.co/microsoft/deberta-xlarge-mnli)|750M|1024|48|Fine-turned with MNLI|
|[Large-MNLI](https://huggingface.co/microsoft/deberta-large-mnli)|400M|1024|24|Fine-turned with MNLI|
|[Base-MNLI](https://huggingface.co/microsoft/deberta-base-mnli)|140M|768|12|Fine-turned with MNLI|
# Try the code
## Note
- 1 This is the model(89.9) that surpassed **T5 11B(89.3) and human performance(89.8)** on **SuperGLUE** for the first time. 128K new SPM vocab.
# Try the model
Read our [documentation](https://deberta.readthedocs.io/en/latest/)
@ -37,6 +64,7 @@ Read our [documentation](https://deberta.readthedocs.io/en/latest/)
There are several ways to try our code,
### Use docker
Docker is the recommended way to run the code as we already built every dependency into the our docker [bagai/deberta](https://hub.docker.com/r/bagai/deberta) and you can follow the [docker official site](https://docs.docker.com/engine/install/ubuntu/) to install docker on your machine.
To run with docker, make sure your system fullfil the requirements in the above list. Here are the steps to try the GLUE experiments: Pull the code, run `./run_docker.sh`
@ -60,11 +88,11 @@ class MyModel(torch.nn.Module):
def __init__(self):
super().__init__()
# Your existing model code
self.bert = deberta.DeBERTa(pre_trained='base') # Or 'large' or 'base_mnli' or 'large_mnli'
self.deberta = deberta.DeBERTa(pre_trained='base') # Or 'large' 'base-mnli' 'large-mnli' 'xlarge' 'xlarge-mnli' 'xlarge-v2' 'xxlarge-v2'
# Your existing model code
# do inilization as before
#
self.bert.apply_state() # Apply the pre-trained model of DeBERTa at the end of the constructor
self.deberta.apply_state() # Apply the pre-trained model of DeBERTa at the end of the constructor
#
def forward(self, input_ids):
# The inputs to DeBERTa forward are
@ -78,11 +106,12 @@ class MyModel(torch.nn.Module):
# - If it's an attention mask then if will be torch.LongTensor of shape [batch_size, sequence_length, sequence_length].
# In this case, it's a mask indicate which tokens in the sequence should be attended by other tokens in the sequence.
# `output_all_encoded_layers`: whether to output results of all encoder layers, default, True
encoding = self.bert(input_ids)[-1]
encoding = deberta.bert(input_ids)[-1]
# 2. Change your tokenizer with the the tokenizer built in DeBERta
from DeBERTa import deberta
tokenizer = deberta.GPT2Tokenizer()
vocab_path, vocab_type = deberta.load_vocab(pretrained_id='base')
tokenizer = deberta.tokenizers[vocab_type](vocab_path)
# We apply the same schema of special tokens as BERT, e.g. [CLS], [SEP], [MASK]
max_seq_len = 512
tokens = tokenizer.tokenize('Examples input text of DeBERTa')
@ -108,7 +137,8 @@ For glue tasks,
1. Get the data
``` bash
cache_dir=/tmp/DeBERTa/
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/master/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
cd experiments/glue
./download_data.sh $cache_dir/glue_tasks
```
2. Run task
@ -117,7 +147,7 @@ curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/master/scripts/downlo
task=STS-B
OUTPUT=/tmp/DeBERTa/exps/$task
export OMP_NUM_THREADS=1
python3 -m DeBERTa.apps.train --task_name $task --do_train \
python3 -m DeBERTa.apps.run --task_name $task --do_train \
--data_dir $cache_dir/glue_tasks/$task \
--eval_batch_size 128 \
--predict_batch_size 128 \
@ -132,10 +162,9 @@ python3 -m DeBERTa.apps.train --task_name $task --do_train \
--max_seq_len 128
```
## Important Notes
1. To run our code on multiple GPUs, you must `OMP_NUM_THREADS=1` before launch our training code
2. By default we will cache the pre-trained model and tokenizer at `$HOME/.~DeBERTa`, you may need to clean it if the downloading failed unexpectedly.
## Notes
- 1. By default we will cache the pre-trained model and tokenizer at `$HOME/.~DeBERTa`, you may need to clean it if the downloading failed unexpectedly.
- 2. You can also try our models with [HF Transformers](https://github.com/huggingface/transformers). But when you try XXLarge model you need to specify --sharded_ddp argument. Please check our [XXLarge model card](https://huggingface.co/microsoft/deberta-xxlarge-v2) for more details.
## Experiments
Our fine-tuning experiments are carried on half a DGX-2 node with 8x32 V100 GPU cards, the results may vary due to different GPU models, drivers, CUDA SDK versions, using FP16 or FP32, and random seeds.
@ -143,22 +172,44 @@ We report our numbers based on multple runs with different random seeds here. He
|Task |Command |Results |Running Time(8x32G V100 GPUs)|
|--------|---------------|---------------|-------------------------|
|MNLI xlarge| `experiments/glue/mnli_xlarge.sh`| 91.5/91.4 +/-0.1| 2.5h|
|MNLI large| `experiments/glue/mnli_large.sh`| 91.2/91.0 +/-0.1| 2.5h|
|QQP large| `experiments/glue/qqp_large.sh`| 92.3 +/-0.1| 6h|
|QNLI large| `experiments/glue/qnli_large.sh`| 95.3 +/-0.2| 2h|
|MRPC large| `experiments/glue/mrpc_large.sh`| 93.4 +/-0.5| 0.5h|
|RTE large| `experiments/glue/rte_large.sh`| 87.7 +/-1.0| 0.5h|
|SST-2 large| `experiments/glue/sst2_large.sh`| 96.7 +/-0.3| 1h|
|STS-b large| `experiments/glue/Stsb_large.sh`| 92.5 +/-0.3| 0.5h|
|CoLA large| `experiments/glue/cola_large.sh`| 70.5 +/-1.0| 0.5h|
|**MNLI xxlarge v2**| `experiments/glue/mnli.sh xxlarge-v2`| **91.7/91.9** +/-0.1| 4h|
|MNLI xlarge v2| `experiments/glue/mnli.sh xlarge-v2`| 91.7/91.6 +/-0.1| 2.5h|
|MNLI xlarge| `experiments/glue/mnli.sh xlarge`| 91.5/91.2 +/-0.1| 2.5h|
|MNLI large| `experiments/glue/mnli.sh large`| 91.3/91.1 +/-0.1| 2.5h|
|QQP large| `experiments/glue/qqp.sh large`| 92.3 +/-0.1| 6h|
|QNLI large| `experiments/glue/qnli.sh large`| 95.3 +/-0.2| 2h|
|MRPC large| `experiments/glue/mrpc.sh large`| 91.9 +/-0.5| 0.5h|
|RTE large| `experiments/glue/rte.sh large`| 86.6 +/-1.0| 0.5h|
|SST-2 large| `experiments/glue/sst2.sh large`| 96.7 +/-0.3| 1h|
|STS-b large| `experiments/glue/Stsb.sh large`| 92.5 +/-0.3| 0.5h|
|CoLA large| `experiments/glue/cola.sh`| 70.5 +/-1.0| 0.5h|
And here are the results from the Base model
|Task |Command |Results |Running Time(8x32G V100 GPUs)|
|--------|---------------|---------------|-------------------------|
|MNLI base| `experiments/glue/mnli_base.sh`| 88.8/88.5 +/-0.2| 1.5h|
|MNLI base| `experiments/glue/mnli.sh base`| 88.8/88.5 +/-0.2| 1.5h|
#### Fine-tuning on NLU tasks
We present the dev results on SQuAD 1.1/2.0 and several GLUE benchmark tasks.
| Model | SQuAD 1.1 | SQuAD 2.0 | MNLI-m/mm | SST-2 | QNLI | CoLA | RTE | MRPC | QQP |STS-B |
|---------------------------|-----------|-----------|-------------|-------|------|------|--------|-------|-------|------|
| | F1/EM | F1/EM | Acc | Acc | Acc | MCC | Acc |Acc/F1 |Acc/F1 |P/S |
| BERT-Large | 90.9/84.1 | 81.8/79.0 | 86.6/- | 93.2 | 92.3 | 60.6 | 70.4 | 88.0/- | 91.3/- |90.0/- |
| RoBERTa-Large | 94.6/88.9 | 89.4/86.5 | 90.2/- | 96.4 | 93.9 | 68.0 | 86.6 | 90.9/- | 92.2/- |92.4/- |
| XLNet-Large | 95.1/89.7 | 90.6/87.9 | 90.8/- | 97.0 | 94.9 | 69.0 | 85.9 | 90.8/- | 92.3/- |92.5/- |
| [DeBERTa-Large](https://huggingface.co/microsoft/deberta-large)<sup>1</sup> | 95.5/90.1 | 90.7/88.0 | 91.3/91.1| 96.5|95.3| 69.5| 91.0| 92.6/94.6| 92.3/- |92.8/92.5 |
| [DeBERTa-XLarge](https://huggingface.co/microsoft/deberta-xlarge)<sup>1</sup> | -/- | -/- | 91.5/91.2| 97.0 | - | - | 93.1 | 92.1/94.3 | - |92.9/92.7|
| [DeBERTa-XLarge-V2](https://huggingface.co/microsoft/deberta-xlarge-v2)<sup>1</sup>|95.8/90.8| 91.4/88.9|91.7/91.6| **97.5**| 95.8|71.1|**93.9**|92.0/94.2|92.3/89.8|92.9/92.9|
|**[DeBERTa-XXLarge-V2](https://huggingface.co/microsoft/deberta-xxlarge-v2)<sup>1</sup>**|**96.1/91.4**|**92.2/89.7**|**91.7/91.9**|97.2|**96.0**|**72.0**| 93.5| **93.1/94.9**|**92.7/90.3** |**93.2/93.1** |
--------
##### Notes.
- <sup>1</sup> Following RoBERTa, for RTE, MRPC, STS-B, we fine-tune the tasks based on [DeBERTa-Large-MNLI](https://huggingface.co/microsoft/deberta-large-mnli), [DeBERTa-XLarge-MNLI](https://huggingface.co/microsoft/deberta-xlarge-mnli), [DeBERTa-XLarge-V2-MNLI](https://huggingface.co/microsoft/deberta-xlarge-v2-mnli), [DeBERTa-XXLarge-V2-MNLI](https://huggingface.co/microsoft/deberta-xxlarge-v2-mnli). The results of SST-2/QQP/QNLI/SQuADv2 will also be slightly improved when start from MNLI fine-tuned models, however, we only report the numbers fine-tuned from pretrained base models for those 4 tasks.
## Contacts
Pengcheng He(penhe@microsoft.com), Xiaodong Liu(xiaodl@microsoft.com), Jianfeng Gao(jfgao@microsoft.com), Weizhu Chen(wzchen@microsoft.com)

Просмотреть файл

@ -1 +1 @@
0.1.8
0.1.11

Просмотреть файл

@ -1,7 +1,8 @@
# This is the docker file built for GLUE task
FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
RUN apt-get update && apt-get install -y vim gawk wget software-properties-common vim-addon-manager locales gdb
RUN apt-get update && apt-get install -y python3.6 python3.6-dev python3.6-doc python3.6-dbg python-distutils
RUN apt-get update && apt-get install -y python3.6 python3.6-dev python3.6-doc python3.6-dbg
#python-distutils
ENV LANG="en_US.UTF-8" LC_ALL="en_US.UTF-8" LANGUAGE="en_US.UTF-8" LC_TYPE="en_US.UTF-8" TERM=xterm-256color
RUN locale-gen en_US en_US.UTF-8

Просмотреть файл

@ -1,20 +1,20 @@
# GLUE fine-tuning task
To run the experiment, you need to
run `./mnli_base.sh` for fine-tuning mnli base model,
run `./mnli.sh` for fine-tuning mnli base model,
run `./mnli_large.sh` for fine-tuning mnli large model.
run `./mnli.sh` for fine-tuning mnli large model.
run `./cola_large.sh` for fine-tuning cola large model.
run `./cola.sh` for fine-tuning cola large model.
run `./sst2_large.sh` for fine-tuning sst2 large model.
run `./sst2.sh` for fine-tuning sst2 large model.
run `./stsb_large.sh` for fine-tuning stsb large model.
run `./stsb.sh` for fine-tuning stsb large model.
run `./rte_large.sh` for fine-tuning rte large model.
run `./rte.sh` for fine-tuning rte large model.
run `./qqp_large.sh` for fine-tuning qqp large model.
run `./qqp.sh` for fine-tuning qqp large model.
run `./qnli_large.sh` for fine-tuning qnli large model.
run `./qnli.sh` for fine-tuning qnli large model.
run `./mrpc_large.sh` for fine-tuning mrpc large model.
run `./mrpc.sh` for fine-tuning mrpc large model.

81
experiments/glue/cola.sh Executable file
Просмотреть файл

@ -0,0 +1,81 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=CoLA
setup_glue_data $Task
# The performance will be better when it's initialized with MNLI fine-tuned models
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 "
;;
large)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 7e-6 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--fp16 True "
;;
xlarge)
parameters=" --num_train_epochs 8 \
--warmup 100 \
--learning_rate 5e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xlarge-v2)
parameters=" --num_train_epochs 8 \
--warmup 100 \
--learning_rate 3e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xxlarge-v2)
parameters=" --num_train_epochs 10 \
--warmup 100 \
--learning_rate 2.5e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2-mnli - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2-mnli - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--do_train \
--tag $tag \
--max_seq_len 64 \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,22 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=large
tag=Large
Task=CoLA
setup_glue_data $Task
../utils/train.sh --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -i $init -o /tmp/ttonly/$tag/$task -- --num_train_epochs 5 --accumulative_update 1 --warmup 100 --learning_rate 1e-5 --train_batch_size 32 --max_seq_len 64

Просмотреть файл

@ -0,0 +1,17 @@
#!/bin/bash
cache_dir=$1
task=$2
if [[ -z $cache_dir ]]; then
cache_dir=/tmp/DeBERTa/glue
fi
mkdir -p $cache_dir
curl -s -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py -o $cache_dir/glue.py
patch $cache_dir/glue.py patch.diff
if [[ -z $task ]]; then
python3 $cache_dir/glue.py --data_dir $cache_dir/
else
python3 $cache_dir/glue.py --data_dir $cache_dir/ --tasks $task
fi

81
experiments/glue/mnli.sh Executable file
Просмотреть файл

@ -0,0 +1,81 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=MNLI
setup_glue_data $Task
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 3 \
--warmup 1000 \
--learning_rate 2e-5 \
--train_batch_size 64 \
--cls_drop_out 0.1 "
;;
large)
parameters=" --num_train_epochs 3 \
--warmup 1000 \
--learning_rate 1e-5 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xlarge)
parameters=" --num_train_epochs 3 \
--warmup 1000 \
--learning_rate 5e-6 \
--train_batch_size 64 \
--cls_drop_out 0.3 \
--fp16 True "
;;
xlarge-v2)
parameters=" --num_train_epochs 3 \
--warmup 1000 \
--learning_rate 4e-6 \
--train_batch_size 64 \
--cls_drop_out 0.25 \
--fp16 True "
;;
xxlarge-v2)
parameters=" --num_train_epochs 3 \
--warmup 1000 \
--learning_rate 3e-6 \
--train_batch_size 64 \
--cls_drop_out 0.3 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--max_seq_len 256 \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,21 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks --tasks $task
fi
}
init=base
tag=Base
Task=MNLI
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$task -- --num_train_epochs 3 --accumulative_update 1 --warmup 1000 --learning_rate 2e-5 --train_batch_size 64 --max_seq_length 448 --dump 5000 --cls_drop 0.1 #--fp16 True

Просмотреть файл

@ -1,21 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks --tasks $task
fi
}
init=large
tag=Large
Task=MNLI
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$task -- --num_train_epochs 2 --accumulative_update 1 --warmup 500 --learning_rate 8e-6 --train_batch_size 64 --max_seq_length 448 --dump 5000 --cls_drop 0.15 --fp16 True

Просмотреть файл

@ -1,21 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks --tasks $task
fi
}
init=xlarge
tag=XLarge
Task=MNLI
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$task -- --num_train_epochs 2 --accumulative_update 2 --warmup 500 --learning_rate 8e-6 --train_batch_size 64 --max_seq_length 256 --dump 5000 --cls_drop 0.15 --fp16 True --max_grad_norm 10

90
experiments/glue/mrpc.sh Executable file
Просмотреть файл

@ -0,0 +1,90 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=MRPC
setup_glue_data $Task
# The performance will be better when it's initialized with MNLI fine-tuned models
init=$1
tag=$init
case ${init,,} in
base)
init=${init}-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--max_seq_len 128"
;;
large)
init=${init}-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 1e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--fp16 True \
--max_seq_len 128 "
;;
xlarge)
init=${init}-mnli
parameters=" --num_train_epochs 8 \
--warmup 50 \
--learning_rate 7e-6 \
--train_batch_size 32 \
--cls_drop_out 0.3 \
--fp16 True \
--max_seq_len 128 "
;;
xlarge-v2)
init=${init}-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 3.5e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 128 "
;;
xxlarge-v2)
init=${init}-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 2e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 128 "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2-mnli - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2-mnli - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--do_train \
--tag $tag \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,20 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=large
tag=Large
Task=MRPC
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$Task -- --num_train_epochs 6 --accumulative_update 1 --warmup 50 --learning_rate 1e-5 --train_batch_size 32 --max_seq_length 128 --dump 500 --cls_drop 0.15

Просмотреть файл

@ -0,0 +1,32 @@
--- download_glue_data.py 2021-02-01 18:22:04.664290174 -0500
+++ download_glue_data_fixed.py 2021-02-01 18:21:13.399941815 -0500
@@ -31,18 +31,18 @@
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {
- "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4", # noqa
- "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8", # noqa
- "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc", # noqa
- "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277", # noqa
- "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5", # noqa
- "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce", # noqa
- "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df", # noqa
- "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601", # noqa
- "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb", # noqa
- "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf", # noqa
+ "CoLA": "https://dl.fbaipublicfiles.com/glue/data/CoLA.zip",
+ "SST": "https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+ "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
+ "QQP": "https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip",
+ "STS": "https://dl.fbaipublicfiles.com/glue/data/STS-B.zip",
+ "MNLI": "https://dl.fbaipublicfiles.com/glue/data/MNLI.zip",
+ "SNLI": "https://dl.fbaipublicfiles.com/glue/data/SNLI.zip",
+ "QNLI": "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip",
+ "RTE": "https://dl.fbaipublicfiles.com/glue/data/RTE.zip",
+ "WNLI": "https://dl.fbaipublicfiles.com/glue/data/WNLI.zip",
"diagnostic": [
- "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D", # noqa
+ "https://dl.fbaipublicfiles.com/glue/data/AX.tsv",
"https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1",
],
}

89
experiments/glue/qnli.sh Executable file
Просмотреть файл

@ -0,0 +1,89 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=QNLI
setup_glue_data $Task
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--do_train \
--max_seq_len 512 "
;;
large)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 1e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--do_train \
--fp16 True \
--max_seq_len 512 "
;;
xlarge)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 7e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--do_train \
--fp16 True \
--max_seq_len 512 "
;;
xlarge-v2)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 4e-6 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--do_train \
--fp16 True \
--max_seq_len 512 "
;;
xxlarge-v2)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 3e-6 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--do_train \
--fp16 True \
--max_seq_len 512 "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,21 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks --tasks $task
fi
}
init=large
tag=Large
Task=QNLI
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$task -- --num_train_epochs 4 --accumulative_update 1 --warmup 500 --learning_rate 1e-5 --train_batch_size 32 --max_seq_length 512 --dump 5000 --cls_drop 0.15 --fp16 True

81
experiments/glue/qqp.sh Executable file
Просмотреть файл

@ -0,0 +1,81 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=QQP
setup_glue_data $Task
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 8 \
--warmup 500 \
--learning_rate 2e-5 \
--train_batch_size 64 \
--cls_drop_out 0.15 "
;;
large)
parameters=" --num_train_epochs 8 \
--warmup 500 \
--learning_rate 1e-5 \
--train_batch_size 64 \
--cls_drop_out 0.15 \
--fp16 True "
;;
xlarge)
parameters=" --num_train_epochs 8 \
--warmup 500 \
--learning_rate 7e-6 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xlarge-v2)
parameters=" --num_train_epochs 10 \
--warmup 1000 \
--learning_rate 4e-6 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xxlarge-v2)
parameters=" --num_train_epochs 10 \
--warmup 1000 \
--learning_rate 3e-6 \
--train_batch_size 64 \
--cls_drop_out 0.2 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--max_seq_len 320 \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,21 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks --tasks $task
fi
}
init=large
tag=Large
Task=QQP
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$task -- --num_train_epochs 8 --accumulative_update 1 --warmup 500 --learning_rate 1e-5 --train_batch_size 32 --max_seq_length 512 --dump 5000 --cls_drop 0.15 --fp16 True

89
experiments/glue/rte.sh Executable file
Просмотреть файл

@ -0,0 +1,89 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=RTE
setup_glue_data $Task
# The performance will be better when it's initialized with MNLI fine-tuned models
init=$1
tag=$init
case ${init,,} in
base)
init=$init-mnli
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--max_seq_len 320"
;;
large)
init=$init-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 1e-5 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--max_seq_len 320"
;;
xlarge)
init=$init-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 5e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2\
--fp16 True \
--max_seq_len 320"
;;
xlarge-v2)
init=$init-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 4e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 320"
;;
xxlarge-v2)
init=$init-mnli
parameters=" --num_train_epochs 6 \
--warmup 50 \
--learning_rate 3e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 320"
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,20 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=base_mnli
tag=base
Task=RTE
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$Task -- --num_train_epochs 8 --accumulative_update 1 --warmup 100 --learning_rate 8e-6 --train_batch_size 16 --max_seq_length 320 --dump 500 --cls_drop 0.50 --fp16 True

Просмотреть файл

@ -1,20 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=large
tag=Large
Task=RTE
setup_glue_data $Task
../utils/train.sh -i $init --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -o /tmp/ttonly/$tag/$Task -- --num_train_epochs 8 --accumulative_update 1 --warmup 100 --learning_rate 8e-6 --train_batch_size 32 --max_seq_length 320 --dump 500 --cls_drop 0.50

81
experiments/glue/sst2.sh Executable file
Просмотреть файл

@ -0,0 +1,81 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=SST-2
setup_glue_data $Task
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 6 \
--warmup 500 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 "
;;
large)
parameters=" --num_train_epochs 6 \
--warmup 500 \
--learning_rate 1e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--fp16 True "
;;
xlarge)
parameters=" --num_train_epochs 6 \
--warmup 500 \
--learning_rate 3e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xlarge-v2)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 4e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xxlarge-v2)
parameters=" --num_train_epochs 4 \
--warmup 500 \
--learning_rate 3e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--max_seq_len 128 \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,22 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=large
tag=Large
Task=SST-2
setup_glue_data $Task
../utils/train.sh --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -i $init -o /tmp/ttonly/$tag/$task -- --num_train_epochs 6 --accumulative_update 1 --warmup 100 --learning_rate 1e-5 --train_batch_size 32 --max_seq_len 128

85
experiments/glue/stsb.sh Executable file
Просмотреть файл

@ -0,0 +1,85 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
./download_data.sh $cache_dir/glue_tasks
fi
}
Task=STS-B
setup_glue_data $Task
init=$1
tag=$init
case ${init,,} in
base)
init=${init}-mnli
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.15 "
;;
large)
init=${init}-mnli
parameters=" --num_train_epochs 4 \
--warmup 100 \
--learning_rate 7e-6 \
--train_batch_size 32 \
--cls_drop_out 0.15 \
--fp16 True "
;;
xlarge)
init=${init}-mnli
parameters=" --num_train_epochs 4 \
--warmup 100 \
--learning_rate 7e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xlarge-v2)
init=${init}-mnli
parameters=" --num_train_epochs 4 \
--warmup 100 \
--learning_rate 4e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
xxlarge-v2)
init=${init}-mnli
parameters=" --num_train_epochs 4 \
--warmup 100 \
--learning_rate 3e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--max_seq_len 128 \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,22 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=base
tag=Base
Task=STS-B
setup_glue_data $Task
../utils/train.sh --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -i $init -o /tmp/ttonly/$tag/$task -- --num_train_epochs 6 --accumulative_update 1 --warmup 100 --learning_rate 2e-5 --train_batch_size 32 --max_seq_len 128

Просмотреть файл

@ -1,22 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
function setup_glue_data(){
task=$1
mkdir -p $cache_dir
if [[ ! -e $cache_dir/glue_tasks/${task}/train.tsv ]]; then
curl -J -L https://raw.githubusercontent.com/nyu-mll/jiant/v1.3.2/scripts/download_glue_data.py | python3 - --data_dir $cache_dir/glue_tasks
fi
}
init=large
tag=Large
Task=STS-B
setup_glue_data $Task
../utils/train.sh --config config.json -t $Task --data $cache_dir/glue_tasks/$Task --tag $tag -i $init -o /tmp/ttonly/$tag/$task -- --num_train_epochs 6 --accumulative_update 1 --warmup 100 --learning_rate 1e-5 --train_batch_size 32 --max_seq_len 128

Просмотреть файл

@ -0,0 +1,3 @@
This is an example to show how to make your own task.
It's RACE task, to get start you need to download data from http://www.cs.cmu.edu/~glai1/data/race/

Просмотреть файл

@ -0,0 +1,176 @@
#
# Author: penhe@microsoft.com
# Date: 01/25/2019
#
from glob import glob
from collections import OrderedDict,defaultdict,Sequence
import copy
import math
from scipy.special import softmax
import numpy as np
import pdb
import os
import sys
import csv
import random
import torch
import re
import ujson as json
from DeBERTa.apps.tasks.metrics import *
from DeBERTa.apps.tasks import EvalData, Task,register_task
from DeBERTa.utils import xtqdm as tqdm
from DeBERTa.data import ExampleInstance, ExampleSet, DynamicDataset,example_to_feature
from DeBERTa.data.example import *
from DeBERTa.utils import get_logger
from DeBERTa.data.example import _truncate_segments
from DeBERTa.apps.models.multi_choice import MultiChoiceModel
logger=get_logger()
__all__ = ["MyRACETask"]
@register_task(name="MyRACE", desc="ReAding Comprehension dataset collected from English Examinations, http://www.qizhexie.com/data/RACE_leaderboard.html")
class MyRACETask(Task):
def __init__(self, data_dir, tokenizer, args, **kwargs):
super().__init__(tokenizer, args, **kwargs)
self.data_dir = data_dir
def train_data(self, max_seq_len=512, dataset_size=None, epochs=1, mask_gen=None, **kwargs):
middle = self.load_jsonl(os.path.join(self.data_dir, 'train_middle.jsonl'))
high = self.load_jsonl(os.path.join(self.data_dir, 'train_high.jsonl'))
examples = ExampleSet(middle + high)
if dataset_size is None:
dataset_size = len(examples)*epochs
return DynamicDataset(examples, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len, mask_gen=mask_gen), \
dataset_size = dataset_size, shuffle=True, **kwargs)
def eval_data(self, max_seq_len=512, dataset_size=None, **kwargs):
ds = [
self._data('test-high', 'test_high.jsonl', 'test', ignore_metric=True),
self._data('test-middle', 'test_middle.jsonl', 'test', ignore_metric=True),
self._data('test', ['test_middle.jsonl', 'test_high.jsonl'], 'test'),
]
for d in ds:
if dataset_size is None:
_size = len(d.data)
d.data = DynamicDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
return ds
def test_data(self,max_seq_len=512, dataset_size = None, **kwargs):
"""See base class."""
ds = [
self._data('high', 'test_high.jsonl', 'test'),
self._data('middle', 'test_middle.jsonl', 'test'),
]
for d in ds:
if dataset_size is None:
_size = len(d.data)
d.data = DynamicDataset(d.data, feature_fn = self.get_feature_fn(max_seq_len=max_seq_len), dataset_size = _size, **kwargs)
return ds
def _data(self, name, path, type_name = 'dev', ignore_metric=False):
if isinstance(path, str):
path = [path]
data = []
for p in path:
input_src = os.path.join(self.data_dir, p)
assert os.path.exists(input_src), f"{input_src} doesn't exists"
data.extend(self.load_jsonl(input_src))
predict_fn = self.get_predict_fn()
examples = ExampleSet(data)
return EvalData(name, examples,
metrics_fn = self.get_metrics_fn(), predict_fn = predict_fn, ignore_metric=ignore_metric, critial_metrics=['accuracy'])
def get_metrics_fn(self):
"""Calcuate metrics based on prediction results"""
def metrics_fn(logits, labels):
metrics = OrderedDict(accuracy= metric_accuracy(logits, labels))
return metrics
return metrics_fn
def get_labels(self):
"""See base class."""
return ["A", "B", "C", "D"]
def load_jsonl(self, path):
examples = []
with open(path, encoding='utf-8') as fs:
data = [json.loads(l) for l in fs]
for d in data:
page = d["article"]
for q,o,a in zip(d["questions"], d["options"], d["answers"]):
example = ExampleInstance(segments=[page, q, *o], label=self.label2id(a))
examples.append(example)
return examples
def get_feature_fn(self, max_seq_len = 512, mask_gen = None):
def _example_to_feature(example, rng=None, ext_params=None, **kwargs):
return self.example_to_feature(self.tokenizer, example, max_seq_len = max_seq_len, \
rng = rng, mask_generator = mask_gen, ext_params = ext_params, **kwargs)
return _example_to_feature
def example_to_feature(self, tokenizer, example, max_seq_len=512, rng=None, mask_generator = None, ext_params=None, label_type='int', **kwargs):
if not rng:
rng = random
max_num_tokens = max_seq_len - 3
def _normalize(text):
text = re.sub(r'\s+', ' ', text.strip('\t \r\n_').replace('\n', ' ')).strip()
return text
# page,question,options
context = tokenizer.tokenize(_normalize(example.segments[0]))
features = OrderedDict(input_ids = [],
type_ids = [],
position_ids = [],
input_mask = [])
for option in example.segments[2:]:
#TODO: truncate
question = example.segments[1]
qa_cat = " ".join([question, option])
qa_cat = tokenizer.tokenize(_normalize(qa_cat))[:160]
segments = [context, qa_cat]
segments = _truncate_segments(segments, max_num_tokens, rng)
tokens = ['[CLS]']
type_ids = [0]
for i,s in enumerate(segments):
tokens.extend(s)
tokens.append('[SEP]')
type_ids.extend([i]*(len(s)+1))
token_ids = tokenizer.convert_tokens_to_ids(tokens)
pos_ids = list(range(len(token_ids)))
rel_pos = []
input_mask = [1]*len(token_ids)
features['input_ids'].append(token_ids)
features['type_ids'].append(type_ids)
features['position_ids'].append(pos_ids)
features['input_mask'].append(input_mask)
padding_size = max(0, max_seq_len - len(token_ids))
for f in features:
features[f][-1].extend([0]*padding_size)
for f in features:
features[f] = torch.tensor(features[f], dtype=torch.int)
if example.label is not None:
label_type = torch.int if label_type=='int' else torch.float
features['labels'] = torch.tensor(example.label, dtype=label_type)
return features
def get_model_class_fn(self):
def partial_class(*wargs, **kwargs):
return MultiChoiceModel.load_model(*wargs, **kwargs)
return partial_class
@classmethod
def add_arguments(cls, parser):
"""Add task specific arguments
e.g. parser.add_argument('--data_dir', type=str, help='The path of data directory.')
"""
parser.add_argument('--task_example_arg', type=str, default=None, help='An example task specific argument')
return parser

80
experiments/my_exp/run.sh Executable file
Просмотреть файл

@ -0,0 +1,80 @@
#!/bin/bash
#
# This is an example script to show how to made customized task
#
#
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
Task=MyRACE
# Download the data from http://www.cs.cmu.edu/~glai1/data/race/
init=$1
tag=$init
case ${init,,} in
base-mnli)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 2e-5 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--max_seq_len 320"
;;
large-mnli)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 8e-6 \
--train_batch_size 32 \
--cls_drop_out 0.2 \
--max_seq_len 320"
;;
xlarge-mnli)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 7e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2\
--fp16 True \
--max_seq_len 320"
;;
xlarge-v2-mnli)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 4e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 320"
;;
xxlarge-v2-mnli)
parameters=" --num_train_epochs 6 \
--warmup 100 \
--learning_rate 3e-6 \
--train_batch_size 48 \
--cls_drop_out 0.2 \
--fp16 True \
--max_seq_len 320"
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base-mnli - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large-mnli - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge-mnli - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2-mnli - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2-mnli - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--task_dir . \
--task_name $Task \
--data_dir <Your race data directory> \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

85
experiments/ner/ner.sh Executable file
Просмотреть файл

@ -0,0 +1,85 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
data_dir=$cache_dir/NER/data
function get_data(){
mkdir -p $data_dir
if [[ ! -e $data_dir/train.txt ]]; then
pip install seqeval
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train -o "$data_dir/train.txt"
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa -o "$data_dir/valid.txt"
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb -o "$data_dir/test.txt"
fi
}
Task=NER
get_data
init=$1
tag=$init
case ${init,,} in
base)
parameters=" --num_train_epochs 15 \
--warmup 0.1 \
--learning_rate 2e-5 \
--train_batch_size 16 \
--cls_drop_out 0 "
;;
large)
parameters=" --num_train_epochs 15 \
--warmup 0.1 \
--learning_rate 1e-5 \
--train_batch_size 16 \
--cls_drop_out 0 \
--fp16 True "
;;
xlarge)
parameters=" --num_train_epochs 15 \
--warmup 0.1 \
--learning_rate 7e-6 \
--train_batch_size 16 \
--cls_drop_out 0 \
--fp16 True "
;;
xlarge-v2)
parameters=" --num_train_epochs 15 \
--warmup 0.1 \
--learning_rate 4e-6 \
--train_batch_size 16 \
--cls_drop_out 0 \
--fp16 True "
;;
xxlarge-v2)
parameters=" --num_train_epochs 15 \
--warmup 0.1 \
--learning_rate 2.5e-6 \
--train_batch_size 16 \
--cls_drop_out 0 \
--fp16 True "
;;
*)
echo "usage $0 <Pretrained model configuration>"
echo "Supported configurations"
echo "base - Pretrained DeBERTa v1 model with 140M parameters (12 layers, 768 hidden size)"
echo "large - Pretrained DeBERta v1 model with 380M parameters (24 layers, 1024 hidden size)"
echo "xlarge - Pretrained DeBERTa v1 model with 750M parameters (48 layers, 1024 hidden size)"
echo "xlarge-v2 - Pretrained DeBERTa v2 model with 900M parameters (24 layers, 1536 hidden size)"
echo "xxlarge-v2 - Pretrained DeBERTa v2 model with 1.5B parameters (48 layers, 1536 hidden size)"
exit 0
;;
esac
python -m DeBERTa.apps.run --model_config config.json \
--tag $tag \
--do_train \
--max_seq_len 512 \
--task_name $Task \
--data_dir $cache_dir/glue_tasks/$Task \
--init_model $init \
--output_dir /tmp/ttonly/$tag/$task $parameters

Просмотреть файл

@ -1,25 +0,0 @@
#!/bin/bash
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
cd $SCRIPT_DIR
cache_dir=/tmp/DeBERTa/
data_dir=$cache_dir/NER/data
function get_data(){
mkdir -p $data_dir
if [[ ! -e $data_dir/train.txt ]]; then
pip install seqeval
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train -o "$data_dir/train.txt"
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa -o "$data_dir/valid.txt"
curl -L https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testb -o "$data_dir/test.txt"
fi
}
model_name=large
Task=NER
get_data
init=large
tag=large
../utils/train.sh --config config.json --vocab $cache_dir/vocab -t $Task --data $data_dir --tag $tag -i $init -o /tmp/ttonly/$tag/$Task -- --num_train_epochs 15 --accumulative_update 1 --warmup 0.1 --learning_rate 1e-5 --train_batch_size 16 --cls_drop 0 --max_seq_length 512

Просмотреть файл

@ -1,128 +0,0 @@
#!/bin/bash
SOURCE=$(dirname "$(readlink -f "$0")")/../
if [[ ! -d $SOURCE/DeBERTa ]]; then
SOURCE=$(dirname "$(readlink -f "$0")")/../../
fi
export PYTHONPATH=${SOURCE}
SCRIPT=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT")
SCRIPT_FILE=$(basename "$SCRIPT")
BashArgs=${@}
DEBUG=False
RESUME=False
Predict=False
ModelType=DeBERTa
ModelSize=Large
INIT_MODEL=""
while [ $# -gt 0 ]; do
case ${1,,} in
--debug|-d)
DEBUG=True
;;
--resume|-r)
RESUME=True
;;
--init|-i)
INIT_MODEL=$2
shift
;;
--predict|-p)
Predict=True
;;
--output|-o)
OUTPUT_DIR=$2
shift
;;
--config|-c)
CONFIG=$2
shift
;;
--task|-t)
Task=$2
shift
;;
--data)
Data=$2
shift
;;
--tag)
Tag=$2
shift
;;
--)
shift
ExtraArgs=${@}
break
;;
--help|-h|*)
echo "Usage $0 [options] -d|--debug -r|--resume -- <job args>:
-d|--debug whether to debug
-r|--resume whether to resume
"
exit 0
;;
esac
shift
done
export OMP_NUM_THREADS=1
if [[ ${DEBUG,,} = 'true' ]]; then
export CUDA_VISIBLE_DEVICES=0 #,1 #,1 #,1,2,3,4,5,6,7
fi
export CUDA_VISIBLE_DEVICES=$(python3 -c "import torch; x=[str(x) for x in range(torch.cuda.device_count()) if torch.cuda.get_device_capability(x)[0]>=6]; print(','.join(x))" 2>/dev/null)
IFS=',' read -a DEVICE_CNT <<< "$CUDA_VISIBLE_DEVICES"
MODEL=$INIT_MODEL
if [[ -z $Task ]]; then
Task=MNLI
fi
if [[ -z $CONFIG ]]; then
CONFIG=config.json
fi
DUMP=5000
LR_SCH=warmup_linear
CLS_DP=0.15
TAG=${ModelType,,}_${Tag}
if [[ ! -z ${OUTPUT_DIR} ]]; then
OUTPUT=${OUTPUT_DIR}
else
OUTPUT=/tmp/job_runs/$Task/$TAG
fi
[ -e $OUTPUT/script ] || mkdir -p $OUTPUT/script
cp -f $CONFIG $OUTPUT/model_config.json
if [[ ! ${Predict,,} = 'true' ]]; then
CMD=" --do_train"
else
CMD="--do_eval --do_predict"
fi
parameters="--task_name $Task $CMD \
--data_dir $Data \
--init_model $MODEL \
--model_config $OUTPUT/model_config.json \
--max_seq_length 512 \
--eval_batch_size 128 \
--predict_batch_size 128 \
--output_dir $OUTPUT \
--scale_steps 250 \
--loss_scale 16384 \
--tag $TAG \
--lr_schedule $LR_SCH \
--accumulative_update 1 \
--dump_interval $DUMP \
--with_radam False \
--cls_drop_out ${CLS_DP} $ExtraArgs "
python3 -m DeBERTa.apps.train $parameters

Просмотреть файл

@ -9,6 +9,8 @@ tqdm
ujson
seqeval
psutil
sentencepiece
laser
#GitPython
torch
#torchvision