diff --git a/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb b/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb index 1fc74fc..02e993f 100644 --- a/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb +++ b/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb @@ -4,7 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Named Entity Recognition Using BERT" + "# Named Entity Recognition Using BERT\n", + "## Summary\n", + "This notebook demonstrates how to fine tune [pretrained BERT model](https://github.com/huggingface/pytorch-pretrained-BERT) for token level named entity recognition (NER) task. A few utility functions and classes in the NLP Best Practices repo are used to facilitate data preprocessing, model training, and model evaluation. \n", + "\n", + "[BERT (Bidirectional Transformers forLanguage Understanding)](https://arxiv.org/pdf/1810.04805.pdf) is a powerful pre-trained lanaguage model that can be used for multiple NLP tasks, including text classification, question answering, named entity recognition. It's able to achieve state of the art performance with only a few epochs of fine tuning. \n", + "The figure below illustrates how BERT can be fine tuned for NER tasks. The input data is a list of tokens representing a sentence. In the training data, each token has an entity label. After fine tuning, the model predicts an entity label for each token of a given sentence in the testing data. \n", + "\n", + "![](bert_architecture.png)" ] }, { @@ -12,6 +19,7 @@ "metadata": {}, "source": [ "### Required packages\n", + "* pytorch\n", "* pytorch-pretrained-bert\n", "* pandas\n", "* seqeval" @@ -19,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 1, "metadata": { "scrolled": false }, @@ -28,16 +36,14 @@ "import sys\n", "import os\n", "import yaml\n", + "import pprint\n", "import random\n", - "import pandas as pd\n", - "import numpy as np\n", - "from tqdm import tqdm, trange\n", "from seqeval.metrics import f1_score\n", "\n", "import torch\n", + "from torch.optim import Adam\n", "\n", "from pytorch_pretrained_bert.tokenization import BertTokenizer\n", - "from torch.optim import Adam\n", "\n", "bert_utils_path = os.path.abspath('../../utils_nlp/bert')\n", "if bert_utils_path not in sys.path:\n", @@ -46,8 +52,7 @@ "from configs import BERTFineTuneConfig\n", "from bert_data_utils import KaggleNERProcessor\n", "from bert_utils import (BertTokenClassifier, \n", - " convert_examples_to_token_features, \n", - " print_dict, \n", + " create_token_feature_dataset, \n", " get_device)\n" ] }, @@ -55,9 +60,24 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "config_file = \"config.yaml\"" + "config_file = \"config.yaml\"\n", + "ner_data_dir = \"./data/NER/ner_dataset.csv\"\n", + "random_seed = 42\n", + "random.seed(random_seed)\n", + "torch.manual_seed(random_seed)" ] }, { @@ -76,9 +96,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "{TrainConfig: {'batch_size': 32, 'num_train_epochs': 3}\n", - "ModelConfig: {'bert_model': 'bert-base-uncased', 'max_seq_length': 75, 'do_lower_case': True}\n", - "OptimizerConfig: {'optimizer_name': 'BertAdam', 'learning_rate': 3e-05, 'no_decay_params': ['bias', 'gamma', 'beta'], 'params_weight_decay': 0.01, 'clip_gradient': True, 'max_gradient_norm': 1.0}}\n" + "{'ModelConfig': {'bert_model': 'bert-base-uncased',\n", + " 'do_lower_case': True,\n", + " 'max_seq_length': 75},\n", + " 'OptimizerConfig': {'clip_gradient': True,\n", + " 'learning_rate': 3e-05,\n", + " 'max_gradient_norm': 1.0,\n", + " 'no_decay_params': ['bias', 'gamma', 'beta'],\n", + " 'optimizer_name': 'Adam',\n", + " 'params_weight_decay': 0.01},\n", + " 'TrainConfig': {'batch_size': 32, 'num_train_epochs': 2}}\n" ] } ], @@ -86,7 +113,7 @@ "with open(config_file, 'r') as ymlfile:\n", " config_dict = yaml.safe_load(ymlfile)\n", "\n", - "print_dict(config_dict)" + "pprint.pprint(config_dict)" ] }, { @@ -110,7 +137,7 @@ "metadata": {}, "source": [ "### Create training and validation examples\n", - "KaggleNERProcessor is a dataset specific class that generates training and evaluation examples in the format accepted by all utility functions. " + "`KaggleNERProcessor` is a dataset specific class that splits the whole dataset into training and validation datasets according to `dev_percentage`. The `get_train_examples` and `get_dev_examples` return the training and validation datasets respectively. The `get_labels` method returns a list of all unique labels " ] }, { @@ -121,7 +148,7 @@ }, "outputs": [], "source": [ - "kaggle_ner_processor = KaggleNERProcessor(data_dir=\"./data/NER/ner_dataset.csv\", dev_percentage = 0.1)" + "kaggle_ner_processor = KaggleNERProcessor(data_dir=ner_data_dir, dev_percentage = 0.1)" ] }, { @@ -130,11 +157,30 @@ "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I-gpe', 'B-art', 'B-eve', 'I-geo', 'B-tim', 'I-art', 'I-eve', 'B-nat', 'B-org', 'I-org', 'O', 'B-geo', 'B-gpe', 'I-tim', 'I-per', 'I-nat', 'B-per', 'X']\n" + ] + } + ], "source": [ - "train_examples = kaggle_ner_processor.get_train_examples(data_dir=\"./data/NER/ner_dataset.csv\")\n", - "dev_examples = kaggle_ner_processor.get_dev_examples(data_dir=\"./data/NER/ner_dataset.csv\")\n", - "label_list = kaggle_ner_processor.get_labels()" + "train_examples = kaggle_ner_processor.get_train_examples()\n", + "dev_examples = kaggle_ner_processor.get_dev_examples()\n", + "label_list = kaggle_ner_processor.get_labels()\n", + "print(label_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`KaggleNERProcessor` generates training and evaluation examples in `BertInputData` type. `BertInputData` is a `namedtuple` with the following three fields:\n", + "* text_a: text string of the first sentence.\n", + "* text_b: text string of the second setence. This is only required for two-sentence tasks.\n", + "* label: required for training and validation data." ] }, { @@ -152,25 +198,33 @@ "Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .\n", "\n", "Sample sentence labels: \n", - "['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O']\n", + "['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']\n", "\n" ] } ], "source": [ "print('Sample sentence: \\n{}\\n'.format(train_examples[0].text_a))\n", - "print('Sample sentence labels: \\n{}\\n'.format(train_examples[1].label))" + "print('Sample sentence labels: \\n{}\\n'.format(train_examples[0].label))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Convert examples to features\n", - "The function `convert_examples_to_token_features` converts raw string data to numerical features, involving the following steps:\n", - "1. Tokenization\n", - "2. Convert tokens and labels to numerical values\n", - "3. Sequence padding or truncation" + "### Convert raw input to feature dataset.\n", + "The function `create_token_feature_dataset` converts raw string data to PyTorch `TensorDataset` containing numerical features, involving the following steps:\n", + "1. Tokenization.\n", + "2. Convert tokens and labels to numerical values, i.e. token ids and label ids.\n", + "3. Sequence padding or truncation according to the `max_seq_length` configuration.\n", + "4. Convert numpy arrays to Pytorch `TensorDataset`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create a dictionary that maps labels to numerical values**" ] }, { @@ -182,6 +236,13 @@ "label_map = {label: i for i, label in enumerate(label_list)}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create a tokenizer**" + ] + }, { "cell_type": "code", "execution_count": 9, @@ -189,55 +250,100 @@ "outputs": [], "source": [ "tokenizer = BertTokenizer.from_pretrained(config.bert_model,\n", - " do_lower_case=config.do_lower_case)\n", - "\n", - "train_features = convert_examples_to_token_features(examples=train_examples,\n", - " tokenizer=tokenizer,\n", - " label_map=label_map, \n", - " config=config)\n", - "dev_features = convert_examples_to_token_features(examples=dev_examples,\n", - " tokenizer=tokenizer,\n", - " label_map=label_map, \n", - " config=config)" + " do_lower_case=config.do_lower_case)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create feature TensorDataset** \n", + "Note there is an argument called `trailing_piece_tag`. BERT uses a WordPiece tokenizer which breaks down some words into multiple tokens, e.g. \"playing\" is tokenized into \"play\" and \"##ing\". Since the input data only come with one token label for \"playing\", within `create_token_feature_dataset`, the original token label is assigned to the first token \"play\" and the second token \"##ing\" is labeled as \"X\". By default, `trailing_piece_tag` is set to \"X\". If your \"X\" already exists in your data, you can set `trailing_piece_tag` to another value that doesn't exist in your data. " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = create_token_feature_dataset(data=train_examples,\n", + " tokenizer=tokenizer,\n", + " label_map=label_map,\n", + " true_label_available=True,\n", + " max_seq_length=config.max_seq_length, \n", + " trailing_piece_tag=\"X\")\n", + "dev_dataset = create_token_feature_dataset(data=dev_examples,\n", + " tokenizer=tokenizer,\n", + " label_map=label_map, \n", + " true_label_available=True,\n", + " max_seq_length=config.max_seq_length, \n", + " trailing_piece_tag=\"X\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`create_token_feature_dataset` outputs a `TensorDataset` with four tensors: \n", + "1. token ids: numerical values each corresponds to a token.\n", + "2. attention mask: 1 for input tokens and 0 for padded tokens, so that padded tokens are not attended to. \n", + "3. segment ids: 0 for the first sentence and 1 for the second sentence, only used in two sentence tasks, not used in NER.\n", + "4. label ids: numerical values each corresponds to an entity label. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sample token id:\n", - "[5190, 1997, 28337, 2031, 9847, 2083, 2414, 2000, 6186, 1996, 2162, 1999, 5712, 1998, 5157, 1996, 10534, 1997, 2329, 3629, 2013, 2008, 2406, 1012, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n", + "tensor([ 5190, 1997, 28337, 2031, 9847, 2083, 2414, 2000, 6186, 1996,\n", + " 2162, 1999, 5712, 1998, 5157, 1996, 10534, 1997, 2329, 3629,\n", + " 2013, 2008, 2406, 1012, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0])\n", "\n", "Sample attention mask:\n", - "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n", + "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0])\n", "\n", "Sample label ids:\n", - "[6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]\n", + "tensor([10, 10, 10, 10, 10, 10, 11, 10, 10, 10, 10, 10, 11, 10, 10, 10, 10, 10,\n", + " 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n", + " 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n", + " 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,\n", + " 10, 10, 10])\n", "\n" ] } ], "source": [ - "print(\"Sample token id:\\n{}\\n\".format(train_features[0].input_ids))\n", - "print(\"Sample attention mask:\\n{}\\n\".format(train_features[0].input_mask))\n", - "print(\"Sample label ids:\\n{}\\n\".format(train_features[0].label_id))" + "print(\"Sample token id:\\n{}\\n\".format(train_dataset[0][0]))\n", + "print(\"Sample attention mask:\\n{}\\n\".format(train_dataset[0][1]))\n", + "print(\"Sample label ids:\\n{}\\n\".format(train_dataset[0][3]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Token Classifier" + "## Create Token Classifier\n", + "The `get_device` is helper function which detects if GPU is avalaible and the number of GPUs available. " ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "scrolled": false }, @@ -248,24 +354,17 @@ "text": [ "BERT fine tune configurations:\n", "batch_size=32\n", - "num_train_epochs=3\n", + "num_train_epochs=2\n", "bert_model=bert-base-uncased\n", "max_seq_length=75\n", "do_lower_case=True\n", - "optimizer_name=BertAdam\n", + "optimizer_name=Adam\n", "learning_rate=3e-05\n", "no_decay_params=['bias', 'gamma', 'beta']\n", "params_weight_decay=0.01\n", "clip_gradient=True\n", "max_gradient_norm=1.0\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "t_total value of -1 results in schedule not being applied\n" - ] } ], "source": [ @@ -285,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "scrolled": true }, @@ -294,1417 +393,90 @@ "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 0%| | 0/3 [00:00 0: @@ -98,30 +107,25 @@ def create_token_feature_dataset(data, segment_ids += padding new_labels += label_padding - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - assert len(new_labels) == max_seq_length - label_ids = [label_map[label] for label in new_labels] features.append((input_ids, input_mask, segment_ids, label_ids)) - all_input_ids = torch.tensor([f[0] for f in features], - dtype=torch.long) - all_input_mask = torch.tensor([f[1] for f in features], - dtype=torch.long) - all_segment_ids = torch.tensor([f[2] for f in features], - dtype=torch.long) + all_input_ids = torch.tensor([f[0] for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f[1] for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f[2] for f in features], dtype=torch.long) if true_label_available: - all_label_ids = torch.tensor([f[3] for f in features], - dtype=torch.long) - tensor_data = TensorDataset(all_input_ids, all_input_mask, - all_segment_ids, all_label_ids) + all_label_ids = torch.tensor( + [f[3] for f in features], dtype=torch.long + ) + tensor_data = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids, all_label_ids + ) else: - tensor_data = TensorDataset(all_input_ids, all_input_mask, - all_segment_ids) + tensor_data = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids + ) return tensor_data @@ -139,23 +143,32 @@ class Language(Enum): class BertTokenClassifier: """BERT-based token classifier.""" - def __init__(self, config, label_map, device, n_gpu, - language=Language.ENGLISH, cache_dir="."): + def __init__( + self, + config, + label_map, + device, + n_gpu, + language=Language.ENGLISH, + cache_dir=".", + ): """ - Initializes the classifier and the underlying pretrained model and + Initializes the classifier and the underlying pre-trained model and optimizer. Args: config (BERTFineTuneConfig): A configuration object contains settings of model, training, and optimizer. label_map (dict): Dictionary used to map token labels to - integers during data preprocessing. + integers during data pre-processing. This is used to + convert token ids back to original token labels during + prediction. device (str): "cuda" or "cpu". Can be obtained by calling get_device. - n_gpu (int): Number of GPUs available.Can be obtained by calling + n_gpu (int): Number of GPUs available.m,Can be obtained by calling get_device. - language (Language, optinal): The pretrained model's language. + language (Language, optional): The pre-trained model's language. Defaults to Language.ENGLISH. cache_dir (str, optional): Location of BERT's cache directory. Defaults to ".". @@ -190,10 +203,12 @@ class BertTokenClassifier: self._is_trained = False def _load_model(self): - """Loads the pretrained BERT model.""" + """Loads the pre-trained BERT model.""" model = BertForTokenClassification.from_pretrained( - self.bert_model, cache_dir=self.cache_dir, - num_labels=self.num_labels) + self.bert_model, + cache_dir=self.cache_dir, + num_labels=self.num_labels, + ) model.to(self.device) @@ -209,20 +224,32 @@ class BertTokenClassifier: """ param_optimizer = list(self.model.named_parameters()) optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if - not any(nd in n for nd in self.no_decay_params)], - 'weight_decay': self.params_weight_decay}, - {'params': [p for n, p in param_optimizer if - any(nd in n for nd in self.no_decay_params)], - 'weight_decay': 0.0} + { + "params": [ + p + for n, p in param_optimizer + if not any(nd in n for nd in self.no_decay_params) + ], + "weight_decay": self.params_weight_decay, + }, + { + "params": [ + p + for n, p in param_optimizer + if any(nd in n for nd in self.no_decay_params) + ], + "weight_decay": 0.0, + }, ] - if self.optimizer_name == 'BertAdam': - optimizer = BertAdam(optimizer_grouped_parameters, - lr=self.learning_rate) - elif self.optimizer_name == 'Adam': - optimizer = Adam(optimizer_grouped_parameters, - lr=self.learning_rate) + if self.optimizer_name == "BertAdam": + optimizer = BertAdam( + optimizer_grouped_parameters, lr=self.learning_rate + ) + elif self.optimizer_name == "Adam": + optimizer = Adam( + optimizer_grouped_parameters, lr=self.learning_rate + ) return optimizer @@ -233,30 +260,36 @@ class BertTokenClassifier: Args: train_dataset (TensorDataset): TensorDataset consisted of the following numerical feature tensors. - 1. token ids - 2. attention mask - 3. segment ids - 4. label ids + 1. token ids: numerical values each corresponds to a token. + 2. attention mask: 1 for input tokens and 0 for padded tokens, + so that padded tokens are not attended to. + 3. segment ids: 0 for the first sentence and 1 for the second + sentence, only used in two sentence tasks. + 4. label ids: numerical values each corresponds to a token + label """ train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, - batch_size=self.batch_size) + train_dataloader = DataLoader( + train_dataset, sampler=train_sampler, batch_size=self.batch_size + ) global_step = 0 self.model.train() for _ in trange(int(self.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 - for step, batch in enumerate(tqdm(train_dataloader, - desc="Iteration", - mininterval=30)): + for step, batch in enumerate( + tqdm(train_dataloader, desc="Iteration", mininterval=30) + ): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch - loss = self.model(input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=input_mask, - labels=label_ids) + loss = self.model( + input_ids=input_ids, + token_type_ids=segment_ids, + attention_mask=input_mask, + labels=label_ids, + ) if self.n_gpu > 1: # mean() to average on multi-gpu. @@ -270,13 +303,14 @@ class BertTokenClassifier: if self.clip_gradient: torch.nn.utils.clip_grad_norm_( parameters=self.model.parameters(), - max_norm=self.max_gradient_norm) + max_norm=self.max_gradient_norm, + ) self.optimizer.step() self.optimizer.zero_grad() global_step += 1 - train_loss = tr_loss/nb_tr_steps + train_loss = tr_loss / nb_tr_steps print("Train loss: {}".format(train_loss)) self._is_trained = True @@ -288,32 +322,38 @@ class BertTokenClassifier: Args: test_dataset (TensorDataset): TensorDataset consisted of the following numerical feature tensors. - 1. token ids - 2. attention mask - 3. segment ids - 4. label ids, optional + 1. token ids: numerical values each corresponds to a token. + 2. attention mask: 1 for input tokens and 0 for padded tokens, + so that padded tokens are not attended to. + 3. segment ids: 0 for the first sentence and 1 for the second + sentence, only used in two sentence tasks. + 4. label ids: numerical values each corresponds to a token + label, optional Returns: tuple: The first element of the tuple is the predicted token - labels. If the testing dataset contain label ids, the second - element of the tuple is the true token labels. + labels. If the testing dataset contains label ids, the second + element of the tuple is the true token labels, otherwise + it's None. """ test_sampler = SequentialSampler(test_dataset) - test_dataloader = DataLoader(test_dataset, sampler=test_sampler, - batch_size=self.batch_size) + test_dataloader = DataLoader( + test_dataset, sampler=test_sampler, batch_size=self.batch_size + ) if not self._is_trained: - raise Exception("Model is not trained. Please train model before " - "predict.") + raise Exception( + "Model is not trained. Please train model before " "predict." + ) self.model.eval() predictions = [] true_labels = [] - eval_loss, eval_accuracy = 0, 0 + eval_loss = 0 nb_eval_steps = 0 - for step, batch in enumerate(tqdm(test_dataloader, - desc="Iteration", - mininterval=10)): + for step, batch in enumerate( + tqdm(test_dataloader, desc="Iteration", mininterval=10) + ): batch = tuple(t.to(self.device) for t in batch) true_label_available = False if len(batch) == 3: @@ -324,19 +364,23 @@ class BertTokenClassifier: with torch.no_grad(): if true_label_available: - tmp_eval_loss = self.model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask, - labels=b_labels) - logits = self.model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask) + tmp_eval_loss = self.model( + b_input_ids, + token_type_ids=b_segment_ids, + attention_mask=b_input_mask, + labels=b_labels, + ) + logits = self.model( + b_input_ids, + token_type_ids=b_segment_ids, + attention_mask=b_input_mask, + ) logits = logits.detach().cpu().numpy() predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) if true_label_available: - label_ids = b_labels.to('cpu').numpy() + label_ids = b_labels.to("cpu").numpy() true_labels.append(label_ids) eval_loss += tmp_eval_loss.mean().item() @@ -346,16 +390,17 @@ class BertTokenClassifier: print("Validation loss: {}".format(validation_loss)) reversed_label_map = {v: k for k, v in self.label_map.items()} - pred_tags = [[reversed_label_map[p_i] for p_i in p] for p in - predictions] + pred_tags = [ + [reversed_label_map[p_i] for p_i in p] for p in predictions + ] if true_label_available: - valid_tags = [[reversed_label_map[l_ii] for l_ii in l_i] for - l in true_labels for l_i in l] + valid_tags = [ + [reversed_label_map[l_ii] for l_ii in l_i] + for l in true_labels + for l_i in l + ] return pred_tags, valid_tags else: - return pred_tags, - - - + return (pred_tags,) diff --git a/utils_nlp/bert/configs.py b/utils_nlp/bert/configs.py index 3200236..afbce7f 100644 --- a/utils_nlp/bert/configs.py +++ b/utils_nlp/bert/configs.py @@ -1,7 +1,8 @@ class BERTFineTuneConfig: """ - Configurations for fine tuning pre-trained bert model. + Configurations for fine tuning pre-trained BERT models. """ + def __init__(self, config_dict): """ Initializes a BERTFineTuneConfig object. @@ -9,23 +10,30 @@ class BERTFineTuneConfig: Args: config_dict (dict): A nested dictionary containing three key, value pairs. - "ModelConfig": model configuration dictionary - "bert_model": str, name of the bert pretrained model. - Accepted values are + "ModelConfig": model configuration dictionary: + "bert_model": str, name of the bert pre-trained model. + Accepted values are: + "bert-base-uncased" + "bert-large-uncased" + "bert-base-cased" + "bert-large-cased" + "bert-base-multilingual-uncased" + "bert-base-multilingual-cased" + "bert-base-chinese" "max_seq_length": int, optional. Maximum length of token sequence. Default value is 512. "do_lower_case": bool, optional. Whether to convert capital letters to lower case during tokenization. Default value is True. - "TrainConfig" (optional): training configuration dictionary + "TrainConfig" (optional): training configuration dictionary: "batch_size": int, optional. Default value is 32. "num_train_epochs": int, optional. Default value is 3. "OptimizerConfig" (optional): optimizer configuration - dictionary + dictionary: "optimizer_name": str, optional. Name of the optimizer to use. Accepted values are "BertAdam", "Adam". Default value is "BertAdam" - "learning_rate": float, optional, default value is 35e-05, + "learning_rate": float, optional, default value is 5e-05, "no_decay_params": list of strings, optional. Names of parameters to apply weight decay on. Default value is []. @@ -61,15 +69,21 @@ class BERTFineTuneConfig: self.batch_size = config_dict["batch_size"] else: self.batch_size = 32 - print("batch_size is set to default value: {}.".format( - self.batch_size)) + print( + "batch_size is set to default value: {}.".format( + self.batch_size + ) + ) if "num_train_epochs" in config_dict: self.num_train_epochs = config_dict["num_train_epochs"] else: self.num_train_epochs = 3 - print("num_train_epochs is set to default value: {}.".format( - self.num_train_epochs)) + print( + "num_train_epochs is set to default value: {}.".format( + self.num_train_epochs + ) + ) def _configure_model_settings(self, config_dict): self.bert_model = config_dict["bert_model"] @@ -78,63 +92,88 @@ class BERTFineTuneConfig: self.max_seq_length = config_dict["max_seq_length"] else: self.max_seq_length = 512 - print("max_seq_length is set to default value: {}.".format( - self.max_seq_length)) + print( + "max_seq_length is set to default value: {}.".format( + self.max_seq_length + ) + ) - if 'do_lower_case' in config_dict: - self.do_lower_case = config_dict['do_lower_case'] + if "do_lower_case" in config_dict: + self.do_lower_case = config_dict["do_lower_case"] else: self.do_lower_case = True - print("do_lower_case is set to default value: {}.".format( - self.do_lower_case)) + print( + "do_lower_case is set to default value: {}.".format( + self.do_lower_case + ) + ) def _configure_optimizer_settings(self, config_dict): if "optimizer_name" in config_dict: self.optimizer_name = config_dict["optimizer_name"] else: self.optimizer_name = "BertAdam" - print("optimizer_name is set to default value: {}.".format( - self.optimizer_name)) + print( + "optimizer_name is set to default value: {}.".format( + self.optimizer_name + ) + ) - if 'learning_rate' in config_dict: - self.learning_rate = config_dict['learning_rate'] + if "learning_rate" in config_dict: + self.learning_rate = config_dict["learning_rate"] else: self.learning_rate = 5e-5 - print("learning_rate is set to default value: {}.".format( - self.learning_rate)) + print( + "learning_rate is set to default value: {}.".format( + self.learning_rate + ) + ) if "no_decay_params" in config_dict: self.no_decay_params = config_dict["no_decay_params"] else: self.no_decay_params = [] - print("no_decay_params is set to default value: {}.".format( - self.no_decay_params)) + print( + "no_decay_params is set to default value: {}.".format( + self.no_decay_params + ) + ) if "params_weight_decay" in config_dict: self.params_weight_decay = config_dict["params_weight_decay"] else: self.params_weight_decay = 0.01 - print("Default params_weight_decay, {}, is used".format( - self.params_weight_decay)) + print( + "Default params_weight_decay, {}, is used".format( + self.params_weight_decay + ) + ) if "clip_gradient" in config_dict: self.clip_gradient = config_dict["clip_gradient"] else: self.clip_gradient = False - print("clip_gradient is set to default value: {}.".format( - self.clip_gradient)) + print( + "clip_gradient is set to default value: {}.".format( + self.clip_gradient + ) + ) if "max_gradient_norm" in config_dict: self.max_gradient_norm = config_dict["max_gradient_norm"] else: self.max_gradient_norm = 1.0 - print("max_gradient_norm is set to default value: {}.".format( - self.max_gradient_norm)) + print( + "max_gradient_norm is set to default value: {}.".format( + self.max_gradient_norm + ) + ) def __str__(self): sb = [] for key in self.__dict__: sb.append( - "{key}={value}".format(key=key, value=self.__dict__[key])) + "{key}={value}".format(key=key, value=self.__dict__[key]) + ) - return '\n'.join(sb) \ No newline at end of file + return "\n".join(sb)