From 2732da27175cc9df32f842adfa92021b862c827e Mon Sep 17 00:00:00 2001 From: Hong Lu Date: Thu, 16 May 2019 18:09:40 -0400 Subject: [PATCH] Updated NER notebook with new BertTokenClassifier class. --- .../NER_bert-demo-new-updated.ipynb | 7072 ++++++++++------- 1 file changed, 4255 insertions(+), 2817 deletions(-) diff --git a/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb b/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb index 83d5678..1fc74fc 100644 --- a/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb +++ b/examples/named_entity_recognition/NER_bert-demo-new-updated.ipynb @@ -14,13 +14,12 @@ "### Required packages\n", "* pytorch-pretrained-bert\n", "* pandas\n", - "* seqeval\n", - "* unicode" + "* seqeval" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 22, "metadata": { "scrolled": false }, @@ -28,11 +27,14 @@ "source": [ "import sys\n", "import os\n", + "import yaml\n", + "import random\n", "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm, trange\n", + "from seqeval.metrics import f1_score\n", + "\n", "import torch\n", - "import random\n", "\n", "from pytorch_pretrained_bert.tokenization import BertTokenizer\n", "from torch.optim import Adam\n", @@ -41,21 +43,21 @@ "if bert_utils_path not in sys.path:\n", " sys.path.insert(0, bert_utils_path)\n", "\n", - "from configs import (PathConfig,\n", - " GlobalConfig, \n", - " DeviceConfig, \n", - " ModelConfig, \n", - " OptimizerConfig, \n", - " TrainConfig, \n", - " EvalConfig)\n", + "from configs import BERTFineTuneConfig\n", "from bert_data_utils import KaggleNERProcessor\n", - "from bert_utils import (convert_examples_to_token_features,\n", - " create_train_dataloader, \n", - " create_eval_dataloader, \n", - " load_model, \n", - " get_optimizer_params, \n", - " train_model, \n", - " eval_token_model)" + "from bert_utils import (BertTokenClassifier, \n", + " convert_examples_to_token_features, \n", + " print_dict, \n", + " get_device)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "config_file = \"config.yaml\"" ] }, { @@ -65,185 +67,35 @@ "## Configurations" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Path configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "path_config_dict = {\"data_dir\": \"./data/NER/\", \n", - " \"output_dir\": \"./NER_output/\"}\n", - "path_config = PathConfig(path_config_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Global configuration" - ] - }, { "cell_type": "code", "execution_count": 3, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "global_config_dict = {\"fp16\": False}\n", - "global_config = GlobalConfig(global_config_dict)" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "### Device configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": false - }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "device name: Tesla K80\n", - "number of gpus: 1\n" + "{TrainConfig: {'batch_size': 32, 'num_train_epochs': 3}\n", + "ModelConfig: {'bert_model': 'bert-base-uncased', 'max_seq_length': 75, 'do_lower_case': True}\n", + "OptimizerConfig: {'optimizer_name': 'BertAdam', 'learning_rate': 3e-05, 'no_decay_params': ['bias', 'gamma', 'beta'], 'params_weight_decay': 0.01, 'clip_gradient': True, 'max_gradient_norm': 1.0}}\n" ] } ], "source": [ - "device_config_dict = {\"no_cuda\": False}\n", - "device_config = DeviceConfig(device_config_dict)\n", - "print(\"device name: {}\".format(torch.cuda.get_device_name(0)))\n", - "print(\"number of gpus: {}\".format(device_config.n_gpu))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model configuration" + "with open(config_file, 'r') as ymlfile:\n", + " config_dict = yaml.safe_load(ymlfile)\n", + "\n", + "print_dict(config_dict)" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "scrolled": false - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ - "model_config_dict = {\"bert_model\": \"bert-base-uncased\",\n", - " \"max_seq_length\": 75,\n", - " \"num_labels\": 18,\n", - " \"model_type\": \"token\"}\n", - "model_config = ModelConfig(model_config_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Optimizer configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "optimizer_config_dict = {\"no_decay_params\": ['bias', 'gamma', 'beta'],\n", - " \"learning_rate\": 3e-5}\n", - "optimizer_config = OptimizerConfig(optimizer_config_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "train_config_dict = {\"train_batch_size\": 32,\n", - " \"num_train_epochs\": 2, \n", - " \"clip_gradient\": True}\n", - "train_config = TrainConfig(train_config_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Evaluation configuration" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "eval_config = EvalConfig({\"eval_batch_size\":32})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set random seeds" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "random.seed(global_config.seed)\n", - "np.random.seed(global_config.seed)\n", - "torch.manual_seed(global_config.seed)" + "config = BERTFineTuneConfig(config_dict)" ] }, { @@ -263,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": { "scrolled": false }, @@ -274,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": { "scrolled": false }, @@ -287,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": { "scrolled": false }, @@ -323,25 +175,35 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "tokenizer = BertTokenizer.from_pretrained(model_config.bert_model,\n", - " do_lower_case=model_config.do_lower_case)\n", - "train_features = convert_examples_to_token_features(examples=train_examples,\n", - " tokenizer=tokenizer,\n", - " label_list=label_list, \n", - " model_config=model_config)\n", - "dev_features = convert_examples_to_token_features(examples=dev_examples,\n", - " tokenizer=tokenizer,\n", - " label_list=label_list, \n", - " model_config=model_config)" + "label_map = {label: i for i, label in enumerate(label_list)}" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = BertTokenizer.from_pretrained(config.bert_model,\n", + " do_lower_case=config.do_lower_case)\n", + "\n", + "train_features = convert_examples_to_token_features(examples=train_examples,\n", + " tokenizer=tokenizer,\n", + " label_map=label_map, \n", + " config=config)\n", + "dev_features = convert_examples_to_token_features(examples=dev_examples,\n", + " tokenizer=tokenizer,\n", + " label_map=label_map, \n", + " config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -355,7 +217,7 @@ "[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n", "\n", "Sample label ids:\n", - "[1, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n", + "[6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]\n", "\n" ] } @@ -370,42 +232,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create dataloaders\n", - "The utility functions `create_train_dataloader` and `create_eval_dataloader` creates Pytorch dataloaders from features, which can be used for model training and evaluation. The following two steps are performed:\n", - "1. Convert numpy arrays to Pytorch tensors\n", - "2. Create dataloader for sampling and serving data in batches" + "## Create Token Classifier" ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "train_dataloader = create_train_dataloader(train_features=train_features,\n", - " model_config=model_config,\n", - " train_config=train_config,\n", - " device_config=device_config)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "valid_dataloader = create_eval_dataloader(eval_features=dev_features, \n", - " model_config=model_config, \n", - " eval_config=eval_config)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": { "scrolled": false }, @@ -414,97 +246,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sample token id tensor:\n", - "tensor([ 1057, 29625, 2015, 29625, 29624, 3709, 2749, 1999, 7041, 2360,\n", - " 2027, 2730, 2321, 17671, 2076, 2019, 11585, 3169, 1999, 1996,\n", - " 2264, 1010, 2096, 2334, 4584, 2758, 2216, 2730, 2020, 9272,\n", - " 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0])\n", - "\n", - "Sample attention mask tensor:\n", - "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0])\n", - "\n", - "Sample label id tensor:\n", - "tensor([ 1, 17, 17, 17, 17, 17, 1, 1, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1])\n", - "\n" + "BERT fine tune configurations:\n", + "batch_size=32\n", + "num_train_epochs=3\n", + "bert_model=bert-base-uncased\n", + "max_seq_length=75\n", + "do_lower_case=True\n", + "optimizer_name=BertAdam\n", + "learning_rate=3e-05\n", + "no_decay_params=['bias', 'gamma', 'beta']\n", + "params_weight_decay=0.01\n", + "clip_gradient=True\n", + "max_gradient_norm=1.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "t_total value of -1 results in schedule not being applied\n" ] } ], "source": [ - "it = iter(train_dataloader)\n", - "first = next(it)\n", - "print(\"Sample token id tensor:\\n{}\\n\".format(first[0][0]))\n", - "print(\"Sample attention mask tensor:\\n{}\\n\".format(first[1][0]))\n", - "print(\"Sample label id tensor:\\n{}\\n\".format(first[3][0]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Model" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "model = load_model(model_config=model_config, \n", - " path_config=path_config, \n", - " device_config=device_config,\n", - " global_config=global_config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure Optimizer\n", - "This step must be done after loading the model, because the load_model function moves all model parameters to the device, e.g. GPU. " - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "optimizer_config = get_optimizer_params(optimizer_config=optimizer_config,\n", - " train_config=train_config, \n", - " device_config=device_config, \n", - " model=model, \n", - " num_train_examples=len(train_dataloader))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# optimizer = BertAdam(optimizer_grouped_parameters,\n", - "# lr=optimizer_config.learning_rate,\n", - "# warmup=optimizer_config.warmup_proportion,\n", - "# t_total=num_train_optimization_steps)\n", - "optimizer = Adam(optimizer_config.grouped_parameters, lr=optimizer_config.learning_rate)" + "device, n_gpu = get_device()\n", + "token_classifier = BertTokenClassifier(config=config, \n", + " label_map=label_map, \n", + " device=device, \n", + " n_gpu=n_gpu)" ] }, { @@ -516,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": { "scrolled": true }, @@ -525,1417 +294,1417 @@ "name": "stderr", "output_type": "stream", "text": [ - "Epoch: 0%| | 0/2 [00:00