{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n", "\n", "**Requirements.** This notebook has additional requirements:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "# %pip install torch transformers datasets ipywidgets flaml[blendsearch,ray]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenizer" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "MODEL_CHECKPOINT = \"distilbert-base-uncased\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer(\"this is a test\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "TASK = \"cola\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import datasets" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n" ] } ], "source": "raw_dataset = datasets.load_dataset(\"glue\", TASK, trust_remote_code=True)" }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# define tokenization function used to process data\n", "COLUMN_NAME = \"sentence\"\n", "def tokenize(examples):\n", " return tokenizer(examples[COLUMN_NAME], truncation=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0dcf9ca8ce024a2b832606a6a3219b17", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c58845729f0a4261830ad679891e7c77", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9716d177a40748008cc6089e3d52a1d5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "encoded_dataset = raw_dataset.map(tokenize, batched=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " 'idx': 0,\n", " 'input_ids': [101,\n", " 2256,\n", " 2814,\n", " 2180,\n", " 1005,\n", " 1056,\n", " 4965,\n", " 2023,\n", " 4106,\n", " 1010,\n", " 2292,\n", " 2894,\n", " 1996,\n", " 2279,\n", " 2028,\n", " 2057,\n", " 16599,\n", " 1012,\n", " 102],\n", " 'label': 1,\n", " 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoded_dataset[\"train\"][0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForSequenceClassification" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n", "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "NUM_LABELS = 2\n", "model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DistilBertForSequenceClassification(\n", " (distilbert): DistilBertModel(\n", " (embeddings): Embeddings(\n", " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", " (position_embeddings): Embedding(512, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (transformer): Transformer(\n", " (layer): ModuleList(\n", " (0): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " (1): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " (2): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " (3): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " (4): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " (5): TransformerBlock(\n", " (attention): MultiHeadSelfAttention(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (q_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (k_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (v_lin): Linear(in_features=768, out_features=768, bias=True)\n", " (out_lin): Linear(in_features=768, out_features=768, bias=True)\n", " )\n", " (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (ffn): FFN(\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (lin1): Linear(in_features=768, out_features=3072, bias=True)\n", " (lin2): Linear(in_features=3072, out_features=768, bias=True)\n", " )\n", " (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " )\n", " )\n", " )\n", " )\n", " (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n", " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", " (dropout): Dropout(p=0.2, inplace=False)\n", ")" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Metric" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": "metric = datasets.load_metric(\"glue\", TASK, trust_remote_code=True)" }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n", "Compute GLUE evaluation metric associated to each GLUE dataset.\n", "Args:\n", " predictions: list of predictions to score.\n", " Each translation should be tokenized into a list of tokens.\n", " references: list of lists of references for each translation.\n", " Each reference should be tokenized into a list of tokens.\n", "Returns: depending on the GLUE subset, one or several of:\n", " \"accuracy\": Accuracy\n", " \"f1\": F1 score\n", " \"pearson\": Pearson Correlation\n", " \"spearmanr\": Spearman Correlation\n", " \"matthews_correlation\": Matthew Correlation\n", "Examples:\n", "\n", " >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of [\"mnli\", \"mnli_mismatched\", \"mnli_matched\", \"qnli\", \"rte\", \"wnli\", \"hans\"]\n", " >>> references = [0, 1]\n", " >>> predictions = [0, 1]\n", " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", " >>> print(results)\n", " {'accuracy': 1.0}\n", "\n", " >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp'\n", " >>> references = [0, 1]\n", " >>> predictions = [0, 1]\n", " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", " >>> print(results)\n", " {'accuracy': 1.0, 'f1': 1.0}\n", "\n", " >>> glue_metric = datasets.load_metric('glue', 'stsb')\n", " >>> references = [0., 1., 2., 3., 4., 5.]\n", " >>> predictions = [0., 1., 2., 3., 4., 5.]\n", " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", " >>> print({\"pearson\": round(results[\"pearson\"], 2), \"spearmanr\": round(results[\"spearmanr\"], 2)})\n", " {'pearson': 1.0, 'spearmanr': 1.0}\n", "\n", " >>> glue_metric = datasets.load_metric('glue', 'cola')\n", " >>> references = [0, 1]\n", " >>> predictions = [0, 1]\n", " >>> results = glue_metric.compute(predictions=predictions, references=references)\n", " >>> print(results)\n", " {'matthews_correlation': 1.0}\n", "\"\"\", stored examples: 0)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metric" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "def compute_metrics(eval_pred):\n", " predictions, labels = eval_pred\n", " predictions = np.argmax(predictions, axis=1)\n", " return metric.compute(predictions=predictions, references=labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Training (aka Finetuning)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from transformers import Trainer\n", "from transformers import TrainingArguments" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "args = TrainingArguments(\n", " output_dir='output',\n", " do_eval=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "trainer = Trainer(\n", " model=model,\n", " args=args,\n", " train_dataset=encoded_dataset[\"train\"],\n", " eval_dataset=encoded_dataset[\"validation\"],\n", " tokenizer=tokenizer,\n", " compute_metrics=compute_metrics,\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "
---|---|
500 | \n", "0.571000 | \n", "
1000 | \n", "0.515400 | \n", "
1500 | \n", "0.356100 | \n", "
"
],
"text/plain": [
"
Memory usage on this node: 4.3/7.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects
Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54
Number of trials: 1/infinite (1 RUNNING)
"
],
"text/plain": [
"
Memory usage on this node: 4.5/7.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects
Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54
Number of trials: 2/infinite (1 PENDING, 1 RUNNING)
"
],
"text/plain": [
"
Memory usage on this node: 4.6/7.7 GiB
Using FIFO scheduling algorithm.
Resources requested: 4.0/4 CPUs, 4.0/4 GPUs, 0.0/2.34 GiB heap, 0.0/1.17 GiB objects
Result logdir: /home/ec2-user/FLAML/notebook/logs/train_distilbert_2021-12-01_23-35-54
Number of trials: 2/infinite (1 PENDING, 1 RUNNING)
"
],
"text/plain": [
"