Merge pull request #208 from microsoft/tfpp_fix

TF++ fixes and basic docs page
2023-04-04 13:45:04 -03:00 · 2023-04-04 13:45:04 -03:00 · fe957c5236
--- a/archai/discrete_search/search_spaces/nlp/tfpp/backbones/codegen/model.py
+++ b/archai/discrete_search/search_spaces/nlp/tfpp/backbones/codegen/model.py
@ -58,11 +58,6 @@ class CodeGenModel(CodeGenPreTrainedModel):
        ])

        self.ln_f = nn.LayerNorm(self.hidden_size, eps=hf_config.layer_norm_epsilon)
-        
-        if all(hf_config.n_ctx // block_config.pick('total_heads') < hf_config.rotary_dim\
-               for block_config in arch_config.pick('hidden_layers')):
-            raise ValueError('The maximum number of heads is 64. Please, reduce the number of heads in the model.')
-
        self.rotary_dim = hf_config.rotary_dim
        self.gradient_checkpointing = False

--- a/archai/discrete_search/search_spaces/nlp/tfpp/ops/fftconv_.py
+++ b/archai/discrete_search/search_spaces/nlp/tfpp/ops/fftconv_.py
@ -1,3 +1,5 @@
+''' Adapted from https://github.com/HazyResearch/H3/blob/main/src/ops/fftconv.py '''
+
 import math

 import torch
--- a/archai/discrete_search/search_spaces/nlp/tfpp/ops/mha.py
+++ b/archai/discrete_search/search_spaces/nlp/tfpp/ops/mha.py
@ -1,4 +1,4 @@
-# Modified from https://github.com/HazyResearch/flash-attention/
+''' Modified from https://github.com/HazyResearch/flash-attention/ '''

 import math
 from warnings import warn
--- a/archai/discrete_search/search_spaces/nlp/tfpp/ops/sgconv3.py
+++ b/archai/discrete_search/search_spaces/nlp/tfpp/ops/sgconv3.py
@ -1,5 +1,4 @@
 # Modified from S4: https://github.com/HazyResearch/state-spaces/blob/main/src/models/sequence/ss/s4.py
-# We will release the whole codebase upon acceptance.
 from functools import partial
 import math

--- a/archai/discrete_search/search_spaces/nlp/tfpp/search_space.py
+++ b/archai/discrete_search/search_spaces/nlp/tfpp/search_space.py
@ -49,7 +49,7 @@ class TfppSearchSpace(ConfigSearchSpace):
                tuple([
                    (op_name, float(item)) for op_name, item in zip(op_subset.keys(), alloc)
                ])
-                for alloc in np.eye(len(OPS), dtype=np.uint).tolist()
+                for alloc in np.eye(len(op_subset), dtype=np.uint).tolist()
            ]

        to_tuple = lambda x: (x, ) if not isinstance(x, (tuple, list)) else x
--- a/docs/getting_started/notebooks/nlp.rst
+++ b/docs/getting_started/notebooks/nlp.rst
@ -11,3 +11,4 @@ Natural Language Processing
   NVIDIA Trainer <nlp/nvidia_trainer.ipynb>
   ONNX Export <nlp/onnx_export.ipynb>
   PyTorch Quantization <nlp/torch_quantization.ipynb>
+   Transformer++ Search Space <nlp/tfpp_ss.ipynb>
--- a/docs/getting_started/notebooks/nlp/tfpp_ss.ipynb
+++ b/docs/getting_started/notebooks/nlp/tfpp_ss.ipynb
@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ed2325a2",
+   "metadata": {},
+   "source": [
+    "# Transformer++ Search Space"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab9d2f3a",
+   "metadata": {},
+   "source": [
+    "```{warning}\n",
+    "This is an experimental feature and could change at any time\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1b88f37",
+   "metadata": {},
+   "source": [
+    "This notebook shows how to use Archai's Tranformer++ search space for Language Modelling. \n",
+    "\n",
+    "This search space consists in 8 different token-mixing primitives that can be used to create a wide variety of architectures. The Transformer++ model functions like a regular decoder-only Transformer architecture, comprising of an embedding layer, followed by a sequence $L$ decoder layers and a final language model head.\n",
+    "\n",
+    "The Transformer++ search space supports using one or more primitives on decoder layers by sharding the embedding dimension across multiple primitives:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "763a6c32",
+   "metadata": {},
+   "source": [
+    "![Search Space Diagram](./tfpp_ss.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b21d1dff",
+   "metadata": {},
+   "source": [
+    "### List of Available Primitives"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5da196ac",
+   "metadata": {},
+   "source": [
+    "| Primitive                \t| Extra params                               \t| Custom CUDA Kernel \t| Reference \t|\n",
+    "|--------------------------\t|--------------------------------------------\t|--------------------\t|-----------\t|\n",
+    "| Multihead Self-Attention \t|                                            \t| 🗸            \t     |       [Link](https://arxiv.org/abs/1706.03762)    \t|\n",
+    "| SGConv                   \t| `kernel_size`                              \t| 🗸                  \t|     [Link](https://openreview.net/forum?id=TGJSPbRpJX-)      \t|\n",
+    "| SGConv3                  \t| `kernel_size`                              \t| 🗸                 \t|           \t|\n",
+    "| Local Attention          \t| `window_size`                              \t|                    \t|     [Link](https://arxiv.org/abs/2004.05150v2)      \t|\n",
+    "| LSH Attention            \t| `bucket_size`, `num_buckets`, `num_hashes` \t|                    \t|    [Link](https://arxiv.org/abs/2001.04451)       \t|\n",
+    "| Separable Conv1D         \t| `kernel_size`                              \t|                    \t|           \t|"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be3a3a4a",
+   "metadata": {},
+   "source": [
+    "# Examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f8de7e1",
+   "metadata": {},
+   "source": [
+    "#### Sampling architectures"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "7a506d23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from archai.discrete_search.search_spaces.nlp import TfppSearchSpace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "014f6329",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import GPT2Tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "65d581c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ss = TfppSearchSpace(\n",
+    "    backbone='codegen', embed_dims=[768, 768*2], inner_dims=[768*4, 1024*4], total_heads=[12],\n",
+    "    total_layers=range(6), op_subset=['mha', 'sgconv', 'local_attn'],\n",
+    "    local_attn_window_sizes=[256, 512], sgconv_kernel_sizes=[128, 256], \n",
+    "    mixed_ops=False, # Only one primitive per layer\n",
+    "    homogeneous=False,\n",
+    "    seed=42,\n",
+    "    \n",
+    "    # Huggingface kwargs\n",
+    "    n_positions=8192, # Maximum Seq len\n",
+    "    vocab_size=50257\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c0bffcdd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LanguageModel(\n",
+       "  (model): CodeGenForCausalLM(\n",
+       "    (transformer): CodeGenModel(\n",
+       "      (wte): Embedding(50257, 1536)\n",
+       "      (embed_dropout): Dropout(p=0.0, inplace=False)\n",
+       "      (h): ModuleList()\n",
+       "      (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n",
+       "    )\n",
+       "    (lm_head): Linear(in_features=1536, out_features=50257, bias=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m = ss.random_sample()\n",
+    "m.arch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f832ff43",
+   "metadata": {},
+   "source": [
+    "Model forward pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "9a39e0d1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
+    "tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})\n",
+    "\n",
+    "x = tokenizer(['Just testing', 'something'], return_tensors='pt', padding=True, truncation=True)\n",
+    "m.arch(**x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "26361bcb",
+   "metadata": {},
+   "source": [
+    "#### Use with custom CUDA Kernels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bfa7a59",
+   "metadata": {},
+   "source": [
+    "Some primitives have custom CUDA kernels that can be used depending on the hardware available. For more information on installation instructions, see [flash_attention](https://github.com/HazyResearch/flash-attention) and [H3](https://github.com/HazyResearch/H3/tree/main) repos by HazyResearch.\n",
+    "\n",
+    "To install archai with flash-attention kernel dependencies, use\n",
+    "\n",
+    "```shell\n",
+    "python3 -m pip install archai[flash-attn]\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d2e8a99",
+   "metadata": {},
+   "source": [
+    "Available CUDA Kernels\n",
+    "\n",
+    "* FusedDense (for linear projections)\n",
+    "* FusedMLP\n",
+    "* FlashAttention (used in MHA)\n",
+    "* FlashRotaryEmb (used in MHA)\n",
+    "* FastFFTConv (used in SGconv and SGconv3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "170e1e79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ss = TfppSearchSpace(\n",
+    "    backbone='codegen', embed_dims=[768, 768*2], inner_dims=[768*4, 1024*4], total_heads=[12],\n",
+    "    total_layers=range(1, 6), op_subset=['mha', 'sgconv', 'local_attn'],\n",
+    "    local_attn_window_sizes=[256, 512], sgconv_kernel_sizes=[128, 256], \n",
+    "    mixed_ops=False, # Only one primitive per layer\n",
+    "    homogeneous=False,\n",
+    "    seed=42,\n",
+    "    \n",
+    "    # Extra kwargs\n",
+    "    n_positions=8192, # Maximum Seq len\n",
+    "    vocab_size=50257,\n",
+    "    \n",
+    "    # CUDA kernel flags\n",
+    "    fused_mlp=True,\n",
+    "    fused_dense=True,\n",
+    "    fast_fftconv=True,\n",
+    "    flash_attn=True,\n",
+    "    flash_rotary_emb=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a0f3e0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = ss.random_sample()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/getting_started/notebooks/nlp/tfpp_ss.png
+++ b/docs/getting_started/notebooks/nlp/tfpp_ss.png