зеркало из https://github.com/microsoft/archai.git
Merge pull request #208 from microsoft/tfpp_fix
TF++ fixes and basic docs page
This commit is contained in:
Коммит
fe957c5236
|
@ -58,11 +58,6 @@ class CodeGenModel(CodeGenPreTrainedModel):
|
|||
])
|
||||
|
||||
self.ln_f = nn.LayerNorm(self.hidden_size, eps=hf_config.layer_norm_epsilon)
|
||||
|
||||
if all(hf_config.n_ctx // block_config.pick('total_heads') < hf_config.rotary_dim\
|
||||
for block_config in arch_config.pick('hidden_layers')):
|
||||
raise ValueError('The maximum number of heads is 64. Please, reduce the number of heads in the model.')
|
||||
|
||||
self.rotary_dim = hf_config.rotary_dim
|
||||
self.gradient_checkpointing = False
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
''' Adapted from https://github.com/HazyResearch/H3/blob/main/src/ops/fftconv.py '''
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Modified from https://github.com/HazyResearch/flash-attention/
|
||||
''' Modified from https://github.com/HazyResearch/flash-attention/ '''
|
||||
|
||||
import math
|
||||
from warnings import warn
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# Modified from S4: https://github.com/HazyResearch/state-spaces/blob/main/src/models/sequence/ss/s4.py
|
||||
# We will release the whole codebase upon acceptance.
|
||||
from functools import partial
|
||||
import math
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ class TfppSearchSpace(ConfigSearchSpace):
|
|||
tuple([
|
||||
(op_name, float(item)) for op_name, item in zip(op_subset.keys(), alloc)
|
||||
])
|
||||
for alloc in np.eye(len(OPS), dtype=np.uint).tolist()
|
||||
for alloc in np.eye(len(op_subset), dtype=np.uint).tolist()
|
||||
]
|
||||
|
||||
to_tuple = lambda x: (x, ) if not isinstance(x, (tuple, list)) else x
|
||||
|
|
|
@ -11,3 +11,4 @@ Natural Language Processing
|
|||
NVIDIA Trainer <nlp/nvidia_trainer.ipynb>
|
||||
ONNX Export <nlp/onnx_export.ipynb>
|
||||
PyTorch Quantization <nlp/torch_quantization.ipynb>
|
||||
Transformer++ Search Space <nlp/tfpp_ss.ipynb>
|
||||
|
|
|
@ -0,0 +1,283 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ed2325a2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Transformer++ Search Space"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ab9d2f3a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"```{warning}\n",
|
||||
"This is an experimental feature and could change at any time\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f1b88f37",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This notebook shows how to use Archai's Tranformer++ search space for Language Modelling. \n",
|
||||
"\n",
|
||||
"This search space consists in 8 different token-mixing primitives that can be used to create a wide variety of architectures. The Transformer++ model functions like a regular decoder-only Transformer architecture, comprising of an embedding layer, followed by a sequence $L$ decoder layers and a final language model head.\n",
|
||||
"\n",
|
||||
"The Transformer++ search space supports using one or more primitives on decoder layers by sharding the embedding dimension across multiple primitives:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "763a6c32",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Search Space Diagram](./tfpp_ss.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b21d1dff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### List of Available Primitives"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5da196ac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"| Primitive \t| Extra params \t| Custom CUDA Kernel \t| Reference \t|\n",
|
||||
"|--------------------------\t|--------------------------------------------\t|--------------------\t|-----------\t|\n",
|
||||
"| Multihead Self-Attention \t| \t| 🗸 \t | [Link](https://arxiv.org/abs/1706.03762) \t|\n",
|
||||
"| SGConv \t| `kernel_size` \t| 🗸 \t| [Link](https://openreview.net/forum?id=TGJSPbRpJX-) \t|\n",
|
||||
"| SGConv3 \t| `kernel_size` \t| 🗸 \t| \t|\n",
|
||||
"| Local Attention \t| `window_size` \t| \t| [Link](https://arxiv.org/abs/2004.05150v2) \t|\n",
|
||||
"| LSH Attention \t| `bucket_size`, `num_buckets`, `num_hashes` \t| \t| [Link](https://arxiv.org/abs/2001.04451) \t|\n",
|
||||
"| Separable Conv1D \t| `kernel_size` \t| \t| \t|"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "be3a3a4a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Examples"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9f8de7e1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Sampling architectures"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "7a506d23",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from archai.discrete_search.search_spaces.nlp import TfppSearchSpace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "014f6329",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import GPT2Tokenizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "65d581c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ss = TfppSearchSpace(\n",
|
||||
" backbone='codegen', embed_dims=[768, 768*2], inner_dims=[768*4, 1024*4], total_heads=[12],\n",
|
||||
" total_layers=range(6), op_subset=['mha', 'sgconv', 'local_attn'],\n",
|
||||
" local_attn_window_sizes=[256, 512], sgconv_kernel_sizes=[128, 256], \n",
|
||||
" mixed_ops=False, # Only one primitive per layer\n",
|
||||
" homogeneous=False,\n",
|
||||
" seed=42,\n",
|
||||
" \n",
|
||||
" # Huggingface kwargs\n",
|
||||
" n_positions=8192, # Maximum Seq len\n",
|
||||
" vocab_size=50257\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "c0bffcdd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"LanguageModel(\n",
|
||||
" (model): CodeGenForCausalLM(\n",
|
||||
" (transformer): CodeGenModel(\n",
|
||||
" (wte): Embedding(50257, 1536)\n",
|
||||
" (embed_dropout): Dropout(p=0.0, inplace=False)\n",
|
||||
" (h): ModuleList()\n",
|
||||
" (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (lm_head): Linear(in_features=1536, out_features=50257, bias=True)\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"m = ss.random_sample()\n",
|
||||
"m.arch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f832ff43",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Model forward pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "9a39e0d1",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained('gpt2')\n",
|
||||
"tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})\n",
|
||||
"\n",
|
||||
"x = tokenizer(['Just testing', 'something'], return_tensors='pt', padding=True, truncation=True)\n",
|
||||
"m.arch(**x)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "26361bcb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Use with custom CUDA Kernels"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1bfa7a59",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Some primitives have custom CUDA kernels that can be used depending on the hardware available. For more information on installation instructions, see [flash_attention](https://github.com/HazyResearch/flash-attention) and [H3](https://github.com/HazyResearch/H3/tree/main) repos by HazyResearch.\n",
|
||||
"\n",
|
||||
"To install archai with flash-attention kernel dependencies, use\n",
|
||||
"\n",
|
||||
"```shell\n",
|
||||
"python3 -m pip install archai[flash-attn]\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7d2e8a99",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Available CUDA Kernels\n",
|
||||
"\n",
|
||||
"* FusedDense (for linear projections)\n",
|
||||
"* FusedMLP\n",
|
||||
"* FlashAttention (used in MHA)\n",
|
||||
"* FlashRotaryEmb (used in MHA)\n",
|
||||
"* FastFFTConv (used in SGconv and SGconv3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "170e1e79",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ss = TfppSearchSpace(\n",
|
||||
" backbone='codegen', embed_dims=[768, 768*2], inner_dims=[768*4, 1024*4], total_heads=[12],\n",
|
||||
" total_layers=range(1, 6), op_subset=['mha', 'sgconv', 'local_attn'],\n",
|
||||
" local_attn_window_sizes=[256, 512], sgconv_kernel_sizes=[128, 256], \n",
|
||||
" mixed_ops=False, # Only one primitive per layer\n",
|
||||
" homogeneous=False,\n",
|
||||
" seed=42,\n",
|
||||
" \n",
|
||||
" # Extra kwargs\n",
|
||||
" n_positions=8192, # Maximum Seq len\n",
|
||||
" vocab_size=50257,\n",
|
||||
" \n",
|
||||
" # CUDA kernel flags\n",
|
||||
" fused_mlp=True,\n",
|
||||
" fused_dense=True,\n",
|
||||
" fast_fftconv=True,\n",
|
||||
" flash_attn=True,\n",
|
||||
" flash_rotary_emb=True\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4a0f3e0f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"m = ss.random_sample()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 105 KiB |
Загрузка…
Ссылка в новой задаче