chore(tasks): Adds boilerplate for text generation task with Archai.

This commit is contained in:
Gustavo Rosa 2023-01-26 10:45:25 -03:00
Родитель 5b0b58d903
Коммит 576a215da6
5 изменённых файлов: 176 добавлений и 1 удалений

Просмотреть файл

@ -0,0 +1,104 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import argparse
from transformers import (
AutoTokenizer,
CodeGenConfig,
CodeGenForCausalLM,
DataCollatorForLanguageModeling,
TrainingArguments,
)
from archai.datasets.nlp.hf_dataset_provider import HfHubDatasetProvider
from archai.datasets.nlp.hf_dataset_provider_utils import tokenize_dataset
from archai.trainers.nlp.hf_trainer import HfTrainer
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Trains a CodeGen using the Huggingface trainer.")
parser.add_argument(
"-dn",
"--dataset_name",
type=str,
default="wikitext",
help="Name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"-dcn",
"--dataset_config_name",
type=str,
default="wikitext-103-raw-v1",
help="Configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument("-seq", "--seq_len", type=int, default=192, help="Sequence length.")
parser.add_argument("-ls", "--logging_steps", type=int, default=10, help="Number of steps between logs.")
parser.add_argument("-es", "--eval_steps", type=int, default=100, help="Number of steps between evaluations.")
parser.add_argument("-bsz", "--per_device_train_batch_size", type=int, default=64, help="Batch size per device.")
parser.add_argument("-lr", "--learning_rate", type=float, default=0.01, help="Learning rate.")
parser.add_argument("-wd", "--weight_decay", type=float, default=0.0, help="Weight decay.")
parser.add_argument("-n", "--max_steps", type=int, default=250, help="Maximum number of steps.")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono", model_max_length=args.seq_len)
tokenizer.pad_token = tokenizer.eos_token
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataset_provider = HfHubDatasetProvider(dataset=args.dataset_name, subset=args.dataset_config_name)
train_dataset = dataset_provider.get_train_dataset()
eval_dataset = dataset_provider.get_val_dataset()
encoded_train_dataset = train_dataset.map(tokenize_dataset, batched=True, fn_kwargs={"tokenizer": tokenizer})
encoded_eval_dataset = eval_dataset.map(tokenize_dataset, batched=True, fn_kwargs={"tokenizer": tokenizer})
config = CodeGenConfig(
n_positions=args.seq_len,
n_embd=768,
n_layer=12,
n_head=12,
rotary_dim=16,
bos_token_id=0,
eos_token_id=0,
vocab_size=50295,
)
model = CodeGenForCausalLM(config=config)
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
training_args = TrainingArguments(
"hf-codegen",
evaluation_strategy="steps",
logging_steps=args.logging_steps,
eval_steps=args.eval_steps,
per_device_train_batch_size=args.per_device_train_batch_size,
learning_rate=args.learning_rate,
weight_decay=args.weight_decay,
max_steps=args.max_steps,
)
trainer = HfTrainer(
model=model,
args=training_args,
data_collator=collator,
train_dataset=encoded_train_dataset,
eval_dataset=encoded_eval_dataset,
)
trainer.train()

Просмотреть файл

Просмотреть файл

Просмотреть файл

@ -23,7 +23,7 @@ def parse_args() -> argparse.Namespace:
"-mt",
"--model_type",
type=str,
choices=["gpt2", "gpt2-flex", "mem-transformer", "opt", "transfo-xl"],
choices=["codegen", "gpt2", "gpt2-flex", "mem-transformer", "opt", "transfo-xl"],
default="gpt2",
help="Type of model.",
)

Просмотреть файл

@ -0,0 +1,71 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from transformers import (
AutoTokenizer,
CodeGenConfig,
CodeGenForCausalLM,
DataCollatorForLanguageModeling,
TrainingArguments,
)
from archai.datasets.nlp.hf_dataset_provider import HfHubDatasetProvider
from archai.datasets.nlp.hf_dataset_provider_utils import tokenize_contiguous_dataset
from archai.trainers.nlp.hf_trainer import HfTrainer
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono", model_max_length=192)
tokenizer.pad_token = tokenizer.eos_token
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataset_provider = HfHubDatasetProvider(dataset="wikitext", subset="wikitext-103-raw-v1")
train_dataset = dataset_provider.get_train_dataset()
eval_dataset = dataset_provider.get_val_dataset()
encoded_train_dataset = train_dataset.map(
tokenize_contiguous_dataset,
batched=True,
fn_kwargs={"tokenizer": tokenizer, "model_max_length": 192},
remove_columns=train_dataset.column_names,
)
encoded_eval_dataset = eval_dataset.map(
tokenize_contiguous_dataset,
batched=True,
fn_kwargs={"tokenizer": tokenizer, "model_max_length": 192},
remove_columns=eval_dataset.column_names,
)
config = CodeGenConfig(
n_positions=192,
n_embd=768,
n_layer=12,
n_head=12,
rotary_dim=16,
bos_token_id=0,
eos_token_id=0,
vocab_size=50295,
)
model = CodeGenForCausalLM(config=config)
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
training_args = TrainingArguments(
"hf-codegen",
evaluation_strategy="steps",
logging_steps=10,
eval_steps=125,
per_device_train_batch_size=32,
learning_rate=0.01,
weight_decay=0.0,
max_steps=250,
)
trainer = HfTrainer(
model=model,
args=training_args,
data_collator=collator,
train_dataset=encoded_train_dataset,
eval_dataset=encoded_eval_dataset,
)
trainer.train()