зеркало из https://github.com/microsoft/archai.git
chore(tasks): Adds boilerplate for text generation task with Archai.
This commit is contained in:
Родитель
5b0b58d903
Коммит
576a215da6
|
@ -0,0 +1,104 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import argparse
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
CodeGenConfig,
|
||||
CodeGenForCausalLM,
|
||||
DataCollatorForLanguageModeling,
|
||||
TrainingArguments,
|
||||
)
|
||||
|
||||
from archai.datasets.nlp.hf_dataset_provider import HfHubDatasetProvider
|
||||
from archai.datasets.nlp.hf_dataset_provider_utils import tokenize_dataset
|
||||
from archai.trainers.nlp.hf_trainer import HfTrainer
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Trains a CodeGen using the Huggingface trainer.")
|
||||
|
||||
parser.add_argument(
|
||||
"-dn",
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
default="wikitext",
|
||||
help="Name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-dcn",
|
||||
"--dataset_config_name",
|
||||
type=str,
|
||||
default="wikitext-103-raw-v1",
|
||||
help="Configuration name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
|
||||
parser.add_argument("-seq", "--seq_len", type=int, default=192, help="Sequence length.")
|
||||
|
||||
parser.add_argument("-ls", "--logging_steps", type=int, default=10, help="Number of steps between logs.")
|
||||
|
||||
parser.add_argument("-es", "--eval_steps", type=int, default=100, help="Number of steps between evaluations.")
|
||||
|
||||
parser.add_argument("-bsz", "--per_device_train_batch_size", type=int, default=64, help="Batch size per device.")
|
||||
|
||||
parser.add_argument("-lr", "--learning_rate", type=float, default=0.01, help="Learning rate.")
|
||||
|
||||
parser.add_argument("-wd", "--weight_decay", type=float, default=0.0, help="Weight decay.")
|
||||
|
||||
parser.add_argument("-n", "--max_steps", type=int, default=250, help="Maximum number of steps.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono", model_max_length=args.seq_len)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
dataset_provider = HfHubDatasetProvider(dataset=args.dataset_name, subset=args.dataset_config_name)
|
||||
train_dataset = dataset_provider.get_train_dataset()
|
||||
eval_dataset = dataset_provider.get_val_dataset()
|
||||
|
||||
encoded_train_dataset = train_dataset.map(tokenize_dataset, batched=True, fn_kwargs={"tokenizer": tokenizer})
|
||||
encoded_eval_dataset = eval_dataset.map(tokenize_dataset, batched=True, fn_kwargs={"tokenizer": tokenizer})
|
||||
|
||||
config = CodeGenConfig(
|
||||
n_positions=args.seq_len,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
rotary_dim=16,
|
||||
bos_token_id=0,
|
||||
eos_token_id=0,
|
||||
vocab_size=50295,
|
||||
)
|
||||
model = CodeGenForCausalLM(config=config)
|
||||
|
||||
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
|
||||
|
||||
training_args = TrainingArguments(
|
||||
"hf-codegen",
|
||||
evaluation_strategy="steps",
|
||||
logging_steps=args.logging_steps,
|
||||
eval_steps=args.eval_steps,
|
||||
per_device_train_batch_size=args.per_device_train_batch_size,
|
||||
learning_rate=args.learning_rate,
|
||||
weight_decay=args.weight_decay,
|
||||
max_steps=args.max_steps,
|
||||
)
|
||||
trainer = HfTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
data_collator=collator,
|
||||
train_dataset=encoded_train_dataset,
|
||||
eval_dataset=encoded_eval_dataset,
|
||||
)
|
||||
|
||||
trainer.train()
|
|
@ -23,7 +23,7 @@ def parse_args() -> argparse.Namespace:
|
|||
"-mt",
|
||||
"--model_type",
|
||||
type=str,
|
||||
choices=["gpt2", "gpt2-flex", "mem-transformer", "opt", "transfo-xl"],
|
||||
choices=["codegen", "gpt2", "gpt2-flex", "mem-transformer", "opt", "transfo-xl"],
|
||||
default="gpt2",
|
||||
help="Type of model.",
|
||||
)
|
|
@ -0,0 +1,71 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
CodeGenConfig,
|
||||
CodeGenForCausalLM,
|
||||
DataCollatorForLanguageModeling,
|
||||
TrainingArguments,
|
||||
)
|
||||
|
||||
from archai.datasets.nlp.hf_dataset_provider import HfHubDatasetProvider
|
||||
from archai.datasets.nlp.hf_dataset_provider_utils import tokenize_contiguous_dataset
|
||||
from archai.trainers.nlp.hf_trainer import HfTrainer
|
||||
|
||||
if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono", model_max_length=192)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
||||
|
||||
dataset_provider = HfHubDatasetProvider(dataset="wikitext", subset="wikitext-103-raw-v1")
|
||||
train_dataset = dataset_provider.get_train_dataset()
|
||||
eval_dataset = dataset_provider.get_val_dataset()
|
||||
|
||||
encoded_train_dataset = train_dataset.map(
|
||||
tokenize_contiguous_dataset,
|
||||
batched=True,
|
||||
fn_kwargs={"tokenizer": tokenizer, "model_max_length": 192},
|
||||
remove_columns=train_dataset.column_names,
|
||||
)
|
||||
encoded_eval_dataset = eval_dataset.map(
|
||||
tokenize_contiguous_dataset,
|
||||
batched=True,
|
||||
fn_kwargs={"tokenizer": tokenizer, "model_max_length": 192},
|
||||
remove_columns=eval_dataset.column_names,
|
||||
)
|
||||
|
||||
config = CodeGenConfig(
|
||||
n_positions=192,
|
||||
n_embd=768,
|
||||
n_layer=12,
|
||||
n_head=12,
|
||||
rotary_dim=16,
|
||||
bos_token_id=0,
|
||||
eos_token_id=0,
|
||||
vocab_size=50295,
|
||||
)
|
||||
model = CodeGenForCausalLM(config=config)
|
||||
|
||||
print(f"Total parameters: {sum(p.numel() for p in model.parameters())}")
|
||||
|
||||
training_args = TrainingArguments(
|
||||
"hf-codegen",
|
||||
evaluation_strategy="steps",
|
||||
logging_steps=10,
|
||||
eval_steps=125,
|
||||
per_device_train_batch_size=32,
|
||||
learning_rate=0.01,
|
||||
weight_decay=0.0,
|
||||
max_steps=250,
|
||||
)
|
||||
trainer = HfTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
data_collator=collator,
|
||||
train_dataset=encoded_train_dataset,
|
||||
eval_dataset=encoded_eval_dataset,
|
||||
)
|
||||
|
||||
trainer.train()
|
Загрузка…
Ссылка в новой задаче