Added script to play with HF featurization.

This commit is contained in:
Debadeepta Dey 2022-07-12 05:40:12 -07:00 коммит произвёл Gustavo Rosa
Родитель 3072394d65
Коммит 2e4d69c8b3
2 изменённых файлов: 53 добавлений и 0 удалений

9
.vscode/launch.json поставляемый
Просмотреть файл

@ -1044,6 +1044,15 @@
"args": ["--list-of-npys", "/home/dedey/archai_experiment_reports/transnasbench_heatmaps/list.txt",
"--out-dir", "/home/dedey/archai_experiment_reports/transnasbench_heatmaps"]
},
{
"name": "HF Featurize",
"type": "python",
"request": "launch",
"program": "${cwd}/scripts/misc/hf_featurize.py",
"console": "integratedTerminal",
"args": [],
"env": {"HF_DATASETS_OFFLINE": "0"}
},
{
"name": "CurrentFile",
"type": "python",

Просмотреть файл

@ -0,0 +1,44 @@
"""
Script to play with featurizing a large dataset like The Pile, with the
motivation to use downstream data summarization techniques on it.
"""
from datasets import load_dataset
def main():
# load wikitext-103
dataset = load_dataset("wikitext", 'wikitext-103-v1')
# grab checkpoint and tokenizer
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
# tokenization method 1
# mimics https://huggingface.co/course/chapter3/2?fw=pt
# this is bad because it actually doesn't return a dataset
# but a dictionary with 'input_ids' as the key and is slow
# tokenized_dataset = tokenizer(dataset['train']['text'])
# tokenize method 2
# mimics https://huggingface.co/course/chapter3/2?fw=pt
# and is the recommended way of tokenizing a dataset
# as it returns an actual dataset with the
def tokenize_function(examples):
return tokenizer(examples["text"])
tokenized_dataset = dataset.map(tokenize_function, split='validation',
batched=True, batch_size=20000, num_proc=48)
tokenized_dataset.save_to_disk('tokenized-wt103-v1')
print("done")
if __name__ == '__main__':
main()