Found good configuration for featurizing.

This commit is contained in:
Debadeepta Dey 2022-07-15 08:25:28 -07:00 коммит произвёл Gustavo Rosa
Родитель ea9bf6cd47
Коммит 0dd8076014
3 изменённых файлов: 49 добавлений и 6 удалений

9
.vscode/launch.json поставляемый
Просмотреть файл

@ -1053,6 +1053,15 @@
"args": [],
"env": {"HF_DATASETS_OFFLINE": "0"}
},
{
"name": "HF Explore Pile",
"type": "python",
"request": "launch",
"program": "${cwd}/scripts/misc/hf_explore_pile.py",
"console": "integratedTerminal",
"args": [],
"env": {"HF_DATASETS_OFFLINE": "0"}
},
{
"name": "CurrentFile",
"type": "python",

Просмотреть файл

@ -0,0 +1,23 @@
from datasets import load_dataset
def main():
dataset = load_dataset("the_pile")
def calc_len(examples):
return {"lengths": [len(t.split()) for t in examples['text']]}
dataset.map(calc_len, batched=True, batch_size=10000, num_proc=1)
chunk_size = int(1e6)
num_train = len(dataset['train'])
for i in range(0, num_train, chunk_size):
start = i
stop = min(i+chunk_size, num_train)
chunk = dataset['train'][start:stop]
print(len(chunk))
print('done')
if __name__ == '__main__':
main()

Просмотреть файл

@ -14,18 +14,26 @@ from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalL
def main():
checkpoint = "facebook/opt-1.3b" # "facebook/opt-350m" # "gpt2"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# "facebook/opt-1.3b", "facebook/opt-350m" # "gpt2"
checkpoint = "facebook/opt-350m"
model = AutoModel.from_pretrained(checkpoint)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
# load wikitext-103 validation and extract features on it
# using pipeline batching pipeline batching \
# https://huggingface.co/docs/transformers/main_classes/pipelines
dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
# dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
dataset = load_dataset("the_pile", split='train')
# max_length is not getting passed through properly
# for idx, out in enumerate(embedder(KeyDataset(dataset, "text"),
# batch_size=100,
# truncation=None,
# truncation=True,
# padding="max_length",
# max_length=50,
# num_workers=20)):
# print(f'idx: {idx}, shape: {torch.tensor(out).shape}')
@ -36,11 +44,14 @@ def main():
encoded_input = tokenizer(examples["text"],
return_tensors='pt',
padding=True,
truncation=True)
truncation=True,
max_length=512).to(device)
output = model(**encoded_input)
return {"embedding" : output['last_hidden_state']}
return {"embedding" : output['last_hidden_state'].detach().cpu().numpy()}
dataset.map(featurize, batched=True, batch_size=1000)
dataset = dataset.shard(21000, index=0)
print(f"Sharded dataset length: {dataset.num_rows}")
fdataset = dataset.map(featurize, batched=True, batch_size=42, num_proc=1)
print('done')