From 0dd8076014c19c73de7bcd15a64e3db8d65a7586 Mon Sep 17 00:00:00 2001 From: Debadeepta Dey Date: Fri, 15 Jul 2022 08:25:28 -0700 Subject: [PATCH] Found good configuration for featurizing. --- .vscode/launch.json | 9 +++++++++ scripts/misc/hf_explore_pile.py | 23 +++++++++++++++++++++++ scripts/misc/hf_featurize.py | 23 +++++++++++++++++------ 3 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 scripts/misc/hf_explore_pile.py diff --git a/.vscode/launch.json b/.vscode/launch.json index e5f4ac26..ec9c4dd6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1053,6 +1053,15 @@ "args": [], "env": {"HF_DATASETS_OFFLINE": "0"} }, + { + "name": "HF Explore Pile", + "type": "python", + "request": "launch", + "program": "${cwd}/scripts/misc/hf_explore_pile.py", + "console": "integratedTerminal", + "args": [], + "env": {"HF_DATASETS_OFFLINE": "0"} + }, { "name": "CurrentFile", "type": "python", diff --git a/scripts/misc/hf_explore_pile.py b/scripts/misc/hf_explore_pile.py new file mode 100644 index 00000000..de454d94 --- /dev/null +++ b/scripts/misc/hf_explore_pile.py @@ -0,0 +1,23 @@ +from datasets import load_dataset + +def main(): + + dataset = load_dataset("the_pile") + + def calc_len(examples): + return {"lengths": [len(t.split()) for t in examples['text']]} + + dataset.map(calc_len, batched=True, batch_size=10000, num_proc=1) + + chunk_size = int(1e6) + num_train = len(dataset['train']) + for i in range(0, num_train, chunk_size): + start = i + stop = min(i+chunk_size, num_train) + chunk = dataset['train'][start:stop] + print(len(chunk)) + + print('done') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/misc/hf_featurize.py b/scripts/misc/hf_featurize.py index cebde526..be1597eb 100644 --- a/scripts/misc/hf_featurize.py +++ b/scripts/misc/hf_featurize.py @@ -14,18 +14,26 @@ from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalL def main(): - checkpoint = "facebook/opt-1.3b" # "facebook/opt-350m" # "gpt2" + device = "cuda:0" if torch.cuda.is_available() else "cpu" + + # "facebook/opt-1.3b", "facebook/opt-350m" # "gpt2" + checkpoint = "facebook/opt-350m" model = AutoModel.from_pretrained(checkpoint) + model.to(device) tokenizer = AutoTokenizer.from_pretrained(checkpoint) embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer) # load wikitext-103 validation and extract features on it # using pipeline batching pipeline batching \ # https://huggingface.co/docs/transformers/main_classes/pipelines - dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation') + # dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation') + dataset = load_dataset("the_pile", split='train') + # max_length is not getting passed through properly # for idx, out in enumerate(embedder(KeyDataset(dataset, "text"), # batch_size=100, - # truncation=None, + # truncation=True, + # padding="max_length", + # max_length=50, # num_workers=20)): # print(f'idx: {idx}, shape: {torch.tensor(out).shape}') @@ -36,11 +44,14 @@ def main(): encoded_input = tokenizer(examples["text"], return_tensors='pt', padding=True, - truncation=True) + truncation=True, + max_length=512).to(device) output = model(**encoded_input) - return {"embedding" : output['last_hidden_state']} + return {"embedding" : output['last_hidden_state'].detach().cpu().numpy()} - dataset.map(featurize, batched=True, batch_size=1000) + dataset = dataset.shard(21000, index=0) + print(f"Sharded dataset length: {dataset.num_rows}") + fdataset = dataset.map(featurize, batched=True, batch_size=42, num_proc=1) print('done')