Found good configuration for featurizing.

2022-07-15 08:25:28 -07:00 · 2022-07-15 08:25:28 -07:00 · 0dd8076014
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -1053,6 +1053,15 @@
            "args": [],
            "env": {"HF_DATASETS_OFFLINE": "0"}
        },
+        {
+            "name": "HF Explore Pile",
+            "type": "python",
+            "request": "launch",
+            "program": "${cwd}/scripts/misc/hf_explore_pile.py",
+            "console": "integratedTerminal",
+            "args": [],
+            "env": {"HF_DATASETS_OFFLINE": "0"}
+        },
        {
            "name": "CurrentFile",
            "type": "python",
--- a/scripts/misc/hf_explore_pile.py
+++ b/scripts/misc/hf_explore_pile.py
@ -0,0 +1,23 @@
+from datasets import load_dataset
+
+def main():
+
+    dataset = load_dataset("the_pile")
+    
+    def calc_len(examples):
+        return {"lengths": [len(t.split()) for t in examples['text']]}
+
+    dataset.map(calc_len, batched=True, batch_size=10000, num_proc=1)
+
+    chunk_size = int(1e6)
+    num_train = len(dataset['train'])
+    for i in range(0, num_train, chunk_size):
+        start = i
+        stop = min(i+chunk_size, num_train)
+        chunk = dataset['train'][start:stop]
+        print(len(chunk))
+
+    print('done')
+
+if __name__ == '__main__':
+    main()
--- a/scripts/misc/hf_featurize.py
+++ b/scripts/misc/hf_featurize.py
@ -14,18 +14,26 @@ from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalL

 def main():

-    checkpoint = "facebook/opt-1.3b" # "facebook/opt-350m" # "gpt2"
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+    # "facebook/opt-1.3b", "facebook/opt-350m" # "gpt2"
+    checkpoint = "facebook/opt-350m" 
    model = AutoModel.from_pretrained(checkpoint)
+    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
    
    # load wikitext-103 validation and extract features on it 
    # using pipeline batching pipeline batching \
    # https://huggingface.co/docs/transformers/main_classes/pipelines
-    dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
+    # dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
+    dataset = load_dataset("the_pile", split='train')
+    # max_length is not getting passed through properly
    # for idx, out in enumerate(embedder(KeyDataset(dataset, "text"), 
    #                     batch_size=100, 
-    #                     truncation=None, 
+    #                     truncation=True,
+    #                     padding="max_length",
+    #                     max_length=50, 
    #                     num_workers=20)):
    #     print(f'idx: {idx}, shape: {torch.tensor(out).shape}')
    
@ -36,11 +44,14 @@ def main():
        encoded_input = tokenizer(examples["text"], 
                        return_tensors='pt', 
                        padding=True,
-                        truncation=True)
+                        truncation=True,
+                        max_length=512).to(device)
        output = model(**encoded_input)
-        return {"embedding" : output['last_hidden_state']}
+        return {"embedding" : output['last_hidden_state'].detach().cpu().numpy()}

-    dataset.map(featurize, batched=True, batch_size=1000)
+    dataset = dataset.shard(21000, index=0)
+    print(f"Sharded dataset length: {dataset.num_rows}")
+    fdataset = dataset.map(featurize, batched=True, batch_size=42, num_proc=1)

    print('done')