From 0dd8076014c19c73de7bcd15a64e3db8d65a7586 Mon Sep 17 00:00:00 2001
From: Debadeepta Dey <dedey@microsoft.com>
Date: Fri, 15 Jul 2022 08:25:28 -0700
Subject: [PATCH] Found good configuration for featurizing.

---
 .vscode/launch.json             |  9 +++++++++
 scripts/misc/hf_explore_pile.py | 23 +++++++++++++++++++++++
 scripts/misc/hf_featurize.py    | 23 +++++++++++++++++------
 3 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 scripts/misc/hf_explore_pile.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
index e5f4ac26..ec9c4dd6 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1053,6 +1053,15 @@
             "args": [],
             "env": {"HF_DATASETS_OFFLINE": "0"}
         },
+        {
+            "name": "HF Explore Pile",
+            "type": "python",
+            "request": "launch",
+            "program": "${cwd}/scripts/misc/hf_explore_pile.py",
+            "console": "integratedTerminal",
+            "args": [],
+            "env": {"HF_DATASETS_OFFLINE": "0"}
+        },
         {
             "name": "CurrentFile",
             "type": "python",
diff --git a/scripts/misc/hf_explore_pile.py b/scripts/misc/hf_explore_pile.py
new file mode 100644
index 00000000..de454d94
--- /dev/null
+++ b/scripts/misc/hf_explore_pile.py
@@ -0,0 +1,23 @@
+from datasets import load_dataset
+
+def main():
+
+    dataset = load_dataset("the_pile")
+    
+    def calc_len(examples):
+        return {"lengths": [len(t.split()) for t in examples['text']]}
+
+    dataset.map(calc_len, batched=True, batch_size=10000, num_proc=1)
+
+    chunk_size = int(1e6)
+    num_train = len(dataset['train'])
+    for i in range(0, num_train, chunk_size):
+        start = i
+        stop = min(i+chunk_size, num_train)
+        chunk = dataset['train'][start:stop]
+        print(len(chunk))
+
+    print('done')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/scripts/misc/hf_featurize.py b/scripts/misc/hf_featurize.py
index cebde526..be1597eb 100644
--- a/scripts/misc/hf_featurize.py
+++ b/scripts/misc/hf_featurize.py
@@ -14,18 +14,26 @@ from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalL
 
 def main():
 
-    checkpoint = "facebook/opt-1.3b" # "facebook/opt-350m" # "gpt2"
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+    # "facebook/opt-1.3b", "facebook/opt-350m" # "gpt2"
+    checkpoint = "facebook/opt-350m" 
     model = AutoModel.from_pretrained(checkpoint)
+    model.to(device)
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
     
     # load wikitext-103 validation and extract features on it 
     # using pipeline batching pipeline batching \
     # https://huggingface.co/docs/transformers/main_classes/pipelines
-    dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
+    # dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
+    dataset = load_dataset("the_pile", split='train')
+    # max_length is not getting passed through properly
     # for idx, out in enumerate(embedder(KeyDataset(dataset, "text"), 
     #                     batch_size=100, 
-    #                     truncation=None, 
+    #                     truncation=True,
+    #                     padding="max_length",
+    #                     max_length=50, 
     #                     num_workers=20)):
     #     print(f'idx: {idx}, shape: {torch.tensor(out).shape}')
     
@@ -36,11 +44,14 @@ def main():
         encoded_input = tokenizer(examples["text"], 
                         return_tensors='pt', 
                         padding=True,
-                        truncation=True)
+                        truncation=True,
+                        max_length=512).to(device)
         output = model(**encoded_input)
-        return {"embedding" : output['last_hidden_state']}
+        return {"embedding" : output['last_hidden_state'].detach().cpu().numpy()}
 
-    dataset.map(featurize, batched=True, batch_size=1000)
+    dataset = dataset.shard(21000, index=0)
+    print(f"Sharded dataset length: {dataset.num_rows}")
+    fdataset = dataset.map(featurize, batched=True, batch_size=42, num_proc=1)
 
     print('done')