Added script to play with HF featurization.

2022-07-12 05:40:12 -07:00 · 2022-07-12 05:40:12 -07:00 · 2e4d69c8b3
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -1044,6 +1044,15 @@
            "args": ["--list-of-npys", "/home/dedey/archai_experiment_reports/transnasbench_heatmaps/list.txt",
                    "--out-dir", "/home/dedey/archai_experiment_reports/transnasbench_heatmaps"]
        },
+        {
+            "name": "HF Featurize",
+            "type": "python",
+            "request": "launch",
+            "program": "${cwd}/scripts/misc/hf_featurize.py",
+            "console": "integratedTerminal",
+            "args": [],
+            "env": {"HF_DATASETS_OFFLINE": "0"}
+        },
        {
            "name": "CurrentFile",
            "type": "python",
--- a/scripts/misc/hf_featurize.py
+++ b/scripts/misc/hf_featurize.py
@ -0,0 +1,44 @@
+"""
+Script to play with featurizing a large dataset like The Pile, with the 
+motivation to use downstream data summarization techniques on it.
+"""
+
+from datasets import load_dataset
+
+
+def main():
+
+    # load wikitext-103
+    dataset = load_dataset("wikitext", 'wikitext-103-v1')
+
+    # grab checkpoint and tokenizer
+    from transformers import GPT2Tokenizer, GPT2Model
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    model = GPT2Model.from_pretrained('gpt2')
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors='pt')
+    output = model(**encoded_input)
+
+    # tokenization method 1
+    # mimics https://huggingface.co/course/chapter3/2?fw=pt
+    # this is bad because it actually doesn't return a dataset
+    # but a dictionary with 'input_ids' as the key and is slow
+    # tokenized_dataset = tokenizer(dataset['train']['text'])
+    
+    # tokenize method 2
+    # mimics https://huggingface.co/course/chapter3/2?fw=pt
+    # and is the recommended way of tokenizing a dataset
+    # as it returns an actual dataset with the 
+    def tokenize_function(examples):
+        return tokenizer(examples["text"])
+    tokenized_dataset = dataset.map(tokenize_function, split='validation', 
+                                    batched=True, batch_size=20000, num_proc=48)
+    tokenized_dataset.save_to_disk('tokenized-wt103-v1')
+
+    print("done")
+
+    
+
+
+if __name__ == '__main__':
+    main()