зеркало из https://github.com/microsoft/archai.git
Found good configuration for featurizing.
This commit is contained in:
Родитель
ea9bf6cd47
Коммит
0dd8076014
|
@ -1053,6 +1053,15 @@
|
|||
"args": [],
|
||||
"env": {"HF_DATASETS_OFFLINE": "0"}
|
||||
},
|
||||
{
|
||||
"name": "HF Explore Pile",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${cwd}/scripts/misc/hf_explore_pile.py",
|
||||
"console": "integratedTerminal",
|
||||
"args": [],
|
||||
"env": {"HF_DATASETS_OFFLINE": "0"}
|
||||
},
|
||||
{
|
||||
"name": "CurrentFile",
|
||||
"type": "python",
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
from datasets import load_dataset
|
||||
|
||||
def main():
|
||||
|
||||
dataset = load_dataset("the_pile")
|
||||
|
||||
def calc_len(examples):
|
||||
return {"lengths": [len(t.split()) for t in examples['text']]}
|
||||
|
||||
dataset.map(calc_len, batched=True, batch_size=10000, num_proc=1)
|
||||
|
||||
chunk_size = int(1e6)
|
||||
num_train = len(dataset['train'])
|
||||
for i in range(0, num_train, chunk_size):
|
||||
start = i
|
||||
stop = min(i+chunk_size, num_train)
|
||||
chunk = dataset['train'][start:stop]
|
||||
print(len(chunk))
|
||||
|
||||
print('done')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -14,18 +14,26 @@ from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalL
|
|||
|
||||
def main():
|
||||
|
||||
checkpoint = "facebook/opt-1.3b" # "facebook/opt-350m" # "gpt2"
|
||||
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# "facebook/opt-1.3b", "facebook/opt-350m" # "gpt2"
|
||||
checkpoint = "facebook/opt-350m"
|
||||
model = AutoModel.from_pretrained(checkpoint)
|
||||
model.to(device)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
|
||||
|
||||
# load wikitext-103 validation and extract features on it
|
||||
# using pipeline batching pipeline batching \
|
||||
# https://huggingface.co/docs/transformers/main_classes/pipelines
|
||||
dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
|
||||
# dataset = load_dataset("wikitext", 'wikitext-103-v1', split='validation')
|
||||
dataset = load_dataset("the_pile", split='train')
|
||||
# max_length is not getting passed through properly
|
||||
# for idx, out in enumerate(embedder(KeyDataset(dataset, "text"),
|
||||
# batch_size=100,
|
||||
# truncation=None,
|
||||
# truncation=True,
|
||||
# padding="max_length",
|
||||
# max_length=50,
|
||||
# num_workers=20)):
|
||||
# print(f'idx: {idx}, shape: {torch.tensor(out).shape}')
|
||||
|
||||
|
@ -36,11 +44,14 @@ def main():
|
|||
encoded_input = tokenizer(examples["text"],
|
||||
return_tensors='pt',
|
||||
padding=True,
|
||||
truncation=True)
|
||||
truncation=True,
|
||||
max_length=512).to(device)
|
||||
output = model(**encoded_input)
|
||||
return {"embedding" : output['last_hidden_state']}
|
||||
return {"embedding" : output['last_hidden_state'].detach().cpu().numpy()}
|
||||
|
||||
dataset.map(featurize, batched=True, batch_size=1000)
|
||||
dataset = dataset.shard(21000, index=0)
|
||||
print(f"Sharded dataset length: {dataset.num_rows}")
|
||||
fdataset = dataset.map(featurize, batched=True, batch_size=42, num_proc=1)
|
||||
|
||||
print('done')
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче