OLive/examples/llama2/llama2_tensor_parallel.json

39 строки
1.2 KiB
JSON

{
"input_model": { "type": "HfModel", "model_path": "meta-llama/Llama-2-7b-hf" },
"systems": {
"local_system": {
"type": "LocalSystem",
"accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ]
}
},
"passes": {
"tensor_parallel": {
"type": "PyTorchTensorParallel",
"user_script": "llama2_tensor_parallel.py",
"class_name": "LlamaPyTorchTensorParallel",
"world_size": 4
},
"conversion": {
"type": "OnnxConversion",
"target_opset": 17,
"save_as_external_data": true,
"all_tensors_to_one_file": true
},
"transformers_optimization_fp16": {
"type": "OrtTransformersOptimization",
"save_as_external_data": true,
"all_tensors_to_one_file": true,
"model_type": "gpt2",
"opt_level": 0,
"only_onnxruntime": false,
"keep_io_types": false,
"float16": true,
"use_gqa": true
}
},
"host": "local_system",
"target": "local_system",
"cache_dir": "cache",
"output_dir": "models/tensor_parallel"
}