111 строки
3.8 KiB
Python
111 строки
3.8 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# DeepSpeed Team
|
|
|
|
import os
|
|
import torch
|
|
import time
|
|
import deepspeed
|
|
import argparse
|
|
from transformers import pipeline
|
|
from deepspeed.accelerator import get_accelerator
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model", "-m", type=str, help="hf model name")
|
|
parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
|
|
parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32")
|
|
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
|
|
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
|
|
parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
|
|
parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
|
|
parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world size")
|
|
parser.add_argument("--trials", type=int, default=30, help="number of trials")
|
|
args = parser.parse_args()
|
|
|
|
|
|
def print_latency(latency_set, title, warmup=3):
|
|
# trim warmup queries
|
|
latency_set = list(latency_set)
|
|
latency_set = latency_set[warmup:]
|
|
count = len(latency_set)
|
|
if count > 0:
|
|
latency_set.sort()
|
|
n50 = (count - 1) * 0.5 + 1
|
|
n90 = (count - 1) * 0.9 + 1
|
|
n95 = (count - 1) * 0.95 + 1
|
|
n99 = (count - 1) * 0.99 + 1
|
|
n999 = (count - 1) * 0.999 + 1
|
|
|
|
avg = sum(latency_set) / count
|
|
p50 = latency_set[int(n50) - 1]
|
|
p90 = latency_set[int(n90) - 1]
|
|
p95 = latency_set[int(n95) - 1]
|
|
p99 = latency_set[int(n99) - 1]
|
|
p999 = latency_set[int(n999) - 1]
|
|
|
|
print(f"====== latency stats {title} ======")
|
|
print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
|
|
print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
|
|
print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
|
|
print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
|
|
print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
|
|
print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
|
|
|
|
|
|
deepspeed.init_distributed()
|
|
|
|
if args.local_rank == 0:
|
|
print("BENCHMARK SETTINGS:")
|
|
print(f"\tMODEL: {args.model}")
|
|
print(f"\tMAX_TOKENS: {args.max_tokens}")
|
|
print(f"\tDTYPE: {args.dtype}")
|
|
print(f"\tCUDA_GRAPHS: {args.graphs}")
|
|
print(f"\tKERNEL_INJECT: {args.kernel_inject}")
|
|
|
|
if args.dtype == "int8":
|
|
dtype = torch.int8
|
|
elif args.dtype == "fp16":
|
|
dtype = torch.float16
|
|
else:
|
|
dtype = torch.float32
|
|
|
|
pipe = pipeline("text-generation", model=args.model, framework="pt", device=args.local_rank)
|
|
|
|
if dtype == torch.float16:
|
|
pipe.model.half()
|
|
|
|
if args.deepspeed:
|
|
pipe.model = deepspeed.init_inference(
|
|
pipe.model,
|
|
dtype=dtype,
|
|
tensor_parallel={"tp_size": args.world_size},
|
|
replace_with_kernel_inject=args.kernel_inject,
|
|
enable_cuda_graph=args.graphs,
|
|
)
|
|
pipe.model.profile_model_time()
|
|
|
|
responses = []
|
|
times = []
|
|
mtimes = []
|
|
for i in range(args.trials):
|
|
get_accelerator().synchronize()
|
|
start = time.time()
|
|
r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
|
|
get_accelerator().synchronize()
|
|
end = time.time()
|
|
responses.append(r)
|
|
times.append(end - start) # / (args.max_tokens - 3))
|
|
if args.deepspeed:
|
|
mtimes.append(sum(pipe.model.model_times()))
|
|
|
|
if args.local_rank == 0:
|
|
print_latency(times, "(e2e) latency")
|
|
if args.deepspeed:
|
|
print_latency(mtimes, "(model-only) latency")
|
|
print_latency(map(lambda t: t / (args.max_tokens - 3), times), "(e2e) per token latency")
|
|
print(f"RESPONSE 0:")
|
|
print("-" * 30)
|
|
print(responses[0][0]["generated_text"])
|
|
print("-" * 30)
|