概述
runpod 是一个 GPU 云厂商,通过 Pod 的形式可以提供 GPU 资源,因为节点都在国外,因此对拉取 Hugging Face 以及其他依赖,速度都非常快,而且上面有不同 GPU 的型号,非常适合用于短期的测试。
测试脚本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
pip install transformers
export TRANSFORMERS_CACHE=/workspace
cat > check.py <<"EOF"
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer, LlamaForCausalLM
def print_latency(latency_set, title, warmup=3):
# trim warmup queries
latency_set = list(latency_set)
latency_set = latency_set[warmup:]
count = len(latency_set)
if count > 0:
latency_set.sort()
n50 = (count - 1) * 0.5 + 1
n90 = (count - 1) * 0.9 + 1
n95 = (count - 1) * 0.95 + 1
n99 = (count - 1) * 0.99 + 1
n999 = (count - 1) * 0.999 + 1
avg = sum(latency_set) / count
p50 = latency_set[int(n50) - 1]
p90 = latency_set[int(n90) - 1]
p95 = latency_set[int(n95) - 1]
p99 = latency_set[int(n99) - 1]
p999 = latency_set[int(n999) - 1]
print(f"====== latency stats {title} ======")
print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
device = torch.device("cuda:0")
print("start")
t1 = time.time()
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-6.7b", torch_dtype=torch.float16).half()
model = model.to(device)
t2 = time.time()
print("load model, using time: ", t2-t1, " s")
t0 = time.time()
print("compile using time: ", t0-t2, " s")
model.eval()
input_text = "The quick brown fox is"
input_texts = [input_text, input_text]
input_ids = tokenizer.encode(input_texts, return_tensors="pt")
input_ids = input_ids.to(device)
time_list = []
for _ in range(30):
t3 = time.time()
# input_text = "The quick brown fox is"
# input_ids = tokenizer.encode(input_text, return_tensors="pt")
# input_ids = input_ids.to(device)
output = model.generate(input_ids, max_length=200, num_return_sequences=1)
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
t4 = time.time()
time_list.append(t4-t3)
# print("Input text: ", input_text)
# print("Generated text: ", generated_text)
print(print_latency(time_list, "predict_time"))
EOF
python check.py
|
参考资料
- runpod站点
警告
本文最后更新于 2017年2月1日,文中内容可能已过时,请谨慎参考。