1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
import argparse
import os
import asyncio
import aiohttp
import requests
import gradio as gr
HOST_A = 'gd17-llm-002-text-generation-webui-api-a.vip.vip.com'
HOST_B = 'gd17-llm-002-text-generation-webui-api-b.vip.vip.com'
URI_A = f'http://{HOST_A}/api/v1/generate'
URI_B = f'http://{HOST_B}/api/v1/generate'
MODEL_A = f'http://{HOST_A}/api/v1/model'
MODEL_B = f'http://{HOST_B}/api/v1/model'
async def make_request(uri, request):
async with aiohttp.ClientSession() as session:
async with session.post(uri, json=request) as response:
if response.status == 200:
result = (await response.json())['results'][0]['text']
return result
else:
return "Error: Failed to make request"
async def run_async(prompt):
request = {
'prompt': prompt,
'max_new_tokens': 1000,
}
async with aiohttp.ClientSession() as session:
tasks = [
asyncio.create_task(make_request(URI_A, request)),
asyncio.create_task(make_request(URI_B, request))
]
results = await asyncio.gather(*tasks)
for result in results:
print(prompt + result)
return results[0], results[1]
def upload_a(model, progress=gr.Progress()):
progress(0.5, desc="Starting")
request = {
'action': "load",
'model_name': model,
}
response = requests.post(MODEL_A, json=request)
if response.status_code == 200:
progress(1, desc="End")
return model
return model
def upload_b(model, progress=gr.Progress()):
progress(0.5, desc="Starting")
request = {
'action': "load",
'model_name': model,
}
response = requests.post(MODEL_B, json=request)
if response.status_code == 200:
progress(1, desc="End")
return model
return model
def run(prompt):
request = {
'prompt': prompt,
'max_new_tokens': 1000,
# Generation params. If 'preset' is set to different than 'None', the values
# in presets/preset-name.yaml are used instead of the individual numbers.
'preset': 'None',
'do_sample': True,
'temperature': 0.7,
'top_p': 0.1,
'typical_p': 1,
'epsilon_cutoff': 0, # In units of 1e-4
'eta_cutoff': 0, # In units of 1e-4
'tfs': 1,
'top_a': 0,
'repetition_penalty': 1.18,
'repetition_penalty_range': 0,
'top_k': 40,
'min_length': 0,
'no_repeat_ngram_size': 0,
'num_beams': 1,
'penalty_alpha': 0,
'length_penalty': 1,
'early_stopping': False,
'mirostat_mode': 0,
'mirostat_tau': 5,
'mirostat_eta': 0.1,
'seed': -1,
'add_bos_token': True,
'truncation_length': 2048,
'ban_eos_token': False,
'skip_special_tokens': True,
'stopping_strings': []
}
response = requests.post(URI_A, json=request)
response1 = requests.post(URI_B, json=request)
if response.status_code & response1.status_code == 200:
result = response.json()['results'][0]['text']
print(prompt + result)
result1 = response.json()['results'][0]['text']
print(prompt + result1)
return result, result1
default_model_options = ["Baichuan-13B-Chat", "BelleGroup_BELLE-7B-2M", "Chinese-LLaMA-13B", "Chinese-LLaMA-7B", "FreedomIntelligence_phoenix-inst-chat-7b", "Merged-BelleGroup_BELLE-LLaMA-7B-0.6M-enc", "Merged-BelleGroup_BELLE-LLaMA-EXT-13B", "Merged-chitanda-llama-panda-zh-coig-7b-delta", "Ziya-LLaMA-13B-v1.1", "Ziya-LLaMA-7B-Reward", "baichuan-13b-0719", "baichuan-13b-sft-0726v1", "baichuan-llama-13b-0-0", "baichuan-model_0_0", "baichuan_model_0_0", "bigscience_bloom-7b1", "bigscience_bloomz-7b1-mt", "bloom-7b1", "config-user.yam", "cpm-bee-10b", "facebook_llama-13b-hf", "facebook_llama-7b-hf", "falcon-7b", "llama-2-13b", "llama-2-13b-chat", "llama-2-7b", "llama-2-7b-chat", "model_0_7", "model_1_7", "opt-125m", "starchat-beta", "vip-models", "wh_model_07120_0", "ziya-llama-13b-alpaca-sft", "ziya-llama-13b-alpaca-sft-padding-vocab-size", "ziya-llama-13b-ppo-merge", "ziya-llama-13b-pretrain-merge", "ziya-llama-ppo-theme-v2"]
default_title = "# 大语言模型AB推理\n## 使用说明\n1. 模型加载以及推理可能大概需要1-2分钟,请耐心等待\n2. 请等AB两个模型都加载完成才开始推理\n3. 页面刷新之后需要重新加载模型\n4. 推理后端只有一组,多用户使用会干扰模型加载以及推理请求排队"
model_options = os.getenv("MODEL_OPTIONS", ",".join(default_model_options)).split(",")
title = os.getenv("TITLE", default_title)
def build_demo():
with gr.Blocks() as demo:
gr.Markdown(title)
with gr.Row():
with gr.Column():
dd_a = gr.Dropdown(model_options, label="模型A", info="加载模型")
btn_a = gr.Button(value="选择模型A,点击加载")
output_a = gr.Textbox(label="模型A加载结果")
btn_a.click(upload_a, inputs=dd_a, outputs=output_a)
with gr.Column():
dd_b = gr.Dropdown(model_options, label="模型B", info="加载模型")
btn_b = gr.Button(value="选择模型B,点击加载")
output_b = gr.Textbox(label="模型B加载结果")
btn_b.click(upload_b, inputs=dd_b, outputs=output_b)
inputbox = gr.Textbox(label="输入问题,按下回车确认", placeholder="Enter text and press ENTER")
outputbox_a = gr.Textbox(label="模型A回答", placeholder="Generated result from the model")
outputbox_b = gr.Textbox(label="模型B回答", placeholder="Generated result from the model")
inputbox.submit(run_async, [inputbox], [outputbox_a, outputbox_b])
return demo
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8001)
args = parser.parse_args()
demo = build_demo()
demo.queue(concurrency_count=100).launch(server_name=args.host, server_port=args.port, share=False)
|