forked from xorbitsai/inference
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchat.py
66 lines (57 loc) · 2.04 KB
/
chat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from typing import List
from xinference.client import Client
from xinference.types import ChatCompletionMessage
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--endpoint", type=str, required=True, help="Xinference endpoint, required"
)
parser.add_argument(
"--model_name", type=str, required=True, help="Name of the model, required"
)
parser.add_argument(
"--model_size_in_billions",
type=int,
required=False,
help="Size of the model in billions",
)
parser.add_argument(
"--model_format",
type=str,
required=False,
help="Format of the model",
)
parser.add_argument("--quantization", type=str, required=False, help="Quantization")
args = parser.parse_args()
endpoint = args.endpoint
model_name = args.model_name
model_size_in_billions = args.model_size_in_billions
model_format = args.model_format
quantization = args.quantization
print(f"Xinference endpoint: {endpoint}")
print(f"Model Name: {model_name}")
print(f"Model Size (in billions): {model_size_in_billions}")
print(f"Model Format: {model_format}")
print(f"Quantization: {quantization}")
client = Client(endpoint)
model_uid = client.launch_model(
model_name,
model_size_in_billions=model_size_in_billions,
model_format=model_format,
quantization=quantization,
n_ctx=2048,
)
model = client.get_model(model_uid)
chat_history: List["ChatCompletionMessage"] = []
while True:
prompt = input("you: ")
completion = model.chat(
prompt=prompt,
chat_history=chat_history,
generate_config={"max_tokens": 1024},
)
content = completion["choices"][0]["message"]["content"]
print(f"{model_name}: {content}")
chat_history.append(ChatCompletionMessage(role="user", content=prompt))
chat_history.append(ChatCompletionMessage(role="assistant", content=content))