应用场景

在使用ChatGPT时,模型的回复内容是一个字一个字蹦出来的,而不是整段话直接出现,整体呈现出一种类似于打字机的效果。但实际上,这个打字机效果是为了美化模型预测时间过长的一个副产品,类似于苹果的灵动岛。因为模型需要不断预测接下来要回复什么内容,整个预测完成的时间会比较长。

stream2.jpg

如果等整段回复生成之后再输出到网页,用户体验就会很差。所以最好是模型预测出几个词以后实时地输出到网页上,减轻用户的等待焦虑心理。

实现

要实现这种效果,需要使用http协议的Server-Sent Events (SSE)技术。SSE是一种基于 HTTP 协议的服务器推送技术,它允许服务器向客户端发送数据和信息。与 WebSocket 不同,SSE 是一种单向通信方式,只有服务器可以向客户端推送消息。SSE 是 HTML5 规范的一部分,使用非常简单,主要由服务端与浏览器端的通讯协议(HTTP协议)和 EventSource 接口来处理 Server-sent events 组成,服务器端的响应的内容类型是“text/event-stream”。

Python Flask实现

在Flask中没有专门的流式输出的响应对象。在Flask中实现流式相应的方法是往Response对象中传入一个生成器函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import json
import time
from flask import Blueprint, Response

gpt_bp = Blueprint('gpt', __name__)


def chat_demo(prompt):
data = ["**ab**", "```cd```", "ef", "gh"]
last_s = ''
for d in data:
s = last_s + d
message = json.dumps(dict(
role=None,
id=1,
parentMessageId=2,
conversationId=3,
text=s,
))
last_s = s
time.sleep(1)
yield message + '\n'


@gpt_bp.route('/chat', methods=['POST'], endpoint='chat')
def chat_service():
prompt = request.json['prompt']

rsp = Response(chat_demo(prompt))
rsp.headers['Content-Type'] = "application/octet-stream"
rsp.headers['Cache-Control'] = "no-cache"
return rsp

yield关键字用于构造生成器。Response对象的Content-Type改为对应的流式响应的类型。Cache-Control改为”no-cache”的原因是防止浏览器将流式输出的响应缓存下来而不是直接输出。

对应的vue代码参考如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
<script>
async function onConversation() {
let message = prompt.value

if (loading.value)
return

if (!message || message.trim() === '')
return

controller = new AbortController()

addChat(
+uuid,
{
dateTime: new Date().toLocaleString(),
text: message,
inversion: true,
error: false,
conversationOptions: null,
requestOptions: { prompt: message, options: null },
},
)
scrollToBottom()

loading.value = true
prompt.value = ''

let options: Chat.ConversationRequest = {}
const lastContext = conversationList.value[conversationList.value.length - 1]?.conversationOptions

if (lastContext && usingContext.value)
options = { ...lastContext }

addChat(
+uuid,
{
dateTime: new Date().toLocaleString(),
text: t('chat.thinking'),
loading: true,
inversion: false,
error: false,
conversationOptions: null,
requestOptions: { prompt: message, options: { ...options } },
},
)
scrollToBottom()

try {
let lastText = ''
const fetchChatAPIOnce = async () => {
await ChatAPI<Chat.ConversationResponse>({
prompt: message,
options,
signal: controller.signal,
onDownloadProgress: ({ event }) => {
const xhr = event.target
const { responseText } = xhr
// Always process the final line
const lastIndex = responseText.lastIndexOf('\n', responseText.length - 2)
let chunk = responseText
if (lastIndex !== -1)
chunk = responseText.substring(lastIndex)
try {
const data = JSON.parse(chunk)
updateChat(
+uuid,
dataSources.value.length - 1,
{
dateTime: new Date().toLocaleString(),
text: lastText + (data.text ?? ''),
inversion: false,
error: false,
loading: true,
conversationOptions: { conversationId: data.conversationId, parentMessageId: data.id },
requestOptions: { prompt: message, options: { ...options } },
},
)

if (openLongReply && data.detail.choices[0].finish_reason === 'length') {
options.parentMessageId = data.id
lastText = data.text
message = ''
return fetchChatAPIOnce()
}

scrollToBottomIfAtBottom()
}
catch (error) {
//
}
},
})
updateChatSome(+uuid, dataSources.value.length - 1, { loading: false })
}

await fetchChatAPIOnce()
}
catch (error: any) {
const errorMessage = error?.message ?? t('common.wrong')

if (error.message === 'canceled') {
updateChatSome(
+uuid,
dataSources.value.length - 1,
{
loading: false,
},
)
scrollToBottomIfAtBottom()
return
}

const currentChat = getChatByUuidAndIndex(+uuid, dataSources.value.length - 1)

if (currentChat?.text && currentChat.text !== '') {
updateChatSome(
+uuid,
dataSources.value.length - 1,
{
text: `${currentChat.text}\n[${errorMessage}]`,
error: false,
loading: false,
},
)
return
}

updateChat(
+uuid,
dataSources.value.length - 1,
{
dateTime: new Date().toLocaleString(),
text: errorMessage,
inversion: false,
error: true,
loading: false,
conversationOptions: null,
requestOptions: { prompt: message, options: { ...options } },
},
)
scrollToBottomIfAtBottom()
}
finally {
loading.value = false
}
}
</script>