llm.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. from openai import OpenAI, AsyncOpenAI
  2. import requests
  3. import json
  4. from utils.get_logger import setup_logger
  5. from config import model_name_vllm_url_dict
  6. logger = setup_logger(__name__)
  7. class VllmApi():
  8. def __init__(self, chat_json):
  9. openai_api_key = "sk-72f9d0e5bc894e1d828d73bdcc50ff0a"
  10. model = chat_json.get("model")
  11. # model = "Qwen3-Coder-30B-loft"
  12. vllm_url = model_name_vllm_url_dict.get(model)
  13. openai_api_base = vllm_url
  14. self.vllm_chat_url = f"{vllm_url}/chat/completions"
  15. self.vllm_generate_url = f"{vllm_url}/completions"
  16. self.client = AsyncOpenAI(
  17. # defaults to os.environ.get("OPENAI_API_KEY")
  18. api_key=openai_api_key,
  19. base_url=openai_api_base,
  20. )
  21. async def chat(self,
  22. prompt : str = "",
  23. model: str = "deepseek-r1:7b",
  24. stream: bool = False,
  25. top_p: float = 0.9,
  26. temperature: float = 0.6,
  27. max_tokens: int = 1024,
  28. history: list = [],
  29. enable_think: bool = False
  30. ):
  31. if history:
  32. messages = history
  33. else:
  34. messages = [{"role": "user", "content": prompt}]
  35. if model == "Qwen3-30B":
  36. if enable_think:
  37. temperature = 0.6
  38. top_p=0.95
  39. else:
  40. temperature = 0.7
  41. top_p=0.8
  42. extra_body = {
  43. "chat_template_kwargs": {"enable_thinking": enable_think},
  44. "top_k": 20
  45. }
  46. else:
  47. extra_body = {}
  48. chat_response = await self.client.chat.completions.create(
  49. model=model,
  50. messages=messages,
  51. stream=stream,
  52. top_p=top_p,
  53. temperature=temperature,
  54. max_tokens=max_tokens,
  55. # extra_body={
  56. # # "chat_template_kwargs": {"enable_thinking": enable_think},
  57. # "enable_thinking": enable_think,
  58. # "top_k":top_k
  59. # },
  60. extra_body=extra_body
  61. )
  62. # 针对deepseek的模型,是否输出think部分
  63. yield_reasoning_content = True
  64. yield_content = True
  65. has_reason = ""
  66. if stream:
  67. try:
  68. async for chunk in chat_response:
  69. logger.info(f"vllm返回的chunk信息:{chunk}")
  70. reasoning_content = None
  71. content = None
  72. chat_id = chunk.id
  73. # Check the content is reasoning_content or content
  74. if chunk.choices[0].delta.role == "assistant":
  75. continue
  76. elif hasattr(chunk.choices[0].delta, "reasoning_content"):
  77. reasoning_content = chunk.choices[0].delta.reasoning_content
  78. if reasoning_content:
  79. has_reason += reasoning_content
  80. elif hasattr(chunk.choices[0].delta, "content"):
  81. content = chunk.choices[0].delta.content
  82. if reasoning_content is not None:
  83. if yield_reasoning_content:
  84. yield_reasoning_content = False
  85. reasoning_content = "```think" + reasoning_content
  86. # print("reasoning_content:", end="", flush=True)
  87. # print(reasoning_content, end="", flush=True)
  88. # yield reasoning_content
  89. yield {"id": chat_id, "event": "add", "data": reasoning_content}
  90. elif content is not None:
  91. if yield_content:
  92. yield_content = False
  93. if has_reason:
  94. content = "think```" + content
  95. else:
  96. content = content
  97. # print("\ncontent:", end="", flush=True)
  98. # print(content, end="", flush=True)
  99. # yield content
  100. yield {"id": chat_id, "event": "add", "data": content}
  101. if chunk.choices[0].finish_reason:
  102. yield {"id": chat_id, "event": "finish", "data": ""}
  103. finally:
  104. await self.client.close()
  105. else:
  106. # print(f"chat response: {chat_response.model_dump_json()}")
  107. # yield chat_response.choices[0].message.content
  108. yield chat_response
  109. def generate(self,
  110. prompt: str,
  111. model: str = "deepseek-r1:7b",
  112. history: list = [],
  113. stream: bool = False
  114. ):
  115. completion = self.client.completions.create(
  116. model=model,
  117. prompt=prompt,
  118. max_tokens=1024,
  119. stream=stream
  120. )
  121. if stream:
  122. for chunk in completion:
  123. print(f"generate chunk: {chunk}")
  124. yield chunk
  125. else:
  126. return completion
  127. def request_generate(self, model, prompt, max_tokens: int = 1024, temperature: float = 0.6, stream: bool = False):
  128. json_data = {
  129. "model": model,
  130. "prompt": prompt,
  131. "max_tokens": max_tokens,
  132. "temperature": temperature,
  133. "stream": stream
  134. }
  135. response = requests.post(self.vllm_generate_url,json=json_data, stream=stream)
  136. response.raise_for_status()
  137. if stream:
  138. for line in response.iter_lines():
  139. if line:
  140. line_str = line.decode("utf-8")
  141. if line_str.startswith("data: "):
  142. json_str = line_str[len("data: "):]
  143. if json_str == "[DONE]":
  144. break
  145. print(f"返回的数据:{json.loads(json_str)}")
  146. yield json.loads(json_str)
  147. else:
  148. logger.info(f"直接返回结果:{response.json()}")
  149. yield response.json()
  150. def request_chat(self,
  151. model,
  152. prompt,
  153. history: list = [],
  154. temperature: float = 0.6,
  155. stream: bool = False,
  156. top_p: float = 0.7):
  157. history.append({"role": "user", "content": prompt})
  158. json_data = {
  159. "model": model,
  160. "messages": history,
  161. "temperature": temperature,
  162. "stream": stream,
  163. "top_p": top_p
  164. }
  165. response = requests.post(self.vllm_chat_url,json=json_data, stream=stream)
  166. response.raise_for_status()
  167. if stream:
  168. for line in response.iter_lines():
  169. if line:
  170. line_str = line.decode("utf-8")
  171. if line_str.startswith("data: "):
  172. json_str = line_str[len("data: "):]
  173. if json_str == "[DONE]":
  174. break
  175. print(f"chat模式返回的数据:{json.loads(json_str)}")
  176. yield json.loads(json_str)
  177. else:
  178. print(f"聊天模式直接返回结果:{response.json()}")
  179. return response.json()
  180. def main():
  181. history = [{"role": "system", "content": "你是一个非常有帮助的助手,在回答用户问题的时候请以<think>开头。"}]
  182. # prompt = "请帮我计算鸡兔同笼的问题。从上面数有35个头,从下面数有94只脚,请问分别多少只兔子多少只鸡?"
  183. prompt = "请帮我将下面提供的中文翻译成日文,要求:1、直接输出翻译的结果,2、不要进行任何解释。需要翻译的内容:我下飞机的时候行李丢了。"
  184. model = "DeepSeek-R1-Distill-Qwen-14B"
  185. vllm_chat_resp = VllmApi().request_chat(prompt=prompt, model=model, history=history, stream=True)
  186. # print("vllm 回复:")
  187. for chunk in vllm_chat_resp:
  188. pass
  189. # print(chunk, end='', flush=True)
  190. if __name__=="__main__":
  191. main()