from openai import OpenAI import requests import json from utils.get_logger import setup_logger from config import model_name_vllm_url_dict logger = setup_logger(__name__) class VllmApi(): def __init__(self, chat_json): openai_api_key = "EMPTY" model = chat_json.get("model") vllm_url = model_name_vllm_url_dict.get(model) openai_api_base = vllm_url self.vllm_chat_url = f"{vllm_url}/chat/completions" self.vllm_generate_url = f"{vllm_url}/completions" self.client = OpenAI( # defaults to os.environ.get("OPENAI_API_KEY") api_key=openai_api_key, base_url=openai_api_base, ) def chat(self, prompt : str = "", model: str = "deepseek-r1:7b", stream: bool = False, top_p: float = 0.9, temperature: float = 0.6, max_tokens: int = 1024, history: list = [] ): if history: messages = history else: messages = [{"role": "user", "content": prompt}] chat_response = self.client.chat.completions.create( model=model, messages=messages, stream=stream, top_p=top_p, temperature=temperature, max_tokens=max_tokens ) # 针对deepseek的模型,是否输出think部分 yield_reasoning_content = True yield_content = True has_reason = "" if stream: for chunk in chat_response: logger.info(f"vllm返回的chunk信息:{chunk}") reasoning_content = None content = None chat_id = chunk.id # Check the content is reasoning_content or content if chunk.choices[0].delta.role == "assistant": continue elif hasattr(chunk.choices[0].delta, "reasoning_content"): reasoning_content = chunk.choices[0].delta.reasoning_content if reasoning_content: has_reason += reasoning_content elif hasattr(chunk.choices[0].delta, "content"): content = chunk.choices[0].delta.content if reasoning_content is not None: if yield_reasoning_content: yield_reasoning_content = False reasoning_content = "```think" + reasoning_content # print("reasoning_content:", end="", flush=True) # print(reasoning_content, end="", flush=True) # yield reasoning_content yield {"id": chat_id, "event": "add", "data": reasoning_content} elif content is not None: if yield_content: yield_content = False if has_reason: content = "think```" + content else: content = content # print("\ncontent:", end="", flush=True) # print(content, end="", flush=True) # yield content yield {"id": chat_id, "event": "add", "data": content} if chunk.choices[0].finish_reason: yield {"id": chat_id, "event": "finish", "data": ""} else: # print(f"chat response: {chat_response.model_dump_json()}") yield chat_response.choices[0].message.content def generate(self, prompt: str, model: str = "deepseek-r1:7b", history: list = [], stream: bool = False ): completion = self.client.completions.create( model=model, prompt=prompt, max_tokens=1024, stream=stream ) if stream: for chunk in completion: print(f"generate chunk: {chunk}") yield chunk else: return completion def request_generate(self, model, prompt, max_tokens: int = 1024, temperature: float = 0.6, stream: bool = False): json_data = { "model": model, "prompt": prompt, "max_tokens": max_tokens, "temperature": temperature, "stream": stream } response = requests.post(self.vllm_generate_url,json=json_data, stream=stream) response.raise_for_status() if stream: for line in response.iter_lines(): if line: line_str = line.decode("utf-8") if line_str.startswith("data: "): json_str = line_str[len("data: "):] if json_str == "[DONE]": break print(f"返回的数据:{json.loads(json_str)}") yield json.loads(json_str) else: logger.info(f"直接返回结果:{response.json()}") yield response.json() def request_chat(self, model, prompt, history: list = [], temperature: float = 0.6, stream: bool = False, top_p: float = 0.7): history.append({"role": "user", "content": prompt}) json_data = { "model": model, "messages": history, "temperature": temperature, "stream": stream, "top_p": top_p } response = requests.post(self.vllm_chat_url,json=json_data, stream=stream) response.raise_for_status() if stream: for line in response.iter_lines(): if line: line_str = line.decode("utf-8") if line_str.startswith("data: "): json_str = line_str[len("data: "):] if json_str == "[DONE]": break print(f"chat模式返回的数据:{json.loads(json_str)}") yield json.loads(json_str) else: print(f"聊天模式直接返回结果:{response.json()}") return response.json() def main(): history = [{"role": "system", "content": "你是一个非常有帮助的助手,在回答用户问题的时候请以开头。"}] # prompt = "请帮我计算鸡兔同笼的问题。从上面数有35个头,从下面数有94只脚,请问分别多少只兔子多少只鸡?" prompt = "请帮我将下面提供的中文翻译成日文,要求:1、直接输出翻译的结果,2、不要进行任何解释。需要翻译的内容:我下飞机的时候行李丢了。" model = "DeepSeek-R1-Distill-Qwen-14B" vllm_chat_resp = VllmApi().request_chat(prompt=prompt, model=model, history=history, stream=True) # print("vllm 回复:") for chunk in vllm_chat_resp: pass # print(chunk, end='', flush=True) if __name__=="__main__": main()