| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- from openai import OpenAI
- import requests
- import json
- from utils.get_logger import setup_logger
- from config import model_name_vllm_url_dict
- logger = setup_logger(__name__)
- class VllmApi():
- def __init__(self, chat_json):
- openai_api_key = "EMPTY"
- model = chat_json.get("model")
- vllm_url = model_name_vllm_url_dict.get(model)
- openai_api_base = vllm_url
- self.vllm_chat_url = f"{vllm_url}/chat/completions"
- self.vllm_generate_url = f"{vllm_url}/completions"
- self.client = OpenAI(
- # defaults to os.environ.get("OPENAI_API_KEY")
- api_key=openai_api_key,
- base_url=openai_api_base,
- )
- def chat(self,
- prompt : str = "",
- model: str = "deepseek-r1:7b",
- stream: bool = False,
- top_p: float = 0.9,
- temperature: float = 0.6,
- max_tokens: int = 1024,
- history: list = []
- ):
- if history:
- messages = history
- else:
- messages = [{"role": "user", "content": prompt}]
- chat_response = self.client.chat.completions.create(
- model=model,
- messages=messages,
- stream=stream,
- top_p=top_p,
- temperature=temperature,
- max_tokens=max_tokens
- )
- # 针对deepseek的模型,是否输出think部分
- yield_reasoning_content = True
- yield_content = True
- has_reason = ""
- if stream:
- for chunk in chat_response:
- logger.info(f"vllm返回的chunk信息:{chunk}")
- reasoning_content = None
- content = None
- chat_id = chunk.id
- # Check the content is reasoning_content or content
- if chunk.choices[0].delta.role == "assistant":
- continue
- elif hasattr(chunk.choices[0].delta, "reasoning_content"):
- reasoning_content = chunk.choices[0].delta.reasoning_content
- if reasoning_content:
- has_reason += reasoning_content
- elif hasattr(chunk.choices[0].delta, "content"):
- content = chunk.choices[0].delta.content
- if reasoning_content is not None:
- if yield_reasoning_content:
- yield_reasoning_content = False
- reasoning_content = "```think" + reasoning_content
- # print("reasoning_content:", end="", flush=True)
- # print(reasoning_content, end="", flush=True)
- # yield reasoning_content
- yield {"id": chat_id, "event": "add", "data": reasoning_content}
-
- elif content is not None:
- if yield_content:
- yield_content = False
- if has_reason:
- content = "think```" + content
- else:
- content = content
- # print("\ncontent:", end="", flush=True)
- # print(content, end="", flush=True)
- # yield content
- yield {"id": chat_id, "event": "add", "data": content}
-
- if chunk.choices[0].finish_reason:
- yield {"id": chat_id, "event": "finish", "data": ""}
-
- else:
- # print(f"chat response: {chat_response.model_dump_json()}")
- yield chat_response.choices[0].message.content
- def generate(self,
- prompt: str,
- model: str = "deepseek-r1:7b",
- history: list = [],
- stream: bool = False
- ):
- completion = self.client.completions.create(
- model=model,
- prompt=prompt,
- max_tokens=1024,
- stream=stream
- )
- if stream:
- for chunk in completion:
- print(f"generate chunk: {chunk}")
- yield chunk
-
- else:
- return completion
-
- def request_generate(self, model, prompt, max_tokens: int = 1024, temperature: float = 0.6, stream: bool = False):
- json_data = {
- "model": model,
- "prompt": prompt,
- "max_tokens": max_tokens,
- "temperature": temperature,
- "stream": stream
- }
- response = requests.post(self.vllm_generate_url,json=json_data, stream=stream)
- response.raise_for_status()
- if stream:
- for line in response.iter_lines():
- if line:
- line_str = line.decode("utf-8")
- if line_str.startswith("data: "):
- json_str = line_str[len("data: "):]
- if json_str == "[DONE]":
- break
-
- print(f"返回的数据:{json.loads(json_str)}")
- yield json.loads(json_str)
-
- else:
- logger.info(f"直接返回结果:{response.json()}")
- yield response.json()
- def request_chat(self,
- model,
- prompt,
- history: list = [],
- temperature: float = 0.6,
- stream: bool = False,
- top_p: float = 0.7):
- history.append({"role": "user", "content": prompt})
- json_data = {
- "model": model,
- "messages": history,
- "temperature": temperature,
- "stream": stream,
- "top_p": top_p
- }
- response = requests.post(self.vllm_chat_url,json=json_data, stream=stream)
- response.raise_for_status()
- if stream:
- for line in response.iter_lines():
- if line:
- line_str = line.decode("utf-8")
- if line_str.startswith("data: "):
- json_str = line_str[len("data: "):]
- if json_str == "[DONE]":
- break
-
- print(f"chat模式返回的数据:{json.loads(json_str)}")
- yield json.loads(json_str)
- else:
- print(f"聊天模式直接返回结果:{response.json()}")
- return response.json()
- def main():
- history = [{"role": "system", "content": "你是一个非常有帮助的助手,在回答用户问题的时候请以<think>开头。"}]
- # prompt = "请帮我计算鸡兔同笼的问题。从上面数有35个头,从下面数有94只脚,请问分别多少只兔子多少只鸡?"
- prompt = "请帮我将下面提供的中文翻译成日文,要求:1、直接输出翻译的结果,2、不要进行任何解释。需要翻译的内容:我下飞机的时候行李丢了。"
- model = "DeepSeek-R1-Distill-Qwen-14B"
- vllm_chat_resp = VllmApi().request_chat(prompt=prompt, model=model, history=history, stream=True)
- # print("vllm 回复:")
- for chunk in vllm_chat_resp:
- pass
- # print(chunk, end='', flush=True)
- if __name__=="__main__":
- main()
|