LLM
/
chat-python


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
							from openai import OpenAI
import requests
import json
from utils.get_logger import setup_logger
from config import model_name_vllm_url_dict

logger = setup_logger(__name__)

class VllmApi():
    def __init__(self, chat_json):
        openai_api_key = "EMPTY"
        model = chat_json.get("model")
        vllm_url = model_name_vllm_url_dict.get(model)
        openai_api_base = vllm_url
        self.vllm_chat_url = f"{vllm_url}/chat/completions"
        self.vllm_generate_url = f"{vllm_url}/completions"
        self.client = OpenAI(
            # defaults to os.environ.get("OPENAI_API_KEY")
            api_key=openai_api_key,
            base_url=openai_api_base,
        )

    def chat(self,
             prompt : str = "",
             model: str = "deepseek-r1:7b",
             stream: bool = False,
             top_p: float = 0.9,
             temperature: float = 0.6,
             max_tokens: int = 1024,
             history: list = []
             ):
        if history:
            messages = history
        else:
            messages = [{"role": "user", "content": prompt}]
        chat_response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            stream=stream,
            top_p=top_p,
            temperature=temperature, 
            max_tokens=max_tokens
        )

        # 针对deepseek的模型，是否输出think部分
        yield_reasoning_content = True
        yield_content = True
        has_reason = ""
        if stream:
            for chunk in chat_response:
                logger.info(f"vllm返回的chunk信息：{chunk}")
                reasoning_content = None
                content = None
                chat_id = chunk.id
                # Check the content is reasoning_content or content
                if chunk.choices[0].delta.role == "assistant":
                    continue
                elif hasattr(chunk.choices[0].delta, "reasoning_content"):
                    reasoning_content = chunk.choices[0].delta.reasoning_content
                    if reasoning_content:
                        has_reason += reasoning_content
                elif hasattr(chunk.choices[0].delta, "content"):
                    content = chunk.choices[0].delta.content

                if reasoning_content is not None:
                    if yield_reasoning_content:
                        yield_reasoning_content = False
                        reasoning_content = "```think" + reasoning_content
                        # print("reasoning_content:", end="", flush=True)
                    # print(reasoning_content, end="", flush=True)
                    # yield reasoning_content
                    yield {"id": chat_id, "event": "add", "data": reasoning_content}
                    
                elif content is not None:
                    if yield_content:
                        yield_content = False
                        if has_reason:
                            content = "think```" + content
                        else:
                            content = content
                    #     print("\ncontent:", end="", flush=True) 
                    # print(content, end="", flush=True)
                    # yield content
                    yield {"id": chat_id, "event": "add", "data": content}
                
                if chunk.choices[0].finish_reason:
                    yield {"id": chat_id, "event": "finish", "data": ""}
        
        else:
            # print(f"chat response: {chat_response.model_dump_json()}")
            yield chat_response.choices[0].message.content

    def generate(self,
                 prompt: str,
                 model: str = "deepseek-r1:7b",
                 history: list = [],
                 stream: bool = False
                 ):
        completion = self.client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=1024,
            stream=stream
        )

        if stream:
            for chunk in completion:
                print(f"generate chunk: {chunk}")
                yield chunk
        
        else:
            return completion
        
    def request_generate(self, model, prompt, max_tokens: int = 1024, temperature: float = 0.6, stream: bool = False):
        json_data = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }
        response = requests.post(self.vllm_generate_url,json=json_data, stream=stream)
        response.raise_for_status()
        if stream:
            for line in response.iter_lines():
                if line:
                    line_str = line.decode("utf-8")
                    if line_str.startswith("data: "):
                        json_str = line_str[len("data: "):]
                    if json_str == "[DONE]":
                        break
                    
                    print(f"返回的数据：{json.loads(json_str)}")
                    yield json.loads(json_str)
                
        else:
            logger.info(f"直接返回结果：{response.json()}")
            yield response.json()

    def request_chat(self, 
                     model, 
                     prompt, 
                     history: list = [], 
                     temperature: float = 0.6, 
                     stream: bool = False,
                     top_p: float = 0.7):
        history.append({"role": "user", "content": prompt})
        json_data = {
            "model": model,
            "messages": history,
            "temperature": temperature,
            "stream": stream,
            "top_p": top_p
        }
        response = requests.post(self.vllm_chat_url,json=json_data, stream=stream)
        response.raise_for_status()
        if stream:
            for line in response.iter_lines():
                if line:
                    line_str = line.decode("utf-8")
                    if line_str.startswith("data: "):
                        json_str = line_str[len("data: "):]

                    if json_str == "[DONE]":
                        break
                    
                    print(f"chat模式返回的数据：{json.loads(json_str)}")
                    yield json.loads(json_str)
        else:
            print(f"聊天模式直接返回结果：{response.json()}")
            return response.json()


def main():
    history = [{"role": "system", "content": "你是一个非常有帮助的助手，在回答用户问题的时候请以<think>开头。"}]
    # prompt = "请帮我计算鸡兔同笼的问题。从上面数有35个头，从下面数有94只脚，请问分别多少只兔子多少只鸡？"
    prompt = "请帮我将下面提供的中文翻译成日文，要求：1、直接输出翻译的结果，2、不要进行任何解释。需要翻译的内容：我下飞机的时候行李丢了。"
    model = "DeepSeek-R1-Distill-Qwen-14B"
    vllm_chat_resp = VllmApi().request_chat(prompt=prompt, model=model, history=history, stream=True)

    # print("vllm 回复：")
    for chunk in vllm_chat_resp:
        pass
    #     print(chunk, end='', flush=True)

if __name__=="__main__":
    main()