JKRAG
/
jk-rag-python


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
							from openai import OpenAI, AsyncOpenAI
import requests
import json
from utils.get_logger import setup_logger
from config import model_name_vllm_url_dict

logger = setup_logger(__name__)

class VllmApi():
    def __init__(self, chat_json):
        openai_api_key = "sk-72f9d0e5bc894e1d828d73bdcc50ff0a"
        # model = chat_json.get("model")
        model = "/opt/vllm/models/Qwen/Qwen3-30B-A3B-Instruct-2507"
        vllm_url = model_name_vllm_url_dict.get(model)
        openai_api_base = vllm_url
        self.vllm_chat_url = f"{vllm_url}/chat/completions"
        self.vllm_generate_url = f"{vllm_url}/completions"
        self.client = AsyncOpenAI(
            # defaults to os.environ.get("OPENAI_API_KEY")
            api_key=openai_api_key,
            base_url=openai_api_base,
        )

    async def chat(self,
             prompt : str = "",
             model: str = "deepseek-r1:7b",
             stream: bool = False,
             top_p: float = 0.9,
             temperature: float = 0.6,
             max_tokens: int = 1024,
             history: list = [],
             enable_think: bool = False
             ):
        if history:
            messages = history
            messages.insert(0, {"role": "system","content": "你是一个基于检索增强生成（RAG）的专业问答助手。"})
        else:
            messages = [{"role": "user", "content": prompt}]

        if model == "Qwen3-30B":
            if enable_think:
                temperature = 0.6
                top_p=0.95
            else:
                temperature = 0.7
                top_p=0.8
                
            extra_body = {
                "chat_template_kwargs": {"enable_thinking": enable_think},
                "top_k": 20
            }
        else:
            extra_body = {}

        chat_response = await self.client.chat.completions.create(
            model=model,
            messages=messages,
            stream=stream,
            top_p=top_p,
            temperature=temperature, 
            max_tokens=max_tokens,
            # extra_body={
            #     # "chat_template_kwargs": {"enable_thinking": enable_think},
            #     "enable_thinking": enable_think,
            #     "top_k":top_k
            # },
            extra_body=extra_body
        )

        # 针对deepseek的模型，是否输出think部分
        yield_reasoning_content = True
        yield_content = True
        has_reason = ""
        if stream:
            try:
                async for chunk in chat_response:
                    logger.info(f"vllm返回的chunk信息：{chunk}")
                    reasoning_content = None
                    content = None
                    chat_id = chunk.id
                    # Check the content is reasoning_content or content
                    if chunk.choices[0].delta.role == "assistant":
                        continue
                    # elif hasattr(chunk.choices[0].delta, "reasoning_content"):
                    #     reasoning_content = chunk.choices[0].delta.reasoning_content
                    #     if reasoning_content:
                    #         has_reason += reasoning_content
                    elif hasattr(chunk.choices[0].delta, "content"):
                        content = chunk.choices[0].delta.content

                    if reasoning_content is not None:
                        if yield_reasoning_content:
                            yield_reasoning_content = False
                            reasoning_content = "```think" + reasoning_content
                            # print("reasoning_content:", end="", flush=True)
                        # print(reasoning_content, end="", flush=True)
                        # yield reasoning_content
                        yield {"id": chat_id, "event": "add", "data": reasoning_content}
                        
                    elif content is not None:
                        if yield_content:
                            yield_content = False
                            if has_reason:
                                content = "think```" + content
                            else:
                                content = content
                        #     print("\ncontent:", end="", flush=True) 
                        # print(content, end="", flush=True)
                        # yield content
                        logger.info(f"返回的信息是id: {chat_id}, event: add, data: {content}")
                        yield {"id": chat_id, "event": "add", "data": content}
                    
                    if chunk.choices[0].finish_reason:
                        yield {"id": chat_id, "event": "finish", "data": ""}
            finally:
                # await self.client.close()
                pass
        
        else:
            # print(f"chat response: {chat_response.model_dump_json()}")
            yield {"id": "", "event": "finish", "data": chat_response.choices[0].message.content}

    
    async def generate_non_stream_async(
        self,
        prompt: list,
        model: str,
        return_full_response: bool = False,
    ):
        # messages = [
        #     {"role": "user", "content": prompt}
        # ]

        completion = await self.client.chat.completions.create(
            model=model,
            messages=prompt,
            stream=False,
        )

        if return_full_response:
            return completion
        return completion.choices[0].message.content


    def generate(self,
                 prompt: str,
                 model: str = "deepseek-r1:7b",
                 history: list = [],
                 stream: bool = False
                 ):
        completion = self.client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=1024,
            stream=stream
        )
        

        if stream:
            for chunk in completion:
                print(f"generate chunk: {chunk}")
                yield chunk
        
        else:
            return completion
        
    def request_generate(self, model, prompt, max_tokens: int = 1024, temperature: float = 0.6, stream: bool = False):
        json_data = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }
        response = requests.post(self.vllm_generate_url,json=json_data, stream=stream)
        response.raise_for_status()
        if stream:
            for line in response.iter_lines():
                if line:
                    line_str = line.decode("utf-8")
                    if line_str.startswith("data: "):
                        json_str = line_str[len("data: "):]
                    if json_str == "[DONE]":
                        break
                    
                    print(f"返回的数据：{json.loads(json_str)}")
                    yield json.loads(json_str)
                
        else:
            logger.info(f"直接返回结果：{response.json()}")
            yield response.json()

    def request_chat(self, 
                     model, 
                     prompt, 
                     history: list = [], 
                     temperature: float = 0.6, 
                     stream: bool = False,
                     top_p: float = 0.7):
        history.append({"role": "user", "content": prompt})
        json_data = {
            "model": model,
            "messages": history,
            "temperature": temperature,
            "stream": stream,
            "top_p": top_p
        }
        response = requests.post(self.vllm_chat_url,json=json_data, stream=stream)
        response.raise_for_status()
        if stream:
            for line in response.iter_lines():
                if line:
                    line_str = line.decode("utf-8")
                    if line_str.startswith("data: "):
                        json_str = line_str[len("data: "):]

                    if json_str == "[DONE]":
                        break
                    
                    print(f"chat模式返回的数据：{json.loads(json_str)}")
                    yield json.loads(json_str)
        else:
            print(f"聊天模式直接返回结果：{response.json()}")
            return response.json()


def main():
    history = [{"role": "system", "content": "你是一个非常有帮助的助手，在回答用户问题的时候请以<think>开头。"}]
    # prompt = "请帮我计算鸡兔同笼的问题。从上面数有35个头，从下面数有94只脚，请问分别多少只兔子多少只鸡？"
    prompt = "请帮我将下面提供的中文翻译成日文，要求：1、直接输出翻译的结果，2、不要进行任何解释。需要翻译的内容：我下飞机的时候行李丢了。"
    model = "DeepSeek-R1-Distill-Qwen-14B"
    vllm_chat_resp = VllmApi().request_chat(prompt=prompt, model=model, history=history, stream=True)

    # print("vllm 回复：")
    for chunk in vllm_chat_resp:
        pass
    #     print(chunk, end='', flush=True)

if __name__=="__main__":
    main()