from pymilvus import model import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification device = "cuda:0" if torch.cuda.is_available() else "cpu" import os os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # 使用sentence transformer方式加载模型 # embedding_path = r"/opt/vllm/models/BAAI/bge-m3" # 线上路径 bge_m3_base_url = r"http://10.1.14.16:8787/v1" bge_me_model = "bge-m3" qwen_ed_base_url = r"http://10.1.14.16:8788/v1" qwen_ed_model = "Qwen3-Embedding" # embedding_path = r"G:/work/code/models/multilingual-e5-large-instruct/" # 本地路径 # sentence_transformer_ef = model.dense.SentenceTransformerEmbeddingFunction(model_name=embedding_path,device=device) bge_m3_ef = model.dense.OpenAIEmbeddingFunction( model_name=bge_me_model, # Specify the model name api_key='YOUR_API_KEY', # Provide your OpenAI API key # dimensions=512, # Set the embedding dimensionality base_url=bge_m3_base_url ) bge_m3_ef._openai_model_meta_info[bge_me_model]["dim"] = 1024 qwen_ed_ef = model.dense.OpenAIEmbeddingFunction( model_name=qwen_ed_model, # Specify the model name api_key='YOUR_API_KEY', # Provide your OpenAI API key # dimensions=512, # Set the embedding dimensionality base_url=qwen_ed_base_url ) qwen_ed_ef._openai_model_meta_info[qwen_ed_model]["dim"] = 1024 # sentence_transformer_ef = model.hybrid.BGEM3EmbeddingFunction(model_name=embedding_path,device=device,use_fp16=False) # embedding_path_qwen = r"/opt/vllm/models/Qwen/Qwen3-Embedding-0.6B" # sentence_transformer_qwen = model.dense.SentenceTransformerEmbeddingFunction(model_name=embedding_path_qwen,device=device, # model_kwargs={ # "attn_implementation": "flash_attention_2", # 加速推理 # "device_map": "auto", # 自动设备分配 # "torch_dtype": torch.float16 # }, # tokenizer_kwargs={ # "padding_side": "left" # 左侧填充 # }, # ) # from transformers import AutoTokenizer, AutoModel # import torch # class QwenEmbedding: # def __init__(self, model_path, device="cuda"): # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained( # model_path, # trust_remote_code=True, # torch_dtype=torch.float32 # ).to(device) # self.device = device # @torch.no_grad() # def __call__(self, texts): # inputs = self.tokenizer( # texts, padding=True, truncation=True, return_tensors="pt" # ).to(self.device) # outputs = self.model(**inputs) # # 一般取 last_hidden_state 的 CLS 或 mean-pool # emb = outputs.last_hidden_state.mean(dim=1) # return emb.cpu().numpy() # sentence_transformer_qwen = QwenEmbedding(embedding_path_qwen, "cuda:1") # # rerank模型 # bce_rerank_model_path = r"/opt/vllm/models/BAAI/bge-reranker-v2-m3" # 线上路径 # # bce_rerank_model_path = r"G:/work/code/models/bce-reranker-base_v1" # 本地路径 # bce_rerank_tokenizer = AutoTokenizer.from_pretrained(bce_rerank_model_path) # bce_rerank_base_model = AutoModelForSequenceClassification.from_pretrained(bce_rerank_model_path).to(device) # # rerank模型 # qwen_rerank_model_path = r"/opt/vllm/models/Qwen/Qwen3-Reranker-0.6B" # 线上路径 # # bce_rerank_model_path = r"G:/work/code/models/bce-reranker-base_v1" # 本地路径 # qwen_rerank_tokenizer = AutoTokenizer.from_pretrained(bce_rerank_model_path) # qwen_rerank_base_model = AutoModelForSequenceClassification.from_pretrained(bce_rerank_model_path).to(device) # rerank bge rerank_bge_url = "http://10.1.14.16:8791" rerank_bge_model = "bge-reranker-v2-m3" rerank_qwen_url = "http://10.1.14.16:8790" rerank_qwen_model = "Qwen3-Reranker-0.6B" from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( "/opt/vllm/models/Qwen/Qwen3-30B-A3B-Instruct-2507", trust_remote_code=True # 有自定义 tokenizer 时常用 )