正文
如何魔改
Qwen2ForCausalLM 来支持 embedding 请求?
直接用 vllm serve gte-7b:
CUDA_VISIBLE_DEVICES=0 vllm serve 7embed --dtype auto --api-key \
sk-1dwqsdv4r3wef3rvefg34ef1dwRv --tensor-parallel-size 1 \
--max-model-len 32768 --enforce-eager \
--disable-custom-all-reduce --port 7777 --served-model-name e5_7b
然后发送 embedding 请求,会出错误(pooler not implemented)。
我们进一步观察 vllm 里面 support 的 qwen2 模型(vllm/model_executor/models/qwen2.py):
class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(
self,
config: Qwen2Config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
) -> None:
# TODO (@robertgshaw2): see if this can be moved out
if (cache_config.sliding_window is not None
and hasattr(config, "max_window_layers")):
raise ValueError("Sliding window for some but all layers is not "
"supported. This model uses sliding window "
"but `max_window_layers` = %s is less than "
"`num_hidden_layers` = %s. Please open an issue "
"to discuss this feature." % (
config.max_window_layers,
config.num_hidden_layers,
))
super().__init__()
self.config = config
self.lora_config = lora_config
self.quant_config = quant_config
self.model = Qwen2Model(config, cache_config, quant_config)
if config.tie_word_embeddings:
self.lm_head = self.model.embed_tokens
else:
self.lm_head = ParallelLMHead(config.vocab_size,
config.hidden_size,
quant_config=quant_config)
self.logits_processor = LogitsProcessor(config