.01
.02
.03
#如何使用
初始化模型和处理器
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image
import torch
import math
model = Qwen2VLForConditionalGeneration.from_pretrained(
'marco/mcdse-2b-v1',
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
device_map="cuda:0"
).eval()
min_pixels = 1 * 28 * 28
max_pixels = 960 * 28 * 28
processor = AutoProcessor.from_pretrained(
'marco/mcdse-2b-v1',
min_pixels=min_pixels,
max_pixels=max_pixels
)
model.padding_side = "left"
processor.tokenizer.padding_side = "left"
document_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>"
query_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
对查询进行编码
def encode_queries(queries: list[str], dimension: int):
dummy_image = Image.new('RGB', (56, 56))
inputs = processor(
text=[query_prompt % x for x in queries],
images=[dummy_image for _ in queries],
videos=None,
padding='longest',
return_tensors='pt'
).to('cuda:0')
cache_position = torch.arange(0, len(queries))
inputs = model.prepare_inputs_for_generation(
**inputs, cache_position=cache_position, use_cache=False)
with torch.no_grad():
output = self.model(
**inputs,
return_dict=True,
output_hidden_states=True
)
embeddings = output.hidden_states[-1][:, -1]
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
对文档进行编码
def round_by_factor(number: float, factor: int) -> int:
return round(number / factor) * factor
def ceil_by_factor(number: float, factor: int) -> int:
return math.ceil(number / factor) * factor
def floor_by_factor(number: float, factor: int) -> int:
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int) -> tuple[int, int]:
h_bar = max(28, round_by_factor(height, 28))
w_bar = max(28, round_by_factor(width, 28))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, 28)
w_bar = floor_by_factor(width / beta, 28)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, 28)
w_bar = ceil_by_factor(width * beta, 28)
return h_bar, w_bar
def resize(image: Image.Image):
new_size = smart_resize(image.height, image.width)
return image.resize(new_size)
def encode_documents(documents: list[Image.Image], dimension: int):
inputs = processor(
text=[document_prompt] * len(documents),
images=[resize(x) for x in documents],
videos=None,
padding='longest',
return_tensors='pt'
).to('cuda:0')
cache_position = torch.arange(0, len(queries))
inputs = model.prepare_inputs_for_generation(
**inputs, cache_position=cache_position, use_cache=False)
with torch.no_grad():
output = self.model(
**inputs,
return_dict=True,
output_hidden_states=True
)
embeddings = output.hidden_states[-1][:, -1]
return torch.nn.functional.normalize(embeddings[:, :dimension], p=2, dim=-1)
对比结果
.04
mcdse-2b-v1 的重要性
.05
参考:
https://huggingface.co/marco/mcdse-2b-v1