Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion vlmeval/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
"PLLaVA-34B": partial(
PLLaVA, model_path="ermu2001/pllava-34b", dir_root=PLLaVA_ROOT
),
"InternVideo-2.5-Chat-8B": partial(InternVideo, model_path="OpenGVLab/InternVideo2_5_Chat_8B", max_new_tokens=4096),
"VideoLLaMA3-7B": partial(VideoLLaMA3, model_path="DAMO-NLP-SG/VideoLLaMA3-7B", max_new_tokens=4096),
}

ungrouped = {
Expand Down Expand Up @@ -633,6 +635,8 @@
),
"MiniCPM-V-2_6": partial(MiniCPM_V_2_6, model_path="openbmb/MiniCPM-V-2_6"),
"MiniCPM-o-2_6": partial(MiniCPM_o_2_6, model_path="openbmb/MiniCPM-o-2_6"),
"MiniCPM-V-4": partial(MiniCPM_V_4, model_path="openbmb/MiniCPM-V-4"),
"MiniCPM-V-4_5": partial(MiniCPM_V_4_5, model_path="openbmb/MiniCPM-V-4_5",max_new_tokens=8192),
}

xtuner_series = {
Expand Down Expand Up @@ -692,6 +696,10 @@
"Thyme-7B": partial(Thyme, model_path="Kwai-Keye/Thyme-RL")
}

keye_vl_series = {
"Keye-VL-1_5-8B": partial(KeyeVL, model_path="/fs-computility/llm/shared/mllm/hub/models--Kwai-Keye--Keye-VL-1_5-8B/snapshots/3921b3d6a81870b107ff76e54c320d8aab66a0da", use_vllm=False, max_new_tokens=4096)
}

llava_series = {
"llava_v1.5_7b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-7b"),
"llava_v1.5_13b": partial(LLaVA, model_path="liuhaotian/llava-v1.5-13b"),
Expand Down Expand Up @@ -759,6 +767,9 @@
"llava_video_qwen2_72b": partial(
LLaVA_OneVision, model_path="lmms-lab/LLaVA-Video-72B-Qwen2"
),
"llava_onevision_1_5_8b": partial(
LLaVA_OneVision_1_5, model_path="lmms-lab/LLaVA-OneVision-1.5-8B-Instruct", max_new_tokens=4096
),
}

varco_vision_series = {
Expand Down Expand Up @@ -1672,7 +1683,7 @@
aria_series, smolvlm_series, sail_series, valley_series, vita_series,
ross_series, emu_series, ola_series, ursa_series, gemma_series,
long_vita_series, ristretto_series, kimi_series, aguvis_series, hawkvl_series,
flash_vl, kimi_vllm_series, oryx_series, treevgr_series, varco_vision_series, qtunevl_series, xvl_series, thyme_series
flash_vl, kimi_vllm_series, oryx_series, treevgr_series, varco_vision_series, qtunevl_series, xvl_series, thyme_series, keye_vl_series
]

for grp in model_groups:
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/dataset/mlvu.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class MLVU(ConcatVideoDataset):
def __init__(self, dataset='MLVU', nframe=0, fps=-1):
self.DATASET_SETS[dataset] = ['MLVU_MCQ', 'MLVU_OpenEnded']
self.type_data_dict = {
'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning'],
'M-Avg':['plotQA', 'needle', 'ego', 'count', 'anomaly_reco', 'topic_reasoning', 'order'],
'G-Avg':['sub_scene', 'summary']
}
super().__init__(dataset=dataset, nframe=nframe, fps=fps)
Expand Down
20 changes: 13 additions & 7 deletions vlmeval/dataset/video_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def frame_paths_fps(self, video, num_frames):
return [osp.join(frame_root,
self.frame_tmpl_fps.format(i, num_frames, self.fps)) for i in range(1, num_frames + 1)]

def save_video_frames(self, video):
def save_video_frames(self, video, max_frames=-1):
import decord
if self.fps > 0:
vid_path = osp.join(self.data_root, video + '.mp4')
Expand All @@ -80,12 +80,18 @@ def save_video_frames(self, video):

# 计算需要提取的总帧数
required_frames = int(total_duration * self.fps)

# 计算提取帧的间隔
step_size = video_fps / self.fps

# 计算提取帧的索引
indices = [int(i * step_size) for i in range(required_frames)]
if max_frames > 0 and required_frames > max_frames:
print(f"video {video} requires {self.fps} fps sampling, \
but all need sampled frames {required_frames} > max_frames {max_frames}, sample down to {max_frames} frames")
required_frames = max_frames
step_size = total_frames / (required_frames+1)
indices = [int(i * step_size) for i in range(1, required_frames + 1)]
else:
# 计算提取帧的间隔
step_size = video_fps / self.fps

# 计算提取帧的索引
indices = [int(i * step_size) for i in range(required_frames)]

# 提取帧并保存
frame_paths = self.frame_paths_fps(video, len(indices))
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/smp/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def get_pred_file_format():
if pred_format == '':
return 'xlsx' # default format
else:
assert pred_format in ['tsv', 'xlsx', 'json'], f'Unsupported PRED_FORMAT {pred_format}'
assert pred_format in ['tsv', 'xlsx', 'csv'], f'Unsupported PRED_FORMAT {pred_format}'
return pred_format


Expand Down
6 changes: 5 additions & 1 deletion vlmeval/vlm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .base import BaseModel
from .hawk_vl import HawkVL
from .thyme import Thyme
from .keye_vl import KeyeVL
from .cogvlm import CogVlm, GLM4v, GLMThinking
from .emu import Emu, Emu3_chat, Emu3_gen
from .eagle_x import Eagle
Expand All @@ -20,10 +21,11 @@
LLaVA_Next2,
LLaVA_OneVision,
LLaVA_OneVision_HF,
LLaVA_OneVision_1_5,
)
from .vita import VITA, VITAQwen2
from .long_vita import LongVITA
from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6
from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6, MiniCPM_o_2_6, MiniCPM_V_4, MiniCPM_V_4_5
from .minigpt4 import MiniGPT4
from .mmalaya import MMAlaya, MMAlaya2
from .monkey import Monkey, MonkeyChat
Expand Down Expand Up @@ -67,6 +69,8 @@
LLaMAVID,
VideoChat2_HD,
PLLaVA,
InternVideo,
VideoLLaMA3,
)
from .vila import VILA, NVILA
from .ovis import Ovis, Ovis1_6, Ovis1_6_Plus, Ovis2, OvisU1
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/vlm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def preproc_content(self, inputs):
assert 'type' in item and 'value' in item
mime, s = parse_file(item['value'])
if mime is None:
assert item['type'] == 'text'
assert item['type'] == 'text', f'Invalid input type: {item}'
else:
assert mime.split('/')[0] == item['type']
item['value'] = s
Expand Down
182 changes: 182 additions & 0 deletions vlmeval/vlm/keye_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@


from transformers import AutoProcessor

from .base import BaseModel

class KeyeVL(BaseModel):

INSTALL_REQ = True
INTERLEAVE = True
VIDEO_LLM = True

def __init__(self, model_path="Kwai-Keye/Keye-VL-1_5-8B", use_vllm=True, **kwargs):

# check vllm and keye_vl_utils are installed
if use_vllm:
try:
from vllm import LLM, SamplingParams
from keye_vl_utils import process_vision_info
except Exception as e:
raise ImportError(
f"vllm and keye_vl_utils are not installed, please install them first, {e}"
"You can install them by running: "
"pip install keye-vl-utils==1.5.2 vllm>=0.10.2"
)
else:
try:
from transformers import AutoModel, AutoTokenizer
from keye_vl_utils import process_vision_info
except Exception as e:
raise ImportError(
f"transformers and keye_vl_utils are not installed, please install them first, {e}"
"You can install them by running: "
"pip install keye-vl-utils==1.5.2 transformers>=4.56.1"
)

self.use_vllm = use_vllm
self.fps = 1
self.max_frames = 64 # 1024
self.kwargs = kwargs
# min_pixels = 32 * 28 * 28
# max_pixels = 1280 * 28 * 28

self.model_path = model_path
if use_vllm:
try:
# Prefer eager mode to avoid torch.compile tracing of generators in custom model code
self.llm = LLM(
model=model_path,
limit_mm_per_prompt={"image": 10, "video": 10},
trust_remote_code=True,
enforce_eager=True,
)
except TypeError:
# Fallback for older vLLM versions without enforce_eager
self.llm = LLM(
model=model_path,
limit_mm_per_prompt={"image": 10, "video": 10},
trust_remote_code=True,
tensor_parallel_size=1,
gpu_memory_utilization=0.8,
max_num_batched_tokens=32768,
max_model_len=32768,
)
sampling_params = SamplingParams(
temperature=0.3,
max_tokens=4096,
)
self.sampling_params = sampling_params
self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
else:
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype="auto",
trust_remote_code=True,
# flash_attention_2 is recommended for better performance
attn_implementation="flash_attention_2",
).eval()
self.model.to("cuda")
self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

def generate_inner_vllm(self, message, dataset=None):
print(f'{self.model_path} is a video-llm model using vllm, can not set fps or nframe, using default sampling method in keye_vl_utils')
content_list = []
for msg in message:
if msg["type"] == "text":
content_list.append(
{"type": "text", "text": msg["value"]}
)
elif msg["type"] == "image":
content_list.append(
{"type": "image", "image": msg["value"]}
)
elif msg["type"] == "video":
content_list.append(
{"type": "video", "video": msg["value"]}
)
else:
raise ValueError(f"Invalid message type: {msg['type']}, {msg}")
conversation = [
{"role": "user", "content": content_list}
]
prompt = self.processor.apply_chat_template(
conversation, tokenize=False, add_generation_prompt=True,
)
from keye_vl_utils import process_vision_info
image_inputs, video_inputs, video_kwargs = process_vision_info(
conversation
)

mm_data = {}
if image_inputs is not None:
mm_data["image"] = image_inputs
if video_inputs is not None:
mm_data["video"] = video_inputs

llm_inputs = {
"prompt": prompt,
"multi_modal_data": mm_data,
# FPS will be returned in video_kwargs
"mm_processor_kwargs": video_kwargs,
}

outputs = self.llm.generate([llm_inputs], sampling_params=self.sampling_params)
generated_text = outputs[0].outputs[0].text

return generated_text

def generate_inner_transformers(self, message, dataset=None):
content_list = []
for msg in message:
if msg["type"] == "text":
content_list.append(
{"type": "text", "text": msg["value"]}
)
elif msg["type"] == "image":
content_list.append(
{"type": "image", "image": msg["value"]}
)
elif msg["type"] == "video":
content_list.append(
{"type": "video", "video": msg["value"], "fps": self.fps, "max_frames": self.max_frames}
)
else:
raise ValueError(f"Invalid message type: {msg['type']}, {msg}")
conversation = [
{"role": "user", "content": content_list}
]
# Preparation for inference
text = self.processor.apply_chat_template(
conversation, tokenize=False, add_generation_prompt=True
)
from keye_vl_utils import process_vision_info
image_inputs, video_inputs, mm_processor_kwargs = process_vision_info(conversation)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**mm_processor_kwargs
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = self.model.generate(**inputs, **self.kwargs)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text


def generate_inner(self, message, dataset=None):
if self.use_vllm:
return self.generate_inner_vllm(message, dataset)
else:
return self.generate_inner_transformers(message, dataset)


4 changes: 2 additions & 2 deletions vlmeval/vlm/llava/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF, LLaVA_OneVision_1_5
from .llava_xtuner import LLaVA_XTuner

__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
__all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF','LLaVA_OneVision_1_5']
Loading
Loading