-
-
Notifications
You must be signed in to change notification settings - Fork 8.3k
Add GLM4.1V model (Draft) #19331
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add GLM4.1V model (Draft) #19331
Changes from all commits
43996c6
1f1446d
1b0ea56
ece4f40
0042be2
21f6945
0ca1769
2b51a60
18e0abb
3094328
78299c7
7e542ca
7811a3f
3a9b2e6
b84a700
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -248,6 +248,42 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: | |
) | ||
|
||
|
||
# GLM-4.1V | ||
def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: | ||
model_name = "THUDM/GLM-4.1V-9B-Thinking" | ||
|
||
engine_args = EngineArgs( | ||
model=model_name, | ||
max_model_len=4096, | ||
max_num_seqs=5, | ||
mm_processor_kwargs={ | ||
"min_pixels": 28 * 28, | ||
"max_pixels": 1280 * 28 * 28, | ||
"fps": 1, | ||
}, | ||
limit_mm_per_prompt={"image": 1}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
) | ||
|
||
if modality == "image": | ||
placeholder = "<|begin_of_image|><|image|><|end_of_image|>" | ||
elif modality == "video": | ||
placeholder = "<|begin_of_video|><|video|><|end_of_video|>" | ||
|
||
prompts = [ | ||
( | ||
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n" | ||
f"{placeholder}" | ||
f"{question}<|assistant|>assistant\n" | ||
) | ||
for question in questions | ||
] | ||
|
||
return ModelRequestData( | ||
engine_args=engine_args, | ||
prompts=prompts, | ||
) | ||
|
||
|
||
# H2OVL-Mississippi | ||
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: | ||
assert modality == "image" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Rotary Positional Embeddings.""" | ||
import itertools | ||
import math | ||
from typing import Any, Optional, Union | ||
|
||
|
@@ -1117,6 +1118,15 @@ def get_input_positions_tensor( | |
audio_feature_lengths=audio_feature_lengths, | ||
use_audio_in_video=use_audio_in_video, | ||
) | ||
elif "glm4v" in hf_config.model_type: | ||
return cls._glm4v_get_input_positions_tensor( | ||
input_tokens=input_tokens, | ||
hf_config=hf_config, | ||
image_grid_thw=image_grid_thw, | ||
video_grid_thw=video_grid_thw, | ||
context_len=context_len, | ||
seq_len=seq_len, | ||
) | ||
else: | ||
return cls._vl_get_input_positions_tensor( | ||
input_tokens=input_tokens, | ||
|
@@ -1128,6 +1138,115 @@ def get_input_positions_tensor( | |
seq_len=seq_len, | ||
) | ||
|
||
@classmethod | ||
def _glm4v_get_input_positions_tensor( | ||
cls, | ||
input_tokens: list[int], | ||
hf_config: PretrainedConfig, | ||
image_grid_thw: Union[list[list[int]], torch.Tensor], | ||
video_grid_thw: Union[list[list[int]], torch.Tensor], | ||
context_len: int = 0, | ||
seq_len: Optional[int] = None, | ||
) -> tuple[torch.Tensor, int]: | ||
"""Get mrope input positions and delta value for GLM4V.""" | ||
|
||
image_token_id = hf_config.image_token_id | ||
video_start_token_id = hf_config.video_start_token_id | ||
video_end_token_id = hf_config.video_end_token_id | ||
spatial_merge_size = hf_config.vision_config.spatial_merge_size | ||
llm_pos_ids_list: list = [] | ||
|
||
if not (image_grid_thw is None and video_grid_thw is None): | ||
if isinstance(image_grid_thw, torch.Tensor): | ||
image_grid_thw = image_grid_thw.tolist() | ||
|
||
input_token_type: list[str] = [] | ||
video_check_flg = False | ||
for token in input_tokens: | ||
if token == video_start_token_id: | ||
video_check_flg = True | ||
elif token == video_end_token_id: | ||
video_check_flg = False | ||
|
||
if (token == image_token_id) and (video_check_flg is False): | ||
input_token_type.append("image") | ||
elif (token == image_token_id) and (video_check_flg is True): | ||
input_token_type.append("video") | ||
else: | ||
input_token_type.append("text") | ||
|
||
input_type_group: list[tuple[str, int, int]] = [] | ||
for key, group_iter in itertools.groupby( | ||
enumerate(input_token_type), lambda x: x[1]): | ||
group_list = list(group_iter) | ||
start_index = group_list[0][0] | ||
end_index = group_list[-1][0] + 1 | ||
input_type_group.append((key, start_index, end_index)) | ||
|
||
video_frame_num = 1 | ||
mm_data_idx = 0 | ||
for modality_type, start_idx, end_idx in input_type_group: | ||
st_idx = llm_pos_ids_list[-1].max() + 1 if len( | ||
llm_pos_ids_list) > 0 else 0 | ||
if modality_type == "image": | ||
t, h, w = ( | ||
image_grid_thw[mm_data_idx][0], | ||
image_grid_thw[mm_data_idx][1], | ||
image_grid_thw[mm_data_idx][2], | ||
) | ||
llm_grid_t, llm_grid_h, llm_grid_w = \ | ||
t, h // spatial_merge_size, w // spatial_merge_size | ||
|
||
t_index = torch.arange(llm_grid_t).view(-1, 1).expand( | ||
-1, llm_grid_h * llm_grid_w).flatten() | ||
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand( | ||
llm_grid_t, -1, llm_grid_w).flatten() | ||
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand( | ||
llm_grid_t, llm_grid_h, -1).flatten() | ||
llm_pos_ids_list.append( | ||
torch.stack([t_index, h_index, w_index]) + st_idx) | ||
mm_data_idx += 1 | ||
|
||
elif modality_type == "video": | ||
t, h, w = ( | ||
video_frame_num, | ||
image_grid_thw[mm_data_idx][1], | ||
image_grid_thw[mm_data_idx][2], | ||
) | ||
Comment on lines
+1212
to
+1215
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the video processing block, the temporal dimension t, h, w = (
video_frame_num,
video_grid_thw[mm_data_idx][1],
video_grid_thw[mm_data_idx][2],
) |
||
llm_grid_t, llm_grid_h, llm_grid_w = \ | ||
t, h // spatial_merge_size, w // spatial_merge_size | ||
|
||
for t_idx in range(llm_grid_t): | ||
t_index = torch.tensor(t_idx).view(-1, 1).expand( | ||
-1, llm_grid_h * llm_grid_w).flatten() | ||
h_index = torch.arange(llm_grid_h).view( | ||
1, -1, 1).expand(1, -1, llm_grid_w).flatten() | ||
w_index = torch.arange(llm_grid_w).view( | ||
1, 1, -1).expand(1, llm_grid_h, -1).flatten() | ||
llm_pos_ids_list.append( | ||
torch.stack([t_index, h_index, w_index]) + st_idx) | ||
|
||
mm_data_idx += 1 | ||
video_frame_num += 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
|
||
else: | ||
text_len = end_idx - start_idx | ||
llm_pos_ids_list.append( | ||
torch.arange(text_len).view(1, -1).expand(3, -1) + | ||
st_idx) | ||
video_frame_num = 1 | ||
|
||
else: | ||
text_len = len(input_tokens) | ||
llm_pos_ids_list.append( | ||
torch.arange(text_len).view(1, -1).expand(3, -1)) | ||
|
||
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) | ||
llm_positions = llm_positions[:, context_len:seq_len] | ||
mrope_position_delta = (llm_positions.max() + 1 - | ||
len(input_tokens)).item() | ||
Comment on lines
+1246
to
+1247
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The calculation of |
||
return llm_positions, mrope_position_delta | ||
|
||
@classmethod | ||
def _vl_get_input_positions_tensor( | ||
cls, | ||
|
Uh oh!
There was an error while loading. Please reload this page.