From 5df0353b85c9e65d61e6c0b3099db7eb65a2cfbe Mon Sep 17 00:00:00 2001 From: xffxff <1247714429@qq.com> Date: Thu, 21 Nov 2024 03:24:04 +0000 Subject: [PATCH] run format.sh Signed-off-by: xffxff <1247714429@qq.com> --- vllm/model_executor/models/aria.py | 221 ++++++++++++++--------------- 1 file changed, 108 insertions(+), 113 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 4c229599a9428..ff02efd5c0830 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -80,6 +80,7 @@ def forward(self, x, *args, **kwargs): class AriaVisionTransformer(Idefics2VisionTransformer): + def __init__( self, config: AriaVisionConfig, @@ -214,12 +215,16 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False): key = self.k_proj(x).permute(1, 0, 2) value = self.v_proj(x).permute(1, 0, 2) - attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + attn_output, _ = self.multihead_attn(query, + key, + value, + attn_mask=attn_mask) attn_output = attn_output.permute(1, 0, 2) if add_residual: - attn_output = hidden_states + self.dropout(self.linear(attn_output)) + attn_output = hidden_states + self.dropout( + self.linear(attn_output)) else: attn_output = self.dropout(self.linear(attn_output)) @@ -260,8 +265,7 @@ def __init__( self.num_heads = num_heads self.query = nn.Parameter( - torch.zeros(max(patch_to_query_dict.values()), self.embed_dim) - ) + torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) trunc_normal_(self.query, std=0.02) @@ -296,9 +300,8 @@ def forward(self, x, attn_mask=None): queries = self.query.unsqueeze(0).repeat(bs, 1, 1) query_num = self.patch_to_query_dict.get(x.shape[1], None) - assert ( - query_num is not None - ), f"Query number for {x.shape[1]} patches is not provided" + assert (query_num is not None + ), f"Query number for {x.shape[1]} patches is not provided" queries = queries[:, :query_num, :] @@ -380,12 +383,16 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False): key = self.k_proj(x).permute(1, 0, 2) value = self.v_proj(x).permute(1, 0, 2) - attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + attn_output, _ = self.multihead_attn(query, + key, + value, + attn_mask=attn_mask) attn_output = attn_output.permute(1, 0, 2) if add_residual: - attn_output = hidden_states + self.dropout(self.linear(attn_output)) + attn_output = hidden_states + self.dropout( + self.linear(attn_output)) else: attn_output = self.dropout(self.linear(attn_output)) @@ -426,8 +433,7 @@ def __init__( self.num_heads = num_heads self.query = nn.Parameter( - torch.zeros(max(patch_to_query_dict.values()), self.embed_dim) - ) + torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) trunc_normal_(self.query, std=0.02) @@ -462,9 +468,8 @@ def forward(self, x, attn_mask=None): queries = self.query.unsqueeze(0).repeat(bs, 1, 1) query_num = self.patch_to_query_dict.get(x.shape[1], None) - assert ( - query_num is not None - ), f"Query number for {x.shape[1]} patches is not provided" + assert (query_num is not None + ), f"Query number for {x.shape[1]} patches is not provided" queries = queries[:, :query_num, :] @@ -520,13 +525,14 @@ def __init__( class Experts(nn.Module): + def __init__(self, config: AriaMoELMConfig): super().__init__() self.config = config self.router_weight = nn.Parameter( - torch.empty((self.config.moe_num_experts, self.config.hidden_size)) - ) + torch.empty( + (self.config.moe_num_experts, self.config.hidden_size))) self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() @@ -536,62 +542,60 @@ def __init__(self, config: AriaMoELMConfig): ) self.w1 = nn.Parameter( - torch.empty( - ( - config.moe_num_experts, - config.moe_intermediate_size * 2 // self.tp_size, - config.hidden_size, - ) - ) - ) + torch.empty(( + config.moe_num_experts, + config.moe_intermediate_size * 2 // self.tp_size, + config.hidden_size, + ))) self.w2 = nn.Parameter( - torch.empty( - ( - config.moe_num_experts, - config.hidden_size, - config.moe_intermediate_size // self.tp_size, - ) - ) - ) - set_weight_attrs( - self.router_weight, {"weight_loader": self._weight_loader_for_router} - ) - set_weight_attrs(self.w1, {"weight_loader": self._weight_loader_for_w1}) - set_weight_attrs(self.w2, {"weight_loader": self._weight_loader_for_w2}) - - def _weight_loader_for_router( - self, param: nn.Parameter, loaded_weight: torch.Tensor - ): + torch.empty(( + config.moe_num_experts, + config.hidden_size, + config.moe_intermediate_size // self.tp_size, + ))) + set_weight_attrs(self.router_weight, + {"weight_loader": self._weight_loader_for_router}) + set_weight_attrs(self.w1, + {"weight_loader": self._weight_loader_for_w1}) + set_weight_attrs(self.w2, + {"weight_loader": self._weight_loader_for_w2}) + + def _weight_loader_for_router(self, param: nn.Parameter, + loaded_weight: torch.Tensor): param.data.copy_(loaded_weight) - def _weight_loader_for_w1(self, param: nn.Parameter, loaded_weight: torch.Tensor): + def _weight_loader_for_w1(self, param: nn.Parameter, + loaded_weight: torch.Tensor): # the shape of loaded_weight is (num_experts, hidden_size, 2 * moe_intermediate_size) if self.tp_size > 1: up, gate = loaded_weight.chunk(2, dim=-1) up_current_rank = up.chunk(self.tp_size, dim=-1)[self.tp_rank] gate_current_rank = gate.chunk(self.tp_size, dim=-1)[self.tp_rank] - up_and_gate = torch.cat( - [up_current_rank, gate_current_rank], dim=-1 - ).transpose(1, 2) + up_and_gate = torch.cat([up_current_rank, gate_current_rank], + dim=-1).transpose(1, 2) param.data.copy_(up_and_gate) else: param.data.copy_(loaded_weight.transpose(1, 2)) - def _weight_loader_for_w2(self, param: nn.Parameter, loaded_weight: torch.Tensor): + def _weight_loader_for_w2(self, param: nn.Parameter, + loaded_weight: torch.Tensor): # the shape of loaded_weight is (num_experts, moe_intermediate_size, hidden_size) if self.tp_size > 1: - down_current_rank = loaded_weight.chunk(self.tp_size, dim=1)[self.tp_rank] + down_current_rank = loaded_weight.chunk(self.tp_size, + dim=1)[self.tp_rank] param.data.copy_(down_current_rank.transpose(1, 2)) else: param.data.copy_(loaded_weight.transpose(1, 2)) def forward(self, hidden_states): - router_output = torch.nn.functional.linear(hidden_states, self.router_weight) - - def custom_routing_function(hidden_states, router_output, topk, renormalize): - top_logits, top_indices = torch.topk( - router_output, k=self.config.moe_topk, dim=1 - ) + router_output = torch.nn.functional.linear(hidden_states, + self.router_weight) + + def custom_routing_function(hidden_states, router_output, topk, + renormalize): + top_logits, top_indices = torch.topk(router_output, + k=self.config.moe_topk, + dim=1) scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32) return scores, top_indices.to(torch.int32) @@ -608,7 +612,8 @@ def custom_routing_function(hidden_states, router_output, topk, renormalize): custom_routing_function=custom_routing_function, ) final_hidden_states = final_hidden_states.view(hidden_states_shape) - final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states) + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) return final_hidden_states @@ -674,24 +679,21 @@ def __init__( rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): + config, "original_max_position_embeddings", None): rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) - max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias attention_bias = getattr(config, "attention_bias", False) or getattr( - config, "bias", False - ) + config, "bias", False) self.self_attn = LlamaAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, - num_kv_heads=getattr( - config, "num_key_value_heads", config.num_attention_heads - ), + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, @@ -700,11 +702,13 @@ def __init__( cache_config=cache_config, prefix=f"{prefix}.self_attn", ) - self.mlp = MoELayer(config, quant_config=quant_config, lora_config=lora_config) - self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm( - config.hidden_size, eps=config.rms_norm_eps - ) + self.mlp = MoELayer(config, + quant_config=quant_config, + lora_config=lora_config) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) class AriaMoELMModel(LlamaModel): @@ -757,9 +761,8 @@ def build_mm_projector(config): ) -def _select_best_resolution( - img_width: int, img_height: int, target_ratios: List[List[int]], patch_size: int -): +def _select_best_resolution(img_width: int, img_height: int, + target_ratios: List[List[int]], patch_size: int): """ Selects the best resolution from a list of possible resolutions based on the original size. @@ -783,10 +786,8 @@ def _select_best_resolution( if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio_w, best_ratio_h = ratio[0], ratio[1] - elif ( - ratio_diff == best_ratio_diff - and area > 0.5 * patch_size * patch_size * ratio[0] * ratio[1] - ): + elif (ratio_diff == best_ratio_diff + and area > 0.5 * patch_size * patch_size * ratio[0] * ratio[1]): best_ratio_w, best_ratio_h = ratio[0], ratio[1] return best_ratio_w, best_ratio_h @@ -832,8 +833,7 @@ def split_image( """ if split_image: ratio_width, ratio_height = _select_best_resolution( - image.width, image.height, split_ratio, patch_size - ) + image.width, image.height, split_ratio, patch_size) resize_width = patch_size * ratio_width resize_height = patch_size * ratio_height blocks = ratio_width * ratio_height @@ -870,21 +870,20 @@ def input_mapper_for_aria(ctx, data): The only different is we would like to support runtime max_image_size adjustment. """ model_config = ctx.model_config - max_image_size = getattr(model_config.multimodal_config, "max_image_size", 980) + max_image_size = getattr(model_config.multimodal_config, "max_image_size", + 980) # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code - ) + model_config.model, + trust_remote_code=model_config.trust_remote_code) if image_processor is None: - raise RuntimeError( - "No HuggingFace processor is available " "to process the image object" - ) + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") try: batch_data = image_processor.preprocess( - data, max_image_size=max_image_size, return_tensors="pt" - ).data + data, max_image_size=max_image_size, return_tensors="pt").data batch_data.pop("num_crops") except Exception: logger.error("Failed to process image (%s)", data) @@ -915,17 +914,15 @@ def input_processor(ctx, llm_inputs): _split_image = multi_modal_data.pop("split_image", False) assert isinstance(max_image_size, int) or isinstance( - max_image_size, float - ), "max_image_size should be float or int" - images = ( - multi_modal_data["image"] - if isinstance(multi_modal_data["image"], list) - else [multi_modal_data["image"]] - ) + max_image_size, float), "max_image_size should be float or int" + images = (multi_modal_data["image"] if isinstance( + multi_modal_data["image"], list) else [multi_modal_data["image"]]) num_crops = [] splitted_images = [] for image in images: - splitted_image = split_image(image, _split_image, patch_size=max_image_size) + splitted_image = split_image(image, + _split_image, + patch_size=max_image_size) splitted_images.extend(splitted_image) num_crops.append(len(splitted_image)) max_image_size = [max_image_size] * len(images) @@ -938,11 +935,13 @@ def input_processor(ctx, llm_inputs): assert ( image_size in hf_config.image_size2tokens ), f"Invalid image size: {image_size}, available options: {list(hf_config.image_size2tokens.keys())}" - image_feature_sizes.append(hf_config.image_size2tokens[image_size] * num_crop) + image_feature_sizes.append(hf_config.image_size2tokens[image_size] * + num_crop) # Set up the max_image_size and split_image in the RuntimeContext for the image processor # TODO: Supports dynamic image size support - setattr(model_config.multimodal_config, "max_image_size", max(max_image_size)) + setattr(model_config.multimodal_config, "max_image_size", + max(max_image_size)) new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, @@ -997,9 +996,8 @@ def __init__( vllm_config=vllm_config.with_hf_config(config.text_config), prefix=maybe_prefix(prefix, "language_model.model"), ) - self.pad_token_id = ( - self.config.pad_token_id if self.config.pad_token_id is not None else -1 - ) + self.pad_token_id = (self.config.pad_token_id + if self.config.pad_token_id is not None else -1) self.unpadded_vocab_size = config.text_config.vocab_size self.lm_head = ParallelLMHead( self.unpadded_vocab_size, @@ -1008,9 +1006,8 @@ def __init__( quant_config=quant_config, ) logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, self.vocab_size, logit_scale - ) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + self.vocab_size, logit_scale) self.sampler = Sampler() def forward( @@ -1030,8 +1027,7 @@ def forward( # 2. Merge text and images if pixel_values is not None: pixel_values = pixel_values.view(-1, *pixel_values.shape[-3:]).to( - torch.bfloat16 - ) + torch.bfloat16) pixel_mask = pixel_mask.view(-1, *pixel_mask.shape[-2:]) selected_image_feature, image_attn_mask = self.vision_tower( pixel_values, @@ -1039,13 +1035,12 @@ def forward( ) image_features = self.multi_modal_projector( - selected_image_feature, attn_mask=image_attn_mask - ) + selected_image_feature, attn_mask=image_attn_mask) inputs_embeds = inputs_embeds.to(image_features.dtype) inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, image_features, self.config.image_token_index - ) + input_ids, inputs_embeds, image_features, + self.config.image_token_index) hidden_states = self.language_model( input_ids, @@ -1058,10 +1053,10 @@ def forward( return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata - ) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) return logits def sample( @@ -1086,4 +1081,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ) loader = AutoWeightsLoader(self) - loader.load_weights(weights, mapper=hf_to_vllm_mapper) \ No newline at end of file + loader.load_weights(weights, mapper=hf_to_vllm_mapper)