diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 49328aa2818ca..e8a69f29f999f 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -557,6 +557,7 @@ class ParallelConfig: if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. + from vllm.v1.executor import ray_utils backend: DistributedExecutorBackend = "mp" diff --git a/vllm/distributed/afd_transfer/afd_connector/p2p_connector.py b/vllm/distributed/afd_transfer/afd_connector/p2p_connector.py index 0655db693bce4..6dc1f317b87d5 100644 --- a/vllm/distributed/afd_transfer/afd_connector/p2p_connector.py +++ b/vllm/distributed/afd_transfer/afd_connector/p2p_connector.py @@ -317,4 +317,5 @@ class P2PAFDConnector(AFDConnectorBase): ) self._current_afd_connector_metadata.recv_handle_list = work_list self._current_afd_connector_metadata.layer_idx = layer_idx + self._current_afd_connector_metadata.stage_idx = stage_idx return hidden_states, self._current_afd_connector_metadata diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dfcc601a1c530..f4bc875cf10b3 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -563,7 +563,6 @@ class ColumnParallelLinear(LinearBase): # Matrix multiply. assert self.quant_method is not None output_parallel = self.quant_method.apply(self, input_, bias) - if self.gather_output and self.tp_size > 1: # All-gather across the partitions. output = tensor_model_parallel_all_gather(output_parallel) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index eb9d85cfc55fd..7938cff98c354 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -387,7 +387,6 @@ class DeepseekV2MoE(nn.Module): final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( final_hidden_states ) - return final_hidden_states.view(num_tokens, hidden_dim) @@ -1415,9 +1414,6 @@ class DeepseekV2Model(nn.Module): for layer in islice(self.layers, self.start_layer, self.end_layer): for stage_i in range(forward_conext.afd_metadata.num_of_stages): - logger.info( - f"jcz deepseekv2 forward_with_afd_v2 layer_idx: {layer.layer_idx}, stage_i: {stage_i}" - ) afd_connector = afd_metadata.afd_connector forward_conext.attn_metadata = afd_metadata.attn_metadata_list[stage_i] forward_conext.dp_metadata = afd_metadata.dp_metadata_list[stage_i] @@ -1436,10 +1432,6 @@ class DeepseekV2Model(nn.Module): work.wait() current_positions = afd_metadata.positions_list[stage_i] - logger.info( - f"jcz deepseekv2 forward_with_afd_v2 hidden_states: {hidden_states.shape}" - f" positions:{positions.shape}" - ) hidden_states, residual = layer( current_positions, hidden_states, residual, llama_4_scaling ) @@ -1458,15 +1450,11 @@ class DeepseekV2Model(nn.Module): ) afd_connector.send_attn_output(hidden_states, metadata) - # Recv last layer and last stage FFN output. - ubatch_hidden_states[afd_metadata.num_of_stages - 1], recv_metadata = ( - afd_connector.recv_ffn_output() - ) - if recv_metadata.recv_handle_list is not None: - recv_handle = recv_metadata.recv_handle_list - if recv_handle is not None: - for work in recv_handle: - work.wait() + # Recv last layer FFN output. + for stage_i in range(afd_metadata.num_of_stages): + ubatch_hidden_states[stage_i], recv_metadata = ( + afd_connector.recv_ffn_output() + ) # Re-assemble the batch hidden_states = torch.cat(ubatch_hidden_states, dim=0) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 257e097401b10..41b617843ac11 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -105,7 +105,6 @@ class EngineCore: self.afd_config = vllm_config.afd_config if self.afd_config and self.afd_config.afd_role == "ffn": - logger.info("jcz EngineCore ffn role") return self.available_gpu_memory_for_kv_cache = -1 diff --git a/vllm/v1/worker/gpu_ffn_model_runner.py b/vllm/v1/worker/gpu_ffn_model_runner.py index 51dd62255acc7..cb08c9c05ae58 100644 --- a/vllm/v1/worker/gpu_ffn_model_runner.py +++ b/vllm/v1/worker/gpu_ffn_model_runner.py @@ -137,6 +137,7 @@ class GPUFFNModelRunner(LoRAModelRunnerMixin): current_layer_idx = recv_metadata.layer_idx logger.info( f"layer {current_layer_idx} moe recv hidden states type:{type(hidden_states)}, shape:{hidden_states.shape}" + f" dp_metadata: {dp_metadata}" ) num_tokens = hidden_states.shape[0] if recv_metadata is not None and recv_metadata.recv_handle_list is not None: