[doc] use MkDocs collapsible blocks - supplement (#19973)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
2026-07-06 17:27:14 +08:00 · 2025-06-23 18:54:16 +08:00 · 2025-06-23 18:54:16 +08:00 · b82e0f82cb
commit b82e0f82cb
parent 5111642a6f
3 changed files with 286 additions and 260 deletions
--- a/docs/design/v1/p2p_nccl_connector.md
+++ b/docs/design/v1/p2p_nccl_connector.md
@ -61,23 +61,25 @@ To address the above issues, I have designed and developed a local Tensor memory
 # Install vLLM
-```shell
+??? Commands
 # Enter the home directory or your working directory.
 cd /home
-# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
+    ```shell
-wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    # Enter the home directory or your working directory.
    cd /home
-# Download the code repository.
+    # Download the installation package, and I will update the commit-id in time. You can directly copy the command.
-git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
+    wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 cd vllm
-# Set the installation package path.
+    # Download the code repository.
-export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
    cd vllm
-# installation
+    # Set the installation package path.
-pip install -e . -v
+    export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-```
+
    # installation
    pip install -e . -v
    ```
 # Run xPyD
@ -104,83 +106,91 @@ python3 disagg_prefill_proxy_xpyd.py &
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20005 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20005 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.9 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.9 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20009 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20009 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.7 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.7 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20003 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20003 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.7 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.7 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20008 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20008 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.7 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.7 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ## Run 3P1D
@ -193,83 +203,91 @@ python3 disagg_prefill_proxy_xpyd.py &
 ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20005 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20005 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.9 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.9 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20009 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20009 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.9 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.9 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20003 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20003 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.9 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.9 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
-```shell
+??? Command
-VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
+
-    --host 0.0.0.0 \
+    ```shell
-    --port 20008 \
+    VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
-    --tensor-parallel-size 1 \
+        --host 0.0.0.0 \
-    --seed 1024 \
+        --port 20008 \
-    --served-model-name base_model \
+        --tensor-parallel-size 1 \
-    --dtype float16 \
+        --seed 1024 \
-    --max-model-len 10000 \
+        --served-model-name base_model \
-    --max-num-batched-tokens 10000 \
+        --dtype float16 \
-    --max-num-seqs 256 \
+        --max-model-len 10000 \
-    --trust-remote-code \
+        --max-num-batched-tokens 10000 \
-    --gpu-memory-utilization 0.7 \
+        --max-num-seqs 256 \
-    --disable-log-request \
+        --trust-remote-code \
-    --kv-transfer-config \
+        --gpu-memory-utilization 0.7 \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
+        --disable-log-request \
-```
+        --kv-transfer-config \
        '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
    ```
 # Single request
@ -286,25 +304,27 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
 # Benchmark
-```shell
+??? Command
-python3 benchmark_serving.py \
+
-    --backend vllm \
+    ```shell
-    --model base_model \
+    python3 benchmark_serving.py \
-    --tokenizer meta-llama/Llama-3.1-8B-Instruct \
+        --backend vllm \
-    --dataset-name "random" \
+        --model base_model \
-    --host 10.0.1.1 \
+        --tokenizer meta-llama/Llama-3.1-8B-Instruct \
-    --port 10001 \
+        --dataset-name "random" \
-    --random-input-len 1024 \
+        --host 10.0.1.1 \
-    --random-output-len 1024 \
+        --port 10001 \
-    --ignore-eos \
+        --random-input-len 1024 \
-    --burstiness 100 \
+        --random-output-len 1024 \
-    --percentile-metrics "ttft,tpot,itl,e2el" \
+        --ignore-eos \
-    --metric-percentiles "90,95,99" \
+        --burstiness 100 \
-    --seed $(date +%s) \
+        --percentile-metrics "ttft,tpot,itl,e2el" \
-    --trust-remote-code \
+        --metric-percentiles "90,95,99" \
-    --request-rate 3 \
+        --seed $(date +%s) \
-    --num-prompts 1000
+        --trust-remote-code \
-```
+        --request-rate 3 \
        --num-prompts 1000
    ```
 # Shut down
--- a/docs/design/v1/torch_compile.md
+++ b/docs/design/v1/torch_compile.md
@ -28,27 +28,29 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 In the very verbose logs, we can see:
-```
+??? Logs
 DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
-DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+      ```text
-DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+      DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
 DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
-DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
+      DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
-DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
-```
+      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
      DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
      DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
      DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
      ```
 This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
@ -99,28 +101,31 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
-```
+```bash
-vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
+vllm serve meta-llama/Llama-3.2-1B \
  --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
 ```
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
-```
+??? Logs
-AUTOTUNE mm(8x2048, 2048x3072)
+
-  triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+    ```
-  triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+    AUTOTUNE mm(8x2048, 2048x3072)
-  triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+      triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-  mm 0.0160 ms 81.6% 
+      triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-  triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+      triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-  triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+      mm 0.0160 ms 81.6% 
-  triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+      triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
-  triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+      triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
-  triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+      triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
-  triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+      triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
-SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
+      triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
-```
+      triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
    SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
    ```
 It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
@ -136,8 +141,9 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
-```
+```bash
-vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
+vllm serve meta-llama/Llama-3.2-1B \
  --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
 ```
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
--- a/examples/others/logging_configuration.md
+++ b/examples/others/logging_configuration.md
@ -55,33 +55,33 @@ STDOUT of the console in JSON format with a log level of `INFO`.
 To begin, first, create an appropriate JSON logging configuration file:
-**/path/to/logging_config.json:**
+??? note "/path/to/logging_config.json"
-```json
+    ```json
-{
+    {
-  "formatters": {
+      "formatters": {
-    "json": {
+        "json": {
-      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
+          "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
        }
      },
      "handlers": {
        "console": {
          "class" : "logging.StreamHandler",
          "formatter": "json",
          "level": "INFO",
          "stream": "ext://sys.stdout"
        }
      },
      "loggers": {
        "vllm": {
          "handlers": ["console"],
          "level": "INFO",
          "propagate": false
        }
      },
      "version": 1
    }
-  },
+    ```
  "handlers": {
    "console": {
      "class" : "logging.StreamHandler",
      "formatter": "json",
      "level": "INFO",
      "stream": "ext://sys.stdout"
    }
  },
  "loggers": {
    "vllm": {
      "handlers": ["console"],
      "level": "INFO",
      "propagate": false
    }
  },
  "version": 1
 }
 ```
 Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
 to the path of the custom logging configuration JSON file:
@ -104,38 +104,38 @@ configuration overrides the built-in default logging configuration used by vLLM.
 First, create an appropriate JSON logging configuration file that includes
 configuration for the root vLLM logger and for the logger you wish to silence:
-**/path/to/logging_config.json:**
+??? note "/path/to/logging_config.json"
-```json
+    ```json
-{
+    {
-  "formatters": {
+      "formatters": {
-    "vllm": {
+        "vllm": {
-      "class": "vllm.logging_utils.NewLineFormatter",
+          "class": "vllm.logging_utils.NewLineFormatter",
-      "datefmt": "%m-%d %H:%M:%S",
+          "datefmt": "%m-%d %H:%M:%S",
-      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+          "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
        }
      },
      "handlers": {
        "vllm": {
          "class" : "logging.StreamHandler",
          "formatter": "vllm",
          "level": "INFO",
          "stream": "ext://sys.stdout"
        }
      },
      "loggers": {
        "vllm": {
          "handlers": ["vllm"],
          "level": "DEBUG",
          "propagate": false
        },
        "vllm.example_noisy_logger": {
          "propagate": false
        }
      },
      "version": 1
    }
-  },
+    ```
  "handlers": {
    "vllm": {
      "class" : "logging.StreamHandler",
      "formatter": "vllm",
      "level": "INFO",
      "stream": "ext://sys.stdout"
    }
  },
  "loggers": {
    "vllm": {
      "handlers": ["vllm"],
      "level": "DEBUG",
      "propagate": false
    },
    "vllm.example_noisy_logger": {
      "propagate": false
    }
  },
  "version": 1
 }
 ```
 Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
 to the path of the custom logging configuration JSON file: