#!/bin/bash # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # vLLM Embedding Server with Enhanced Chunked Processing # This script starts a vLLM server with chunked processing enabled for long text embedding. # Now supports proper pooling type validation and model-specific configurations. set -euo pipefail # Configuration MODEL_NAME=${MODEL_NAME:-"intfloat/multilingual-e5-large"} MODEL_CODE=${MODEL_CODE:-"multilingual-e5-large"} PORT=${PORT:-31090} GPU_COUNT=${GPU_COUNT:-1} MAX_EMBED_LEN=${MAX_EMBED_LEN:-3072000} API_KEY=${API_KEY:-"your-api-key"} # Enhanced pooling configuration with model-specific defaults POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST export VLLM_ENABLE_CHUNKED_PROCESSING=true export CUDA_VISIBLE_DEVICES=2,3,4,5 echo "๐Ÿš€ Starting vLLM Embedding Server with Enhanced Chunked Processing" echo "==================================================================" # Environment variables for optimization export VLLM_WORKER_MULTIPROC_METHOD=spawn # Function to determine optimal pooling type for known models get_optimal_pooling_type() { local model="$1" case "$model" in *"e5-"* | *"multilingual-e5"*) echo "MEAN" # E5 series native pooling ;; *"bge-"*) echo "CLS" # BGE series native pooling ;; *"gte-"*) echo "LAST" # GTE series native pooling ;; *"sentence-t5"* | *"st5"*) echo "MEAN" # Sentence-T5 native pooling ;; *"jina-embeddings"*) echo "MEAN" # Jina embeddings native pooling ;; *"Qwen"*"Embedding"*) echo "LAST" # Qwen embeddings native pooling ;; *) echo "MEAN" # Default native pooling for unknown models ;; esac } # Auto-detect pooling type if not explicitly set if [ "$POOLING_TYPE" = "auto" ]; then POOLING_TYPE=$(get_optimal_pooling_type "$MODEL_NAME") echo "๐Ÿ” Auto-detected pooling type: $POOLING_TYPE for model $MODEL_NAME" fi # Display configuration echo "๐Ÿ“‹ Configuration:" echo " - Model: $MODEL_NAME" echo " - Port: $PORT" echo " - GPU Count: $GPU_COUNT" echo " - Enhanced Chunked Processing: ${VLLM_ENABLE_CHUNKED_PROCESSING}" echo " - Max Embed Length: ${MAX_EMBED_LEN} tokens" echo " - Native Pooling Type: $POOLING_TYPE + Normalization" echo " - Cross-chunk Aggregation: MEAN (automatic)" echo "" # Validate GPU availability if command -v nvidia-smi &> /dev/null; then gpu_count=$(nvidia-smi --list-gpus | wc -l) echo "๐Ÿ–ฅ๏ธ Available GPUs: $gpu_count" if [ "$GPU_COUNT" -gt "$gpu_count" ]; then echo "โš ๏ธ Warning: Requested $GPU_COUNT GPUs but only $gpu_count available" echo " Adjusting to use $gpu_count GPUs" GPU_COUNT=$gpu_count fi else echo "โš ๏ธ Warning: nvidia-smi not found. GPU detection skipped." fi # Chunked processing uses unified MEAN aggregation echo "โ„น๏ธ Chunked Processing: Using $POOLING_TYPE pooling within chunks, MEAN aggregation across chunks" echo " - All chunks processed for complete semantic coverage" echo " - Weighted averaging based on chunk token counts" echo "" echo "๐Ÿ”ง Starting server with enhanced chunked processing configuration..." # Build pooler config JSON POOLER_CONFIG="{\"pooling_type\": \"$POOLING_TYPE\", \"normalize\": true, \"enable_chunked_processing\": ${VLLM_ENABLE_CHUNKED_PROCESSING}, \"max_embed_len\": ${MAX_EMBED_LEN}}" # Start vLLM server with enhanced chunked processing vllm serve "$MODEL_NAME" \ --tensor-parallel-size "$GPU_COUNT" \ --enforce-eager \ --pooler-config "$POOLER_CONFIG" \ --served-model-name ${MODEL_CODE} \ --api-key "$API_KEY" \ --trust-remote-code \ --port "$PORT" \ --host 0.0.0.0 echo "" echo "โœ… vLLM Embedding Server started successfully!" echo "" echo "๐Ÿ“ก Server Information:" echo " - Base URL: http://localhost:$PORT" echo " - Model Code: ${MODEL_CODE}" echo " - API Key: $API_KEY" echo " - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN" echo "" echo "๐Ÿงช Test the server with:" echo " python examples/online_serving/openai_embedding_long_text/client.py" echo "" echo "๐Ÿ“š Enhanced features enabled:" echo " โœ… Intelligent native pooling type detection" echo " โœ… Unified MEAN aggregation for chunked processing" echo " โœ… Model-specific native pooling optimization" echo " โœ… Enhanced max embedding length (${MAX_EMBED_LEN} tokens)" echo " โœ… Complete semantic coverage for all pooling types" echo " โœ… OpenAI-compatible API" echo " โœ… GPU acceleration" echo "" echo "๐Ÿ”ง Advanced usage:" echo " - Set POOLING_TYPE=MEAN|CLS|LAST to override auto-detection" echo " - Set MAX_EMBED_LEN to adjust maximum input length" echo " - All pooling types use MEAN aggregation across chunks"