mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-13 07:32:23 +08:00
fix all commit
Signed-off-by: inkcherry <mingzhi.liu@amd.com>
This commit is contained in:
parent
795a305b1b
commit
857d93cbfb
@ -16,12 +16,11 @@ from quart import Quart, make_response, request
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
prefill_instances = []
|
prefill_instances: list[dict] = []
|
||||||
decode_instances = []
|
decode_instances: list[dict] = []
|
||||||
request_nums = 0
|
request_nums = 0
|
||||||
app = Quart(__name__)
|
app = Quart(__name__)
|
||||||
|
|
||||||
yield_chunk = set()
|
|
||||||
IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)")
|
IP_PORT_PATTERN = re.compile(r"//(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d+)")
|
||||||
|
|
||||||
|
|
||||||
@ -200,7 +199,10 @@ async def handle_request():
|
|||||||
request_nums += 1
|
request_nums += 1
|
||||||
|
|
||||||
def extract_ip_port_fast(url):
|
def extract_ip_port_fast(url):
|
||||||
return IP_PORT_PATTERN.search(url).groups()
|
match = IP_PORT_PATTERN.search(url)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Invalid URL format: {url}")
|
||||||
|
return match.groups()
|
||||||
|
|
||||||
req_data = await request.get_json()
|
req_data = await request.get_json()
|
||||||
request_id = str(uuid.uuid4())
|
request_id = str(uuid.uuid4())
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import contextlib
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import pickle
|
|
||||||
import queue
|
import queue
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
@ -1673,7 +1672,7 @@ class MoRIIOConnectorWorker:
|
|||||||
) # send local mori io engine meta data
|
) # send local mori io engine meta data
|
||||||
logger.debug("MoRIIO handshake listener sent metadata")
|
logger.debug("MoRIIO handshake listener sent metadata")
|
||||||
# now we send tensor meta data for each block
|
# now we send tensor meta data for each block
|
||||||
buf = pickle.dumps(layer_name_to_local_kv_cache_metadata)
|
buf = msgpack.dumps(layer_name_to_local_kv_cache_metadata)
|
||||||
sock.send_multipart((identity, b"", buf))
|
sock.send_multipart((identity, b"", buf))
|
||||||
elif msg == MoRIIOConstants.POP_DONE_RECV:
|
elif msg == MoRIIOConstants.POP_DONE_RECV:
|
||||||
_, req_id = sock.recv_multipart()
|
_, req_id = sock.recv_multipart()
|
||||||
@ -1752,7 +1751,7 @@ class MoRIIOConnectorWorker:
|
|||||||
assert 0, f"Unexpected frame! {received_frame = }"
|
assert 0, f"Unexpected frame! {received_frame = }"
|
||||||
buf = received_frame[1]
|
buf = received_frame[1]
|
||||||
self.layer_name_to_remote_kv_cache_metadata[expected_engine_id] = (
|
self.layer_name_to_remote_kv_cache_metadata[expected_engine_id] = (
|
||||||
pickle.loads(buf)
|
msgpack.loads(buf)
|
||||||
)
|
)
|
||||||
|
|
||||||
setup_agent_time = time.perf_counter()
|
setup_agent_time = time.perf_counter()
|
||||||
|
|||||||
@ -328,6 +328,7 @@ class OpenAIServingChat(OpenAIServing):
|
|||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
trace_headers=trace_headers,
|
trace_headers=trace_headers,
|
||||||
priority=request.priority,
|
priority=request.priority,
|
||||||
|
data_parallel_rank=data_parallel_rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
generator = self.engine_client.generate(
|
generator = self.engine_client.generate(
|
||||||
|
|||||||
@ -1172,7 +1172,7 @@ class OpenAIServing:
|
|||||||
lora_request: LoRARequest | None,
|
lora_request: LoRARequest | None,
|
||||||
trace_headers: Mapping[str, str] | None,
|
trace_headers: Mapping[str, str] | None,
|
||||||
priority: int,
|
priority: int,
|
||||||
data_parallel_rank: int,
|
data_parallel_rank: int | None,
|
||||||
) -> tuple[EngineCoreRequest, dict[str, Any]]:
|
) -> tuple[EngineCoreRequest, dict[str, Any]]:
|
||||||
"""Use the Processor to process inputs for AsyncLLM."""
|
"""Use the Processor to process inputs for AsyncLLM."""
|
||||||
tokenization_kwargs: dict[str, Any] = {}
|
tokenization_kwargs: dict[str, Any] = {}
|
||||||
@ -1220,6 +1220,7 @@ class OpenAIServing:
|
|||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
trace_headers=trace_headers,
|
trace_headers=trace_headers,
|
||||||
priority=priority,
|
priority=priority,
|
||||||
|
data_parallel_rank=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
generator = self.engine_client.generate(
|
generator = self.engine_client.generate(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user