mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 04:15:01 +08:00
[Hardware][AMD] Improve OAM device ID + llama4 Maverick MOE tuning (#16263)
Signed-off-by: Lu Fang <lufang@fb.com> Co-authored-by: Lu Fang <lufang@fb.com>
This commit is contained in:
parent
182f40ea8b
commit
9352cdb56d
@ -442,8 +442,14 @@ class BenchmarkWorker:
|
|||||||
hidden_size, search_space,
|
hidden_size, search_space,
|
||||||
is_fp16, topk)
|
is_fp16, topk)
|
||||||
|
|
||||||
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
|
need_device_guard = False
|
||||||
) else nullcontext():
|
if current_platform.is_rocm():
|
||||||
|
visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None)
|
||||||
|
if visible_device != f"{self.device_id}":
|
||||||
|
need_device_guard = True
|
||||||
|
|
||||||
|
with torch.cuda.device(
|
||||||
|
self.device_id) if need_device_guard else nullcontext():
|
||||||
for config in tqdm(search_space):
|
for config in tqdm(search_space):
|
||||||
try:
|
try:
|
||||||
kernel_time = benchmark_config(
|
kernel_time = benchmark_config(
|
||||||
@ -578,6 +584,15 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
use_deep_gemm = bool(args.use_deep_gemm)
|
use_deep_gemm = bool(args.use_deep_gemm)
|
||||||
|
|
||||||
|
if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ:
|
||||||
|
# Ray will set ROCR_VISIBLE_DEVICES for device visibility
|
||||||
|
logger.warning(
|
||||||
|
"Ray uses ROCR_VISIBLE_DEVICES to control device accessibility."
|
||||||
|
"Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.")
|
||||||
|
val = os.environ["HIP_VISIBLE_DEVICES"]
|
||||||
|
os.environ["ROCR_VISIBLE_DEVICES"] = val
|
||||||
|
del os.environ["HIP_VISIBLE_DEVICES"]
|
||||||
|
|
||||||
ray.init()
|
ray.init()
|
||||||
num_gpus = int(ray.available_resources()["GPU"])
|
num_gpus = int(ray.available_resources()["GPU"])
|
||||||
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||||
|
|||||||
@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 16,
|
||||||
|
"BLOCK_SIZE_K": 256,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"16": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"24": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"32": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"48": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"64": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 4,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"96": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 1,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"128": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 4,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"256": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 32,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 2,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"512": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1024": {
|
||||||
|
"BLOCK_SIZE_M": 16,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 1,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"1536": {
|
||||||
|
"BLOCK_SIZE_M": 32,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"2048": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 64,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 16,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
},
|
||||||
|
"3072": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 8,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 1
|
||||||
|
},
|
||||||
|
"4096": {
|
||||||
|
"BLOCK_SIZE_M": 64,
|
||||||
|
"BLOCK_SIZE_N": 128,
|
||||||
|
"BLOCK_SIZE_K": 128,
|
||||||
|
"GROUP_SIZE_M": 32,
|
||||||
|
"num_warps": 8,
|
||||||
|
"num_stages": 2,
|
||||||
|
"waves_per_eu": 0,
|
||||||
|
"matrix_instr_nonkdim": 16,
|
||||||
|
"kpack": 2
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -58,6 +58,15 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
|
|||||||
"excessive use of shared memory. If this happens, disable Triton FA "
|
"excessive use of shared memory. If this happens, disable Triton FA "
|
||||||
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
|
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
|
||||||
}
|
}
|
||||||
|
_ROCM_DEVICE_ID_NAME_MAP: Dict[str, str] = {
|
||||||
|
"0x74a0": "AMD_Instinct_MI300A",
|
||||||
|
"0x74a1": "AMD_Instinct_MI300X",
|
||||||
|
"0x74b5": "AMD_Instinct_MI300X", # MI300X VF
|
||||||
|
"0x74a5": "AMD_Instinct_MI325X",
|
||||||
|
"0x74b9": "AMD_Instinct_MI325X", # MI325X VF
|
||||||
|
"0x74a9": "AMD_Instinct_MI300X_HF",
|
||||||
|
"0x74bd": "AMD_Instinct_MI300X_HF",
|
||||||
|
}
|
||||||
|
|
||||||
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
|
# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
|
||||||
if "HIP_VISIBLE_DEVICES" in os.environ:
|
if "HIP_VISIBLE_DEVICES" in os.environ:
|
||||||
@ -225,7 +234,11 @@ class RocmPlatform(Platform):
|
|||||||
def get_device_name(cls, device_id: int = 0) -> str:
|
def get_device_name(cls, device_id: int = 0) -> str:
|
||||||
physical_device_id = device_id_to_physical_device_id(device_id)
|
physical_device_id = device_id_to_physical_device_id(device_id)
|
||||||
handle = amdsmi_get_processor_handles()[physical_device_id]
|
handle = amdsmi_get_processor_handles()[physical_device_id]
|
||||||
return amdsmi_get_gpu_asic_info(handle)["market_name"]
|
asic_info = amdsmi_get_gpu_asic_info(handle)
|
||||||
|
device_name: str = asic_info["device_id"]
|
||||||
|
if device_name in _ROCM_DEVICE_ID_NAME_MAP:
|
||||||
|
return _ROCM_DEVICE_ID_NAME_MAP[device_name]
|
||||||
|
return asic_info["market_name"]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
def get_device_total_memory(cls, device_id: int = 0) -> int:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user