mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 21:55:50 +08:00
[CI] update typos config for CI pre-commit and fix some spells (#20919)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
This commit is contained in:
parent
6ebf313790
commit
1eb2b9c102
@ -21,7 +21,7 @@ repos:
|
|||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.32.0
|
rev: v1.34.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
|||||||
@ -58,7 +58,7 @@ namespace {
|
|||||||
|
|
||||||
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
|
||||||
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
|
#define CHECK_LAST_DIM_CONTIGUOUS(x) \
|
||||||
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimention")
|
TORCH_CHECK(x.strides()[x.strides().size() - 1] == 1, #x "must be contiguous at last dimension")
|
||||||
|
|
||||||
#define CHECK_INPUT(x) \
|
#define CHECK_INPUT(x) \
|
||||||
CHECK_CPU(x); \
|
CHECK_CPU(x); \
|
||||||
|
|||||||
@ -126,7 +126,7 @@ void fused_experts_int4_w4a16_kernel_impl(
|
|||||||
int64_t topk,
|
int64_t topk,
|
||||||
int64_t num_tokens_post_pad);
|
int64_t num_tokens_post_pad);
|
||||||
|
|
||||||
// shared expert implememntation for int8 w8a8
|
// shared expert implementation for int8 w8a8
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
void shared_expert_int8_kernel_impl(
|
void shared_expert_int8_kernel_impl(
|
||||||
scalar_t* __restrict__ output,
|
scalar_t* __restrict__ output,
|
||||||
|
|||||||
@ -41,7 +41,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
|
|||||||
__m512 vd0;
|
__m512 vd0;
|
||||||
__m512 vd1[COLS];
|
__m512 vd1[COLS];
|
||||||
|
|
||||||
// oops! 4x4 spills but luckly we use 4x2
|
// oops! 4x4 spills but luckily we use 4x2
|
||||||
__m512 vbias[COLS];
|
__m512 vbias[COLS];
|
||||||
|
|
||||||
// [NOTE]: s8s8 igemm compensation in avx512-vnni
|
// [NOTE]: s8s8 igemm compensation in avx512-vnni
|
||||||
|
|||||||
@ -37,7 +37,7 @@ inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vecto
|
|||||||
#define CVT_FP16_TO_FP32(a) \
|
#define CVT_FP16_TO_FP32(a) \
|
||||||
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
_mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
|
||||||
|
|
||||||
// this doesn't hanel NaN.
|
// this doesn't handle NaN.
|
||||||
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
|
inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
|
||||||
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
|
const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
|
||||||
|
|
||||||
|
|||||||
@ -63,7 +63,7 @@ ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
|
|||||||
ARG PIP_KEYRING_PROVIDER=disabled
|
ARG PIP_KEYRING_PROVIDER=disabled
|
||||||
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
ARG UV_KEYRING_PROVIDER=${PIP_KEYRING_PROVIDER}
|
||||||
|
|
||||||
# Flag enables build-in KV-connector dependency libs into docker images
|
# Flag enables built-in KV-connector dependency libs into docker images
|
||||||
ARG INSTALL_KV_CONNECTORS=false
|
ARG INSTALL_KV_CONNECTORS=false
|
||||||
|
|
||||||
#################### BASE BUILD IMAGE ####################
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|||||||
@ -106,7 +106,7 @@ to enable simultaneous generation and embedding using the same engine instance i
|
|||||||
|
|
||||||
Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
|
Models using selective state-space mechanisms instead of standard transformer attention are partially supported.
|
||||||
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
|
Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers
|
||||||
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet suported. Please note that these models currently require
|
(e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require
|
||||||
enforcing eager mode and disabling prefix caching in V1.
|
enforcing eager mode and disabling prefix caching in V1.
|
||||||
|
|
||||||
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
|
||||||
|
|||||||
183
pyproject.toml
183
pyproject.toml
@ -174,3 +174,186 @@ respect-ignore-files = true
|
|||||||
|
|
||||||
[tool.ty.environment]
|
[tool.ty.environment]
|
||||||
python = "./.venv"
|
python = "./.venv"
|
||||||
|
|
||||||
|
[tool.typos.files]
|
||||||
|
# these files may be written in non english words
|
||||||
|
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
||||||
|
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
||||||
|
"vllm/third_party/*"]
|
||||||
|
ignore-hidden = true
|
||||||
|
ignore-files = true
|
||||||
|
ignore-dot = true
|
||||||
|
ignore-vcs = true
|
||||||
|
ignore-global = true
|
||||||
|
ignore-parent = true
|
||||||
|
|
||||||
|
[tool.typos.default]
|
||||||
|
binary = false
|
||||||
|
check-filename = false
|
||||||
|
check-file = true
|
||||||
|
unicode = true
|
||||||
|
ignore-hex = true
|
||||||
|
identifier-leading-digits = false
|
||||||
|
locale = "en"
|
||||||
|
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
||||||
|
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
|
||||||
|
".*[Tt]h[rR].*"]
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.default.extend-identifiers]
|
||||||
|
bbc5b7ede = "bbc5b7ede"
|
||||||
|
womens_doubles = "womens_doubles"
|
||||||
|
v_2nd = "v_2nd"
|
||||||
|
# splitted_input = "splitted_input"
|
||||||
|
NOOPs = "NOOPs"
|
||||||
|
typ = "typ"
|
||||||
|
nin_shortcut = "nin_shortcut"
|
||||||
|
UperNetDecoder = "UperNetDecoder"
|
||||||
|
subtile = "subtile"
|
||||||
|
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
||||||
|
SFOuput = "SFOuput"
|
||||||
|
# huggingface transformers repo uses these words
|
||||||
|
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
||||||
|
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
||||||
|
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
||||||
|
|
||||||
|
[tool.typos.default.extend-words]
|
||||||
|
iy = "iy"
|
||||||
|
tendencias = "tendencias"
|
||||||
|
# intel cpu features
|
||||||
|
tme = "tme"
|
||||||
|
dout = "dout"
|
||||||
|
Pn = "Pn"
|
||||||
|
arange = "arange"
|
||||||
|
|
||||||
|
[tool.typos.type.py]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.py.extend-identifiers]
|
||||||
|
arange = "arange"
|
||||||
|
NDArray = "NDArray"
|
||||||
|
EOFError = "EOFError"
|
||||||
|
fo = "fo"
|
||||||
|
ba = "ba"
|
||||||
|
|
||||||
|
[tool.typos.type.py.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.cpp]
|
||||||
|
extend-glob = ["*.cu"]
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.cpp.extend-identifiers]
|
||||||
|
countr_one = "countr_one"
|
||||||
|
k_ot = "k_ot"
|
||||||
|
ot = "ot"
|
||||||
|
|
||||||
|
[tool.typos.type.cpp.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.rust]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.rust.extend-identifiers]
|
||||||
|
flate2 = "flate2"
|
||||||
|
|
||||||
|
[tool.typos.type.rust.extend-words]
|
||||||
|
ser = "ser"
|
||||||
|
|
||||||
|
[tool.typos.type.lock]
|
||||||
|
extend-glob = []
|
||||||
|
check-file = false
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.lock.extend-identifiers]
|
||||||
|
|
||||||
|
[tool.typos.type.lock.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.jl]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.jl.extend-identifiers]
|
||||||
|
|
||||||
|
[tool.typos.type.jl.extend-words]
|
||||||
|
modul = "modul"
|
||||||
|
egals = "egals"
|
||||||
|
usig = "usig"
|
||||||
|
egal = "egal"
|
||||||
|
|
||||||
|
[tool.typos.type.go]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.go.extend-identifiers]
|
||||||
|
flate = "flate"
|
||||||
|
|
||||||
|
[tool.typos.type.go.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.css]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.css.extend-identifiers]
|
||||||
|
nd = "nd"
|
||||||
|
|
||||||
|
[tool.typos.type.css.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.man]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.man.extend-identifiers]
|
||||||
|
Nd = "Nd"
|
||||||
|
|
||||||
|
[tool.typos.type.man.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.cert]
|
||||||
|
extend-glob = []
|
||||||
|
check-file = false
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.cert.extend-identifiers]
|
||||||
|
|
||||||
|
[tool.typos.type.cert.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.sh]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.sh.extend-identifiers]
|
||||||
|
ot = "ot"
|
||||||
|
|
||||||
|
[tool.typos.type.sh.extend-words]
|
||||||
|
|
||||||
|
[tool.typos.type.vimscript]
|
||||||
|
extend-glob = []
|
||||||
|
extend-ignore-identifiers-re = []
|
||||||
|
extend-ignore-words-re = []
|
||||||
|
extend-ignore-re = []
|
||||||
|
|
||||||
|
[tool.typos.type.vimscript.extend-identifiers]
|
||||||
|
windo = "windo"
|
||||||
|
|
||||||
|
[tool.typos.type.vimscript.extend-words]
|
||||||
|
|||||||
@ -416,7 +416,7 @@ class RankTensors:
|
|||||||
# We dequant and use that as hidden_states so the tests are stable.
|
# We dequant and use that as hidden_states so the tests are stable.
|
||||||
# quantizing and dequantizing yield slightly different results
|
# quantizing and dequantizing yield slightly different results
|
||||||
# depending on the hardware. Here we, quantize and dequantize
|
# depending on the hardware. Here we, quantize and dequantize
|
||||||
# first - so further quantize and dequantize will yeild the same
|
# first - so further quantize and dequantize will yield the same
|
||||||
# values.
|
# values.
|
||||||
if config.is_per_tensor_act_quant:
|
if config.is_per_tensor_act_quant:
|
||||||
a_q, a_scales = ops.scaled_fp8_quant(
|
a_q, a_scales = ops.scaled_fp8_quant(
|
||||||
|
|||||||
@ -95,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
|
|||||||
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
|
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
|
||||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
|
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
|
||||||
|
|
||||||
# triton referrence
|
# triton reference
|
||||||
out_triton = fused_experts(
|
out_triton = fused_experts(
|
||||||
hidden_states=tokens_bf16,
|
hidden_states=tokens_bf16,
|
||||||
w1=w1,
|
w1=w1,
|
||||||
|
|||||||
@ -43,7 +43,7 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
|||||||
text_config = hf_config.get_text_config()
|
text_config = hf_config.get_text_config()
|
||||||
|
|
||||||
# Ensure at least 2 expert per group
|
# Ensure at least 2 expert per group
|
||||||
# Since `grouped_topk` assums top-2
|
# Since `grouped_topk` assumes top-2
|
||||||
n_group = getattr(text_config, 'n_group', None)
|
n_group = getattr(text_config, 'n_group', None)
|
||||||
num_experts = n_group * 2 if n_group is not None else 2
|
num_experts = n_group * 2 if n_group is not None else 2
|
||||||
|
|
||||||
|
|||||||
@ -17,7 +17,7 @@ MODEL_NAME = "ibm-research/PowerMoE-3b"
|
|||||||
|
|
||||||
# Number of data parallel ranks for external LB testing
|
# Number of data parallel ranks for external LB testing
|
||||||
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
|
DP_SIZE = int(os.getenv("DP_SIZE", "2"))
|
||||||
# Default tensor parallell size to use
|
# Default tensor parallel size to use
|
||||||
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
|
TP_SIZE = int(os.getenv("TP_SIZE", "1"))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
179
typos.toml
179
typos.toml
@ -1,179 +0,0 @@
|
|||||||
[files]
|
|
||||||
# these files may be written in non english words
|
|
||||||
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
|
|
||||||
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
|
|
||||||
"vllm/third_party/*"]
|
|
||||||
ignore-hidden = true
|
|
||||||
ignore-files = true
|
|
||||||
ignore-dot = true
|
|
||||||
ignore-vcs = true
|
|
||||||
ignore-global = true
|
|
||||||
ignore-parent = true
|
|
||||||
|
|
||||||
[default]
|
|
||||||
binary = false
|
|
||||||
check-filename = false
|
|
||||||
check-file = true
|
|
||||||
unicode = true
|
|
||||||
ignore-hex = true
|
|
||||||
identifier-leading-digits = false
|
|
||||||
locale = "en"
|
|
||||||
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
|
|
||||||
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
|
|
||||||
".*ot.*", ".*[Tt]h[rR].*"]
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[default.extend-identifiers]
|
|
||||||
bbc5b7ede = "bbc5b7ede"
|
|
||||||
womens_doubles = "womens_doubles"
|
|
||||||
v_2nd = "v_2nd"
|
|
||||||
splitted_input = "splitted_input"
|
|
||||||
NOOPs = "NOOPs"
|
|
||||||
typ = "typ"
|
|
||||||
nin_shortcut = "nin_shortcut"
|
|
||||||
UperNetDecoder = "UperNetDecoder"
|
|
||||||
subtile = "subtile"
|
|
||||||
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
|
|
||||||
SFOuput = "SFOuput"
|
|
||||||
# huggingface transformers repo uses these words
|
|
||||||
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
|
|
||||||
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
|
|
||||||
depthwise_seperable_CNN = "depthwise_seperable_CNN"
|
|
||||||
|
|
||||||
[default.extend-words]
|
|
||||||
iy = "iy"
|
|
||||||
tendencias = "tendencias"
|
|
||||||
# intel cpu features
|
|
||||||
tme = "tme"
|
|
||||||
dout = "dout"
|
|
||||||
Pn = "Pn"
|
|
||||||
arange = "arange"
|
|
||||||
|
|
||||||
[type.py]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.py.extend-identifiers]
|
|
||||||
arange = "arange"
|
|
||||||
NDArray = "NDArray"
|
|
||||||
EOFError = "EOFError"
|
|
||||||
|
|
||||||
[type.py.extend-words]
|
|
||||||
|
|
||||||
[type.cpp]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.cpp.extend-identifiers]
|
|
||||||
countr_one = "countr_one"
|
|
||||||
|
|
||||||
[type.cpp.extend-words]
|
|
||||||
|
|
||||||
[type.rust]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.rust.extend-identifiers]
|
|
||||||
flate2 = "flate2"
|
|
||||||
|
|
||||||
[type.rust.extend-words]
|
|
||||||
ser = "ser"
|
|
||||||
|
|
||||||
[type.lock]
|
|
||||||
extend-glob = []
|
|
||||||
check-file = false
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.lock.extend-identifiers]
|
|
||||||
|
|
||||||
[type.lock.extend-words]
|
|
||||||
|
|
||||||
[type.jl]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.jl.extend-identifiers]
|
|
||||||
|
|
||||||
[type.jl.extend-words]
|
|
||||||
modul = "modul"
|
|
||||||
egals = "egals"
|
|
||||||
usig = "usig"
|
|
||||||
egal = "egal"
|
|
||||||
|
|
||||||
[type.go]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.go.extend-identifiers]
|
|
||||||
flate = "flate"
|
|
||||||
|
|
||||||
[type.go.extend-words]
|
|
||||||
|
|
||||||
[type.css]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.css.extend-identifiers]
|
|
||||||
nd = "nd"
|
|
||||||
|
|
||||||
[type.css.extend-words]
|
|
||||||
|
|
||||||
[type.man]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.man.extend-identifiers]
|
|
||||||
Nd = "Nd"
|
|
||||||
|
|
||||||
[type.man.extend-words]
|
|
||||||
|
|
||||||
[type.cert]
|
|
||||||
extend-glob = []
|
|
||||||
check-file = false
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.cert.extend-identifiers]
|
|
||||||
|
|
||||||
[type.cert.extend-words]
|
|
||||||
|
|
||||||
[type.sh]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.sh.extend-identifiers]
|
|
||||||
stap = "stap"
|
|
||||||
ot = "ot"
|
|
||||||
|
|
||||||
[type.sh.extend-words]
|
|
||||||
|
|
||||||
[type.vimscript]
|
|
||||||
extend-glob = []
|
|
||||||
extend-ignore-identifiers-re = []
|
|
||||||
extend-ignore-words-re = []
|
|
||||||
extend-ignore-re = []
|
|
||||||
|
|
||||||
[type.vimscript.extend-identifiers]
|
|
||||||
windo = "windo"
|
|
||||||
|
|
||||||
[type.vimscript.extend-words]
|
|
||||||
@ -961,7 +961,7 @@ class DifferentialFlashAttentionImpl(AttentionImpl):
|
|||||||
"... H (two D) -> ... (H two) D",
|
"... H (two D) -> ... (H two) D",
|
||||||
two=2)
|
two=2)
|
||||||
|
|
||||||
else: # re-use the kv cache, full attention
|
else: # reuse the kv cache, full attention
|
||||||
q = q.view(-1, self.num_heads, self.head_size)
|
q = q.view(-1, self.num_heads, self.head_size)
|
||||||
q1, q2 = self.split_heads(q)
|
q1, q2 = self.split_heads(q)
|
||||||
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
|
# kv_cache shape is (2, num_blocks, block_size, num_kv_heads, head_size) # noqa: E501
|
||||||
|
|||||||
@ -372,7 +372,7 @@ class OpenAIServingResponses(OpenAIServing):
|
|||||||
})
|
})
|
||||||
|
|
||||||
# Append the new input.
|
# Append the new input.
|
||||||
# Reponses API supports simple text inputs without chat format.
|
# Responses API supports simple text inputs without chat format.
|
||||||
if isinstance(request.input, str):
|
if isinstance(request.input, str):
|
||||||
messages.append({"role": "user", "content": request.input})
|
messages.append({"role": "user", "content": request.input})
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -1172,7 +1172,7 @@ def fused_experts(
|
|||||||
allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
|
allow_cutlass_block_scaled_grouped_gemm: bool = False) -> torch.Tensor:
|
||||||
# For now, disable DeepGemm for small N (<= 512) until better
|
# For now, disable DeepGemm for small N (<= 512) until better
|
||||||
# permute/unpermute ops are available.
|
# permute/unpermute ops are available.
|
||||||
# However, on B200, we use DeepGemm for all cases becuase they only support
|
# However, on B200, we use DeepGemm for all cases because they only support
|
||||||
# E8M0 scale, which means we requantize the weight and input to the specific
|
# E8M0 scale, which means we requantize the weight and input to the specific
|
||||||
# scale. Fallen back to cutlass or triton for some cases would cause
|
# scale. Fallen back to cutlass or triton for some cases would cause
|
||||||
# accuracy issue.
|
# accuracy issue.
|
||||||
|
|||||||
@ -193,7 +193,7 @@ class SambaYAttention(nn.Module):
|
|||||||
],
|
],
|
||||||
dim=-1)
|
dim=-1)
|
||||||
attn_output = self.attn(q, k, v)
|
attn_output = self.attn(q, k, v)
|
||||||
else: # re-use the kv cache, full attention
|
else: # reuse the kv cache, full attention
|
||||||
q = self.Wqkv(hidden_states)
|
q = self.Wqkv(hidden_states)
|
||||||
attn_output = self.attn(q, None, None)
|
attn_output = self.attn(q, None, None)
|
||||||
attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
|
attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
|
||||||
|
|||||||
@ -394,7 +394,7 @@ def use_cudnn_prefill() -> bool:
|
|||||||
|
|
||||||
|
|
||||||
# Currently 394MB, this can be tuned based on GEMM sizes used.
|
# Currently 394MB, this can be tuned based on GEMM sizes used.
|
||||||
# Choosen to be the same as sglang:
|
# Chosen to be the same as sglang:
|
||||||
# https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
|
# https://github.com/sgl-project/sglang/blob/766392c6bda2558b61ce6d1c1bfd8081a549e1f1/python/sglang/global_config.py#L37
|
||||||
FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
|
FLASHINFER_WORKSPACE_BUFFER_SIZE = 394 * 1024 * 1024
|
||||||
|
|
||||||
|
|||||||
@ -969,7 +969,7 @@ class TPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
mm_embeds = []
|
mm_embeds = []
|
||||||
xm.mark_step()
|
xm.mark_step()
|
||||||
# Prepare inputs, the requests might be splitted into multiple
|
# Prepare inputs, the requests might be split into multiple
|
||||||
# executions, combine the result of each execution.
|
# executions, combine the result of each execution.
|
||||||
start_index = 0
|
start_index = 0
|
||||||
combined_selected_tokens: list[torch.Tensor] = []
|
combined_selected_tokens: list[torch.Tensor] = []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user