mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:26:12 +08:00
[AMD][Build] Porting dockerfiles from the ROCm/vllm fork (#11777)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
This commit is contained in:
parent
ecf67814f1
commit
d4b62d4641
258
Dockerfile.rocm
258
Dockerfile.rocm
@ -1,174 +1,118 @@
|
|||||||
# Default ROCm 6.2 base image
|
# default base image
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
|
ARG REMOTE_VLLM="0"
|
||||||
|
ARG USE_CYTHON="0"
|
||||||
|
ARG BUILD_RPD="1"
|
||||||
|
ARG COMMON_WORKDIR=/app
|
||||||
|
ARG BASE_IMAGE=rocm/vllm-dev:base
|
||||||
|
|
||||||
# Default ROCm ARCHes to build vLLM for.
|
FROM ${BASE_IMAGE} AS base
|
||||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
|
||||||
|
|
||||||
# Whether to install CK-based flash-attention
|
ARG ARG_PYTORCH_ROCM_ARCH
|
||||||
# If 0, will not install flash-attention
|
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
|
||||||
ARG BUILD_FA="1"
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
ARG FA_BRANCH="3cea2fb"
|
|
||||||
|
|
||||||
# Whether to build triton on rocm
|
|
||||||
ARG BUILD_TRITON="1"
|
|
||||||
ARG TRITON_BRANCH="e192dba"
|
|
||||||
|
|
||||||
### Base image build stage
|
|
||||||
FROM $BASE_IMAGE AS base
|
|
||||||
|
|
||||||
# Import arg(s) defined before this build stage
|
|
||||||
ARG PYTORCH_ROCM_ARCH
|
|
||||||
|
|
||||||
# Install some basic utilities
|
# Install some basic utilities
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
RUN apt-get update -q -y && apt-get install -q -y \
|
||||||
RUN apt-get update && apt-get install -y \
|
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
|
||||||
curl \
|
# Remove sccache
|
||||||
ca-certificates \
|
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
|
||||||
sudo \
|
|
||||||
git \
|
|
||||||
bzip2 \
|
|
||||||
libx11-6 \
|
|
||||||
build-essential \
|
|
||||||
wget \
|
|
||||||
unzip \
|
|
||||||
tmux \
|
|
||||||
ccache \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# When launching the container, mount the code directory to /vllm-workspace
|
|
||||||
ARG APP_MOUNT=/vllm-workspace
|
|
||||||
WORKDIR ${APP_MOUNT}
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
# Remove sccache so it doesn't interfere with ccache
|
|
||||||
# TODO: implement sccache support across components
|
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
# Install torch == 2.6.0 on ROCm
|
WORKDIR ${COMMON_WORKDIR}
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
|
||||||
*"rocm-6.2"*) \
|
|
||||||
python3 -m pip uninstall -y torch torchvision \
|
|
||||||
&& python3 -m pip install --pre \
|
|
||||||
torch \
|
|
||||||
'setuptools-scm>=8' \
|
|
||||||
torchvision \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
|
|
||||||
*) ;; esac
|
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
|
||||||
|
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
|
|
||||||
### AMD-SMI build stage
|
# -----------------------
|
||||||
FROM base AS build_amdsmi
|
# vLLM fetch stages
|
||||||
# Build amdsmi wheel always
|
FROM base AS fetch_vllm_0
|
||||||
RUN cd /opt/rocm/share/amd_smi \
|
ONBUILD COPY ./ vllm/
|
||||||
&& python3 -m pip wheel . --wheel-dir=/install
|
FROM base AS fetch_vllm_1
|
||||||
|
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
|
||||||
|
ARG VLLM_BRANCH="main"
|
||||||
|
ONBUILD RUN git clone ${VLLM_REPO} \
|
||||||
|
&& cd vllm \
|
||||||
|
&& git checkout ${VLLM_BRANCH}
|
||||||
|
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
### Flash-Attention wheel build stage
|
# vLLM build stages
|
||||||
FROM base AS build_fa
|
FROM fetch_vllm AS build_vllm
|
||||||
ARG BUILD_FA
|
ARG USE_CYTHON
|
||||||
ARG FA_GFX_ARCHS
|
# Build vLLM
|
||||||
ARG FA_BRANCH
|
RUN cd vllm \
|
||||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
&& python3 -m pip install -r requirements-rocm.txt \
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
if [ "$BUILD_FA" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
|
||||||
&& cd flash-attention \
|
|
||||||
&& git checkout "${FA_BRANCH}" \
|
|
||||||
&& git submodule update --init \
|
|
||||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
### Triton wheel build stage
|
|
||||||
FROM base AS build_triton
|
|
||||||
ARG BUILD_TRITON
|
|
||||||
ARG TRITON_BRANCH
|
|
||||||
# Build triton wheel if `BUILD_TRITON = 1`
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& python3 -m pip install ninja cmake wheel pybind11 \
|
|
||||||
&& git clone https://github.com/OpenAI/triton.git \
|
|
||||||
&& cd triton \
|
|
||||||
&& git checkout "${TRITON_BRANCH}" \
|
|
||||||
&& cd python \
|
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
### Final vLLM build stage
|
|
||||||
FROM base AS final
|
|
||||||
# Import the vLLM development directory from the build context
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
# Package upgrades for useful functionality or to avoid dependency issues
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
|
||||||
|
|
||||||
|
|
||||||
# Workaround for ray >= 2.10.0
|
|
||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
|
||||||
# Silences the HF Tokenizers warning
|
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install -Ur requirements-rocm.txt \
|
|
||||||
&& python3 setup.py clean --all \
|
&& python3 setup.py clean --all \
|
||||||
&& python3 setup.py develop
|
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
FROM scratch AS export_vllm
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
|
||||||
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
|
||||||
|
|
||||||
# Copy amdsmi wheel into final image
|
# -----------------------
|
||||||
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
# Test vLLM image
|
||||||
mkdir -p libs \
|
FROM base AS test
|
||||||
&& cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y amdsmi;
|
|
||||||
|
|
||||||
# Copy triton wheel(s) into final image if they were built
|
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||||
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& if ls /install/*.whl; then \
|
|
||||||
cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y triton; fi
|
|
||||||
|
|
||||||
# Copy flash-attn wheel(s) into final image if they were built
|
# Install vLLM
|
||||||
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
mkdir -p libs \
|
cd /install \
|
||||||
&& if ls /install/*.whl; then \
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
cp /install/*.whl libs \
|
&& pip uninstall -y vllm \
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
&& pip install *.whl
|
||||||
&& python3 -m pip uninstall -y flash-attn; fi
|
|
||||||
|
|
||||||
# Install wheels that were built to the final image
|
WORKDIR /vllm-workspace
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
ARG COMMON_WORKDIR
|
||||||
if ls libs/*.whl; then \
|
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
|
||||||
python3 -m pip install libs/*.whl; fi
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
# install development dependencies (for testing)
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
RUN cd /vllm-workspace \
|
||||||
|
&& rm -rf vllm \
|
||||||
|
&& python3 -m pip install -e tests/vllm_test_utils \
|
||||||
|
&& python3 -m pip install lm-eval[api]==0.4.4
|
||||||
|
|
||||||
|
# -----------------------
|
||||||
|
# Final vLLM image
|
||||||
|
FROM base AS final
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
|
||||||
|
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||||
|
# Manually remove it so that later steps of numpy upgrade can continue
|
||||||
|
RUN case "$(which python3)" in \
|
||||||
|
*"/opt/conda/envs/py_3.9"*) \
|
||||||
|
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
|
||||||
|
*) ;; esac
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade huggingface-hub[cli]
|
||||||
|
ARG BUILD_RPD
|
||||||
|
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
|
||||||
|
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
|
||||||
|
&& cd rocmProfileData/rpd_tracer \
|
||||||
|
&& pip install -r requirements.txt && cd ../ \
|
||||||
|
&& make && make install \
|
||||||
|
&& cd hipMarker && python3 setup.py install ; fi
|
||||||
|
|
||||||
|
# Install vLLM
|
||||||
|
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
|
||||||
|
cd /install \
|
||||||
|
&& pip install -U -r requirements-rocm.txt \
|
||||||
|
&& pip uninstall -y vllm \
|
||||||
|
&& pip install *.whl
|
||||||
|
|
||||||
|
ARG COMMON_WORKDIR
|
||||||
|
|
||||||
|
# Copy over the benchmark scripts as well
|
||||||
|
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
|
||||||
|
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
|
||||||
|
|
||||||
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
ENV TOKENIZERS_PARALLELISM=false
|
||||||
|
|
||||||
|
# Performance environment variable.
|
||||||
|
ENV HIP_FORCE_DEV_KERNARG=1
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
|||||||
158
Dockerfile.rocm_base
Normal file
158
Dockerfile.rocm_base
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
|
||||||
|
ARG HIPBLASLT_BRANCH="4d40e36"
|
||||||
|
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION=
|
||||||
|
ARG RCCL_BRANCH="648a58d"
|
||||||
|
ARG RCCL_REPO="https://github.com/ROCm/rccl"
|
||||||
|
ARG TRITON_BRANCH="e5be006"
|
||||||
|
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
|
||||||
|
ARG PYTORCH_BRANCH="8d4926e"
|
||||||
|
ARG PYTORCH_VISION_BRANCH="v0.19.1"
|
||||||
|
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
|
||||||
|
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
|
||||||
|
ARG FA_BRANCH="b7d29fb"
|
||||||
|
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
|
||||||
|
|
||||||
|
FROM ${BASE_IMAGE} AS base
|
||||||
|
|
||||||
|
ENV PATH=/opt/rocm/llvm/bin:$PATH
|
||||||
|
ENV ROCM_PATH=/opt/rocm
|
||||||
|
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
|
||||||
|
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
|
||||||
|
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
||||||
|
|
||||||
|
ARG PYTHON_VERSION=3.12
|
||||||
|
|
||||||
|
RUN mkdir -p /app
|
||||||
|
WORKDIR /app
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
# Install Python and other dependencies
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y software-properties-common git curl sudo vim less \
|
||||||
|
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||||
|
&& apt-get update -y \
|
||||||
|
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
|
||||||
|
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
|
||||||
|
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
|
||||||
|
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
|
||||||
|
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||||
|
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
|
||||||
|
&& python3 --version && python3 -m pip --version
|
||||||
|
|
||||||
|
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
|
||||||
|
|
||||||
|
FROM base AS build_hipblaslt
|
||||||
|
ARG HIPBLASLT_BRANCH
|
||||||
|
ARG HIPBLAS_COMMON_BRANCH
|
||||||
|
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
|
RUN git clone https://github.com/ROCm/hipBLAS-common.git
|
||||||
|
RUN cd hipBLAS-common \
|
||||||
|
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build \
|
||||||
|
&& cmake .. \
|
||||||
|
&& make package \
|
||||||
|
&& dpkg -i ./*.deb
|
||||||
|
RUN git clone https://github.com/ROCm/hipBLASLt
|
||||||
|
RUN cd hipBLASLt \
|
||||||
|
&& git checkout ${HIPBLASLT_BRANCH} \
|
||||||
|
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
|
||||||
|
&& cd build/release \
|
||||||
|
&& make package
|
||||||
|
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
|
||||||
|
|
||||||
|
FROM base AS build_rccl
|
||||||
|
ARG RCCL_BRANCH
|
||||||
|
ARG RCCL_REPO
|
||||||
|
RUN git clone ${RCCL_REPO}
|
||||||
|
RUN cd rccl \
|
||||||
|
&& git checkout ${RCCL_BRANCH} \
|
||||||
|
&& ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
|
||||||
|
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
|
||||||
|
|
||||||
|
FROM base AS build_triton
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
ARG TRITON_REPO
|
||||||
|
RUN git clone ${TRITON_REPO}
|
||||||
|
RUN cd triton \
|
||||||
|
&& git checkout ${TRITON_BRANCH} \
|
||||||
|
&& cd python \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS build_amdsmi
|
||||||
|
RUN cd /opt/rocm/share/amd_smi \
|
||||||
|
&& pip wheel . --wheel-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS build_pytorch
|
||||||
|
ARG PYTORCH_BRANCH
|
||||||
|
ARG PYTORCH_VISION_BRANCH
|
||||||
|
ARG PYTORCH_REPO
|
||||||
|
ARG PYTORCH_VISION_REPO
|
||||||
|
ARG FA_BRANCH
|
||||||
|
ARG FA_REPO
|
||||||
|
RUN git clone ${PYTORCH_REPO} pytorch
|
||||||
|
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
|
||||||
|
pip install -r requirements.txt && git submodule update --init --recursive \
|
||||||
|
&& python3 tools/amd_build/build_amd.py \
|
||||||
|
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& pip install dist/*.whl
|
||||||
|
RUN git clone ${PYTORCH_VISION_REPO} vision
|
||||||
|
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
|
||||||
|
&& python3 setup.py bdist_wheel --dist-dir=dist \
|
||||||
|
&& pip install dist/*.whl
|
||||||
|
RUN git clone ${FA_REPO}
|
||||||
|
RUN cd flash-attention \
|
||||||
|
&& git checkout ${FA_BRANCH} \
|
||||||
|
&& git submodule update --init \
|
||||||
|
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
|
||||||
|
&& cp /app/vision/dist/*.whl /app/install \
|
||||||
|
&& cp /app/flash-attention/dist/*.whl /app/install
|
||||||
|
|
||||||
|
FROM base AS final
|
||||||
|
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
|
||||||
|
dpkg -i /install/*deb \
|
||||||
|
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
|
||||||
|
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
|
||||||
|
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
|
||||||
|
dpkg -i /install/*deb \
|
||||||
|
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
|
||||||
|
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
|
||||||
|
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
|
||||||
|
pip install /install/*.whl
|
||||||
|
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG HIPBLASLT_BRANCH
|
||||||
|
ARG LEGACY_HIPBLASLT_OPTION
|
||||||
|
ARG RCCL_BRANCH
|
||||||
|
ARG RCCL_REPO
|
||||||
|
ARG TRITON_BRANCH
|
||||||
|
ARG TRITON_REPO
|
||||||
|
ARG PYTORCH_BRANCH
|
||||||
|
ARG PYTORCH_VISION_BRANCH
|
||||||
|
ARG PYTORCH_REPO
|
||||||
|
ARG PYTORCH_VISION_REPO
|
||||||
|
ARG FA_BRANCH
|
||||||
|
ARG FA_REPO
|
||||||
|
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
|
||||||
|
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
|
||||||
|
&& echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
|
||||||
|
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
|
||||||
|
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
|
||||||
@ -123,11 +123,10 @@ It is important that the user kicks off the docker build using buildkit. Either
|
|||||||
<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
|
<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
|
||||||
It provides flexibility to customize the build of docker image using the following arguments:
|
It provides flexibility to customize the build of docker image using the following arguments:
|
||||||
|
|
||||||
- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image.
|
- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
|
||||||
- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target.
|
- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
|
||||||
- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
|
- `BUILD_RPD`: Include RocmProfileData profiling tool in the image
|
||||||
- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c`
|
- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
|
||||||
- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
|
|
||||||
|
|
||||||
Their values can be passed in when running `docker build` with `--build-arg` options.
|
Their values can be passed in when running `docker build` with `--build-arg` options.
|
||||||
|
|
||||||
@ -137,10 +136,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
|
|||||||
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
|
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
|
||||||
```
|
```
|
||||||
|
|
||||||
To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below:
|
To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
|
DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
|
||||||
```
|
```
|
||||||
|
|
||||||
To run the above docker image `vllm-rocm`, use the below command:
|
To run the above docker image `vllm-rocm`, use the below command:
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -16,7 +16,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -27,7 +27,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -38,7 +38,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 1,
|
"num_warps": 1,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -49,7 +49,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -60,7 +60,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 1,
|
"num_warps": 1,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -71,7 +71,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -82,7 +82,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -104,7 +104,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -115,7 +115,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -126,7 +126,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -137,7 +137,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -148,7 +148,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 32,
|
"matrix_instr_nonkdim": 32,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -159,7 +159,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -170,7 +170,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -181,7 +181,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -192,7 +192,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -16,7 +16,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -27,7 +27,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -38,7 +38,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -49,7 +49,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -60,7 +60,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -71,7 +71,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -82,7 +82,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -104,7 +104,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -115,7 +115,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -126,7 +126,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -137,7 +137,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -148,7 +148,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -159,7 +159,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -170,7 +170,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -181,7 +181,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -192,7 +192,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -16,7 +16,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -27,7 +27,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -38,7 +38,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -49,7 +49,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -60,7 +60,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -71,7 +71,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -82,7 +82,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -104,7 +104,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -115,7 +115,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -126,7 +126,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -137,7 +137,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 32,
|
"matrix_instr_nonkdim": 32,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -148,7 +148,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -159,7 +159,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -170,7 +170,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -181,7 +181,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -192,7 +192,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -16,7 +16,7 @@
|
|||||||
"BLOCK_SIZE_K": 32,
|
"BLOCK_SIZE_K": 32,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -27,7 +27,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -38,7 +38,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -49,7 +49,7 @@
|
|||||||
"BLOCK_SIZE_K": 256,
|
"BLOCK_SIZE_K": 256,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -60,7 +60,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -71,7 +71,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -82,7 +82,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 2,
|
"num_warps": 2,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -93,7 +93,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -104,7 +104,7 @@
|
|||||||
"BLOCK_SIZE_K": 128,
|
"BLOCK_SIZE_K": 128,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 4,
|
"num_warps": 4,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -115,7 +115,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -126,7 +126,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 4,
|
"GROUP_SIZE_M": 4,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 32,
|
"matrix_instr_nonkdim": 32,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -137,7 +137,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -148,7 +148,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -159,7 +159,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -170,7 +170,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
@ -181,7 +181,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 2
|
"kpack": 2
|
||||||
@ -192,7 +192,7 @@
|
|||||||
"BLOCK_SIZE_K": 64,
|
"BLOCK_SIZE_K": 64,
|
||||||
"GROUP_SIZE_M": 1,
|
"GROUP_SIZE_M": 1,
|
||||||
"num_warps": 8,
|
"num_warps": 8,
|
||||||
"num_stages": 0,
|
"num_stages": 2,
|
||||||
"waves_per_eu": 0,
|
"waves_per_eu": 0,
|
||||||
"matrix_instr_nonkdim": 16,
|
"matrix_instr_nonkdim": 16,
|
||||||
"kpack": 1
|
"kpack": 1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user