[AMD][Build] Porting dockerfiles from the ROCm/vllm fork (#11777)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
This commit is contained in:
Gregory Shtrasberg 2025-01-20 23:22:23 -05:00 committed by GitHub
parent ecf67814f1
commit d4b62d4641
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 338 additions and 237 deletions

View File

@ -1,174 +1,118 @@
# Default ROCm 6.2 base image # default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0" ARG REMOTE_VLLM="0"
ARG USE_CYTHON="0"
ARG BUILD_RPD="1"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
# Default ROCm ARCHes to build vLLM for. FROM ${BASE_IMAGE} AS base
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
# Whether to install CK-based flash-attention ARG ARG_PYTORCH_ROCM_ARCH
# If 0, will not install flash-attention ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
ARG BUILD_FA="1"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
ARG FA_BRANCH="3cea2fb"
# Whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="e192dba"
### Base image build stage
FROM $BASE_IMAGE AS base
# Import arg(s) defined before this build stage
ARG PYTORCH_ROCM_ARCH
# Install some basic utilities # Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update -q -y && apt-get install -q -y \
RUN apt-get update && apt-get install -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
curl \ # Remove sccache
ca-certificates \ RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
tmux \
ccache \
&& rm -rf /var/lib/apt/lists/*
# When launching the container, mount the code directory to /vllm-workspace
ARG APP_MOUNT=/vllm-workspace
WORKDIR ${APP_MOUNT}
RUN python3 -m pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR
# Install torch == 2.6.0 on ROCm WORKDIR ${COMMON_WORKDIR}
RUN --mount=type=cache,target=/root/.cache/pip \
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.2"*) \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
torch \
'setuptools-scm>=8' \
torchvision \
--extra-index-url https://download.pytorch.org/whl/rocm6.2;; \
*) ;; esac
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV CCACHE_DIR=/root/.cache/ccache
### AMD-SMI build stage # -----------------------
FROM base AS build_amdsmi # vLLM fetch stages
# Build amdsmi wheel always FROM base AS fetch_vllm_0
RUN cd /opt/rocm/share/amd_smi \ ONBUILD COPY ./ vllm/
&& python3 -m pip wheel . --wheel-dir=/install FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git checkout ${VLLM_BRANCH}
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
### Flash-Attention wheel build stage # vLLM build stages
FROM base AS build_fa FROM fetch_vllm AS build_vllm
ARG BUILD_FA ARG USE_CYTHON
ARG FA_GFX_ARCHS # Build vLLM
ARG FA_BRANCH RUN cd vllm \
# Build ROCm flash-attention wheel if `BUILD_FA = 1` && python3 -m pip install -r requirements-rocm.txt \
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_FA" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout "${FA_BRANCH}" \
&& git submodule update --init \
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Triton wheel build stage
FROM base AS build_triton
ARG BUILD_TRITON
ARG TRITON_BRANCH
# Build triton wheel if `BUILD_TRITON = 1`
RUN --mount=type=cache,target=${CCACHE_DIR} \
if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& python3 -m pip install ninja cmake wheel pybind11 \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout "${TRITON_BRANCH}" \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=/install; \
# Create an empty directory otherwise as later build stages expect one
else mkdir -p /install; \
fi
### Final vLLM build stage
FROM base AS final
# Import the vLLM development directory from the build context
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install --upgrade pip
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
# Workaround for ray >= 2.10.0
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
# Silences the HF Tokenizers warning
ENV TOKENIZERS_PARALLELISM=false
RUN --mount=type=cache,target=${CCACHE_DIR} \
--mount=type=bind,source=.git,target=.git \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -Ur requirements-rocm.txt \
&& python3 setup.py clean --all \ && python3 setup.py clean --all \
&& python3 setup.py develop && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
# Copy amdsmi wheel into final image # -----------------------
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ # Test vLLM image
mkdir -p libs \ FROM base AS test
&& cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y amdsmi;
# Copy triton wheel(s) into final image if they were built RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
mkdir -p libs \
&& if ls /install/*.whl; then \
cp /install/*.whl libs \
# Preemptively uninstall to avoid same-version no-installs
&& python3 -m pip uninstall -y triton; fi
# Copy flash-attn wheel(s) into final image if they were built # Install vLLM
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
mkdir -p libs \ cd /install \
&& if ls /install/*.whl; then \ && pip install -U -r requirements-rocm.txt \
cp /install/*.whl libs \ && pip uninstall -y vllm \
# Preemptively uninstall to avoid same-version no-installs && pip install *.whl
&& python3 -m pip uninstall -y flash-attn; fi
# Install wheels that were built to the final image WORKDIR /vllm-workspace
RUN --mount=type=cache,target=/root/.cache/pip \ ARG COMMON_WORKDIR
if ls libs/*.whl; then \ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
python3 -m pip install libs/*.whl; fi
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4
# -----------------------
# Final vLLM image
FROM base AS final
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually remove it so that later steps of numpy upgrade can continue
RUN case "$(which python3)" in \
*"/opt/conda/envs/py_3.9"*) \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli]
ARG BUILD_RPD
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
&& cd rocmProfileData/rpd_tracer \
&& pip install -r requirements.txt && cd ../ \
&& make && make install \
&& cd hipMarker && python3 setup.py install ; fi
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
ARG COMMON_WORKDIR
# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"] CMD ["/bin/bash"]

158
Dockerfile.rocm_base Normal file
View File

@ -0,0 +1,158 @@
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
ARG HIPBLASLT_BRANCH="4d40e36"
ARG HIPBLAS_COMMON_BRANCH="7c1566b"
ARG LEGACY_HIPBLASLT_OPTION=
ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="8d4926e"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="b7d29fb"
ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:$PATH
ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG PYTHON_VERSION=3.12
RUN mkdir -p /app
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH
ARG HIPBLAS_COMMON_BRANCH
# Set to "--legacy_hipblas_direct" for ROCm<=6.2
ARG LEGACY_HIPBLASLT_OPTION
RUN git clone https://github.com/ROCm/hipBLAS-common.git
RUN cd hipBLAS-common \
&& git checkout ${HIPBLAS_COMMON_BRANCH} \
&& mkdir build \
&& cd build \
&& cmake .. \
&& make package \
&& dpkg -i ./*.deb
RUN git clone https://github.com/ROCm/hipBLASLt
RUN cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
&& cd build/release \
&& make package
RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
FROM base AS build_rccl
ARG RCCL_BRANCH
ARG RCCL_REPO
RUN git clone ${RCCL_REPO}
RUN cd rccl \
&& git checkout ${RCCL_BRANCH} \
&& ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
FROM base AS build_triton
ARG TRITON_BRANCH
ARG TRITON_REPO
RUN git clone ${TRITON_REPO}
RUN cd triton \
&& git checkout ${TRITON_BRANCH} \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
FROM base AS build_amdsmi
RUN cd /opt/rocm/share/amd_smi \
&& pip wheel . --wheel-dir=dist
RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
FROM base AS build_pytorch
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN git clone ${FA_REPO}
RUN cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install \
&& cp /app/flash-attention/dist/*.whl /app/install
FROM base AS final
RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
dpkg -i /install/*deb \
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
pip install /install/*.whl
RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
pip install /install/*.whl
ARG BASE_IMAGE
ARG HIPBLASLT_BRANCH
ARG LEGACY_HIPBLASLT_OPTION
ARG RCCL_BRANCH
ARG RCCL_REPO
ARG TRITON_BRANCH
ARG TRITON_REPO
ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO
ARG FA_BRANCH
ARG FA_REPO
RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
&& echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
&& echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
&& echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
&& echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
&& echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
&& echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt

View File

@ -123,11 +123,10 @@ It is important that the user kicks off the docker build using buildkit. Either
<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. <gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
It provides flexibility to customize the build of docker image using the following arguments: It provides flexibility to customize the build of docker image using the following arguments:
- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
Their values can be passed in when running `docker build` with `--build-arg` options. Their values can be passed in when running `docker build` with `--build-arg` options.
@ -137,10 +136,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
``` ```
To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
```console ```console
DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
``` ```
To run the above docker image `vllm-rocm`, use the below command: To run the above docker image `vllm-rocm`, use the below command:

View File

@ -5,7 +5,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -16,7 +16,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -27,7 +27,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -38,7 +38,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 1, "num_warps": 1,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -49,7 +49,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -60,7 +60,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 1, "num_warps": 1,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -71,7 +71,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -82,7 +82,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -93,7 +93,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -104,7 +104,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -115,7 +115,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -126,7 +126,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -137,7 +137,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -148,7 +148,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 32, "matrix_instr_nonkdim": 32,
"kpack": 2 "kpack": 2
@ -159,7 +159,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -170,7 +170,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -181,7 +181,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -192,7 +192,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1

View File

@ -5,7 +5,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -16,7 +16,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -27,7 +27,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -38,7 +38,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -49,7 +49,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -60,7 +60,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -71,7 +71,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -82,7 +82,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -93,7 +93,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -104,7 +104,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -115,7 +115,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -126,7 +126,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -137,7 +137,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -148,7 +148,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -159,7 +159,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -170,7 +170,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -181,7 +181,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -192,7 +192,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1

View File

@ -5,7 +5,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -16,7 +16,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -27,7 +27,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -38,7 +38,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -49,7 +49,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -60,7 +60,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -71,7 +71,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -82,7 +82,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -93,7 +93,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -104,7 +104,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -115,7 +115,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -126,7 +126,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -137,7 +137,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 32, "matrix_instr_nonkdim": 32,
"kpack": 2 "kpack": 2
@ -148,7 +148,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -159,7 +159,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -170,7 +170,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -181,7 +181,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -192,7 +192,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1

View File

@ -5,7 +5,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -16,7 +16,7 @@
"BLOCK_SIZE_K": 32, "BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -27,7 +27,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -38,7 +38,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -49,7 +49,7 @@
"BLOCK_SIZE_K": 256, "BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -60,7 +60,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -71,7 +71,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -82,7 +82,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 2, "num_warps": 2,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -93,7 +93,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -104,7 +104,7 @@
"BLOCK_SIZE_K": 128, "BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 4, "num_warps": 4,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -115,7 +115,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -126,7 +126,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 4, "GROUP_SIZE_M": 4,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 32, "matrix_instr_nonkdim": 32,
"kpack": 2 "kpack": 2
@ -137,7 +137,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -148,7 +148,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -159,7 +159,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -170,7 +170,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1
@ -181,7 +181,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 2 "kpack": 2
@ -192,7 +192,7 @@
"BLOCK_SIZE_K": 64, "BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1, "GROUP_SIZE_M": 1,
"num_warps": 8, "num_warps": 8,
"num_stages": 0, "num_stages": 2,
"waves_per_eu": 0, "waves_per_eu": 0,
"matrix_instr_nonkdim": 16, "matrix_instr_nonkdim": 16,
"kpack": 1 "kpack": 1