[CI] Shard tests for LoRA and Kernels to speed up (#3445)

2025-12-10 08:34:56 +08:00 · 2024-03-17 14:56:30 -07:00 · 2024-03-17 14:56:30 -07:00 · 93348d9458
commit 93348d9458
parent abfc4f3387
3 changed files with 10 additions and 5 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -33,9 +33,9 @@ steps:
 - label: Entrypoints Test
  command: pytest -v -s entrypoints
- label: Kernels Test
+- label: Kernels Test %N
-  command: pytest -v -s kernels
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  soft_fail: true
+  parallelism: 4
 - label: Models Test
  commands:
@ -55,8 +55,9 @@ steps:
 - label: Speculative decoding tests
  command: pytest -v -s spec_decode
- label: LoRA Test
+- label: LoRA Test %N
-  command: pytest -v -s lora --forked
+  command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4
 - label: Metrics Test
  command: pytest -v -s metrics
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -20,6 +20,9 @@ steps:
    agents:
      queue: kubernetes
    soft_fail: {{ step.soft_fail or false }}
    {% if step.parallelism %}
    parallelism: {{ step.parallelism }}
    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -16,6 +16,7 @@ pytest
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
 httpx
 einops # required for MPT
 openai