Skip to content

Commit 85f79d1

Browse files
authored
Merge branch 'main' into addMoreTorchNightlyTest0429
2 parents 3a2f273 + f2e7af9 commit 85f79d1

File tree

130 files changed

+1796
-741
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

130 files changed

+1796
-741
lines changed

.buildkite/test-pipeline.yaml

+44-48
Original file line numberDiff line numberDiff line change
@@ -393,13 +393,14 @@ steps:
393393
commands:
394394
- pytest -v -s benchmarks/
395395

396-
- label: Quantization Test # 33min
396+
- label: Quantization Test
397397
torch_nightly: true
398398
source_file_dependencies:
399399
- csrc/
400400
- vllm/model_executor/layers/quantization
401401
- tests/quantization
402-
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
402+
commands:
403+
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
403404

404405
- label: LM Eval Small Models # 53min
405406
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
@@ -446,82 +447,78 @@ steps:
446447
commands:
447448
- pytest -v -s models/test_transformers.py
448449
- pytest -v -s models/test_registry.py
450+
- pytest -v -s models/test_utils.py
451+
- pytest -v -s models/test_vision.py
449452
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
450453
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
451454
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
452455
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
453456

454-
- label: Language Models Test (Standard) # 32min
457+
- label: Language Models Test (Standard)
455458
#mirror_hardwares: [amd]
456459
source_file_dependencies:
457460
- vllm/
458-
- tests/models/decoder_only/language
459-
- tests/models/embedding/language
460-
- tests/models/encoder_decoder/language
461+
- tests/models/language
461462
commands:
462463
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
463464
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
464-
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
465-
- pytest -v -s models/embedding/language -m core_model
465+
- pytest -v -s models/language -m core_model
466466

467-
- label: Language Models Test (Extended) # 1h10min
467+
- label: Language Models Test (Extended)
468468
optional: true
469469
source_file_dependencies:
470470
- vllm/
471-
- tests/models/decoder_only/language
472-
- tests/models/embedding/language
473-
- tests/models/encoder_decoder/language
471+
- tests/models/language
474472
commands:
475473
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
476-
- pip install causal-conv1d
477-
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
478-
- pytest -v -s models/embedding/language -m 'not core_model'
474+
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
475+
- pytest -v -s models/language -m 'not core_model'
479476

480-
- label: Multi-Modal Models Test (Standard) # 40min
477+
- label: Multi-Modal Models Test (Standard)
481478
#mirror_hardwares: [amd]
482479
source_file_dependencies:
483480
- vllm/
484-
- tests/models/decoder_only/audio_language
485-
- tests/models/decoder_only/vision_language
486-
- tests/models/embedding/vision_language
487-
- tests/models/encoder_decoder/audio_language
488-
- tests/models/encoder_decoder/vision_language
481+
- tests/models/multimodal
489482
commands:
490483
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
491-
- pytest -v -s models/multimodal
492-
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
493-
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
494-
- pytest -v -s models/embedding/vision_language -m core_model
495-
- pytest -v -s models/encoder_decoder/audio_language -m core_model
496-
- pytest -v -s models/encoder_decoder/language -m core_model
497-
- pytest -v -s models/encoder_decoder/vision_language -m core_model
498-
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
499-
500-
- label: Multi-Modal Models Test (Extended) 1 # 48m
484+
- pytest -v -s models/multimodal/processing
485+
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
486+
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
487+
488+
- label: Multi-Modal Models Test (Extended) 1
501489
optional: true
502490
source_file_dependencies:
503491
- vllm/
504-
- tests/models/decoder_only/audio_language
505-
- tests/models/decoder_only/vision_language
506-
- tests/models/embedding/vision_language
507-
- tests/models/encoder_decoder/vision_language
492+
- tests/models/multimodal
508493
commands:
509494
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
510-
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
511-
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
512-
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
513-
- pytest -v -s models/embedding/vision_language -m 'not core_model'
514-
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
515-
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
516-
517-
- label: Multi-Modal Models Test (Extended) 2 # 38m
495+
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
496+
497+
- label: Multi-Modal Models Test (Extended) 2
518498
optional: true
519499
source_file_dependencies:
520500
- vllm/
521-
- tests/models/decoder_only/vision_language
501+
- tests/models/multimodal
522502
commands:
523503
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
524-
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
504+
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
505+
506+
- label: Multi-Modal Models Test (Extended) 3
507+
optional: true
508+
source_file_dependencies:
509+
- vllm/
510+
- tests/models/multimodal
511+
commands:
512+
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
513+
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
514+
515+
- label: Quantized Models Test
516+
#mirror_hardwares: [amd]
517+
source_file_dependencies:
518+
- vllm/model_executor/layers/quantization
519+
- tests/models/quantization
520+
commands:
521+
- pytest -v -s models/quantization
525522

526523
# This test is used only in PR development phase to test individual models and should never run on main
527524
- label: Custom Models Test
@@ -591,9 +588,8 @@ steps:
591588
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
592589
# Avoid importing model tests that cause CUDA reinitialization error
593590
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
594-
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
595-
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
596-
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
591+
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
592+
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
597593
# test sequence parallel
598594
- pytest -v -s distributed/test_sequence_parallel.py
599595
# this test fails consistently.

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ set(VLLM_EXT_SRC
241241
"csrc/quantization/fp8/common.cu"
242242
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
243243
"csrc/quantization/gguf/gguf_kernel.cu"
244+
"csrc/quantization/activation_kernels.cu"
244245
"csrc/cuda_utils_kernels.cu"
245246
"csrc/prepare_inputs/advance_step.cu"
246247
"csrc/custom_all_reduce.cu"

benchmarks/backend_request_func.py

+1
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,7 @@ async def async_request_deepspeed_mii(
201201
timeout=AIOHTTP_TIMEOUT) as session:
202202

203203
payload = {
204+
"model": request_func_input.model,
204205
"prompt": request_func_input.prompt,
205206
"max_tokens": request_func_input.output_len,
206207
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.

csrc/core/math.hpp

+19
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,22 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
77
if (num <= 1) return num;
88
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
99
}
10+
11+
template <typename A, typename B>
12+
static inline constexpr auto div_ceil(A a, B b) {
13+
return (a + b - 1) / b;
14+
}
15+
16+
// Round a down to the next multiple of b. The caller is responsible for making
17+
// sure that b is non-zero
18+
template <typename T>
19+
inline constexpr T round_to_previous_multiple_of(T a, T b) {
20+
return a % b == 0 ? a : (a / b) * b;
21+
}
22+
23+
// Round a up to the next multiple of b. The caller is responsible for making
24+
// sure that b is non-zero
25+
template <typename T>
26+
inline constexpr T round_to_next_multiple_of(T a, T b) {
27+
return a % b == 0 ? a : ((a / b) + 1) * b;
28+
}

csrc/ops.h

+3
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
9797

9898
void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
9999

100+
void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
101+
torch::Tensor& scale);
102+
100103
void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
101104

102105
void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#include <ATen/cuda/CUDAContext.h>
2+
#include <torch/all.h>
3+
#include <c10/cuda/CUDAGuard.h>
4+
5+
#include <cmath>
6+
#include "core/math.hpp"
7+
#include "cuda_compat.h"
8+
#include "dispatch_utils.h"
9+
10+
#include "quantization/fp8/common.cuh"
11+
12+
namespace vllm {
13+
14+
template <typename T>
15+
__device__ __forceinline__ T silu_kernel(const T& x) {
16+
// x * sigmoid(x)
17+
return (T)(((float)x) / (1.0f + expf((float)-x)));
18+
}
19+
20+
// Activation and gating kernel template.
21+
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
22+
typename fp8_type>
23+
__global__ void act_and_mul_quant_kernel(
24+
fp8_type* __restrict__ out, // [..., d]
25+
const scalar_t* __restrict__ input, // [..., 2, d]
26+
const float* scale, const int d) {
27+
const int32_t blocks_per_token = gridDim.y;
28+
29+
const int32_t elems_per_128bit_load = (128 / 8) / sizeof(scalar_t);
30+
31+
// We don't expect the hidden dimension to exceed 32 bits so int32 should
32+
// be safe here.
33+
const int32_t tgt_elems_per_block = div_ceil(d, blocks_per_token);
34+
const int32_t elems_per_block =
35+
round_to_next_multiple_of(tgt_elems_per_block, elems_per_128bit_load);
36+
const int32_t block_start = blockIdx.y * elems_per_block;
37+
int32_t block_end = block_start + elems_per_block;
38+
block_end = block_end > d ? d : block_end;
39+
40+
// token_idx is 64 bit to prevent 32 bit overflow when the number of tokens
41+
// is very large
42+
const int64_t token_idx = blockIdx.x;
43+
const scalar_t* __restrict__ x_ptr = input + token_idx * 2 * d;
44+
const scalar_t* __restrict__ y_ptr = input + token_idx * 2 * d + d;
45+
fp8_type* __restrict__ out_ptr = out + token_idx * d;
46+
47+
// 128-bit vectorized code
48+
const int32_t vec_loop_end =
49+
round_to_previous_multiple_of(elems_per_128bit_load, block_end);
50+
const int32_t vec_end_idx = vec_loop_end / elems_per_128bit_load;
51+
const int32_t vec_start_idx = block_start / elems_per_128bit_load;
52+
53+
const int4* __restrict__ x_128bit_ptr = reinterpret_cast<const int4*>(x_ptr);
54+
const int4* __restrict__ y_128bit_ptr = reinterpret_cast<const int4*>(y_ptr);
55+
int2* __restrict__ out_128bit_ptr = reinterpret_cast<int2*>(out_ptr);
56+
57+
float inverted_scale = 1 / *scale;
58+
#pragma unroll
59+
for (int32_t vec_idx = vec_start_idx + threadIdx.x; vec_idx < vec_end_idx;
60+
vec_idx += blockDim.x) {
61+
const int4 x_128bit = VLLM_LDG(&x_128bit_ptr[vec_idx]);
62+
const int4 y_128bit = VLLM_LDG(&y_128bit_ptr[vec_idx]);
63+
using scalar_128bit_vec_t = std::array<scalar_t, elems_per_128bit_load>;
64+
using scalar_64bit_vec_t = std::array<fp8_type, elems_per_128bit_load>;
65+
66+
scalar_64bit_vec_t out_vec;
67+
const auto x_vec = reinterpret_cast<scalar_128bit_vec_t const&>(x_128bit);
68+
const auto y_vec = reinterpret_cast<scalar_128bit_vec_t const&>(y_128bit);
69+
70+
#pragma unroll
71+
for (int i = 0; i < elems_per_128bit_load; i++) {
72+
out_vec[i] = scaled_fp8_conversion<true, fp8_type>(
73+
ACT_FN(x_vec[i]) * y_vec[i], inverted_scale);
74+
}
75+
76+
out_128bit_ptr[vec_idx] = reinterpret_cast<const int2&>(out_vec);
77+
}
78+
79+
// Scalar cleanup code
80+
if (block_end > vec_loop_end) {
81+
for (int64_t idx = vec_loop_end + threadIdx.x; idx < block_end;
82+
idx += blockDim.x) {
83+
const scalar_t x = VLLM_LDG(&x_ptr[idx]);
84+
const scalar_t y = VLLM_LDG(&y_ptr[idx]);
85+
out_ptr[idx] =
86+
scaled_fp8_conversion<true, fp8_type>(ACT_FN(x) * y, inverted_scale);
87+
}
88+
}
89+
}
90+
} // namespace vllm
91+
92+
// Launch activation, gating, and quantize kernel.
93+
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
94+
int d = input.size(-1) / 2; \
95+
int64_t num_tokens = input.numel() / input.size(-1); \
96+
dim3 grid(num_tokens, num_tokens > 16 ? num_tokens > 32 ? 1 : 2 : 4); \
97+
dim3 block(std::min(d, 512)); \
98+
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
99+
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
100+
VLLM_DISPATCH_FLOATING_TYPES( \
101+
input.scalar_type(), "act_and_mul_kernel", [&] { \
102+
VLLM_DISPATCH_FP8_TYPES( \
103+
out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] { \
104+
vllm::act_and_mul_quant_kernel<scalar_t, KERNEL<scalar_t>, \
105+
fp8_t> \
106+
<<<grid, block, 0, stream>>>(out.data_ptr<fp8_t>(), \
107+
input.data_ptr<scalar_t>(), \
108+
scale.data_ptr<float>(), d); \
109+
}); \
110+
});
111+
112+
void silu_and_mul_quant(torch::Tensor& out, // [..., d]
113+
torch::Tensor& input, // [..., 2 * d]
114+
torch::Tensor& scale) {
115+
TORCH_CHECK(out.dtype() == torch::kFloat8_e4m3fn);
116+
TORCH_CHECK(input.dtype() == torch::kFloat16 ||
117+
input.dtype() == torch::kBFloat16);
118+
TORCH_CHECK(input.size(-1) % 2 == 0);
119+
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
120+
}

csrc/torch_bindings.cpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
8181

8282
// Activation ops
8383
// Activation function used in SwiGLU.
84-
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
84+
ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
8585
ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
8686

87+
ops.def(
88+
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
89+
ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
90+
8791
ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
8892
ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
8993

docker/Dockerfile.rocm

+9-1
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,16 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
114114
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
115115
ENV TOKENIZERS_PARALLELISM=false
116116

117+
# ENV that can improve safe tensor loading, and end-to-end time
118+
ENV SAFETENSORS_FAST_GPU=1
119+
120+
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
121+
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
122+
# you must use the 'spawn' start method
123+
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
124+
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
125+
117126
# Performance environment variable.
118127
ENV HIP_FORCE_DEV_KERNARG=1
119128

120129
CMD ["/bin/bash"]
121-
Loading

docs/source/deployment/frameworks/index.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@ lws
1212
modal
1313
open-webui
1414
skypilot
15+
streamlit
1516
triton
1617
:::

0 commit comments

Comments
 (0)