-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Feature] Prototype of vLLM execution on CPU-only devices #1028
Closed
Closed
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
FROM python:3.10 AS dev | ||
|
||
RUN apt-get update -y \ | ||
&& apt-get install -y python3-pip | ||
|
||
WORKDIR /workspace | ||
|
||
# install build and runtime dependencies | ||
COPY requirements-cpu.txt requirements-cpu.txt | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip install -r requirements-cpu.txt | ||
|
||
# install development dependencies | ||
COPY requirements-dev.txt requirements-dev.txt | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip install -r requirements-dev.txt | ||
|
||
# image to build pytorch extensions | ||
FROM dev AS build | ||
|
||
# install build dependencies | ||
COPY requirements-build-cpu.txt requirements-build-cpu.txt | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip install -r requirements-build-cpu.txt | ||
|
||
# copy input files | ||
COPY csrc csrc | ||
COPY setup.py setup.py | ||
COPY requirements-cpu.txt requirements-cpu.txt | ||
COPY pyproject.toml pyproject.toml | ||
COPY vllm/__init__.py vllm/__init__.py | ||
|
||
# max jobs used by Ninja to build extensions | ||
ENV MAX_JOBS=$max_jobs | ||
RUN python3 setup.py build_ext --inplace | ||
|
||
# image to run unit testing suite | ||
FROM dev AS test | ||
|
||
# copy pytorch extensions separately to avoid having to rebuild | ||
# when python code changes | ||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/ | ||
COPY tests tests | ||
COPY vllm vllm | ||
|
||
ENTRYPOINT ["python3", "-m", "pytest", "tests"] | ||
|
||
# use CUDA base as CUDA runtime dependencies are already installed via pip | ||
FROM python:3.10 AS dev | ||
|
||
# libnccl required for ray | ||
RUN apt-get update -y \ | ||
&& apt-get install -y python3-pip | ||
|
||
WORKDIR /workspace | ||
COPY requirements-cpu.txt requirements-cpu.txt | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip install -r requirements-cpu.txt | ||
|
||
FROM vllm-base AS vllm | ||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/ | ||
COPY vllm vllm | ||
|
||
EXPOSE 8000 | ||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] | ||
|
||
# openai api server alternative | ||
FROM vllm-base AS vllm-openai | ||
# install additional dependencies for openai api server | ||
RUN --mount=type=cache,target=/root/.cache/pip \ | ||
pip install accelerate fschat | ||
|
||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/ | ||
COPY vllm vllm | ||
|
||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#include "cpu_types.hpp" | ||
|
||
namespace { | ||
template <typename scalar_t> | ||
void silu_and_mul_cpu_impl(int num_tokens, int d, scalar_t *__restrict__ input, | ||
scalar_t *__restrict__ output) { | ||
using scalar_vec_t = vec_op::vec_t<scalar_t>; | ||
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num(); | ||
|
||
TORCH_CHECK(d % VEC_ELEM_NUM == 0); | ||
|
||
const vec_op::FP32Vec8 zeros(0.0); | ||
const vec_op::FP32Vec8 ones(1.0); | ||
|
||
#pragma omp parallel for | ||
for (int i = 0; i < num_tokens; ++i) { | ||
for (int j = 0; j < d; j += VEC_ELEM_NUM) { | ||
const int start = i * 2 * d; | ||
const scalar_vec_t x(input + start + j); | ||
const scalar_vec_t y(input + start + d + j); | ||
|
||
const vec_op::FP32Vec8 f32_x(x.reg); | ||
const vec_op::FP32Vec8 f32_y(y.reg); | ||
|
||
const vec_op::FP32Vec8 f32_ans = | ||
f32_y * (f32_x / (ones + (zeros - f32_x).exp())); | ||
|
||
const scalar_vec_t ans(f32_ans.reg); | ||
ans.save(output + i * d + j); | ||
} | ||
} | ||
} | ||
}; // namespace | ||
|
||
void silu_and_mul_cpu(torch::Tensor &out, torch::Tensor &input) { | ||
int num_tokens = input.numel() / input.size(-1); | ||
int d = input.size(-1) / 2; | ||
|
||
VLLM_DISPATCH_FLOATING_TYPES( | ||
input.scalar_type(), "silu_and_mul_cpu_impl", [&] { | ||
CPU_KERNEL_GUARD_IN(silu_and_mul_cpu_impl) | ||
silu_and_mul_cpu_impl(num_tokens, d, input.data_ptr<scalar_t>(), | ||
out.data_ptr<scalar_t>()); | ||
CPU_KERNEL_GUARD_OUT(silu_and_mul_cpu_impl) | ||
}); | ||
} | ||
|
||
void gelu_new_cpu(torch::Tensor &out, torch::Tensor &input) { | ||
TORCH_CHECK(false, "gelu_new is unsupported on CPU.") | ||
} | ||
|
||
void gelu_fast_cpu(torch::Tensor &out, torch::Tensor &input) { | ||
TORCH_CHECK(false, "gelu_fast is unsupported on CPU.") | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vllm-base is not available if build cpu.Dockerfile only