diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index e2f5e1e8e5062..0000000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -cmake_minimum_required(VERSION 3.15) -message(STATUS "Building using CMake version: ${CMAKE_VERSION}") - -project(PageAttCPU) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED True) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -add_compile_options(-mfma -mavx512f -mavx512bf16 -mavx512vl) - -set(PYTHON_LIB_PATH "/usr/local/lib/python3.10/dist-packages/") -set(PYTHON_INCLUDE_PATH "/usr/include/python3.10") - -set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${PYTHON_LIB_PATH}) -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/FindModules") - -include_directories(${PYTHON_INCLUDE_PATH}) -include_directories("/usr/local/lib/python3.10/dist-packages/torch/include/") -include_directories("/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include/") - -option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF) - -add_subdirectory(csrc/cpu) \ No newline at end of file diff --git a/FindModules/FindMKL.cmake b/FindModules/FindMKL.cmake deleted file mode 100644 index 01594a5b66e05..0000000000000 --- a/FindModules/FindMKL.cmake +++ /dev/null @@ -1,436 +0,0 @@ -# - Find INTEL MKL library -# -# This module sets the following variables: -# MKL_FOUND - set to true if a library implementing the CBLAS interface is found -# MKL_VERSION - best guess of the found mkl version -# MKL_INCLUDE_DIR - path to include dir. -# MKL_LIBRARIES - list of libraries for base mkl -# MKL_OPENMP_TYPE - OpenMP flavor that the found mkl uses: GNU or Intel -# MKL_OPENMP_LIBRARY - path to the OpenMP library the found mkl uses -# MKL_LAPACK_LIBRARIES - list of libraries to add for lapack -# MKL_SCALAPACK_LIBRARIES - list of libraries to add for scalapack -# MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers -# MKL_CDFT_LIBRARIES - list of libraries to add for the solvers - -# Do nothing if MKL_FOUND was set before! -IF (NOT MKL_FOUND) - -SET(MKL_VERSION) -SET(MKL_INCLUDE_DIR) -SET(MKL_LIBRARIES) -SET(MKL_OPENMP_TYPE) -SET(MKL_OPENMP_LIBRARY) -SET(MKL_LAPACK_LIBRARIES) -SET(MKL_SCALAPACK_LIBRARIES) -SET(MKL_SOLVER_LIBRARIES) -SET(MKL_CDFT_LIBRARIES) - -# Includes -INCLUDE(CheckTypeSize) -INCLUDE(CheckFunctionExists) - -# Set default value of INTEL_COMPILER_DIR and INTEL_MKL_DIR -IF (WIN32) - IF(DEFINED ENV{MKLProductDir}) - SET(DEFAULT_INTEL_COMPILER_DIR $ENV{MKLProductDir}) - ELSE() - SET(DEFAULT_INTEL_COMPILER_DIR - "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries/windows") - ENDIF() - SET(DEFAULT_INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl") -ELSE (WIN32) - SET(DEFAULT_INTEL_COMPILER_DIR "/opt/intel") - SET(DEFAULT_INTEL_MKL_DIR "/opt/intel/mkl") -ENDIF (WIN32) - -# Intel Compiler Suite -SET(INTEL_COMPILER_DIR "${DEFAULT_INTEL_COMPILER_DIR}" CACHE STRING - "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)") -SET(INTEL_MKL_DIR "${DEFAULT_INTEL_MKL_DIR}" CACHE STRING - "Root directory of the Intel MKL (standalone)") -SET(INTEL_OMP_DIR "${DEFAULT_INTEL_MKL_DIR}" CACHE STRING - "Root directory of the Intel OpenMP (standalone)") -SET(MKL_THREADING "OMP" CACHE STRING "MKL flavor: SEQ, TBB or OMP (default)") - -IF (NOT "${MKL_THREADING}" STREQUAL "SEQ" AND - NOT "${MKL_THREADING}" STREQUAL "TBB" AND - NOT "${MKL_THREADING}" STREQUAL "OMP") - MESSAGE(FATAL_ERROR "Invalid MKL_THREADING (${MKL_THREADING}), should be one of: SEQ, TBB, OMP") -ENDIF() - -IF ("${MKL_THREADING}" STREQUAL "TBB" AND NOT USE_TBB) - MESSAGE(FATAL_ERROR "MKL_THREADING is TBB but USE_TBB is turned off") -ENDIF() - -MESSAGE(STATUS "MKL_THREADING = ${MKL_THREADING}") - -# Checks -CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP) -IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(mklvers "intel64") - SET(iccvers "intel64") - SET(mkl64s "_lp64") -ELSE ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(mklvers "32") - SET(iccvers "ia32") - SET(mkl64s) -ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8) -IF(CMAKE_COMPILER_IS_GNUCC) - IF ("${MKL_THREADING}" STREQUAL "TBB") - SET(mklthreads "mkl_tbb_thread") - SET(mklrtls "tbb") - ELSE() - SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread") - SET(mklrtls "gomp" "iomp5") - ENDIF() - SET(mklifaces "intel" "gf") -ELSE(CMAKE_COMPILER_IS_GNUCC) - IF ("${MKL_THREADING}" STREQUAL "TBB") - SET(mklthreads "mkl_tbb_thread") - SET(mklrtls "tbb") - ELSE() - SET(mklthreads "mkl_intel_thread") - SET(mklrtls "iomp5" "guide") - IF (MSVC) - SET(mklrtls "libiomp5md") - ENDIF (MSVC) - ENDIF() - SET(mklifaces "intel") -ENDIF (CMAKE_COMPILER_IS_GNUCC) - -# Kernel libraries dynamically loaded -SET(mklkerlibs "mc" "mc3" "nc" "p4n" "p4m" "p4m3" "p4p" "def") -SET(mklseq) - -# Paths -SET(saved_CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}) -SET(saved_CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}) -IF(WIN32) - # Change mklvers and iccvers when we are using MSVC instead of ICC - IF(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - SET(mklvers "${mklvers}_win") - SET(iccvers "${iccvers}_win") - ENDIF() -ENDIF(WIN32) -IF (EXISTS ${INTEL_COMPILER_DIR}) - # TODO: diagnostic if dir does not exist - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_COMPILER_DIR}/lib/${iccvers}") - IF(MSVC) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_COMPILER_DIR}/compiler/lib/${iccvers}") - ENDIF() - IF (APPLE) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_COMPILER_DIR}/lib") - ENDIF() - IF (NOT EXISTS ${INTEL_MKL_DIR}) - SET(INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl") - ENDIF() -ENDIF() -IF (EXISTS ${INTEL_MKL_DIR}) - # TODO: diagnostic if dir does not exist - SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} - "${INTEL_MKL_DIR}/include") - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_MKL_DIR}/lib/${mklvers}") - IF (MSVC) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_MKL_DIR}/lib/${iccvers}") - IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_MKL_DIR}/win-x64") - ENDIF () - ENDIF() - IF (APPLE) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_MKL_DIR}/lib") - ENDIF() -ENDIF() - -IF (EXISTS ${INTEL_OMP_DIR}) - # TODO: diagnostic if dir does not exist - SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} - "${INTEL_OMP_DIR}/include") - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_OMP_DIR}/lib/${mklvers}") - IF (MSVC) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_OMP_DIR}/lib/${iccvers}") - IF ("${SIZE_OF_VOIDP}" EQUAL 8) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_OMP_DIR}/win-x64") - ENDIF () - ENDIF() - IF (APPLE) - SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} - "${INTEL_OMP_DIR}/lib") - ENDIF() -ENDIF() - -MACRO(GET_MKL_LIB_NAMES LIBRARIES INTERFACE MKL64) - cmake_parse_arguments("" "" "THREAD" "" ${ARGN}) - SET(${LIBRARIES} mkl_${INTERFACE}${MKL64} mkl_core) - IF(_THREAD) - LIST(INSERT ${LIBRARIES} 1 ${_THREAD}) - IF(UNIX AND ${USE_STATIC_MKL}) - # The thread library defines symbols required by the other MKL libraries so also add it last - LIST(APPEND ${LIBRARIES} ${_THREAD}) - ENDIF() - ENDIF() - IF(${USE_STATIC_MKL}) - IF(UNIX) - list(TRANSFORM ${LIBRARIES} PREPEND "lib") - list(TRANSFORM ${LIBRARIES} APPEND ".a") - ELSE() - message(WARNING "Ignoring USE_STATIC_MKL") - ENDIF() - ENDIF() -ENDMACRO() - -# Try linking multiple libs -MACRO(CHECK_ALL_LIBRARIES LIBRARIES OPENMP_TYPE OPENMP_LIBRARY _name _list _flags) - # This macro checks for the existence of the combination of libraries given by _list. - # If the combination is found, this macro checks whether we can link against that library - # combination using the name of a routine given by _name using the linker - # flags given by _flags. If the combination of libraries is found and passes - # the link test, LIBRARIES is set to the list of complete library paths that - # have been found. Otherwise, LIBRARIES is set to FALSE. - # N.B. _prefix is the prefix applied to the names of all cached variables that - # are generated internally and marked advanced by this macro. - SET(_prefix "${LIBRARIES}") - # start checking - SET(_libraries_work TRUE) - SET(${LIBRARIES}) - SET(${OPENMP_TYPE}) - SET(${OPENMP_LIBRARY}) - SET(_combined_name) - SET(_openmp_type) - SET(_openmp_library) - SET(_paths) - IF (NOT MKL_FIND_QUIETLY) - set(_str_list) - foreach(_elem ${_list}) - if(_str_list) - set(_str_list "${_str_list} - ${_elem}") - else() - set(_str_list "${_elem}") - endif() - endforeach(_elem) - message(STATUS "Checking for [${_str_list}]") - ENDIF () - SET(_found_tbb FALSE) - FOREACH(_library ${_list}) - SET(_combined_name ${_combined_name}_${_library}) - UNSET(${_prefix}_${_library}_LIBRARY) - IF(_libraries_work) - IF(${_library} MATCHES "omp") - IF(_openmp_type) - MESSAGE(FATAL_ERROR "More than one OpenMP libraries appear in the MKL test: ${_list}") - ELSEIF(${_library} MATCHES "gomp") - SET(_openmp_type "GNU") - # Use FindOpenMP to find gomp - FIND_PACKAGE(OpenMP QUIET) - IF(OPENMP_FOUND) - # Test that none of the found library names contains "iomp" (Intel - # OpenMP). This doesn't necessarily mean that we have gomp... but it - # is probably good enough since on gcc we should already have - # OpenMP_CXX_FLAGS="-fopenmp" and OpenMP_CXX_LIB_NAMES="". - SET(_found_gomp true) - FOREACH(_lib_name ${OpenMP_CXX_LIB_NAMES}) - IF (_found_gomp AND "${_lib_name}" MATCHES "iomp") - SET(_found_gomp false) - ENDIF() - ENDFOREACH() - IF(_found_gomp) - SET(${_prefix}_${_library}_LIBRARY ${OpenMP_CXX_FLAGS}) - SET(_openmp_library "${${_prefix}_${_library}_LIBRARY}") - ENDIF() - ENDIF(OPENMP_FOUND) - ELSEIF(${_library} MATCHES "iomp") - SET(_openmp_type "Intel") - FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library}) - SET(_openmp_library "${${_prefix}_${_library}_LIBRARY}") - ELSE() - MESSAGE(FATAL_ERROR "Unknown OpenMP flavor: ${_library}") - ENDIF() - ELSEIF(${_library} STREQUAL "tbb") - # Separately handling compiled TBB - SET(_found_tbb TRUE) - ELSE() - SET(lib_names ${_library}) - FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${lib_names}) - ENDIF() - MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY) - IF(NOT (${_library} STREQUAL "tbb")) - SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY}) - SET(_libraries_work ${${_prefix}_${_library}_LIBRARY}) - IF (NOT MKL_FIND_QUIETLY) - IF(${_prefix}_${_library}_LIBRARY) - MESSAGE(STATUS " Library ${_library}: ${${_prefix}_${_library}_LIBRARY}") - ELSE(${_prefix}_${_library}_LIBRARY) - MESSAGE(STATUS " Library ${_library}: not found") - ENDIF(${_prefix}_${_library}_LIBRARY) - ENDIF () - ENDIF() - ENDIF(_libraries_work) - ENDFOREACH(_library ${_list}) - # Test this combination of libraries. - IF(_libraries_work) - IF (NOT _found_tbb) - SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}}) - SET(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES};${CMAKE_REQUIRED_LIBRARIES}") - CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS) - SET(CMAKE_REQUIRED_LIBRARIES) - MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS) - SET(_libraries_work ${${_prefix}${_combined_name}_WORKS}) - ENDIF() - ENDIF(_libraries_work) - # Fin - IF(_libraries_work) - SET(${OPENMP_TYPE} ${_openmp_type}) - MARK_AS_ADVANCED(${OPENMP_TYPE}) - SET(${OPENMP_LIBRARY} ${_openmp_library}) - MARK_AS_ADVANCED(${OPENMP_LIBRARY}) - ELSE (_libraries_work) - SET(${LIBRARIES}) - MARK_AS_ADVANCED(${LIBRARIES}) - ENDIF(_libraries_work) -ENDMACRO(CHECK_ALL_LIBRARIES) - -IF(WIN32) - SET(mkl_m "") - SET(mkl_pthread "") -ELSE(WIN32) - SET(mkl_m "m") - SET(mkl_pthread "pthread") -ENDIF(WIN32) - -IF(UNIX AND NOT APPLE) - SET(mkl_dl "${CMAKE_DL_LIBS}") -ELSE(UNIX AND NOT APPLE) - SET(mkl_dl "") -ENDIF(UNIX AND NOT APPLE) - -# Check for version 10/11 -IF (NOT MKL_LIBRARIES) - SET(MKL_VERSION 1011) -ENDIF (NOT MKL_LIBRARIES) - -# First: search for parallelized ones with intel thread lib -IF (NOT "${MKL_THREADING}" STREQUAL "SEQ") - FOREACH(mklrtl ${mklrtls} "") - FOREACH(mkliface ${mklifaces}) - FOREACH(mkl64 ${mkl64s} "") - FOREACH(mklthread ${mklthreads}) - IF (NOT MKL_LIBRARIES) - GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}") - CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "${mkl_lib_names};${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "") - ENDIF (NOT MKL_LIBRARIES) - ENDFOREACH(mklthread) - ENDFOREACH(mkl64) - ENDFOREACH(mkliface) - ENDFOREACH(mklrtl) -ENDIF (NOT "${MKL_THREADING}" STREQUAL "SEQ") - -# Second: search for sequential ones -FOREACH(mkliface ${mklifaces}) - FOREACH(mkl64 ${mkl64s} "") - IF (NOT MKL_LIBRARIES) - GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "mkl_sequential") - CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "${mkl_lib_names};${mkl_m};${mkl_dl}" "") - IF (MKL_LIBRARIES) - SET(mklseq "_sequential") - ENDIF (MKL_LIBRARIES) - ENDIF (NOT MKL_LIBRARIES) - ENDFOREACH(mkl64) -ENDFOREACH(mkliface) - -# First: search for parallelized ones with native pthread lib -FOREACH(mklrtl ${mklrtls} "") - FOREACH(mkliface ${mklifaces}) - FOREACH(mkl64 ${mkl64s} "") - IF (NOT MKL_LIBRARIES) - GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}") - CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "${mkl_lib_names};${mklrtl};pthread;${mkl_m};${mkl_dl}" "") - ENDIF (NOT MKL_LIBRARIES) - ENDFOREACH(mkl64) - ENDFOREACH(mkliface) -ENDFOREACH(mklrtl) - -# Check for older versions -IF (NOT MKL_LIBRARIES) - SET(MKL_VERSION 900) - if (USE_STATIC_MKL) - message(WARNING "Ignoring USE_STATIC_MKL") - endif() - CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm - "mkl;guide;pthread;m" "") -ENDIF (NOT MKL_LIBRARIES) - -# Include files -IF (MKL_LIBRARIES) - FIND_PATH(MKL_INCLUDE_DIR "mkl_cblas.h") - MARK_AS_ADVANCED(MKL_INCLUDE_DIR) -ENDIF (MKL_LIBRARIES) - -# Other libraries -IF (MKL_LIBRARIES) - FOREACH(mkl64 ${mkl64s} "_core" "") - FOREACH(mkls ${mklseq} "") - IF (NOT MKL_LAPACK_LIBRARIES) - FIND_LIBRARY(MKL_LAPACK_LIBRARIES NAMES "mkl_lapack${mkl64}${mkls}") - MARK_AS_ADVANCED(MKL_LAPACK_LIBRARIES) - ENDIF (NOT MKL_LAPACK_LIBRARIES) - IF (NOT MKL_SCALAPACK_LIBRARIES) - FIND_LIBRARY(MKL_SCALAPACK_LIBRARIES NAMES "mkl_scalapack${mkl64}${mkls}") - MARK_AS_ADVANCED(MKL_SCALAPACK_LIBRARIES) - ENDIF (NOT MKL_SCALAPACK_LIBRARIES) - IF (NOT MKL_SOLVER_LIBRARIES) - FIND_LIBRARY(MKL_SOLVER_LIBRARIES NAMES "mkl_solver${mkl64}${mkls}") - MARK_AS_ADVANCED(MKL_SOLVER_LIBRARIES) - ENDIF (NOT MKL_SOLVER_LIBRARIES) - IF (NOT MKL_CDFT_LIBRARIES) - FIND_LIBRARY(MKL_CDFT_LIBRARIES NAMES "mkl_cdft${mkl64}${mkls}") - MARK_AS_ADVANCED(MKL_CDFT_LIBRARIES) - ENDIF (NOT MKL_CDFT_LIBRARIES) - ENDFOREACH(mkls) - ENDFOREACH(mkl64) -ENDIF (MKL_LIBRARIES) - -# Final -SET(CMAKE_LIBRARY_PATH ${saved_CMAKE_LIBRARY_PATH}) -SET(CMAKE_INCLUDE_PATH ${saved_CMAKE_INCLUDE_PATH}) -IF (MKL_LIBRARIES AND MKL_INCLUDE_DIR) - SET(MKL_FOUND TRUE) -ELSE (MKL_LIBRARIES AND MKL_INCLUDE_DIR) - if (MKL_LIBRARIES AND NOT MKL_INCLUDE_DIR) - MESSAGE(WARNING "MKL libraries files are found, but MKL header files are \ - not. You can get them by `conda install mkl-include` if using conda (if \ - it is missing, run `conda upgrade -n root conda` first), and \ - `pip install mkl-devel` if using pip. If build fails with header files \ - available in the system, please make sure that CMake will search the \ - directory containing them, e.g., by setting CMAKE_INCLUDE_PATH.") - endif() - SET(MKL_FOUND FALSE) - SET(MKL_VERSION) # clear MKL_VERSION -ENDIF (MKL_LIBRARIES AND MKL_INCLUDE_DIR) - -# Standard termination -IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED) - MESSAGE(FATAL_ERROR "MKL library not found. Please specify library location \ - by appending the root directory of the MKL installation to the environment variable CMAKE_PREFIX_PATH.") -ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED) -IF(NOT MKL_FIND_QUIETLY) - IF(MKL_FOUND) - MESSAGE(STATUS "MKL library found") - ELSE(MKL_FOUND) - MESSAGE(STATUS "MKL library not found") - ENDIF(MKL_FOUND) -ENDIF(NOT MKL_FIND_QUIETLY) - -# Do nothing if MKL_FOUND was set before! -ENDIF (NOT MKL_FOUND) diff --git a/Makefile b/Makefile deleted file mode 100644 index 35ae0731c050d..0000000000000 --- a/Makefile +++ /dev/null @@ -1,57 +0,0 @@ -JOBS?=$(bash getconf _NPROCESSORS_CONF) - -.PHONY: clean build - -clean: - @ls | grep '^build-\(Debug\|Release\)' | xargs -r rm -r - -build: - @mkdir -p build-$(BUILD_TYPE) && \ - cmake -B build-$(BUILD_TYPE) -GNinja -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) - cmake --build build-$(BUILD_TYPE) -j $(JOBS) - -debug: - $(MAKE) build BUILD_TYPE=Debug ENABLE_SANITIZER=OFF - -debug-asn: - $(MAKE) build BUILD_TYPE=Debug ENABLE_SANITIZER=ON - -release: - $(MAKE) build BUILD_TYPE=Release ENABLE_SANITIZER=OFF - -release-debug: - $(MAKE) build BUILD_TYPE=RelWithDebInfo ENABLE_SANITIZER=OFF - -sanitizer: - echo 1 > /proc/sys/vm/overcommit_memory - -py_install: - VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS pip install --no-build-isolation -v -e . - -py_install_cpu: - VLLM_BUILD_CPU_ONLY=1 MAX_JOBS=JOBS pip install --no-build-isolation -v -e . - -install_vllm: - MAX_JOBS=JOBS pip install -v git+https://github.com/intel-sandbox/vllm-xpu.git@dev -f https://download.pytorch.org/whl/torch_stable.html - -package: - VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS python setup.py bdist_wheel - echo "Wheel package is saved in ./dist/" - -HF_TP_bench: - cd benchmarks && python benchmark_throughput.py --backend=hf --dataset=../ShareGPT_V3_unfiltered_cleaned_split.json --model=/root/frameworks.bigdata.dev-ops/vicuna-7b-v1.5/ --n=1 --num-prompts=1 --hf-max-batch-size=1 --trust-remote-code --device=cpu - -VLLM_TP_bench: - cd benchmarks && python benchmark_throughput.py --backend=vllm --dataset=/root/HF_models/ShareGPT_V3_unfiltered_cleaned_split.json --model=/root/HF_models/vicuna-7b-v1.5/ --n=1 --num-prompts=1 --dtype=float32 --trust-remote-code --device=cpu --swap-space=4 - -VLLM_LT_bench: - cd benchmarks && python benchmark_latency.py --model=/root/frameworks.bigdata.dev-ops/vicuna-7b-v1.5/ --n=1 --batch-size=48 --input-len=128 --output-len=128 --num-iters=8 --dtype=bfloat16 --trust-remote-code --device=cpu - -VLLM_SERVE_bench: - cd benchmarks && python -m vllm.entrypoints.api_server \ - --model /root/HF_models/vicuna-7b-v1.5/ --swap-space 40 \ - --disable-log-requests --dtype=bfloat16 --device cpu & \ - cd benchmarks && sleep 30 && python benchmark_serving.py \ - --backend vllm \ - --tokenizer /root/HF_models/vicuna-7b-v1.5/ --dataset /root/HF_models/ShareGPT_V3_unfiltered_cleaned_split.json \ - --request-rate 10 \ No newline at end of file diff --git a/README.md b/README.md index a03f0b978c1c9..622e49e6adaa4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -*** ***This is the development branch for vLLM CPU support.*** *** -

diff --git a/benchmarks/kernels/activation.py b/benchmarks/kernels/activation.py deleted file mode 100644 index f646ca1679806..0000000000000 --- a/benchmarks/kernels/activation.py +++ /dev/null @@ -1,32 +0,0 @@ -from threadpoolctl import threadpool_info -from pprint import pprint - -import torch -from benchmark import KernelBenchmark -from vllm.activation_ops import silu_and_mul - - -class ActivationBench(KernelBenchmark): - - def __init__(self, loop_time, num_tokens, d, dtype: torch.dtype, - device: torch.device) -> None: - super().__init__(loop_time) - self.num_tokens = num_tokens - self.d = d - self.input = torch.randn(num_tokens, 2 * d, dtype=dtype, device=device) - self.output = torch.empty(num_tokens, d, dtype=dtype, device=device) - - def _run(self): - for i in range(self.loop_time): - silu_and_mul(self.output, self.input) - - -bench = ActivationBench(10, 4096, 512, torch.float32, torch.device("cpu")) -bench.execute() - -pprint(threadpool_info()) - -# RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu")) -# Scalar: 282420151.5 ns -# token parallel: 36635991.875 ns 7.7x -# FMA: 36517116.125 ns diff --git a/benchmarks/kernels/attention.py b/benchmarks/kernels/attention.py deleted file mode 100644 index 55a1c2af461fa..0000000000000 --- a/benchmarks/kernels/attention.py +++ /dev/null @@ -1,108 +0,0 @@ -from threadpoolctl import threadpool_info -from pprint import pprint - -import random -import torch -from benchmark import KernelBenchmark -from vllm.attention_ops import single_query_cached_kv_attention - - -class SingleCachedAttentionBench(KernelBenchmark): - - def __init__( - self, - loop_time, - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - device: torch.device, - num_kv_heads: int = None, - ) -> None: - super().__init__(loop_time) - self.block_size = block_size - qkv = torch.empty(num_tokens, - 3, - num_heads, - head_size, - dtype=dtype, - device=device) - qkv.uniform_(-1e-3, 1e-3) - self.query, _, _ = qkv.unbind(dim=1) - - x = 16 // torch.tensor([], dtype=dtype).element_size() - key_block_shape = (num_heads, head_size // x, block_size, x) - self.key_cache = torch.empty(size=(num_blocks, *key_block_shape), - dtype=dtype, - device=device) - self.key_cache.uniform_(-1e-3, 1e-3) - value_block_shape = (num_heads, head_size, block_size) - self.value_cache = torch.empty(size=(num_blocks, *value_block_shape), - dtype=dtype, - device=device) - self.value_cache.uniform_(-1e-3, 1e-3) - - context_lens = [random.randint(1, 4096) for _ in range(num_tokens)] - self.max_context_len = max(context_lens) - self.context_lens = torch.tensor(context_lens, - dtype=torch.int, - device=device) - - self.max_num_blocks_per_seq = (self.max_context_len + block_size - - 1) // block_size - block_tables = [] - for _ in range(num_tokens): - block_table = [ - random.randint(0, num_blocks - 1) - for _ in range(self.max_num_blocks_per_seq) - ] - block_tables.append(block_table) - self.block_tables = torch.tensor(block_tables, - dtype=torch.int, - device=device) - head_mapping = torch.arange(num_heads, - dtype=torch.int32, - device=device) - - self.scale = float(1.0 / (head_size**0.5)) - - num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - assert num_heads % num_kv_heads == 0 - self.num_queries_per_kv = num_heads // num_kv_heads - self.head_mapping = torch.repeat_interleave( - torch.arange(num_kv_heads, dtype=torch.int32, device=device), - self.num_queries_per_kv) - - self.output = torch.empty(num_tokens, - num_heads, - head_size, - dtype=dtype, - device=device) - - def _run(self): - single_query_cached_kv_attention( - self.output, - self.query, - self.key_cache, - self.value_cache, - self.head_mapping, - self.scale, - self.block_tables, - self.context_lens, - self.block_size, - self.max_context_len, - None, # ALiBi slopes. - ) - - -bench = SingleCachedAttentionBench(10, 32, 32, 256, 16, 1024, torch.float32, - torch.device('cpu'), 16) -bench.execute() - -# SingleCachedAttentionBench(10, 32, 32, 256, 16, 1024, torch.float32, torch.device('cpu'), 16) -# Scalar: 851373304 ns -# Parallel: 70520607.25 ns 10x - -pprint(threadpool_info()) diff --git a/benchmarks/kernels/benchmark.py b/benchmarks/kernels/benchmark.py deleted file mode 100644 index 20e53b4ee21b7..0000000000000 --- a/benchmarks/kernels/benchmark.py +++ /dev/null @@ -1,28 +0,0 @@ -from abc import ABC, abstractmethod -from time import perf_counter_ns -from statistics import mean - - -class KernelBenchmark(ABC): - - def __init__(self, loop_time) -> None: - super().__init__() - loop_time = loop_time if loop_time > 2 else 3 - - self.loop_time = loop_time - self.time = [] - - def execute(self): - for i in range(self.loop_time): - start = perf_counter_ns() - self._run() - end = perf_counter_ns() - self.time.append(end - start) - - self.time.sort() - avg = mean(self.time[1:-1]) - print("Execution time: {} ns".format(avg)) - - @abstractmethod - def _run(self): - pass diff --git a/benchmarks/kernels/cache_op.py b/benchmarks/kernels/cache_op.py deleted file mode 100644 index b4992bae804e8..0000000000000 --- a/benchmarks/kernels/cache_op.py +++ /dev/null @@ -1,119 +0,0 @@ -from threadpoolctl import threadpool_info -from pprint import pprint - -import torch -import random -from benchmark import KernelBenchmark -from vllm.cache_ops import copy_blocks, reshape_and_cache - - -class CacheCopyBench(KernelBenchmark): - - def __init__( - self, - loop_time, - num_mappings: int, - num_layers: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - device: torch.device, - ) -> None: - super().__init__(loop_time) - # Generate random block mappings. - src_blocks = random.sample(range(num_blocks), num_mappings) - remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) - dst_blocks = random.sample(remainig_blocks, num_mappings) - self.block_mapping = { - src: [dst] - for src, dst in zip(src_blocks, dst_blocks) - } - - # Create the KV cache. - x = 16 // torch.tensor([], dtype=dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, - x) - self.key_caches = [] - for _ in range(num_layers): - key_cache = torch.randn(size=key_cache_shape, - dtype=dtype, - device=device) - self.key_caches.append(key_cache) - - value_cache_shape = (num_blocks, num_heads, head_size, block_size) - self.value_caches = [] - for _ in range(num_layers): - value_cache = torch.randn(size=value_cache_shape, - dtype=dtype, - device=device) - self.value_caches.append(value_cache) - - def _run(self): - for i in range(self.loop_time): - copy_blocks(self.key_caches, self.value_caches, self.block_mapping) - - -class CacheReshapeBench(KernelBenchmark): - - def __init__( - self, - loop_time, - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - device: torch.device, - ) -> None: - super().__init__(loop_time) - num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) - self.slot_mapping = torch.tensor(slot_mapping, - dtype=torch.int, - device=device) - - qkv = torch.randn(num_tokens, - 3, - num_heads, - head_size, - dtype=dtype, - device=device) - _, self.key, self.value = qkv.unbind(dim=1) - - self.x = 16 // torch.tensor([], dtype=dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // self.x, - block_size, self.x) - self.key_cache = torch.randn(size=key_cache_shape, - dtype=dtype, - device=device) - - value_cache_shape = (num_blocks, num_heads, head_size, block_size) - self.value_cache = torch.randn(size=value_cache_shape, - dtype=dtype, - device=device) - - def _run(self): - reshape_and_cache(self.key, self.value, self.key_cache, - self.value_cache, self.slot_mapping) - - -# bench = CacheCopyBench(10, 256, 8, 16, 256, 16, 1024, torch.float32, torch.device("cpu")) -# bench.execute() - -# CacheCopyBench(10, 256, 8, 16, 256, 16, 1024, torch.float32, torch.device("cpu")) -# Scalar: 2731509071.375 ns -# Layer parallel: 510428213.5 ns 5.35x -# nested parallel: 434456796.5 ns 6.05x -# section parallel: 442927758 ns - -# bench = CacheReshapeBench(10, 128, 64, 256, 16, 1024, torch.float32, torch.device("cpu")) -# bench.execute() - -# CacheReshapeBench(10, 128, 64, 256, 16, 1024, torch.float32, torch.device("cpu")) -# Scalar: 77548817.875 ns -# Parallel: 7257660.75 ns 10x - -pprint(threadpool_info()) diff --git a/benchmarks/kernels/pos_encoding.py b/benchmarks/kernels/pos_encoding.py deleted file mode 100644 index 3c69ab806787a..0000000000000 --- a/benchmarks/kernels/pos_encoding.py +++ /dev/null @@ -1,58 +0,0 @@ -from threadpoolctl import threadpool_info -from pprint import pprint - -import torch -from benchmark import KernelBenchmark -from vllm import pos_encoding_ops - - -class PosEncodingBench(KernelBenchmark): - - def __init__(self, loop_time, num_tokens: int, num_heads: int, - head_size: int, max_position: int, rotary_dim: int, - dtype: torch.dtype, device: torch.device) -> None: - super().__init__(loop_time) - base: int = 10000 - self.positions = torch.randint(0, - max_position, (num_tokens, ), - device=device) - query = torch.randn(num_tokens, - num_heads * head_size, - dtype=dtype, - device=device) - key = torch.randn(num_tokens, - num_heads * head_size, - dtype=dtype, - device=device) - # Create the rotary embedding. - inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim)) - t = torch.arange(max_position).float() - freqs = torch.einsum('i,j -> ij', t, inv_freq.float()) - cos = freqs.cos() - sin = freqs.sin() - self.head_size = head_size - self.cos_sin_cache = torch.cat((cos, sin), dim=-1) - self.cos_sin_cache = self.cos_sin_cache.to(dtype=dtype, device=device) - self.out_query = query.clone() - self.out_key = key.clone() - - def _run(self): - for i in range(self.loop_time): - pos_encoding_ops.rotary_embedding_neox(self.positions, - self.out_query, - self.out_key, - self.head_size, - self.cos_sin_cache) - - -bench = PosEncodingBench(10, - num_tokens=4096, - num_heads=5, - head_size=128, - max_position=8192, - rotary_dim=128, - dtype=torch.float32, - device=torch.device("cpu")) -bench.execute() - -pprint(threadpool_info()) diff --git a/benchmarks/kernels/rmsnorm.py b/benchmarks/kernels/rmsnorm.py deleted file mode 100644 index 3140aea984d70..0000000000000 --- a/benchmarks/kernels/rmsnorm.py +++ /dev/null @@ -1,34 +0,0 @@ -from threadpoolctl import threadpool_info -from pprint import pprint - -import torch -from benchmark import KernelBenchmark -from vllm.layernorm_ops import rms_norm - - -class RMSNormBench(KernelBenchmark): - - def __init__(self, loop_time, token_num, hidden_size, dtype: torch.dtype, - device: torch.device) -> None: - super().__init__(loop_time) - self.x = torch.randn(token_num, - hidden_size, - dtype=dtype, - device=device) - self.out = torch.empty_like(self.x) - self.weight = torch.empty(hidden_size) - - def _run(self): - for i in range(self.loop_time): - rms_norm(self.out, self.x, self.weight, 1e-6) - - -bench = RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu")) -bench.execute() - -pprint(threadpool_info()) - -# RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu")) -# Scalar: 282420151.5 ns -# token parallel: 36635991.875 ns 7.7x -# FMA: 36517116.125 ns diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt deleted file mode 100644 index 8a5d40a6540bb..0000000000000 --- a/csrc/cpu/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ - -# find_package(Torch REQUIRED) -# message("Torch CXX FLAGS: ${TORCH_CXX_FLAGS}") -# message("Torch LIBS: ${TORCH_LIBRARIES}") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") - -# set(USE_STATIC_MKL OFF) -# find_package(MKL REQUIRED) -# message("MKL found: ${MKL_FOUND}, -# version: ${MKL_VERSION}, -# include dir: ${MKL_INCLUDE_DIR}, -# libs: ${MKL_LIBRARIES}, -# omp type: ${MKL_OPENMP_TYPE}, -# omp path: ${MKL_OPENMP_LIBRARY}") - -macro(add_operator_target op_name source_file) -add_library(${op_name} SHARED ${source_file}) -# target_link_libraries(${op_name} ${TORCH_LIBRARIES} ${MKL_LIBRARIES}) -endmacro(add_operator_target) - -add_operator_target(layernorm layernorm_impl.cpp) -add_operator_target(pos_encoding pos_encoding_impl.cpp) -add_operator_target(cache_op "cache_impl.cpp") -add_operator_target(activation activation_impl.cpp) -add_operator_target(attention attention_impl.cpp) diff --git a/utils/tensor_dump.py b/utils/tensor_dump.py deleted file mode 100644 index dedfffcfb1727..0000000000000 --- a/utils/tensor_dump.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import argparse -import torch - -DATA_PATH = "/root/vllm-xpu/data" - - -class TensorDumper: - - def __init__(self, ident: str, iter_limit: int = 3) -> None: - self.ident = ident - self.device = None - self.limit = iter_limit - self.iter_num = 0 - - def append(self, data: torch.Tensor): - if self.device is None: - self.device = data.device.type - - if self.iter_num < self.limit: - self.iter_num += 1 - torch.save( - data.cpu(), - os.path.join( - DATA_PATH, "{}_{}_{}".format(self.device, self.ident, - self.iter_num))) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Tensor dumper reader.") - parser.add_argument("--ident", type=str) - parser.add_argument("--iter-num", type=int) - parser.add_argument("--full_print", action='store_true') - - args = parser.parse_args() - ident = args.ident - iter_num = args.iter_num - full_print = args.full_print - - if full_print: - torch.set_printoptions(profile=full_print) - - cuda_t = torch.load( - os.path.join(DATA_PATH, "{}_{}_{}".format("cuda", ident, iter_num))) - cpu_t = torch.load( - os.path.join(DATA_PATH, "{}_{}_{}".format("cpu", ident, iter_num))) - - print("---", ident, "---", iter_num, "---") - print("Max_diff: ", (cuda_t - cpu_t).abs().max()) - print("Mean_diff: ", (cuda_t - cpu_t).abs().mean()) - print("cuda:", cuda_t.size()) - print(cuda_t) - print("---------------------------------") - print("cpu:", cpu_t.size()) - print(cpu_t) - print("---------------------------------")