diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index e2f5e1e8e5062..0000000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-cmake_minimum_required(VERSION 3.15)
-message(STATUS "Building using CMake version: ${CMAKE_VERSION}")
-
-project(PageAttCPU)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED True)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-add_compile_options(-mfma -mavx512f -mavx512bf16 -mavx512vl)
-
-set(PYTHON_LIB_PATH "/usr/local/lib/python3.10/dist-packages/")
-set(PYTHON_INCLUDE_PATH "/usr/include/python3.10")
-
-set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${PYTHON_LIB_PATH})
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/FindModules")
-
-include_directories(${PYTHON_INCLUDE_PATH})
-include_directories("/usr/local/lib/python3.10/dist-packages/torch/include/")
-include_directories("/usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include/")
-
-option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
-
-add_subdirectory(csrc/cpu)
\ No newline at end of file
diff --git a/FindModules/FindMKL.cmake b/FindModules/FindMKL.cmake
deleted file mode 100644
index 01594a5b66e05..0000000000000
--- a/FindModules/FindMKL.cmake
+++ /dev/null
@@ -1,436 +0,0 @@
-# - Find INTEL MKL library
-#
-# This module sets the following variables:
-#  MKL_FOUND - set to true if a library implementing the CBLAS interface is found
-#  MKL_VERSION - best guess of the found mkl version
-#  MKL_INCLUDE_DIR - path to include dir.
-#  MKL_LIBRARIES - list of libraries for base mkl
-#  MKL_OPENMP_TYPE - OpenMP flavor that the found mkl uses: GNU or Intel
-#  MKL_OPENMP_LIBRARY - path to the OpenMP library the found mkl uses
-#  MKL_LAPACK_LIBRARIES - list of libraries to add for lapack
-#  MKL_SCALAPACK_LIBRARIES - list of libraries to add for scalapack
-#  MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers
-#  MKL_CDFT_LIBRARIES - list of libraries to add for the solvers
-
-# Do nothing if MKL_FOUND was set before!
-IF (NOT MKL_FOUND)
-
-SET(MKL_VERSION)
-SET(MKL_INCLUDE_DIR)
-SET(MKL_LIBRARIES)
-SET(MKL_OPENMP_TYPE)
-SET(MKL_OPENMP_LIBRARY)
-SET(MKL_LAPACK_LIBRARIES)
-SET(MKL_SCALAPACK_LIBRARIES)
-SET(MKL_SOLVER_LIBRARIES)
-SET(MKL_CDFT_LIBRARIES)
-
-# Includes
-INCLUDE(CheckTypeSize)
-INCLUDE(CheckFunctionExists)
-
-# Set default value of INTEL_COMPILER_DIR and INTEL_MKL_DIR
-IF (WIN32)
-  IF(DEFINED ENV{MKLProductDir})
-    SET(DEFAULT_INTEL_COMPILER_DIR $ENV{MKLProductDir})
-  ELSE()
-    SET(DEFAULT_INTEL_COMPILER_DIR
-     "C:/Program Files (x86)/IntelSWTools/compilers_and_libraries/windows")
-  ENDIF()
-  SET(DEFAULT_INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl")
-ELSE (WIN32)
-  SET(DEFAULT_INTEL_COMPILER_DIR "/opt/intel")
-  SET(DEFAULT_INTEL_MKL_DIR "/opt/intel/mkl")
-ENDIF (WIN32)
-
-# Intel Compiler Suite
-SET(INTEL_COMPILER_DIR "${DEFAULT_INTEL_COMPILER_DIR}" CACHE STRING
-  "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
-SET(INTEL_MKL_DIR "${DEFAULT_INTEL_MKL_DIR}" CACHE STRING
-  "Root directory of the Intel MKL (standalone)")
-SET(INTEL_OMP_DIR "${DEFAULT_INTEL_MKL_DIR}" CACHE STRING
-  "Root directory of the Intel OpenMP (standalone)")
-SET(MKL_THREADING "OMP" CACHE STRING "MKL flavor: SEQ, TBB or OMP (default)")
-
-IF (NOT "${MKL_THREADING}" STREQUAL "SEQ" AND
-    NOT "${MKL_THREADING}" STREQUAL "TBB" AND
-    NOT "${MKL_THREADING}" STREQUAL "OMP")
-  MESSAGE(FATAL_ERROR "Invalid MKL_THREADING (${MKL_THREADING}), should be one of: SEQ, TBB, OMP")
-ENDIF()
-
-IF ("${MKL_THREADING}" STREQUAL "TBB" AND NOT USE_TBB)
-  MESSAGE(FATAL_ERROR "MKL_THREADING is TBB but USE_TBB is turned off")
-ENDIF()
-
-MESSAGE(STATUS "MKL_THREADING = ${MKL_THREADING}")
-
-# Checks
-CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
-IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-  SET(mklvers "intel64")
-  SET(iccvers "intel64")
-  SET(mkl64s "_lp64")
-ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
-  SET(mklvers "32")
-  SET(iccvers "ia32")
-  SET(mkl64s)
-ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
-IF(CMAKE_COMPILER_IS_GNUCC)
-  IF ("${MKL_THREADING}" STREQUAL "TBB")
-    SET(mklthreads "mkl_tbb_thread")
-    SET(mklrtls "tbb")
-  ELSE()
-    SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
-    SET(mklrtls "gomp" "iomp5")
-  ENDIF()
-  SET(mklifaces  "intel" "gf")
-ELSE(CMAKE_COMPILER_IS_GNUCC)
-  IF ("${MKL_THREADING}" STREQUAL "TBB")
-    SET(mklthreads "mkl_tbb_thread")
-    SET(mklrtls "tbb")
-  ELSE()
-    SET(mklthreads "mkl_intel_thread")
-    SET(mklrtls "iomp5" "guide")
-    IF (MSVC)
-      SET(mklrtls "libiomp5md")
-    ENDIF (MSVC)
-  ENDIF()
-  SET(mklifaces  "intel")
-ENDIF (CMAKE_COMPILER_IS_GNUCC)
-
-# Kernel libraries dynamically loaded
-SET(mklkerlibs "mc" "mc3" "nc" "p4n" "p4m" "p4m3" "p4p" "def")
-SET(mklseq)
-
-# Paths
-SET(saved_CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH})
-SET(saved_CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH})
-IF(WIN32)
-  # Change mklvers and iccvers when we are using MSVC instead of ICC
-  IF(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-    SET(mklvers "${mklvers}_win")
-    SET(iccvers "${iccvers}_win")
-  ENDIF()
-ENDIF(WIN32)
-IF (EXISTS ${INTEL_COMPILER_DIR})
-  # TODO: diagnostic if dir does not exist
-  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_COMPILER_DIR}/lib/${iccvers}")
-  IF(MSVC)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_COMPILER_DIR}/compiler/lib/${iccvers}")
-  ENDIF()
-  IF (APPLE)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_COMPILER_DIR}/lib")
-  ENDIF()
-  IF (NOT EXISTS ${INTEL_MKL_DIR})
-    SET(INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl")
-  ENDIF()
-ENDIF()
-IF (EXISTS ${INTEL_MKL_DIR})
-  # TODO: diagnostic if dir does not exist
-  SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
-    "${INTEL_MKL_DIR}/include")
-  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_MKL_DIR}/lib/${mklvers}")
-  IF (MSVC)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_MKL_DIR}/lib/${iccvers}")
-    IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-      SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-        "${INTEL_MKL_DIR}/win-x64")
-    ENDIF ()
-  ENDIF()
-  IF (APPLE)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_MKL_DIR}/lib")
-  ENDIF()
-ENDIF()
-
-IF (EXISTS ${INTEL_OMP_DIR})
-  # TODO: diagnostic if dir does not exist
-  SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
-    "${INTEL_OMP_DIR}/include")
-  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_OMP_DIR}/lib/${mklvers}")
-  IF (MSVC)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_OMP_DIR}/lib/${iccvers}")
-    IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-      SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-        "${INTEL_OMP_DIR}/win-x64")
-    ENDIF ()
-  ENDIF()
-  IF (APPLE)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_OMP_DIR}/lib")
-  ENDIF()
-ENDIF()
-
-MACRO(GET_MKL_LIB_NAMES LIBRARIES INTERFACE MKL64)
-  cmake_parse_arguments("" "" "THREAD" "" ${ARGN})
-  SET(${LIBRARIES} mkl_${INTERFACE}${MKL64} mkl_core)
-  IF(_THREAD)
-    LIST(INSERT ${LIBRARIES} 1 ${_THREAD})
-    IF(UNIX AND ${USE_STATIC_MKL})
-      # The thread library defines symbols required by the other MKL libraries so also add it last
-      LIST(APPEND ${LIBRARIES} ${_THREAD})
-    ENDIF()
-  ENDIF()
-  IF(${USE_STATIC_MKL})
-    IF(UNIX)
-      list(TRANSFORM ${LIBRARIES} PREPEND "lib")
-      list(TRANSFORM ${LIBRARIES} APPEND ".a")
-    ELSE()
-      message(WARNING "Ignoring USE_STATIC_MKL")
-    ENDIF()
-  ENDIF()
-ENDMACRO()
-
-# Try linking multiple libs
-MACRO(CHECK_ALL_LIBRARIES LIBRARIES OPENMP_TYPE OPENMP_LIBRARY _name _list _flags)
-  # This macro checks for the existence of the combination of libraries given by _list.
-  # If the combination is found, this macro checks whether we can link against that library
-  # combination using the name of a routine given by _name using the linker
-  # flags given by _flags.  If the combination of libraries is found and passes
-  # the link test, LIBRARIES is set to the list of complete library paths that
-  # have been found.  Otherwise, LIBRARIES is set to FALSE.
-  # N.B. _prefix is the prefix applied to the names of all cached variables that
-  # are generated internally and marked advanced by this macro.
-  SET(_prefix "${LIBRARIES}")
-  # start checking
-  SET(_libraries_work TRUE)
-  SET(${LIBRARIES})
-  SET(${OPENMP_TYPE})
-  SET(${OPENMP_LIBRARY})
-  SET(_combined_name)
-  SET(_openmp_type)
-  SET(_openmp_library)
-  SET(_paths)
-  IF (NOT MKL_FIND_QUIETLY)
-    set(_str_list)
-    foreach(_elem ${_list})
-      if(_str_list)
-        set(_str_list "${_str_list} - ${_elem}")
-      else()
-        set(_str_list "${_elem}")
-      endif()
-    endforeach(_elem)
-    message(STATUS "Checking for [${_str_list}]")
-  ENDIF ()
-  SET(_found_tbb FALSE)
-  FOREACH(_library ${_list})
-    SET(_combined_name ${_combined_name}_${_library})
-    UNSET(${_prefix}_${_library}_LIBRARY)
-    IF(_libraries_work)
-      IF(${_library} MATCHES "omp")
-        IF(_openmp_type)
-          MESSAGE(FATAL_ERROR "More than one OpenMP libraries appear in the MKL test: ${_list}")
-        ELSEIF(${_library} MATCHES "gomp")
-          SET(_openmp_type "GNU")
-          # Use FindOpenMP to find gomp
-          FIND_PACKAGE(OpenMP QUIET)
-          IF(OPENMP_FOUND)
-            # Test that none of the found library names contains "iomp" (Intel
-            # OpenMP). This doesn't necessarily mean that we have gomp... but it
-            # is probably good enough since on gcc we should already have
-            # OpenMP_CXX_FLAGS="-fopenmp" and OpenMP_CXX_LIB_NAMES="".
-            SET(_found_gomp true)
-            FOREACH(_lib_name ${OpenMP_CXX_LIB_NAMES})
-              IF (_found_gomp AND "${_lib_name}" MATCHES "iomp")
-                SET(_found_gomp false)
-              ENDIF()
-            ENDFOREACH()
-            IF(_found_gomp)
-              SET(${_prefix}_${_library}_LIBRARY ${OpenMP_CXX_FLAGS})
-              SET(_openmp_library "${${_prefix}_${_library}_LIBRARY}")
-            ENDIF()
-          ENDIF(OPENMP_FOUND)
-        ELSEIF(${_library} MATCHES "iomp")
-          SET(_openmp_type "Intel")
-          FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
-          SET(_openmp_library "${${_prefix}_${_library}_LIBRARY}")
-        ELSE()
-          MESSAGE(FATAL_ERROR "Unknown OpenMP flavor: ${_library}")
-        ENDIF()
-      ELSEIF(${_library} STREQUAL "tbb")
-        # Separately handling compiled TBB
-        SET(_found_tbb TRUE)
-      ELSE()
-        SET(lib_names ${_library})
-        FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${lib_names})
-      ENDIF()
-      MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
-      IF(NOT (${_library} STREQUAL "tbb"))
-        SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
-        SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-        IF (NOT MKL_FIND_QUIETLY)
-          IF(${_prefix}_${_library}_LIBRARY)
-            MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
-          ELSE(${_prefix}_${_library}_LIBRARY)
-            MESSAGE(STATUS "  Library ${_library}: not found")
-          ENDIF(${_prefix}_${_library}_LIBRARY)
-        ENDIF ()
-      ENDIF()
-    ENDIF(_libraries_work)
-  ENDFOREACH(_library ${_list})
-  # Test this combination of libraries.
-  IF(_libraries_work)
-    IF (NOT _found_tbb)
-      SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
-      SET(CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES};${CMAKE_REQUIRED_LIBRARIES}")
-      CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
-      SET(CMAKE_REQUIRED_LIBRARIES)
-      MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
-      SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
-    ENDIF()
-  ENDIF(_libraries_work)
-  # Fin
-  IF(_libraries_work)
-    SET(${OPENMP_TYPE} ${_openmp_type})
-    MARK_AS_ADVANCED(${OPENMP_TYPE})
-    SET(${OPENMP_LIBRARY} ${_openmp_library})
-    MARK_AS_ADVANCED(${OPENMP_LIBRARY})
-  ELSE (_libraries_work)
-    SET(${LIBRARIES})
-    MARK_AS_ADVANCED(${LIBRARIES})
-  ENDIF(_libraries_work)
-ENDMACRO(CHECK_ALL_LIBRARIES)
-
-IF(WIN32)
-  SET(mkl_m "")
-  SET(mkl_pthread "")
-ELSE(WIN32)
-  SET(mkl_m "m")
-  SET(mkl_pthread "pthread")
-ENDIF(WIN32)
-
-IF(UNIX AND NOT APPLE)
-  SET(mkl_dl "${CMAKE_DL_LIBS}")
-ELSE(UNIX AND NOT APPLE)
-  SET(mkl_dl "")
-ENDIF(UNIX AND NOT APPLE)
-
-# Check for version 10/11
-IF (NOT MKL_LIBRARIES)
-  SET(MKL_VERSION 1011)
-ENDIF (NOT MKL_LIBRARIES)
-
-# First: search for parallelized ones with intel thread lib
-IF (NOT "${MKL_THREADING}" STREQUAL "SEQ")
-  FOREACH(mklrtl ${mklrtls} "")
-    FOREACH(mkliface ${mklifaces})
-      FOREACH(mkl64 ${mkl64s} "")
-        FOREACH(mklthread ${mklthreads})
-          IF (NOT MKL_LIBRARIES)
-            GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}")
-            CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-              "${mkl_lib_names};${mklrtl};${mkl_pthread};${mkl_m};${mkl_dl}" "")
-          ENDIF (NOT MKL_LIBRARIES)
-        ENDFOREACH(mklthread)
-      ENDFOREACH(mkl64)
-    ENDFOREACH(mkliface)
-  ENDFOREACH(mklrtl)
-ENDIF (NOT "${MKL_THREADING}" STREQUAL "SEQ")
-
-# Second: search for sequential ones
-FOREACH(mkliface ${mklifaces})
-  FOREACH(mkl64 ${mkl64s} "")
-    IF (NOT MKL_LIBRARIES)
-      GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "mkl_sequential")
-      CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-        "${mkl_lib_names};${mkl_m};${mkl_dl}" "")
-      IF (MKL_LIBRARIES)
-        SET(mklseq "_sequential")
-      ENDIF (MKL_LIBRARIES)
-    ENDIF (NOT MKL_LIBRARIES)
-  ENDFOREACH(mkl64)
-ENDFOREACH(mkliface)
-
-# First: search for parallelized ones with native pthread lib
-FOREACH(mklrtl ${mklrtls} "")
-  FOREACH(mkliface ${mklifaces})
-    FOREACH(mkl64 ${mkl64s} "")
-      IF (NOT MKL_LIBRARIES)
-        GET_MKL_LIB_NAMES(mkl_lib_names "${mkliface}" "${mkl64}" THREAD "${mklthread}")
-        CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-          "${mkl_lib_names};${mklrtl};pthread;${mkl_m};${mkl_dl}" "")
-      ENDIF (NOT MKL_LIBRARIES)
-    ENDFOREACH(mkl64)
-  ENDFOREACH(mkliface)
-ENDFOREACH(mklrtl)
-
-# Check for older versions
-IF (NOT MKL_LIBRARIES)
-  SET(MKL_VERSION 900)
-  if (USE_STATIC_MKL)
-      message(WARNING "Ignoring USE_STATIC_MKL")
-  endif()
-  CHECK_ALL_LIBRARIES(MKL_LIBRARIES MKL_OPENMP_TYPE MKL_OPENMP_LIBRARY cblas_sgemm
-    "mkl;guide;pthread;m" "")
-ENDIF (NOT MKL_LIBRARIES)
-
-# Include files
-IF (MKL_LIBRARIES)
-  FIND_PATH(MKL_INCLUDE_DIR "mkl_cblas.h")
-  MARK_AS_ADVANCED(MKL_INCLUDE_DIR)
-ENDIF (MKL_LIBRARIES)
-
-# Other libraries
-IF (MKL_LIBRARIES)
-  FOREACH(mkl64 ${mkl64s} "_core" "")
-    FOREACH(mkls ${mklseq} "")
-      IF (NOT MKL_LAPACK_LIBRARIES)
-        FIND_LIBRARY(MKL_LAPACK_LIBRARIES NAMES "mkl_lapack${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_LAPACK_LIBRARIES)
-      ENDIF (NOT MKL_LAPACK_LIBRARIES)
-      IF (NOT MKL_SCALAPACK_LIBRARIES)
-        FIND_LIBRARY(MKL_SCALAPACK_LIBRARIES NAMES "mkl_scalapack${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_SCALAPACK_LIBRARIES)
-      ENDIF (NOT MKL_SCALAPACK_LIBRARIES)
-      IF (NOT MKL_SOLVER_LIBRARIES)
-        FIND_LIBRARY(MKL_SOLVER_LIBRARIES NAMES "mkl_solver${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_SOLVER_LIBRARIES)
-      ENDIF (NOT MKL_SOLVER_LIBRARIES)
-      IF (NOT MKL_CDFT_LIBRARIES)
-        FIND_LIBRARY(MKL_CDFT_LIBRARIES NAMES "mkl_cdft${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_CDFT_LIBRARIES)
-      ENDIF (NOT MKL_CDFT_LIBRARIES)
-    ENDFOREACH(mkls)
-  ENDFOREACH(mkl64)
-ENDIF (MKL_LIBRARIES)
-
-# Final
-SET(CMAKE_LIBRARY_PATH ${saved_CMAKE_LIBRARY_PATH})
-SET(CMAKE_INCLUDE_PATH ${saved_CMAKE_INCLUDE_PATH})
-IF (MKL_LIBRARIES AND MKL_INCLUDE_DIR)
-  SET(MKL_FOUND TRUE)
-ELSE (MKL_LIBRARIES AND MKL_INCLUDE_DIR)
-  if (MKL_LIBRARIES AND NOT MKL_INCLUDE_DIR)
-    MESSAGE(WARNING "MKL libraries files are found, but MKL header files are \
-      not. You can get them by `conda install mkl-include` if using conda (if \
-      it is missing, run `conda upgrade -n root conda` first), and \
-      `pip install mkl-devel` if using pip. If build fails with header files \
-      available in the system, please make sure that CMake will search the \
-      directory containing them, e.g., by setting CMAKE_INCLUDE_PATH.")
-  endif()
-  SET(MKL_FOUND FALSE)
-  SET(MKL_VERSION)  # clear MKL_VERSION
-ENDIF (MKL_LIBRARIES AND MKL_INCLUDE_DIR)
-
-# Standard termination
-IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
-  MESSAGE(FATAL_ERROR "MKL library not found. Please specify library location \
-    by appending the root directory of the MKL installation to the environment variable CMAKE_PREFIX_PATH.")
-ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
-IF(NOT MKL_FIND_QUIETLY)
-  IF(MKL_FOUND)
-    MESSAGE(STATUS "MKL library found")
-  ELSE(MKL_FOUND)
-    MESSAGE(STATUS "MKL library not found")
-  ENDIF(MKL_FOUND)
-ENDIF(NOT MKL_FIND_QUIETLY)
-
-# Do nothing if MKL_FOUND was set before!
-ENDIF (NOT MKL_FOUND)
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 35ae0731c050d..0000000000000
--- a/Makefile
+++ /dev/null
@@ -1,57 +0,0 @@
-JOBS?=$(bash getconf _NPROCESSORS_CONF)
-
-.PHONY: clean build
-
-clean:
-	@ls | grep '^build-\(Debug\|Release\)' | xargs -r rm -r
-
-build:
-	@mkdir -p build-$(BUILD_TYPE) && \
-	cmake -B build-$(BUILD_TYPE) -GNinja -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) 
-	cmake --build build-$(BUILD_TYPE) -j $(JOBS) 
-
-debug:
-	$(MAKE) build BUILD_TYPE=Debug ENABLE_SANITIZER=OFF
-
-debug-asn:
-	$(MAKE) build BUILD_TYPE=Debug ENABLE_SANITIZER=ON
-
-release:
-	$(MAKE) build BUILD_TYPE=Release ENABLE_SANITIZER=OFF
-
-release-debug:
-	$(MAKE) build BUILD_TYPE=RelWithDebInfo ENABLE_SANITIZER=OFF
-
-sanitizer:
-	echo 1 > /proc/sys/vm/overcommit_memory
-
-py_install:
-	VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS pip install --no-build-isolation  -v -e .
-
-py_install_cpu:
-	VLLM_BUILD_CPU_ONLY=1 MAX_JOBS=JOBS pip install --no-build-isolation  -v -e .
-
-install_vllm:
-	MAX_JOBS=JOBS pip install -v git+https://github.com/intel-sandbox/vllm-xpu.git@dev -f https://download.pytorch.org/whl/torch_stable.html
-
-package:
-	VLLM_BUILD_CPU_OPS=1 MAX_JOBS=JOBS python setup.py bdist_wheel
-	echo "Wheel package is saved in ./dist/"
-
-HF_TP_bench:
-	cd benchmarks && python benchmark_throughput.py --backend=hf --dataset=../ShareGPT_V3_unfiltered_cleaned_split.json --model=/root/frameworks.bigdata.dev-ops/vicuna-7b-v1.5/ --n=1 --num-prompts=1 --hf-max-batch-size=1 --trust-remote-code --device=cpu
-
-VLLM_TP_bench:
-	cd benchmarks && python benchmark_throughput.py --backend=vllm --dataset=/root/HF_models/ShareGPT_V3_unfiltered_cleaned_split.json --model=/root/HF_models/vicuna-7b-v1.5/ --n=1 --num-prompts=1 --dtype=float32 --trust-remote-code --device=cpu --swap-space=4
-
-VLLM_LT_bench:
-	cd benchmarks && python benchmark_latency.py --model=/root/frameworks.bigdata.dev-ops/vicuna-7b-v1.5/ --n=1 --batch-size=48 --input-len=128 --output-len=128 --num-iters=8 --dtype=bfloat16 --trust-remote-code --device=cpu
-
-VLLM_SERVE_bench:
-	cd benchmarks && python -m vllm.entrypoints.api_server \
-        --model /root/HF_models/vicuna-7b-v1.5/ --swap-space 40 \
-        --disable-log-requests --dtype=bfloat16 --device cpu & \
-	cd benchmarks && sleep 30 && python benchmark_serving.py \
-        --backend vllm \
-        --tokenizer /root/HF_models/vicuna-7b-v1.5/ --dataset /root/HF_models/ShareGPT_V3_unfiltered_cleaned_split.json \
-        --request-rate 10
\ No newline at end of file
diff --git a/README.md b/README.md
index a03f0b978c1c9..622e49e6adaa4 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
-*** ***This is the development branch for vLLM CPU support.*** ***
-   
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
diff --git a/benchmarks/kernels/activation.py b/benchmarks/kernels/activation.py
deleted file mode 100644
index f646ca1679806..0000000000000
--- a/benchmarks/kernels/activation.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from threadpoolctl import threadpool_info
-from pprint import pprint
-
-import torch
-from benchmark import KernelBenchmark
-from vllm.activation_ops import silu_and_mul
-
-
-class ActivationBench(KernelBenchmark):
-
-    def __init__(self, loop_time, num_tokens, d, dtype: torch.dtype,
-                 device: torch.device) -> None:
-        super().__init__(loop_time)
-        self.num_tokens = num_tokens
-        self.d = d
-        self.input = torch.randn(num_tokens, 2 * d, dtype=dtype, device=device)
-        self.output = torch.empty(num_tokens, d, dtype=dtype, device=device)
-
-    def _run(self):
-        for i in range(self.loop_time):
-            silu_and_mul(self.output, self.input)
-
-
-bench = ActivationBench(10, 4096, 512, torch.float32, torch.device("cpu"))
-bench.execute()
-
-pprint(threadpool_info())
-
-# RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu"))
-# Scalar: 282420151.5 ns
-# token parallel: 36635991.875 ns 7.7x
-# FMA: 36517116.125 ns
diff --git a/benchmarks/kernels/attention.py b/benchmarks/kernels/attention.py
deleted file mode 100644
index 55a1c2af461fa..0000000000000
--- a/benchmarks/kernels/attention.py
+++ /dev/null
@@ -1,108 +0,0 @@
-from threadpoolctl import threadpool_info
-from pprint import pprint
-
-import random
-import torch
-from benchmark import KernelBenchmark
-from vllm.attention_ops import single_query_cached_kv_attention
-
-
-class SingleCachedAttentionBench(KernelBenchmark):
-
-    def __init__(
-        self,
-        loop_time,
-        num_tokens: int,
-        num_heads: int,
-        head_size: int,
-        block_size: int,
-        num_blocks: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        num_kv_heads: int = None,
-    ) -> None:
-        super().__init__(loop_time)
-        self.block_size = block_size
-        qkv = torch.empty(num_tokens,
-                          3,
-                          num_heads,
-                          head_size,
-                          dtype=dtype,
-                          device=device)
-        qkv.uniform_(-1e-3, 1e-3)
-        self.query, _, _ = qkv.unbind(dim=1)
-
-        x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_block_shape = (num_heads, head_size // x, block_size, x)
-        self.key_cache = torch.empty(size=(num_blocks, *key_block_shape),
-                                     dtype=dtype,
-                                     device=device)
-        self.key_cache.uniform_(-1e-3, 1e-3)
-        value_block_shape = (num_heads, head_size, block_size)
-        self.value_cache = torch.empty(size=(num_blocks, *value_block_shape),
-                                       dtype=dtype,
-                                       device=device)
-        self.value_cache.uniform_(-1e-3, 1e-3)
-
-        context_lens = [random.randint(1, 4096) for _ in range(num_tokens)]
-        self.max_context_len = max(context_lens)
-        self.context_lens = torch.tensor(context_lens,
-                                         dtype=torch.int,
-                                         device=device)
-
-        self.max_num_blocks_per_seq = (self.max_context_len + block_size -
-                                       1) // block_size
-        block_tables = []
-        for _ in range(num_tokens):
-            block_table = [
-                random.randint(0, num_blocks - 1)
-                for _ in range(self.max_num_blocks_per_seq)
-            ]
-            block_tables.append(block_table)
-        self.block_tables = torch.tensor(block_tables,
-                                         dtype=torch.int,
-                                         device=device)
-        head_mapping = torch.arange(num_heads,
-                                    dtype=torch.int32,
-                                    device=device)
-
-        self.scale = float(1.0 / (head_size**0.5))
-
-        num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        assert num_heads % num_kv_heads == 0
-        self.num_queries_per_kv = num_heads // num_kv_heads
-        self.head_mapping = torch.repeat_interleave(
-            torch.arange(num_kv_heads, dtype=torch.int32, device=device),
-            self.num_queries_per_kv)
-
-        self.output = torch.empty(num_tokens,
-                                  num_heads,
-                                  head_size,
-                                  dtype=dtype,
-                                  device=device)
-
-    def _run(self):
-        single_query_cached_kv_attention(
-            self.output,
-            self.query,
-            self.key_cache,
-            self.value_cache,
-            self.head_mapping,
-            self.scale,
-            self.block_tables,
-            self.context_lens,
-            self.block_size,
-            self.max_context_len,
-            None,  # ALiBi slopes.
-        )
-
-
-bench = SingleCachedAttentionBench(10, 32, 32, 256, 16, 1024, torch.float32,
-                                   torch.device('cpu'), 16)
-bench.execute()
-
-# SingleCachedAttentionBench(10, 32, 32, 256, 16, 1024, torch.float32, torch.device('cpu'), 16)
-# Scalar: 851373304 ns
-# Parallel: 70520607.25 ns 10x
-
-pprint(threadpool_info())
diff --git a/benchmarks/kernels/benchmark.py b/benchmarks/kernels/benchmark.py
deleted file mode 100644
index 20e53b4ee21b7..0000000000000
--- a/benchmarks/kernels/benchmark.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from abc import ABC, abstractmethod
-from time import perf_counter_ns
-from statistics import mean
-
-
-class KernelBenchmark(ABC):
-
-    def __init__(self, loop_time) -> None:
-        super().__init__()
-        loop_time = loop_time if loop_time > 2 else 3
-
-        self.loop_time = loop_time
-        self.time = []
-
-    def execute(self):
-        for i in range(self.loop_time):
-            start = perf_counter_ns()
-            self._run()
-            end = perf_counter_ns()
-            self.time.append(end - start)
-
-        self.time.sort()
-        avg = mean(self.time[1:-1])
-        print("Execution time: {} ns".format(avg))
-
-    @abstractmethod
-    def _run(self):
-        pass
diff --git a/benchmarks/kernels/cache_op.py b/benchmarks/kernels/cache_op.py
deleted file mode 100644
index b4992bae804e8..0000000000000
--- a/benchmarks/kernels/cache_op.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from threadpoolctl import threadpool_info
-from pprint import pprint
-
-import torch
-import random
-from benchmark import KernelBenchmark
-from vllm.cache_ops import copy_blocks, reshape_and_cache
-
-
-class CacheCopyBench(KernelBenchmark):
-
-    def __init__(
-        self,
-        loop_time,
-        num_mappings: int,
-        num_layers: int,
-        num_heads: int,
-        head_size: int,
-        block_size: int,
-        num_blocks: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> None:
-        super().__init__(loop_time)
-        # Generate random block mappings.
-        src_blocks = random.sample(range(num_blocks), num_mappings)
-        remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
-        dst_blocks = random.sample(remainig_blocks, num_mappings)
-        self.block_mapping = {
-            src: [dst]
-            for src, dst in zip(src_blocks, dst_blocks)
-        }
-
-        # Create the KV cache.
-        x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (num_blocks, num_heads, head_size // x, block_size,
-                           x)
-        self.key_caches = []
-        for _ in range(num_layers):
-            key_cache = torch.randn(size=key_cache_shape,
-                                    dtype=dtype,
-                                    device=device)
-            self.key_caches.append(key_cache)
-
-        value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-        self.value_caches = []
-        for _ in range(num_layers):
-            value_cache = torch.randn(size=value_cache_shape,
-                                      dtype=dtype,
-                                      device=device)
-            self.value_caches.append(value_cache)
-
-    def _run(self):
-        for i in range(self.loop_time):
-            copy_blocks(self.key_caches, self.value_caches, self.block_mapping)
-
-
-class CacheReshapeBench(KernelBenchmark):
-
-    def __init__(
-        self,
-        loop_time,
-        num_tokens: int,
-        num_heads: int,
-        head_size: int,
-        block_size: int,
-        num_blocks: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> None:
-        super().__init__(loop_time)
-        num_slots = block_size * num_blocks
-        slot_mapping = random.sample(range(num_slots), num_tokens)
-        self.slot_mapping = torch.tensor(slot_mapping,
-                                         dtype=torch.int,
-                                         device=device)
-
-        qkv = torch.randn(num_tokens,
-                          3,
-                          num_heads,
-                          head_size,
-                          dtype=dtype,
-                          device=device)
-        _, self.key, self.value = qkv.unbind(dim=1)
-
-        self.x = 16 // torch.tensor([], dtype=dtype).element_size()
-        key_cache_shape = (num_blocks, num_heads, head_size // self.x,
-                           block_size, self.x)
-        self.key_cache = torch.randn(size=key_cache_shape,
-                                     dtype=dtype,
-                                     device=device)
-
-        value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-        self.value_cache = torch.randn(size=value_cache_shape,
-                                       dtype=dtype,
-                                       device=device)
-
-    def _run(self):
-        reshape_and_cache(self.key, self.value, self.key_cache,
-                          self.value_cache, self.slot_mapping)
-
-
-# bench = CacheCopyBench(10, 256, 8, 16, 256, 16, 1024, torch.float32, torch.device("cpu"))
-# bench.execute()
-
-# CacheCopyBench(10, 256, 8, 16, 256, 16, 1024, torch.float32, torch.device("cpu"))
-# Scalar: 2731509071.375 ns
-# Layer parallel: 510428213.5 ns 5.35x
-# nested parallel: 434456796.5 ns 6.05x
-# section parallel: 442927758 ns
-
-# bench = CacheReshapeBench(10, 128, 64, 256, 16, 1024, torch.float32, torch.device("cpu"))
-# bench.execute()
-
-# CacheReshapeBench(10, 128, 64, 256, 16, 1024, torch.float32, torch.device("cpu"))
-# Scalar: 77548817.875 ns
-# Parallel: 7257660.75 ns 10x
-
-pprint(threadpool_info())
diff --git a/benchmarks/kernels/pos_encoding.py b/benchmarks/kernels/pos_encoding.py
deleted file mode 100644
index 3c69ab806787a..0000000000000
--- a/benchmarks/kernels/pos_encoding.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from threadpoolctl import threadpool_info
-from pprint import pprint
-
-import torch
-from benchmark import KernelBenchmark
-from vllm import pos_encoding_ops
-
-
-class PosEncodingBench(KernelBenchmark):
-
-    def __init__(self, loop_time, num_tokens: int, num_heads: int,
-                 head_size: int, max_position: int, rotary_dim: int,
-                 dtype: torch.dtype, device: torch.device) -> None:
-        super().__init__(loop_time)
-        base: int = 10000
-        self.positions = torch.randint(0,
-                                       max_position, (num_tokens, ),
-                                       device=device)
-        query = torch.randn(num_tokens,
-                            num_heads * head_size,
-                            dtype=dtype,
-                            device=device)
-        key = torch.randn(num_tokens,
-                          num_heads * head_size,
-                          dtype=dtype,
-                          device=device)
-        # Create the rotary embedding.
-        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
-        t = torch.arange(max_position).float()
-        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
-        cos = freqs.cos()
-        sin = freqs.sin()
-        self.head_size = head_size
-        self.cos_sin_cache = torch.cat((cos, sin), dim=-1)
-        self.cos_sin_cache = self.cos_sin_cache.to(dtype=dtype, device=device)
-        self.out_query = query.clone()
-        self.out_key = key.clone()
-
-    def _run(self):
-        for i in range(self.loop_time):
-            pos_encoding_ops.rotary_embedding_neox(self.positions,
-                                                   self.out_query,
-                                                   self.out_key,
-                                                   self.head_size,
-                                                   self.cos_sin_cache)
-
-
-bench = PosEncodingBench(10,
-                         num_tokens=4096,
-                         num_heads=5,
-                         head_size=128,
-                         max_position=8192,
-                         rotary_dim=128,
-                         dtype=torch.float32,
-                         device=torch.device("cpu"))
-bench.execute()
-
-pprint(threadpool_info())
diff --git a/benchmarks/kernels/rmsnorm.py b/benchmarks/kernels/rmsnorm.py
deleted file mode 100644
index 3140aea984d70..0000000000000
--- a/benchmarks/kernels/rmsnorm.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from threadpoolctl import threadpool_info
-from pprint import pprint
-
-import torch
-from benchmark import KernelBenchmark
-from vllm.layernorm_ops import rms_norm
-
-
-class RMSNormBench(KernelBenchmark):
-
-    def __init__(self, loop_time, token_num, hidden_size, dtype: torch.dtype,
-                 device: torch.device) -> None:
-        super().__init__(loop_time)
-        self.x = torch.randn(token_num,
-                             hidden_size,
-                             dtype=dtype,
-                             device=device)
-        self.out = torch.empty_like(self.x)
-        self.weight = torch.empty(hidden_size)
-
-    def _run(self):
-        for i in range(self.loop_time):
-            rms_norm(self.out, self.x, self.weight, 1e-6)
-
-
-bench = RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu"))
-bench.execute()
-
-pprint(threadpool_info())
-
-# RMSNormBench(10, 4096, 4096, torch.float32, torch.device("cpu"))
-# Scalar: 282420151.5 ns
-# token parallel: 36635991.875 ns 7.7x
-# FMA: 36517116.125 ns
diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt
deleted file mode 100644
index 8a5d40a6540bb..0000000000000
--- a/csrc/cpu/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-
-# find_package(Torch REQUIRED)
-# message("Torch CXX FLAGS: ${TORCH_CXX_FLAGS}")
-# message("Torch LIBS: ${TORCH_LIBRARIES}")
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
-
-# set(USE_STATIC_MKL OFF)
-# find_package(MKL REQUIRED)
-# message("MKL found: ${MKL_FOUND}, 
-#         version: ${MKL_VERSION}, 
-#         include dir: ${MKL_INCLUDE_DIR}, 
-#         libs: ${MKL_LIBRARIES},
-#         omp type: ${MKL_OPENMP_TYPE},
-#         omp path: ${MKL_OPENMP_LIBRARY}")
-
-macro(add_operator_target op_name source_file)
-add_library(${op_name} SHARED ${source_file})
-# target_link_libraries(${op_name} ${TORCH_LIBRARIES} ${MKL_LIBRARIES})
-endmacro(add_operator_target)
-
-add_operator_target(layernorm layernorm_impl.cpp)
-add_operator_target(pos_encoding pos_encoding_impl.cpp)
-add_operator_target(cache_op "cache_impl.cpp")
-add_operator_target(activation activation_impl.cpp)
-add_operator_target(attention attention_impl.cpp)
diff --git a/utils/tensor_dump.py b/utils/tensor_dump.py
deleted file mode 100644
index dedfffcfb1727..0000000000000
--- a/utils/tensor_dump.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import os
-import argparse
-import torch
-
-DATA_PATH = "/root/vllm-xpu/data"
-
-
-class TensorDumper:
-
-    def __init__(self, ident: str, iter_limit: int = 3) -> None:
-        self.ident = ident
-        self.device = None
-        self.limit = iter_limit
-        self.iter_num = 0
-
-    def append(self, data: torch.Tensor):
-        if self.device is None:
-            self.device = data.device.type
-
-        if self.iter_num < self.limit:
-            self.iter_num += 1
-            torch.save(
-                data.cpu(),
-                os.path.join(
-                    DATA_PATH, "{}_{}_{}".format(self.device, self.ident,
-                                                 self.iter_num)))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Tensor dumper reader.")
-    parser.add_argument("--ident", type=str)
-    parser.add_argument("--iter-num", type=int)
-    parser.add_argument("--full_print", action='store_true')
-
-    args = parser.parse_args()
-    ident = args.ident
-    iter_num = args.iter_num
-    full_print = args.full_print
-
-    if full_print:
-        torch.set_printoptions(profile=full_print)
-
-    cuda_t = torch.load(
-        os.path.join(DATA_PATH, "{}_{}_{}".format("cuda", ident, iter_num)))
-    cpu_t = torch.load(
-        os.path.join(DATA_PATH, "{}_{}_{}".format("cpu", ident, iter_num)))
-
-    print("---", ident, "---", iter_num, "---")
-    print("Max_diff: ", (cuda_t - cpu_t).abs().max())
-    print("Mean_diff: ", (cuda_t - cpu_t).abs().mean())
-    print("cuda:", cuda_t.size())
-    print(cuda_t)
-    print("---------------------------------")
-    print("cpu:", cpu_t.size())
-    print(cpu_t)
-    print("---------------------------------")