diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 22a3f741f..47857a321 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -1,89 +1,95 @@
-if(Codegen_GPU_cmake_included)
+if(Codegen_XPU_cmake_included)
   return()
 endif()
-set(Codegen_GPU_cmake_included true)
+set(Codegen_XPU_cmake_included true)
 
-set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen/")
+set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen")
+set(BUILD_TORCH_ATEN_GENERATED "${CMAKE_BINARY_DIR}/aten/src/ATen")
 file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
 
-set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
-set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
-set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
-set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
-set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
+set(RegisterXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
+set(RegisterSparseXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
+set(RegisterSparseCsrXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
+set(RegisterNestedTensorXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
+set(XPUFallback_TEMPLATE ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
+set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
+set(XPU_AOTI_SHIM_HEADER ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h)
+set(XPU_AOTI_SHIM_SOURCE ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
 
 if(WIN32)
   set(FILE_DISPLAY_CMD type)
   # replace forward slash with back slash for compatibility with 'type' command on Windows
-  string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}")
-  string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}")
-  set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH})
+  string(REPLACE "/" "\\" RegisterXPU_GENERATED_BACKSLASH "${RegisterXPU_GENERATED}")
+  string(REPLACE "/" "\\" XPUFallback_TEMPLATE_BACKSLASH "${XPUFallback_TEMPLATE}")
+  set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_TEMPLATE_BACKSLASH} ">>" ${RegisterXPU_GENERATED_BACKSLASH})
 else()
   set(FILE_DISPLAY_CMD cat)
-  set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH})
+  set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_TEMPLATE} ">>" ${RegisterXPU_GENERATED})
 endif()
 
-function(GEN_BACKEND file_yaml)
-  set(generated_files "")
-  foreach(f ${ARGN})
-    list(APPEND generated_files "${BUILD_TORCH_XPU_ATEN_GENERATED}/${f}")
-  endforeach()
-  file(GLOB_RECURSE depended_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml})
-  add_custom_command(
-    OUTPUT ${generated_files}
-    COMMAND
-    "${PYTHON_EXECUTABLE}" -m torchgen.gen_backend_stubs
-    --output_dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
-    --source_yaml ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml}
-    COMMAND
-    ${REGISTER_FALLBACK_CMD}
-    ${SIMPLE_TRACE}
-    WORKING_DIRECTORY ${TORCH_ROOT}
-    DEPENDS
-    ${depended_files}
-    ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml}
-    ${XPUFallback_PATH}
-    )
-endfunction(GEN_BACKEND)
-
-
-set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
-set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
-set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
-set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
-set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
-set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
 function(GEN_XPU file_yaml)
   set(generated_files "")
   foreach(f ${ARGN})
     list(APPEND generated_files "${f}")
   endforeach()
-  file(GLOB_RECURSE depend_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml})
-  set(CODEGEN_TEMPLATE ${TORCH_XPU_OPS_ROOT}/yaml/)
+  set(CODEGEN_XPU_YAML_DIR ${TORCH_XPU_OPS_ROOT}/yaml)
 
   # Codegen prepare process
   if(WIN32)
-    string(REPLACE "/" "\\" DestPATH "${CODEGEN_TEMPLATE}templates")
+    string(REPLACE "/" "\\" DestPATH "${CODEGEN_XPU_YAML_DIR}/templates")
     string(REPLACE "/" "\\" SrcPATH "${CMAKE_SOURCE_DIR}/aten/src/ATen/templates")
     execute_process(COMMAND cmd /c xcopy ${SrcPATH} ${DestPATH} /E /H /C /I /Y > nul)
-    string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}")
-    string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}")
-    set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH})
   else()
-    execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_TEMPLATE}) # soft link to pytorch templates
-    set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH})
+    execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_XPU_YAML_DIR}) # soft link to pytorch templates
   endif()
-  add_custom_command(
-    OUTPUT ${generated_files}
-    COMMAND
+
+  set(XPU_CODEGEN_COMMAND
     "${PYTHON_EXECUTABLE}" -m torchgen.gen
-    --source-path ${TORCH_XPU_OPS_ROOT}/yaml/
+    --source-path ${CODEGEN_XPU_YAML_DIR}
     --install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
     --per-operator-headers
-    --static-dispatch-backend
     --backend-whitelist XPU SparseXPU SparseCsrXPU NestedTensorXPU
-    # --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs
     --xpu
+  )
+
+  set(XPU_INSTALL_HEADER_COMMAND
+    "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
+    --src-header-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
+    --dst-header-dir ${BUILD_TORCH_ATEN_GENERATED}
+  )
+
+  execute_process(
+    COMMAND
+    ${XPU_CODEGEN_COMMAND}
+    --generate headers
+    --dry-run
+    --output-dependencies ${BUILD_TORCH_XPU_ATEN_GENERATED}/generated_headers.cmake
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get generated_headers list")
+  endif()
+
+  execute_process(
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
+    --dry-run
+    RESULT_VARIABLE RETURN_VALUE
+    WORKING_DIRECTORY ${TORCH_ROOT}
+  )
+
+  if(NOT RETURN_VALUE EQUAL 0)
+    message(FATAL_ERROR "Failed to get XPU header list to install")
+  endif()
+
+  add_custom_command(
+    COMMENT "Generating XPU ATen Codegen..."
+    OUTPUT ${generated_files}
+    COMMAND
+    ${XPU_CODEGEN_COMMAND}
+    --static-dispatch-backend
     # --update-aoti-c-shim: generate extend/c_shim_xpu.h
     --update-aoti-c-shim
     # --exten-aoti-c-shim: specifiy the extend/c_shim_xpu
@@ -94,17 +100,14 @@ function(GEN_XPU file_yaml)
     --aoti-install-dir=${XPU_AOTI_INSTALL_DIR}
     COMMAND
     ${REGISTER_FALLBACK_CMD}
-    # Codegen post-process
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH}
-    COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
-    ${SIMPLE_TRACE}
+    # # Codegen post-process
+    COMMAND
+    ${XPU_INSTALL_HEADER_COMMAND}
     WORKING_DIRECTORY ${TORCH_ROOT}
     DEPENDS
-    ${depended_files}
-    ${TORCH_XPU_OPS_ROOT}/yaml/native/${file_yaml}
-    ${XPUFallback_PATH}
+    ${CODEGEN_XPU_YAML_DIR}/native/${file_yaml}
+    ${XPUFallback_TEMPLATE}
+    ${TORCH_XPU_OPS_ROOT}/tools/codegen/install_xpu_headers.py
   )
 
   # Post codegen delete the copied templates folder only on Windows.
@@ -118,30 +121,25 @@ function(GEN_XPU file_yaml)
   endif()
 endfunction(GEN_XPU)
 
-# GEN_BACKEND(
-#   xpu_functions.yaml
-#   XPUNativeFunctions.h
-#   RegisterXPU_0.cpp)
-
 GEN_XPU(
   native_functions.yaml
   ${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions.h
-  ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp
-  ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp
-  ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp
-  ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp
-  ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
-  ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
+  ${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions_inl.h
+  ${RegisterXPU_GENERATED}
+  ${RegisterSparseXPU_GENERATED}
+  ${RegisterSparseCsrXPU_GENERATED}
+  ${RegisterNestedTensorXPU_GENERATED}
+  ${XPU_AOTI_SHIM_HEADER}
+  ${XPU_AOTI_SHIM_SOURCE}
 )
 
+include(${BUILD_TORCH_XPU_ATEN_GENERATED}/xpu_ops_generated_headers.cmake)
 
-# The c_shim_xpu.cpp needs include files in ${CMAKE_BINARY_DIR}/xpu/ATen/ops/*.h)
-# The include path is auto generated as "#include <ATen/ops/*.h">
-# To follow the design of aoti codegen, here ${CMAKE_BINARY_DIR}/xpu is added to
-# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
-list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
-
-list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterSparseCsrXPU_PATH} ${RegisterNestedTensorXPU_PATH})
-list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
-add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
+list(APPEND xpu_generated_src
+  ${RegisterXPU_GENERATED}
+  ${RegisterSparseXPU_GENERATED}
+  ${RegisterSparseCsrXPU_GENERATED}
+  ${RegisterNestedTensorXPU_GENERATED}
+  ${XPU_AOTI_SHIM_SOURCE}
+)
 set(ATen_XPU_GEN_SRCS ${xpu_generated_src})
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
index 22e060111..ad936acb8 100644
--- a/src/ATen/CMakeLists.txt
+++ b/src/ATen/CMakeLists.txt
@@ -19,3 +19,7 @@ set(ATen_XPU_SYCL_SRCS ${ATen_XPU_SYCL_SRCS} PARENT_SCOPE)
 foreach(HEADER  ${xpu_h})
   install(FILES ${HEADER} DESTINATION "${AT_INSTALL_INCLUDE_DIR}/ATen/xpu")
 endforeach()
+
+foreach(HEADER  ${xpu_ops_generated_headers})
+  install(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/ops)
+endforeach()
diff --git a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
index 38564914b..965de7b48 100644
--- a/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
+++ b/src/ATen/native/sparse/xpu/SparseCsrTensorMath.cpp
@@ -1,7 +1,7 @@
 #include <ATen/native/sparse/SparseStubs.h>
 #include <ATen/native/sparse/xpu/sycl/SparseCsrTensorMathKernels.h>
-#include <xpu/ATen/ops/_convert_indices_from_coo_to_csr_native.h>
-#include <xpu/ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/Activation.cpp b/src/ATen/native/xpu/Activation.cpp
index 87cac9c36..a19249c19 100644
--- a/src/ATen/native/xpu/Activation.cpp
+++ b/src/ATen/native/xpu/Activation.cpp
@@ -7,9 +7,9 @@
 #include <ATen/native/TensorIterator.h>
 
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/empty.h>
-#include <xpu/ATen/ops/gelu_backward_native.h>
-#include <xpu/ATen/ops/gelu_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/gelu_native.h>
 
 #include <ATen/native/xpu/sycl/ActivationEluKernels.h>
 #include <ATen/native/xpu/sycl/ActivationGeluKernel.h>
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
index 4a34e70d1..4b3efcebe 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling2d.cpp
@@ -7,8 +7,8 @@
 
 #include <ATen/ops/mean.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
 
 #include <ATen/native/xpu/sycl/AdaptiveAveragePooling2dKernels.h>
 
diff --git a/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp b/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
index 1a445b8ec..86ffe0f57 100644
--- a/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
+++ b/src/ATen/native/xpu/AdaptiveAveragePooling3d.cpp
@@ -4,8 +4,8 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
index 6098072ac..c587cde35 100644
--- a/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
+++ b/src/ATen/native/xpu/AdaptiveMaxPooling2d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/adaptive_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp b/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
index 7610dbd45..3bca6156b 100644
--- a/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
+++ b/src/ATen/native/xpu/AdaptiveMaxPooling3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/AdaptiveMaxPooling3dKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AveragePool2d.cpp b/src/ATen/native/xpu/AveragePool2d.cpp
index 326ad8a51..7647aa562 100644
--- a/src/ATen/native/xpu/AveragePool2d.cpp
+++ b/src/ATen/native/xpu/AveragePool2d.cpp
@@ -5,8 +5,8 @@
 #include <ATen/native/xpu/sycl/AveragePool2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/avg_pool2d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#include <ATen/ops/avg_pool2d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/AveragePool3d.cpp b/src/ATen/native/xpu/AveragePool3d.cpp
index 471e98a27..97eda5ae1 100644
--- a/src/ATen/native/xpu/AveragePool3d.cpp
+++ b/src/ATen/native/xpu/AveragePool3d.cpp
@@ -1,8 +1,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/AveragePool3dKernels.h>
 
-#include <xpu/ATen/ops/avg_pool3d_backward_native.h>
-#include <xpu/ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#include <ATen/ops/avg_pool3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/BinaryOps.cpp b/src/ATen/native/xpu/BinaryOps.cpp
index 53a8e56d2..18654eda1 100644
--- a/src/ATen/native/xpu/BinaryOps.cpp
+++ b/src/ATen/native/xpu/BinaryOps.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/TensorIterator.h>
 
-#include <xpu/ATen/ops/add_native.h>
+#include <ATen/ops/add_native.h>
 
 #include <ATen/native/xpu/sycl/BinaryBitwiseOpsKernels.h>
 #include <ATen/native/xpu/sycl/BinaryGeometricKernels.h>
diff --git a/src/ATen/native/xpu/Col2Im.cpp b/src/ATen/native/xpu/Col2Im.cpp
index 2a6742e5e..71c42fd6b 100644
--- a/src/ATen/native/xpu/Col2Im.cpp
+++ b/src/ATen/native/xpu/Col2Im.cpp
@@ -7,7 +7,7 @@
 #include <ATen/native/xpu/sycl/Col2ImKernel.h>
 
 #include <comm/xpu_aten.h>
-#include <xpu/ATen/ops/col2im_native.h>
+#include <ATen/ops/col2im_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/DilatedMaxPool2d.cpp b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
index a08227b47..c13e76bb5 100644
--- a/src/ATen/native/xpu/DilatedMaxPool2d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool2d.cpp
@@ -4,9 +4,9 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool2d.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/max.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool2d_with_indices_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/DilatedMaxPool3d.cpp b/src/ATen/native/xpu/DilatedMaxPool3d.cpp
index f19e8c530..56d9ba0dc 100644
--- a/src/ATen/native/xpu/DilatedMaxPool3d.cpp
+++ b/src/ATen/native/xpu/DilatedMaxPool3d.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/DilatedMaxPool3d.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_backward_native.h>
-#include <xpu/ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
 namespace at {
 namespace native {
 
diff --git a/src/ATen/native/xpu/Dropout.cpp b/src/ATen/native/xpu/Dropout.cpp
index bfb704e5f..5cc9ded92 100644
--- a/src/ATen/native/xpu/Dropout.cpp
+++ b/src/ATen/native/xpu/Dropout.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/xpu/sycl/DropoutKernels.h>
 
-#include <xpu/ATen/ops/native_dropout_backward_native.h>
-#include <xpu/ATen/ops/native_dropout_native.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_dropout_native.h>
 
 #include <comm/xpu_aten.h>
 
diff --git a/src/ATen/native/xpu/Embedding.cpp b/src/ATen/native/xpu/Embedding.cpp
index 204a324fd..42b8cd67d 100644
--- a/src/ATen/native/xpu/Embedding.cpp
+++ b/src/ATen/native/xpu/Embedding.cpp
@@ -1,6 +1,6 @@
 #include <ATen/core/op_registration/adaption.h>
 
-#include <xpu/ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingKernels.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/EmbeddingBag.cpp b/src/ATen/native/xpu/EmbeddingBag.cpp
index 25e9e8d1e..120370d6b 100644
--- a/src/ATen/native/xpu/EmbeddingBag.cpp
+++ b/src/ATen/native/xpu/EmbeddingBag.cpp
@@ -1,5 +1,5 @@
-#include <xpu/ATen/ops/_embedding_bag_forward_only_native.h>
-#include <xpu/ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
 
 #include <ATen/native/xpu/sycl/EmbeddingBagKernels.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/Equal.cpp b/src/ATen/native/xpu/Equal.cpp
index dcee9b380..bc9126843 100644
--- a/src/ATen/native/xpu/Equal.cpp
+++ b/src/ATen/native/xpu/Equal.cpp
@@ -1,6 +1,6 @@
 #include <ATen/NamedTensorUtils.h>
 
-#include <xpu/ATen/ops/equal_native.h>
+#include <ATen/ops/equal_native.h>
 
 namespace at {
 namespace xpu {
diff --git a/src/ATen/native/xpu/ForeachOpScalarList.cpp b/src/ATen/native/xpu/ForeachOpScalarList.cpp
index 87c1f0ce3..2ec48cf0f 100644
--- a/src/ATen/native/xpu/ForeachOpScalarList.cpp
+++ b/src/ATen/native/xpu/ForeachOpScalarList.cpp
@@ -16,8 +16,8 @@
 #include <ATen/native/xpu/sycl/ForeachPointwiseOpScalarListKernels.h>
 #include <ATen/native/xpu/sycl/ForeachTernaryOpScalarListKernels.h>
 
-#include <xpu/ATen/ops/_foreach_add_native.h>
-#include <xpu/ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/ForeachReduceOp.cpp b/src/ATen/native/xpu/ForeachReduceOp.cpp
index a9ef1ff44..6b104dda2 100644
--- a/src/ATen/native/xpu/ForeachReduceOp.cpp
+++ b/src/ATen/native/xpu/ForeachReduceOp.cpp
@@ -1,8 +1,8 @@
 #include <ATen/native/ForeachUtils.h>
 
 #include <ATen/native/xpu/sycl/ForeachReduceKernels.h>
-#include <xpu/ATen/ops/_foreach_max_native.h>
-#include <xpu/ATen/ops/_foreach_norm_native.h>
+#include <ATen/ops/_foreach_max_native.h>
+#include <ATen/ops/_foreach_norm_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/FractionalMaxPool2d.cpp b/src/ATen/native/xpu/FractionalMaxPool2d.cpp
index e0ddea64b..2586fe17c 100644
--- a/src/ATen/native/xpu/FractionalMaxPool2d.cpp
+++ b/src/ATen/native/xpu/FractionalMaxPool2d.cpp
@@ -3,8 +3,8 @@
 #include <ATen/native/cpu/mixed_data_type.h>
 #include <ATen/native/xpu/sycl/FractionalMaxPool2dKernels.h>
 
-#include <xpu/ATen/ops/fractional_max_pool2d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool2d_native.h>
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/FractionalMaxPool3d.cpp b/src/ATen/native/xpu/FractionalMaxPool3d.cpp
index 29d6acf8c..a0ac54b1b 100644
--- a/src/ATen/native/xpu/FractionalMaxPool3d.cpp
+++ b/src/ATen/native/xpu/FractionalMaxPool3d.cpp
@@ -4,8 +4,8 @@
 #include <ATen/native/xpu/sycl/FractionalMaxPool3dKernels.h>
 #include <ATen/ops/empty.h>
 
-#include <xpu/ATen/ops/fractional_max_pool3d_backward_native.h>
-#include <xpu/ATen/ops/fractional_max_pool3d_native.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/Im2Col.cpp b/src/ATen/native/xpu/Im2Col.cpp
index eb9f4077a..200b56831 100644
--- a/src/ATen/native/xpu/Im2Col.cpp
+++ b/src/ATen/native/xpu/Im2Col.cpp
@@ -4,7 +4,7 @@
 #include <ATen/native/TensorIterator.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/im2col_native.h>
+#include <ATen/ops/im2col_native.h>
 
 #include <ATen/native/xpu/sycl/Im2ColKernel.h>
 #include <comm/xpu_aten.h>
diff --git a/src/ATen/native/xpu/Indexing.cpp b/src/ATen/native/xpu/Indexing.cpp
index bb8c07a92..fe4dc79fc 100644
--- a/src/ATen/native/xpu/Indexing.cpp
+++ b/src/ATen/native/xpu/Indexing.cpp
@@ -10,7 +10,7 @@
 #include <comm/xpu_aten.h>
 
 #include <ATen/ops/index.h>
-#include <xpu/ATen/ops/index_native.h>
+#include <ATen/ops/index_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/LossMultiMargin.cpp b/src/ATen/native/xpu/LossMultiMargin.cpp
index 2db427135..6f8076a0f 100644
--- a/src/ATen/native/xpu/LossMultiMargin.cpp
+++ b/src/ATen/native/xpu/LossMultiMargin.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/MultiMarginLossKernels.h>
 
 #include <ATen/ops/empty.h>
-#include <xpu/ATen/ops/multi_margin_loss_backward_native.h>
-#include <xpu/ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#include <ATen/ops/multi_margin_loss_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/LossNLL.cpp b/src/ATen/native/xpu/LossNLL.cpp
index d80fef746..28cceca99 100644
--- a/src/ATen/native/xpu/LossNLL.cpp
+++ b/src/ATen/native/xpu/LossNLL.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/nll_loss_backward_native.h>
-#include <xpu/ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
index a12b686b2..88c9e46c2 100644
--- a/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
+++ b/src/ATen/native/xpu/PinnedMemoryAllocator.cpp
@@ -3,7 +3,7 @@
 #include <ATen/xpu/PinnedMemoryAllocator.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/is_pinned_native.h>
+#include <ATen/ops/is_pinned_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/RangeFactories.cpp b/src/ATen/native/xpu/RangeFactories.cpp
index bfa0f1545..4cbed8c73 100644
--- a/src/ATen/native/xpu/RangeFactories.cpp
+++ b/src/ATen/native/xpu/RangeFactories.cpp
@@ -10,10 +10,10 @@
 #include <comm/xpu_aten.h>
 #include <torch/library.h>
 
-#include <xpu/ATen/ops/arange_native.h>
-#include <xpu/ATen/ops/linspace_native.h>
-#include <xpu/ATen/ops/logspace_native.h>
-#include <xpu/ATen/ops/range_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/range_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/ReflectionPad.cpp b/src/ATen/native/xpu/ReflectionPad.cpp
index a88151914..0c9ee7da4 100644
--- a/src/ATen/native/xpu/ReflectionPad.cpp
+++ b/src/ATen/native/xpu/ReflectionPad.cpp
@@ -6,12 +6,12 @@
 
 #include <ATen/ops/empty.h>
 #include <ATen/ops/zeros_like.h>
-#include <xpu/ATen/ops/reflection_pad1d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad1d_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad2d_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_backward_native.h>
-#include <xpu/ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
 #include "ATen/TensorMeta.h"
 
 namespace at {
diff --git a/src/ATen/native/xpu/ReplicationPadding.cpp b/src/ATen/native/xpu/ReplicationPadding.cpp
index 3f0093845..e72ff0a4c 100644
--- a/src/ATen/native/xpu/ReplicationPadding.cpp
+++ b/src/ATen/native/xpu/ReplicationPadding.cpp
@@ -6,12 +6,12 @@
 
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/replication_pad1d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad1d_native.h>
-#include <xpu/ATen/ops/replication_pad2d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad2d_native.h>
-#include <xpu/ATen/ops/replication_pad3d_backward_native.h>
-#include <xpu/ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/Resize.cpp b/src/ATen/native/xpu/Resize.cpp
index 66c95302b..19b37ff0c 100644
--- a/src/ATen/native/xpu/Resize.cpp
+++ b/src/ATen/native/xpu/Resize.cpp
@@ -6,20 +6,13 @@
 #include <torch/library.h>
 
 #include <ATen/native/Resize.h>
-#include <xpu/ATen/ops/copy.h>
-#include <xpu/ATen/ops/resize_native.h>
-#include <xpu/ATen/ops/set_native.h>
+#include <ATen/ops/copy.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/set_native.h>
 
 #include <ATen/native/xpu/sycl/ResizeKernel.h>
 
 namespace at {
-
-namespace native {
-const at::Tensor& resize_(
-    const at::Tensor& self,
-    at::IntArrayRef size,
-    ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt);
-}
 namespace native::xpu {
 
 const Tensor& resize_xpu_(
diff --git a/src/ATen/native/xpu/SoftMax.cpp b/src/ATen/native/xpu/SoftMax.cpp
index f155165ce..052a3d9cf 100644
--- a/src/ATen/native/xpu/SoftMax.cpp
+++ b/src/ATen/native/xpu/SoftMax.cpp
@@ -5,10 +5,10 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_log_softmax_backward_data_native.h>
-#include <xpu/ATen/ops/_log_softmax_native.h>
-#include <xpu/ATen/ops/_softmax_backward_data_native.h>
-#include <xpu/ATen/ops/_softmax_native.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/_softmax_native.h>
 namespace at::native {
 
 TORCH_IMPL_FUNC(softmax_xpu_out)
diff --git a/src/ATen/native/xpu/SummaryOps.cpp b/src/ATen/native/xpu/SummaryOps.cpp
index 953004227..22a52e4e2 100644
--- a/src/ATen/native/xpu/SummaryOps.cpp
+++ b/src/ATen/native/xpu/SummaryOps.cpp
@@ -2,7 +2,7 @@
 #include <ATen/native/xpu/sycl/SummaryOpsKernels.h>
 #include <comm/SYCLContext.h>
 
-#include <xpu/ATen/ops/bincount_native.h>
+#include <ATen/ops/bincount_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
index bd24aa3a0..a1ce1fad5 100644
--- a/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
+++ b/src/ATen/native/xpu/TensorAdvancedIndexing.cpp
@@ -22,9 +22,9 @@
 
 #include <ATen/ops/index_add_meta.h>
 #include <ATen/ops/index_reduce_meta.h>
-#include <xpu/ATen/ops/index_add_native.h>
-#include <xpu/ATen/ops/index_reduce_native.h> //generated
-//#include <xpu/ATen/ops/index_reduce_prod_native.h> //generated
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_reduce_native.h> //generated
+//#include <ATen/ops/index_reduce_prod_native.h> //generated
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TensorFactories.cpp b/src/ATen/native/xpu/TensorFactories.cpp
index 3caef39ba..2c0faa535 100644
--- a/src/ATen/native/xpu/TensorFactories.cpp
+++ b/src/ATen/native/xpu/TensorFactories.cpp
@@ -5,7 +5,7 @@
 
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/empty_strided_native.h>
-#include <xpu/ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/_efficientzerotensor_native.h>
 
 #include <ATen/native/xpu/sycl/ComplexKernels.h>
 #include <ATen/native/xpu/sycl/RandpermKernel.h>
diff --git a/src/ATen/native/xpu/TensorShape.cpp b/src/ATen/native/xpu/TensorShape.cpp
index b237b4336..aae14c1b6 100644
--- a/src/ATen/native/xpu/TensorShape.cpp
+++ b/src/ATen/native/xpu/TensorShape.cpp
@@ -9,9 +9,9 @@
 #include <ATen/native/xpu/sycl/ShapeKernels.h>
 #include <ATen/native/xpu/sycl/TensorShapeKernels.h>
 #include <comm/RegisterUtils.h>
-#include <xpu/ATen/ops/as_strided_copy_native.h>
-#include <xpu/ATen/ops/as_strided_native.h>
-#include <xpu/ATen/ops/cat_native.h>
+#include <ATen/ops/as_strided_copy_native.h>
+#include <ATen/ops/as_strided_native.h>
+#include <ATen/ops/cat_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TensorTopK.cpp b/src/ATen/native/xpu/TensorTopK.cpp
index ab3fc5250..0c79610cd 100644
--- a/src/ATen/native/xpu/TensorTopK.cpp
+++ b/src/ATen/native/xpu/TensorTopK.cpp
@@ -5,7 +5,7 @@
 
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/topk_native.h>
+#include <ATen/ops/topk_native.h>
 
 namespace at {
 
diff --git a/src/ATen/native/xpu/TriangluarOps.cpp b/src/ATen/native/xpu/TriangluarOps.cpp
index 3db5e967b..39213b00f 100644
--- a/src/ATen/native/xpu/TriangluarOps.cpp
+++ b/src/ATen/native/xpu/TriangluarOps.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/tril_native.h>
-#include <xpu/ATen/ops/triu_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/triu_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/UpSampleBicubic2d.cpp b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
index 7e0e4de40..388c6d0e4 100644
--- a/src/ATen/native/xpu/UpSampleBicubic2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBicubic2d.cpp
@@ -5,10 +5,10 @@
 #include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/upsample_bicubic2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_bicubic2d_native.h>
-#include <xpu/ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
-#include <xpu/ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
 namespace at {
 namespace native {
 TORCH_IMPL_FUNC(upsample_bicubic2d_out_xpu)
diff --git a/src/ATen/native/xpu/UpSampleBilinear2d.cpp b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
index aec707193..91bc5219b 100644
--- a/src/ATen/native/xpu/UpSampleBilinear2d.cpp
+++ b/src/ATen/native/xpu/UpSampleBilinear2d.cpp
@@ -4,10 +4,10 @@
 #include <ATen/native/xpu/sycl/UpSampleBilinear2dKernels.h>
 #include <comm/RegisterUtils.h>
 
-#include <xpu/ATen/ops/upsample_bilinear2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_bilinear2d_native.h>
-#include <xpu/ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
-#include <xpu/ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleLinear1d.cpp b/src/ATen/native/xpu/UpSampleLinear1d.cpp
index 13dfa33de..388f6c257 100644
--- a/src/ATen/native/xpu/UpSampleLinear1d.cpp
+++ b/src/ATen/native/xpu/UpSampleLinear1d.cpp
@@ -5,8 +5,8 @@
 #include <comm/RegisterUtils.h>
 #include "ATen/core/ATen_fwd.h"
 
-#include <xpu/ATen/ops/upsample_linear1d_backward_native.h>
-#include <xpu/ATen/ops/upsample_linear1d_native.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#include <ATen/ops/upsample_linear1d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest1d.cpp b/src/ATen/native/xpu/UpSampleNearest1d.cpp
index 30287e4b2..7603a43e9 100644
--- a/src/ATen/native/xpu/UpSampleNearest1d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest1d.cpp
@@ -2,10 +2,10 @@
 #include <ATen/native/xpu/sycl/UpSampleNearest1dKernels.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact1d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact1d_native.h>
-#include <xpu/ATen/ops/upsample_nearest1d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest1d_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/upsample_nearest1d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest2d.cpp b/src/ATen/native/xpu/UpSampleNearest2d.cpp
index 9ebbd74b1..c906a703f 100644
--- a/src/ATen/native/xpu/UpSampleNearest2d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest2d.cpp
@@ -2,10 +2,10 @@
 #include <ATen/native/xpu/sycl/UpSampleNearest2dKernels.h>
 #include <comm/xpu_aten.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact2d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact2d_native.h>
-#include <xpu/ATen/ops/upsample_nearest2d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest2d_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
 namespace at {
 
 namespace native {
diff --git a/src/ATen/native/xpu/UpSampleNearest3d.cpp b/src/ATen/native/xpu/UpSampleNearest3d.cpp
index 5528b0ac2..8cc0bb9f8 100644
--- a/src/ATen/native/xpu/UpSampleNearest3d.cpp
+++ b/src/ATen/native/xpu/UpSampleNearest3d.cpp
@@ -1,14 +1,14 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/xpu/sycl/UpSampleNearest3dKernels.h>
 
-#include <xpu/ATen/ops/_upsample_nearest_exact3d.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_backward.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_backward_native.h>
-#include <xpu/ATen/ops/_upsample_nearest_exact3d_native.h>
-#include <xpu/ATen/ops/upsample_nearest3d.h>
-#include <xpu/ATen/ops/upsample_nearest3d_backward.h>
-#include <xpu/ATen/ops/upsample_nearest3d_backward_native.h>
-#include <xpu/ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
 
 namespace at::native {
 
diff --git a/src/ATen/native/xpu/UpSampleTrilinear3d.cpp b/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
index 4c46a07c6..e6a28ca84 100644
--- a/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
+++ b/src/ATen/native/xpu/UpSampleTrilinear3d.cpp
@@ -2,8 +2,8 @@
 #include <ATen/native/xpu/sycl/UpSampleTrilinear3dKernels.h>
 #include <comm/SYCLContext.h>
 
-#include <xpu/ATen/ops/upsample_trilinear3d_backward_native.h>
-#include <xpu/ATen/ops/upsample_trilinear3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#include <ATen/ops/upsample_trilinear3d_native.h>
 
 namespace at {
 namespace native {
diff --git a/src/ATen/native/xpu/XPUScalar.cpp b/src/ATen/native/xpu/XPUScalar.cpp
index d47dd7871..25acb44d1 100644
--- a/src/ATen/native/xpu/XPUScalar.cpp
+++ b/src/ATen/native/xpu/XPUScalar.cpp
@@ -3,7 +3,7 @@
 #include <ATen/EmptyTensor.h>
 #include <ATen/core/Tensor.h>
 #include <comm/SYCLContext.h>
-#include <xpu/ATen/ops/_local_scalar_dense_native.h>
+#include <ATen/ops/_local_scalar_dense_native.h>
 
 namespace at::native {
 
diff --git a/test/regressions/test_xpu_ops_header.py b/test/regressions/test_xpu_ops_header.py
new file mode 100644
index 000000000..844c9b25a
--- /dev/null
+++ b/test/regressions/test_xpu_ops_header.py
@@ -0,0 +1,21 @@
+# Owner(s): ["module: intel"]
+import os
+
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestXpuOpsHeader(TestCase):
+    def test_xpu_ops_header(self):
+        include_dir = os.path.join(os.path.dirname(torch.__file__), "include")
+        aten_ops_dir = os.path.join(include_dir, "ATen/ops")
+        self.assertTrue(
+            os.path.exists(os.path.join(aten_ops_dir, "cat_xpu_dispatch.h"))
+        )
+        self.assertTrue(
+            os.path.exists(os.path.join(aten_ops_dir, "index_fill_xpu_dispatch.h"))
+        )
+        self.assertTrue(os.path.exists(os.path.join(aten_ops_dir, "col2im_native.h")))
+        with open(os.path.join(aten_ops_dir, "col2im_native.h")) as fr:
+            text = fr.read()
+            self.assertTrue("col2im_xpu" in text)
diff --git a/tools/codegen/install_xpu_headers.py b/tools/codegen/install_xpu_headers.py
new file mode 100644
index 000000000..c6fdefb4a
--- /dev/null
+++ b/tools/codegen/install_xpu_headers.py
@@ -0,0 +1,130 @@
+import argparse
+import os
+import re
+import shutil
+
+
+parser = argparse.ArgumentParser(description="Utils for append ops headers")
+parser.add_argument(
+    "--src-header-dir", type=str, help="torch-xpu-ops build header file path"
+)
+parser.add_argument("--dst-header-dir", type=str, help="torch build header file path")
+parser.add_argument(
+    "--dry-run", action="store_true", help="run without writing any files"
+)
+args = parser.parse_args()
+
+
+def append_xpu_function_header(src, dst):
+    r"""
+    Append XPU function header XPUFunctions_inl.h from source to destination build.
+    """
+    if args.dry_run:
+        return
+
+    with open(dst) as fr:
+        lines = fr.readlines()
+    while lines and lines[-1].strip() == "":
+        lines.pop()
+    with open(dst, "w") as fw:
+        fw.writelines(lines)
+
+    with open(src) as fr, open(dst, "a") as fa:
+        src_lines = fr.readlines()
+        for line in src_lines:
+            if re.match(r"^#include <ATen/ops/.*", line):
+                fa.write(line)
+
+
+def parse_ops_headers(src):
+    r"""
+    Parse ops headers from file.
+    """
+    ops_headers = []
+    with open(src) as fr:
+        src_text = fr.read()
+        ops_headers.extend(re.findall(r".*/ATen/+ops/(.*.h)", src_text))
+    return ops_headers
+
+
+def classify_ops_headers(src_dir, dst_dir):
+    r"""
+    Classify ops headers into common headers and XPU-specific ops headers.
+    """
+    src_ops_headers = parse_ops_headers(os.path.join(src_dir, "ops_generated_headers.cmake"))
+    dst_ops_headers = parse_ops_headers(os.path.join(dst_dir, "ops_generated_headers.cmake"))
+    common_headers = [f for f in src_ops_headers if f in dst_ops_headers]
+    xpu_ops_headers = [f for f in src_ops_headers if f not in common_headers]
+    return common_headers, xpu_ops_headers
+
+
+def generate_xpu_ops_headers_cmake(src_dir, dst_dir, xpu_ops_headers):
+    r"""
+    Generate XPU ops headers xpu_ops_generated_headers.cmake
+    """
+    with open(os.path.join(src_dir, "xpu_ops_generated_headers.cmake"), "w") as fw:
+        fw.write("set(xpu_ops_generated_headers\n")
+        for header in xpu_ops_headers:
+            fw.write(f'    "{os.path.join(dst_dir, header)}"\n')
+        fw.write(")\n")
+
+
+def append_xpu_ops_headers(src_dir, dst_dir, common_headers, xpu_ops_headers):
+    r"""
+    For XPU-specific ops headers, copy them to destination build and append XPU declarations to common headers.
+    """
+    if args.dry_run:
+        return
+
+    for f in xpu_ops_headers:
+        # TODO: fix the incorrect op info registered in native_functions.yaml
+        # assert "xpu" in f, f"Error: The function signature or namespace in '{f}' is incorrect. Expected 'xpu' to be present."
+        src = os.path.join(src_dir, f)
+        dst = os.path.join(dst_dir, f)
+        shutil.copy(src, dst)
+
+    for f in common_headers:
+        src = os.path.join(src_dir, f)
+        dst = os.path.join(dst_dir, f)
+        xpu_declarations = []
+        with open(src) as fr:
+            src_text = fr.read()
+            xpu_declarations.extend(
+                re.findall(r"^TORCH_API.*xpu.*?;\n", src_text, re.MULTILINE)
+            )
+            xpu_declarations.extend(
+                re.findall(r"struct TORCH_XPU_API.*xpu.*?{.*?};\n", src_text, re.DOTALL)
+            )
+
+        with open(dst) as fr:
+            dst_lines = fr.readlines()
+            dst_text = "".join(dst_lines)
+            for line in dst_lines:
+                if re.match(r"^(TORCH_API.*;|struct TORCH_API.*)", line):
+                    for xpu_declaration in xpu_declarations:
+                        if not re.search(re.escape(xpu_declaration), dst_text):
+                            dst_lines.insert(dst_lines.index(line), xpu_declaration)
+                    break
+
+        with open(dst, "w") as fw:
+            fw.writelines(dst_lines)
+
+
+def main():
+    src_xpu_function_header = os.path.join(args.src_header_dir, "XPUFunctions_inl.h")
+    dst_xpu_function_header = os.path.join(args.dst_header_dir, "XPUFunctions_inl.h")
+    append_xpu_function_header(src_xpu_function_header, dst_xpu_function_header)
+
+    src_xpu_ops_header_dir = os.path.join(args.src_header_dir, "ops")
+    dst_xpu_ops_header_dir = os.path.join(args.dst_header_dir, "ops")
+    common_headers, xpu_ops_headers = classify_ops_headers(
+        args.src_header_dir, args.dst_header_dir
+    )
+    generate_xpu_ops_headers_cmake(args.src_header_dir, dst_xpu_ops_header_dir, xpu_ops_headers)
+    append_xpu_ops_headers(
+        src_xpu_ops_header_dir, dst_xpu_ops_header_dir, common_headers, xpu_ops_headers
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/codegen/remove_headers.py b/tools/codegen/remove_headers.py
deleted file mode 100644
index 8f5e24e64..000000000
--- a/tools/codegen/remove_headers.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import argparse
-import re
-
-parser = argparse.ArgumentParser(description="Utils for remove unused headers")
-parser.add_argument("--register_xpu_path", type=str, help="file location of RegisterXPU.cpp")
-args = parser.parse_args()
-
-def rm_as_strided_native():
-    with open(args.register_xpu_path) as fr:
-        lines = fr.readlines()
-
-        with open(args.register_xpu_path, 'w') as fw:
-            for ln in lines:
-                if "#include <ATen/ops/as_strided_native.h>" not in ln:
-                    fw.write(ln)
-
-def replace_op_headers():
-    with open(args.register_xpu_path) as fr:
-        lines = fr.readlines()
-        patt = r'#include <ATen/ops'
-        rep = r'#include <xpu/ATen/ops'
-        with open(args.register_xpu_path, 'w') as fw:
-            for ln in lines:
-                if 'empty.h' in ln:
-                    continue
-                replaced = re.sub(patt, rep, ln)
-                fw.write(replaced)
-
-if __name__ == "__main__":
-    # rm_as_strided_native()
-    replace_op_headers()