Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… final_state_intermediate
  • Loading branch information
jim19930609 committed Feb 22, 2022
2 parents 1b61ab1 + 574f340 commit c95d4b7
Show file tree
Hide file tree
Showing 276 changed files with 5,336 additions and 3,130 deletions.
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ ENDIF()

if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand Down
22 changes: 11 additions & 11 deletions cmake/pten.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -58,26 +58,26 @@ endfunction()
function(kernel_declare TARGET_LIST)
foreach(kernel_path ${TARGET_LIST})
file(READ ${kernel_path} kernel_impl)
# TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
# TODO(chenweihang): rename PD_REGISTER_KERNEL to PD_REGISTER_KERNEL
# NOTE(chenweihang): now we don't recommend to use digit in kernel name
string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
if (NOT first_registry STREQUAL "")
# parse the first kernel name
string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
string(REPLACE "," "" kernel_name "${kernel_name}")
string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
# append kernel declare into declarations.h
# TODO(chenweihang): default declare ALL_LAYOUT for each kernel
if (${kernel_path} MATCHES "./cpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./gpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
elseif (${kernel_path} MATCHES "./xpu\/")
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
else ()
# deal with device independent kernel, now we use CPU temporaary
file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
endif()
endif()
endforeach()
Expand Down Expand Up @@ -285,9 +285,9 @@ endfunction()

function(append_op_util_declare TARGET)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
string(APPEND util_declare ");\n")
file(APPEND ${op_utils_header} "${util_declare}")
endfunction()
Expand Down
10 changes: 9 additions & 1 deletion paddle/fluid/distributed/fleet_executor/dist_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
} else if (input_data.dtype == DistModelDataType::INT32) {
input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
} else if (input_data.dtype == DistModelDataType::FLOAT16) {
input_tensor_ptr = input_tensor->mutable_data<float16>(dims, place);
} else {
LOG(ERROR) << "unsupported feed type " << input_data.dtype;
return false;
Expand Down Expand Up @@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() {
feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
} else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
} else if (real_var->GetDataType() == framework::proto::VarType::FP16) {
feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16});
} else {
LOG(ERROR) << "Don't support feed var dtype for: "
<< real_var->GetDataType();
Expand Down Expand Up @@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
} else if (type == framework::proto::VarType::INT32) {
rst = FetchResult<int32_t>(fetch, output);
output->dtype = DistModelDataType::INT32;
} else if (type == framework::proto::VarType::FP16) {
rst = FetchResult<float16>(fetch, output);
output->dtype = DistModelDataType::FLOAT16;
} else {
LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
"supports float32, int64 and int32 fetch type for now.";
"supports float32, float16, int64 and int32 fetch type "
"for now.";
}
if (!rst) {
LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/macros.h"

namespace paddle {
Expand All @@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype<float>() {
return DistModelDataType::FLOAT32;
}

template <>
constexpr DistModelDataType DistModelGetDtype<platform::float16>() {
return DistModelDataType::FLOAT16;
}

class DistModelDataBuf {
public:
explicit DistModelDataBuf(size_t length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ def ParseIntermediate(string):
return [v.strip() for v in string.split(",")]


def ParseNoNeedBuffer(string):
# string: "x, y"
no_need_buffer_set = set()
for name in string.split(","):
no_need_buffer_set.add(name.strip())

return no_need_buffer_set


def ParseYamlArgs(string):
# Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y

Expand Down Expand Up @@ -417,7 +426,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,


def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
backward_attrs_list):
backward_attrs_list, no_need_buffer_set):
# Inputs:
# fwd_api_name = ""
# backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...}
Expand All @@ -430,15 +439,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set:
no_need_buffer = "true"
else:
no_need_buffer = "false"

tensor_wrapper_name = GetSavedName(tname)
if IsPlainTensorType(ttype):
SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
{} = egr::TensorWrapper({}, full_reserved);
{} = egr::TensorWrapper({}, full_reserved, {});
}}
"""
set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tensor_wrapper_name, tname)
tname, tname, tensor_wrapper_name, tname, no_need_buffer)

PLAIN_TENSOR_MEMBER_TEMPLATE = """
egr::TensorWrapper {};
Expand All @@ -450,12 +464,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
for(const auto& eager_tensor : {}) {{
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) );
{}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
}};
}}
"""
set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
tname, tname, tname, tensor_wrapper_name)
tname, tname, tname, tensor_wrapper_name, no_need_buffer)

VECTOR_TENSOR_MEMBER_TEMPLATE = """
std::vector<egr::TensorWrapper> {};
Expand Down Expand Up @@ -1024,6 +1038,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
assert 'output' in fwd_api.keys()
assert 'backward' in fwd_api.keys()

no_need_buffer_set = set()
if 'no_need_buffer' in fwd_api.keys():
no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])

fwd_api_name = fwd_api['api']
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
Expand Down Expand Up @@ -1095,7 +1113,8 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):

# Node Declaration Generation
node_declaration_str += GenerateNodeDeclaration(
fwd_api_name, backward_fwd_input_map, backward_attrs_list)
fwd_api_name, backward_fwd_input_map, backward_attrs_list,
no_need_buffer_set)
print("Generated Node Declaration: ", node_declaration_str)

node_definition_str += GenerateNodeDefinition(
Expand Down
20 changes: 18 additions & 2 deletions paddle/fluid/eager/tensor_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ class TensorWrapper {
public:
TensorWrapper() = default;
explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
bool full_reserved = false) {
bool full_reserved = false,
bool no_need_buffer = false) {
/**
* Normally, we should fully reserved all non-output or non-leaf fwd tensor
* here. And for fwd output tensor, we should not reserve its autogradmeta,
Expand All @@ -48,7 +49,22 @@ class TensorWrapper {
}

// shallow copy tensor_impl here
intermidiate_tensor_.set_impl(tensor.impl());
if (no_need_buffer) {
if (phi::DenseTensor::classof(tensor.impl().get())) {
// Only Copy Meta
phi::DenseTensor* dense_tensor =
static_cast<phi::DenseTensor*>(tensor.impl().get());
auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
tw_dense_tensor->set_meta(dense_tensor->meta());
intermidiate_tensor_.set_impl(tw_dense_tensor);
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Unrecognized tensor type for no_need_buffer feature"));
}
} else {
intermidiate_tensor_.set_impl(tensor.impl());
}

intermidiate_tensor_.set_name(tensor.name() + "@Saved");
PADDLE_ENFORCE_NOT_NULL(
EagerUtils::unsafe_autograd_meta(tensor),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,4 +176,4 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {

USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,4 @@ TEST(Benchmark, FluidMLPCPU) {
USE_OP_ITSELF(scale);
USE_OP_ITSELF(elementwise_add);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ TEST(Benchmark, FluidMLPCUDA) {

USE_OP_ITSELF(scale);
USE_OP_ITSELF(matmul_v2);
USE_OP(reduce_sum);
USE_OP_ITSELF(reduce_sum);
USE_OP(reduce_sum_grad);
USE_OP_ITSELF(elementwise_add);

Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
configure_file(commit.h.in commit.h)

cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
cc_library(custom_kernel SRCS custom_kernel.cc DEPS
tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw)
#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)

Expand All @@ -459,4 +458,3 @@ else()
cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
endif()
cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
Loading

1 comment on commit c95d4b7

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.