Skip to content

Commit

Permalink
fix mat_mul graph creating
Browse files Browse the repository at this point in the history
  • Loading branch information
chraac committed Oct 3, 2024
1 parent 07fc1e6 commit fc8b521
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 81 deletions.
108 changes: 67 additions & 41 deletions ggml/src/ggml-qnn/op-config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qn
_qnn_parameters.push_back(param);
}

bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const ggml_qnn_dimension_array_t &dimensions,
bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const ggml_dimension_array_t &dimensions,
int rank, const uint8_t *data, const ggml_type data_type,
QNNBackend device, Qnn_GraphHandle_t graph_handle) {
auto param_tensor = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::PARAMETER, name, dimensions, data_type, rank,
Expand Down Expand Up @@ -230,7 +230,8 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
const ggml_tensor_array_t &tensor_inputs,
const ggml_tensor_array_t &tensor_outputs) {
/*
* First, both the ggml and qnn tensor in memory are stored as row-major format.
* First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also:
* https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix)
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
Expand Down Expand Up @@ -262,83 +263,108 @@ bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandl
* Here, the B.T is the transpose of B.
*
* So here we need to create graph like:
* src0 ------------------------------------> | mat_mul0 | -> | transpose1 | -> intermediate1 -> dst0
* src0 ------------------------------------> | mat_mul0 | -> intermediate1 -> | transpose1 | -> dst0
* src1 -> | transpose0 | -> intermediate0 -> | mat_mul0 |
*/

// TODO: Fix this function
const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs);
tensor_common_params params = { "src", tensor_rank, device, graph_handle, _qnn_instance };
create_tensors_from_ggml_tensor(params, true, tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);

// create intermediate tensor
auto *first_ggml_tensor = tensor_inputs.front();
// create intermed0 tensor
auto *src1 = tensor_inputs.back();
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value");
ggml_qnn_dimension_array_t dimensions = {
first_ggml_tensor->ne[1],
first_ggml_tensor->ne[0],
first_ggml_tensor->ne[2],
first_ggml_tensor->ne[3],
ggml_dimension_array_t dimensions = {
src1->ne[1],
src1->ne[0],
src1->ne[2],
src1->ne[3],
};
auto intermediate_tensor =
std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "intermediate", dimensions,
first_ggml_tensor->type, tensor_rank, device, graph_handle, _qnn_instance);
auto intermed0 = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "intermed0", dimensions,
src1->type, tensor_rank, device, graph_handle, _qnn_instance);

// create intermed1 tensor
auto *src0 = tensor_inputs.front();
dimensions[0] = src1->ne[1];
dimensions[1] = src0->ne[1];
dimensions[2] = src0->ne[2];
dimensions[3] = src0->ne[3];
auto intermed1 = std::make_shared<ggml_qnn_tensor>(ggml_qnn_tensor::INTERMEDIATE, "intermed1", dimensions,
src0->type, tensor_rank, device, graph_handle, _qnn_instance);

// create transpose0
auto transpose0 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);

// create transpose1
auto transpose1 = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);

// create mat_mul
auto mat_mul = std::make_shared<ggml_qnn_connectable_op_config>(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
_qnn_instance);

// create output tensor of mat_mul
params.name_prefix = "dst";
create_tensors_from_ggml_tensor(params, false, tensor_outputs, mat_mul->get_output_tensors(),
mat_mul->get_qnn_output_tensors());

// create transpose
auto transpose = std::make_shared<ggml_qnn_connectable_op_config>(_name + "_trans", QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_TRANSPOSE, _qnn_instance);

// set transpose parameters
const ggml_qnn_dimension_array_t param_dims = { tensor_rank, 1, 1, 1 };
transpose->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1,
reinterpret_cast<const uint8_t *>(_transpose_param_data.data()), GGML_TYPE_I32, device,
graph_handle);

// set tensor to transpose and mat_mul
// the graph here will look like:
// src0 -> | transpose | -> intermediate -> | mat_mul | -> dst0
// src1 -> | mat_mul |
ggml_qnn_tensor_array_t tensors = { _tensor_inputs.front() };
transpose->set_input_tensors(tensors);
tensors = { intermediate_tensor };
transpose->set_output_tensors(tensors);
tensors = { intermediate_tensor, _tensor_inputs.back() };
create_tensors_from_ggml_tensor(params, false, tensor_outputs, transpose1->get_output_tensors(),
transpose1->get_qnn_output_tensors());

// set transpose0 parameters
const ggml_dimension_array_t param_dims = { tensor_rank, 1, 1, 1 };
transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1,
reinterpret_cast<const uint8_t *>(_transpose_param_data.data()), GGML_TYPE_I32, device,
graph_handle);

// set transpose1 parameters
transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1,
reinterpret_cast<const uint8_t *>(_transpose_param_data.data()), GGML_TYPE_I32, device,
graph_handle);

// set tensor to transpose0
ggml_qnn_tensor_array_t tensors = { _tensor_inputs.back() };
transpose0->set_input_tensors(tensors);
tensors = { intermed0 };
transpose0->set_output_tensors(tensors);

// set tensor to mat_mul
tensors = { _tensor_inputs.front(), intermed0 };
mat_mul->set_input_tensors(tensors);
tensors = { intermed1 };
mat_mul->set_output_tensors(tensors);

// set tensor to transpose1
tensors = { intermed1 };
transpose1->set_input_tensors(tensors);

_mat_mul = mat_mul;
_transpose = transpose;
_transpose0 = transpose0;
_transpose1 = transpose1;
return true;
}

bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) {
return _transpose->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle);
return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) &&
_transpose1->add_op_to_graph(graph_handle);
}

bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) {
return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs);
}

bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) {
return _mat_mul->bind_output_tensors(tensor_outputs);
return _transpose1->bind_output_tensors(tensor_outputs);
}

void ggml_qnn_matmul_op_config::unbind_input_tensors() {
_transpose->unbind_input_tensors();
_transpose1->unbind_input_tensors();
_mat_mul->unbind_input_tensors();
_transpose0->unbind_input_tensors();
}

void ggml_qnn_matmul_op_config::unbind_output_tensors() {
_transpose->unbind_output_tensors();
_transpose1->unbind_output_tensors();
_mat_mul->unbind_output_tensors();
_transpose0->unbind_output_tensors();
}

ggml_op_constructor_t create_op_constructor(const std::string &op_name) {
Expand Down
7 changes: 4 additions & 3 deletions ggml/src/ggml-qnn/op-config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ggml_qnn_op_config_base : public ggml_qnn_op_config {
_name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {}

void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar);
bool add_tensor_param(const std::string &name, const ggml_qnn_dimension_array_t &dimensions, int rank,
bool add_tensor_param(const std::string &name, const ggml_dimension_array_t &dimensions, int rank,
const uint8_t *data, const ggml_type data_type, QNNBackend device,
Qnn_GraphHandle_t graph_handle);
bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override;
Expand Down Expand Up @@ -95,12 +95,13 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config {
void unbind_input_tensors() override;
void unbind_output_tensors() override;
std::vector<Qnn_Tensor_t> &get_qnn_input_tensors() override { return _qnn_tensor_inputs; }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _mat_mul->get_qnn_output_tensors(); }
std::vector<Qnn_Tensor_t> &get_qnn_output_tensors() override { return _transpose1->get_qnn_output_tensors(); }

private:
std::string _name;
std::shared_ptr<qnn_instance> _qnn_instance;
std::shared_ptr<ggml_qnn_op_config> _transpose;
std::shared_ptr<ggml_qnn_op_config> _transpose0;
std::shared_ptr<ggml_qnn_op_config> _transpose1;
std::shared_ptr<ggml_qnn_op_config> _mat_mul;
ggml_qnn_tensor_array_t _tensor_inputs;
std::vector<Qnn_Tensor_t> _qnn_tensor_inputs;
Expand Down
44 changes: 7 additions & 37 deletions ggml/src/ggml-qnn/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,14 @@
namespace qnn {

static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
using ggml_qnn_dimension_array_t = int64_t[GGML_MAX_DIMS];

class ggml_qnn_tensor {
public:
typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t;

explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name,
const ggml_qnn_dimension_array_t &dimensions, ggml_type data_type, int rank,
QNNBackend device, Qnn_GraphHandle_t graph_handle,
std::shared_ptr<qnn_instance> qnn_instance) :
const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device,
Qnn_GraphHandle_t graph_handle, std::shared_ptr<qnn_instance> qnn_instance) :
_tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) {
QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str());
QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data());
Expand Down Expand Up @@ -198,41 +196,13 @@ class ggml_qnn_tensor {
return true;
}

void update_params_from_ggml_tensor(tensor_type_t tensor_type, const ggml_qnn_dimension_array_t &dimensions,
void update_params_from_ggml_tensor(tensor_type_t tensor_type, const ggml_dimension_array_t &dimensions,
ggml_type data_type, int rank) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0);
/*
* Both the ggml and qnn tensor in memory are stored as row-major format.
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*/
switch (rank) {
case 4:
_dimensions[3] = std::max<uint32_t>(dimensions[0], 1);
_dimensions[2] = std::max<uint32_t>(dimensions[1], 1);
_dimensions[1] = std::max<uint32_t>(dimensions[2], 1);
_dimensions[0] = std::max<uint32_t>(dimensions[3], 1);
break;
case 3:
_dimensions[2] = std::max<uint32_t>(dimensions[0], 1);
_dimensions[1] = std::max<uint32_t>(dimensions[1], 1);
_dimensions[0] = std::max<uint32_t>(dimensions[2], 1);
break;
case 2:
_dimensions[1] = std::max<uint32_t>(dimensions[0], 1);
_dimensions[0] = std::max<uint32_t>(dimensions[1], 1);
break;
case 1:
_dimensions[0] = (uint32_t)dimensions[0];
break;
}
_dimensions = get_internal_dimension(dimensions, rank);
QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(data_type));
QNN_LOG_DEBUG("tensor %s, rank: %d, dims: [%d, %d, %d, %d], data type: %d", _tensor_name.c_str(), rank,
(int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], (int)_dimensions[3],
(int)data_type);

// TODO: set the quantizeParams base on the tensor type

Expand Down
39 changes: 39 additions & 0 deletions ggml/src/ggml-qnn/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,45 @@

namespace qnn {

qnn_internal_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4");
GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0);

qnn_internal_dimension_array_t internal_dims = {};
/*
* Both the ggml and qnn tensor in memory are stored as row-major format.
* But the dimensions of the tensor are stored in different order.
* For example, a 2x3 matrix:
* [
* [1, 2, 3],
* [4, 5, 6],
* ]
* The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3].
*/
switch (rank) {
case 4:
internal_dims[3] = std::max<uint32_t>(dims[0], 1);
internal_dims[2] = std::max<uint32_t>(dims[1], 1);
internal_dims[1] = std::max<uint32_t>(dims[2], 1);
internal_dims[0] = std::max<uint32_t>(dims[3], 1);
break;
case 3:
internal_dims[2] = std::max<uint32_t>(dims[0], 1);
internal_dims[1] = std::max<uint32_t>(dims[1], 1);
internal_dims[0] = std::max<uint32_t>(dims[2], 1);
break;
case 2:
internal_dims[1] = std::max<uint32_t>(dims[0], 1);
internal_dims[0] = std::max<uint32_t>(dims[1], 1);
break;
case 1:
internal_dims[0] = (uint32_t)dims[0];
break;
}

return internal_dims;
}

// TODO: mapping more ggml data type to QNN data type
// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) {
Expand Down
6 changes: 6 additions & 0 deletions ggml/src/ggml-qnn/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <stddef.h>
#include <stdint.h>

#include <array>
#include <string>

#include "ggml.h"
Expand All @@ -17,6 +18,11 @@

namespace qnn {

using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
using qnn_internal_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;

qnn_internal_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank);

uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor);
const char *get_backend_name(int n_backend_type);
const char *get_chipset_desc(uint32_t chipset_id);
Expand Down

0 comments on commit fc8b521

Please sign in to comment.