-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel-loss-gradient.cc
319 lines (281 loc) · 16.7 KB
/
kernel-loss-gradient.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/**
* @file kernel-loss-gradient.cc
*
* This file defines the kernel for the DynamicLossGradient op.
*
* @author Arne Hasselbring
*/
#include "tensorflow/core/framework/op_kernel.h"
#include <pinocchio/algorithm/rnea.hpp>
#include <pinocchio/algorithm/rnea-derivatives.hpp>
#include <pinocchio/parsers/urdf.hpp>
#include <algorithm>
#include <cmath>
#include <mutex>
using namespace tensorflow;
// Imported from LAPACK, solves a linear equation system with a real symmetric positive definite band matrix:
// see https://www.netlib.org/lapack/explore-html/de/d49/group__double_o_t_h_e_rsolve_ga9c26c8344bc125d78d6a33a22459169c.html
extern "C" void dpbsv_(const char* uplo, const int* n, const int* kd, const int* nrhs, double* ab, const int* ldab, double* b, const int* ldb, int* info);
class DynamicLossGradientOp : public OpKernel
{
public:
explicit DynamicLossGradientOp(OpKernelConstruction* context) :
OpKernel(context)
{
std::string model_path;
OP_REQUIRES_OK(context, context->GetAttr("model", &model_path));
pinocchio::urdf::buildModel(model_path, model);
data = pinocchio::Data(model);
OP_REQUIRES(context, model.nq == model.nv, errors::InvalidArgument("Invalid robot model."));
}
void Compute(OpKernelContext* context) override
{
OP_REQUIRES(context, context->num_inputs() == 14, errors::InvalidArgument("Wrong number of inputs."));
const Tensor& trajectory_tensor = context->input(0);
const Tensor& q_1_tensor = context->input(1);
const Tensor& q0_tensor = context->input(2);
const Tensor& dt_tensor = context->input(3);
const Tensor& max_torques_tensor = context->input(4);
const Tensor& max_velocities_tensor = context->input(5);
const Tensor& acceleration_factors_tensor = context->input(6);
const Tensor& torque_limit_ratio_tensor = context->input(7);
const Tensor& velocity_limit_ratio_tensor = context->input(8);
const Tensor& regularizer_tensor = context->input(9);
const Tensor& gradient_trajectory_tensor = context->input(10);
const Tensor& gradient_torque_cost_tensor = context->input(11);
const Tensor& gradient_torque_limit_tensor = context->input(12);
const Tensor& gradient_velocity_limit_tensor = context->input(13);
const auto D = model.nq;
OP_REQUIRES(context,
trajectory_tensor.shape().dims() == 3 &&
trajectory_tensor.shape().dim_size(2) == D &&
q_1_tensor.shape().dims() == 2 &&
q_1_tensor.shape().dim_size(1) == D &&
q0_tensor.shape().dims() == 2 &&
q0_tensor.shape().dim_size(1) == D &&
trajectory_tensor.shape().dim_size(0) == q_1_tensor.shape().dim_size(0) &&
trajectory_tensor.shape().dim_size(0) == q0_tensor.shape().dim_size(0) &&
dt_tensor.shape().dims() == 0 &&
max_torques_tensor.shape().dims() == 1 &&
max_torques_tensor.shape().dim_size(0) == D &&
max_velocities_tensor.shape().dims() == 1 &&
max_velocities_tensor.shape().dim_size(0) == D &&
(acceleration_factors_tensor.shape().dims() == 0 ||
(acceleration_factors_tensor.shape().dims() == 1 && acceleration_factors_tensor.shape().dim_size(0) == D)) &&
torque_limit_ratio_tensor.shape().dims() == 0 &&
velocity_limit_ratio_tensor.shape().dims() == 0 &&
regularizer_tensor.shape().dims() == 0 &&
(gradient_trajectory_tensor.shape().dims() == 0 || gradient_trajectory_tensor.shape() == trajectory_tensor.shape()) &&
gradient_torque_cost_tensor.shape().dims() == 0 &&
gradient_torque_limit_tensor.shape().dims() == 0 &&
gradient_velocity_limit_tensor.shape().dims() == 0,
errors::InvalidArgument("Wrong shape(s)."));
OP_REQUIRES(context, trajectory_tensor.shape().dim_size(1) >= 2, errors::InvalidArgument("Trajectory too short."));
// Allocate the gradient output which has the same shape as the trajectory.
Tensor* gradient_output_tensor = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, trajectory_tensor.shape(), &gradient_output_tensor));
// Get references to the Eigen objects.
auto trajectory = trajectory_tensor.tensor<double, 3>();
auto q_1 = q_1_tensor.matrix<double>();
auto q0 = q0_tensor.matrix<double>();
const double dt = *dt_tensor.scalar<double>().data();
Eigen::Map<const Eigen::ArrayXd> max_torques(max_torques_tensor.vec<double>().data(), D);
Eigen::Map<const Eigen::ArrayXd> max_velocities(max_velocities_tensor.vec<double>().data(), D);
const double* acceleration_factors = acceleration_factors_tensor.shape().dims() > 0 ? acceleration_factors_tensor.vec<double>().data() : nullptr;
const double torque_limit_ratio = *torque_limit_ratio_tensor.scalar<double>().data();
const double velocity_limit_ratio = *velocity_limit_ratio_tensor.scalar<double>().data();
const double regularizer = *regularizer_tensor.scalar<double>().data();
const double torque_cost_weight = *gradient_torque_cost_tensor.scalar<double>().data();
const double torque_limit_weight = *gradient_torque_limit_tensor.scalar<double>().data();
const double velocity_limit_weight = *gradient_velocity_limit_tensor.scalar<double>().data();
auto gradient_output = gradient_output_tensor->tensor<double, 3>();
const double dt_squared = dt * dt;
// B = batch size, T = length of the trajectory
const auto B = trajectory_tensor.shape().dim_size(0);
const auto T = trajectory_tensor.shape().dim_size(1);
// pinocchio writes the results of the RNEA and its derivatives here.
auto& tau = data.tau;
auto& Jq = data.dtau_dq;
auto& Jv = data.dtau_dv;
auto& Ja = data.M;
// Variables for calculating the Hessian (per batch element):
Eigen::MatrixXd banded_hessian(3 * D, T * D);
Eigen::MatrixXd torque_cost_jacobians((T + 1) * D, 3 * D);
Eigen::ArrayXd torque_limit_mask_full((T + 1) * D);
Eigen::ArrayXd velocity_limit_mask_full(T * D);
// Variables for calculating the gradient (per batch element):
// The gradients are calculated individually.
Eigen::ArrayXd torque_cost_grad(T * D);
Eigen::ArrayXd torque_limit_grad(T * D);
Eigen::ArrayXd velocity_limit_grad(T * D);
for(int64_t i = 0; i < B; ++i)
{
const auto* trajectory_ptr = trajectory.data() + i * T * D;
for(int64_t j = 0; j < T; ++j)
{
Eigen::Map<const Eigen::VectorXd> q(j ? (trajectory_ptr + (j - 1) * D) : q0.data() + i * D, D);
Eigen::Map<const Eigen::VectorXd> q_next(trajectory_ptr + j * D, D);
Eigen::Map<const Eigen::VectorXd> q_prev(j >= 2 ? (trajectory_ptr + (j - 2) * D) : (j == 1 ? q0.data() + i * D : q_1.data() + i * D), D);
const Eigen::VectorXd v = (q_next - q) / dt;
const Eigen::VectorXd a = (q_next + q_prev - 2.0 * q) / dt_squared;
std::lock_guard<std::mutex> lg(mutex);
Jq.setZero();
Jv.setZero();
Ja.setZero();
// computeRNEADerivatives must be called before rnea because it overwrites tau.
pinocchio::computeRNEADerivatives(model, data, q, v, a);
pinocchio::rnea(model, data, q, v, a);
// pinocchio only fills the upper triangle:
Ja.template triangularView<Eigen::StrictlyLower>() = Ja.transpose().template triangularView<Eigen::StrictlyLower>();
Jv /= dt;
if(acceleration_factors)
Ja.diagonal() += Eigen::Map<const Eigen::VectorXd>(acceleration_factors, D);
Ja /= dt_squared;
// This is going to be used by the Hessian calculation:
if(j >= 2)
torque_cost_jacobians.block(j * D, 0, D, D) = Ja;
if(j >= 1)
torque_cost_jacobians.block(j * D, D, D, D) = -Jv + Jq - 2.0 * Ja;
torque_cost_jacobians.block(j * D, 2 * D, D, D) = Jv + Ja;
if(acceleration_factors)
tau.array() += a.array() * Eigen::Map<const Eigen::ArrayXd>(acceleration_factors, D);
tau.array() /= max_torques;
// For the Jacobian of the torque limit loss, we need the Jacobians before the multiplication by tau.
auto Jq_limit = Jq;
auto Jv_limit = Jv;
auto Ja_limit = Ja;
Jq.array().colwise() *= tau.array() / max_torques;
Jv.array().colwise() *= tau.array() / max_torques;
Ja.array().colwise() *= tau.array() / max_torques;
Eigen::ArrayXd torque_limit_mask(D);
Eigen::ArrayXd velocity_limit_mask(D);
for(int k = 0; k < D; ++k)
{
torque_limit_mask(k) = std::abs(tau(k)) > torque_limit_ratio ? (tau(k) > 0.0 ? 1.0 : -1.0) : 0.0;
torque_limit_mask_full(j * D + k) = torque_limit_mask(k);
velocity_limit_mask(k) = std::abs(v(k)) > velocity_limit_ratio * max_velocities(k) ? (v(k) > 0.0 ? 1.0 : -1.0) : 0.0;
velocity_limit_mask_full(j * D + k) = velocity_limit_mask(k);
}
const auto tau_limit = (tau.array().abs() - torque_limit_ratio) * torque_limit_mask;
Jq_limit.array().colwise() *= tau_limit / max_torques;
Jv_limit.array().colwise() *= tau_limit / max_torques;
Ja_limit.array().colwise() *= tau_limit / max_torques;
if(j >= 2)
{
torque_cost_grad.segment((j - 2) * D, D) += Ja.colwise().sum().array();
torque_limit_grad.segment((j - 2) * D, D) += Ja_limit.colwise().sum().array();
}
if(j >= 1)
{
torque_cost_grad.segment((j - 1) * D, D) += -Jv.colwise().sum().array() + Jq.colwise().sum().array() - 2.0 * Ja.colwise().sum().array();
torque_limit_grad.segment((j - 1) * D, D) += -Jv_limit.colwise().sum().array() + Jq_limit.colwise().sum().array() - 2.0 * Ja_limit.colwise().sum().array();
velocity_limit_grad.segment((j - 1) * D, D) -= (v.array().abs() / max_velocities - velocity_limit_ratio) / max_velocities * velocity_limit_mask;
}
// This line is always the first one that visits the gradient of a particular timestep, so we can overwrite what was there before.
torque_cost_grad.segment(j * D, D) = Jv.colwise().sum().array() + Ja.colwise().sum().array();
torque_limit_grad.segment(j * D, D) = Jv_limit.colwise().sum().array() + Ja_limit.colwise().sum().array();
velocity_limit_grad.segment(j * D, D) = (v.array().abs() / max_velocities - velocity_limit_ratio) / max_velocities * velocity_limit_mask;
}
{
Eigen::Map<const Eigen::VectorXd> q(trajectory_ptr + (T - 1) * D, D);
const Eigen::VectorXd a = (Eigen::Map<const Eigen::VectorXd>(trajectory_ptr + (T - 2) * D, D) - q) / dt_squared;
std::lock_guard<std::mutex> lg(mutex);
Jq.setZero();
Jv.setZero(); // We don't need Jv in this case, but who knows what pinocchio does with the values there?
Ja.setZero();
// computeRNEADerivatives must be called before rnea because it overwrites tau.
pinocchio::computeRNEADerivatives(model, data, q, Eigen::VectorXd::Zero(D), a);
pinocchio::rnea(model, data, q, Eigen::VectorXd::Zero(D), a);
// pinocchio only fills the upper triangle:
Ja.template triangularView<Eigen::StrictlyLower>() = Ja.transpose().template triangularView<Eigen::StrictlyLower>();
if(acceleration_factors)
Ja.diagonal() += Eigen::Map<const Eigen::VectorXd>(acceleration_factors, D);
Ja /= dt_squared;
// This is going to be used by the Hessian calculation:
torque_cost_jacobians.block(T * D, 0, D, D) = Ja;
torque_cost_jacobians.block(T * D, D, D, D) = Jq - Ja;
if(acceleration_factors)
tau.array() += a.array() * Eigen::Map<const Eigen::ArrayXd>(acceleration_factors, D);
tau.array() /= max_torques;
// tau must not be divided by max_torques^2 here because its original value is needed for comparing to the limits.
// For the Jacobian of the torque limit loss, we need the Jacobians before the multiplication by tau.
auto Jq_limit = Jq;
auto Ja_limit = Ja;
Jq.array().colwise() *= tau.array() / max_torques;
Ja.array().colwise() *= tau.array() / max_torques;
Eigen::ArrayXd torque_limit_mask(D);
for(int k = 0; k < D; ++k)
{
torque_limit_mask(k) = std::abs(tau(k)) > torque_limit_ratio ? (tau(k) > 0.0 ? 1.0 : -1.0) : 0.0;
torque_limit_mask_full(T * D + k) = torque_limit_mask(k);
}
const auto tau_limit = (tau.array().abs() - torque_limit_ratio) * torque_limit_mask;
Jq_limit.array().colwise() *= tau_limit / max_torques;
Ja_limit.array().colwise() *= tau_limit / max_torques;
torque_cost_grad.segment((T - 2) * D, D) += Ja.colwise().sum().array();
torque_limit_grad.segment((T - 2) * D, D) += Ja_limit.colwise().sum().array();
torque_cost_grad.segment((T - 1) * D, D) += Jq.colwise().sum().array() - Ja.colwise().sum().array();
torque_limit_grad.segment((T - 1) * D, D) += Jq_limit.colwise().sum().array() - Ja_limit.colwise().sum().array();
}
// Incorporate normalization factors such that being at the limit equals 1.
// Velocity limit gradient also gets a 1/dt that was factored out.
torque_limit_grad /= (1.0 - torque_limit_ratio) * (1.0 - torque_limit_ratio);
velocity_limit_grad /= dt * (1.0 - velocity_limit_ratio) * (1.0 - velocity_limit_ratio);
// Combine the gradient from weighted components.
Eigen::Map<Eigen::ArrayXd>(gradient_output.data() + i * T * D, T * D) = (torque_cost_weight * torque_cost_grad + torque_limit_weight * torque_limit_grad + velocity_limit_weight * velocity_limit_grad) / B;
// If present, add the gradient from other loss components.
if(gradient_trajectory_tensor.shape().dims() > 0)
Eigen::Map<Eigen::ArrayXd>(gradient_output.data() + i * T * D, T * D) += Eigen::Map<const Eigen::ArrayXd>(gradient_trajectory_tensor.tensor<double, 3>().data() + i * T * D, T * D);
// Calculate the Hessian.
for(int64_t j = 0; j < 3 * D; ++j)
{
// j is the "off-diagonality".
for(int64_t k = 0; k < T * D - j; ++k)
{
// k counts along the (off-)diagonal.
// All of those are dot products of columns of the (not in memory existing) Jacobian of the entire residual.
// Namely this is the dot product of columns k and (k + j).
double x = 0.0;
const auto start_block = (k + j) / D;
const auto end_block = std::min(k / D + 3, T + 1);
for(auto idx = start_block; idx < end_block; ++idx)
for(auto l = decltype(D){0}; l < D; ++l)
{
x += (torque_cost_weight + torque_limit_weight * torque_limit_mask_full(idx * D + l) * torque_limit_mask_full(idx * D + l) / ((1.0 - torque_limit_ratio) * (1.0 - torque_limit_ratio))) * torque_cost_jacobians(idx * D + l, k + j - (idx - 2) * D) * torque_cost_jacobians(idx * D + l, k - (idx - 2) * D) / (max_torques(l) * max_torques(l));
}
if(j == 0)
{
const auto l = k % D;
x += velocity_limit_weight * velocity_limit_mask_full(k) * velocity_limit_mask_full(k) / (dt * dt * (1.0 - velocity_limit_ratio) * (1.0 - velocity_limit_ratio) * max_velocities(l) * max_velocities(l));
if(k / D < T - 1)
x += velocity_limit_weight * velocity_limit_mask_full(k + D) * velocity_limit_mask_full(k + D) / (dt * dt * (1.0 - velocity_limit_ratio) * (1.0 - velocity_limit_ratio) * max_velocities(l) * max_velocities(l));
}
else if(j == D)
{
const auto l = k % D;
x -= velocity_limit_weight * velocity_limit_mask_full(k + j) * velocity_limit_mask_full(k + j) / (dt * dt * (1.0 - velocity_limit_ratio) * (1.0 - velocity_limit_ratio) * max_velocities(l) * max_velocities(l));
}
banded_hessian(j, k) = x / B;
}
}
// Add the regularizer to the diagonal of the Hessian (which, due to the banded representation, is the first row).
banded_hessian.row(0).array() += regularizer;
// Let LAPACK solve the system (in place).
const char uplo = 'L';
const int n = T * D;
const int kd = 3 * D - 1;
const int nrhs = 1;
const int ldab = 3 * D;
const int ldb = n;
int info = 0;
dpbsv_(&uplo, &n, &kd, &nrhs, banded_hessian.data(), &ldab, gradient_output.data() + i * T * D, &ldb, &info);
if(info != 0)
fprintf(stderr, "LAPACK error when decomposing the Hessian: %d\n", info);
}
}
private:
pinocchio::Model model; ///< The robot model.
pinocchio::Data data; ///< The data on which pinocchio operates.
std::mutex mutex; ///< Mutex to synchronize access to \c data (the TF docs say that \c Compute can be called from multiple threads simultaneously).
};
REGISTER_KERNEL_BUILDER(Name("DynamicLossGradient").Device(DEVICE_CPU), DynamicLossGradientOp);