This repository has been archived by the owner on Sep 5, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 56
/
Copy pathmr.c
472 lines (382 loc) · 10.5 KB
/
mr.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
// SPDX-License-Identifier: BSD-3-Clause
/* Copyright 2020, Intel Corporation */
/*
* mr.c -- librpma memory region-related implementations
*/
#include <endian.h>
#include <inttypes.h>
#include <stdlib.h>
#include "librpma.h"
#include "log_internal.h"
#include "mr.h"
#include "peer.h"
#ifdef TEST_MOCK_ALLOC
#include "cmocka_alloc.h"
#endif
#define RPMA_MR_DESC_SIZE (2 * sizeof(uint64_t) + sizeof(uint32_t) \
+ sizeof(uint8_t))
/* a bit-wise OR of all allowed values */
#define USAGE_ALL_ALLOWED (RPMA_MR_USAGE_READ_SRC | RPMA_MR_USAGE_READ_DST |\
RPMA_MR_USAGE_WRITE_SRC | RPMA_MR_USAGE_WRITE_DST |\
RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV |\
RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY |\
RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT)
/*
* Make sure the size of the usage field is big enough
* to store all allowed values.
*/
#define STATIC_ASSERT(cond, msg)\
typedef char static_assertion_##msg[(cond) ? 1 : -1]
STATIC_ASSERT(USAGE_ALL_ALLOWED < (1 << (8 * sizeof(uint8_t))),
usage_too_small);
/* generate operation completion on success */
#define RPMA_F_COMPLETION_ON_SUCCESS \
(RPMA_F_COMPLETION_ALWAYS & ~RPMA_F_COMPLETION_ON_ERROR)
struct rpma_mr_local {
struct ibv_mr *ibv_mr; /* an IBV memory registration object */
int usage; /* usage of the memory region */
};
struct rpma_mr_remote {
uint64_t raddr; /* the base virtual address of the memory region */
uint64_t size; /* the size of the memory being registered */
uint32_t rkey; /* remote key of the memory region */
int usage; /* usage of the memory region */
};
/* helper functions */
/*
* usage_to_access -- convert usage to access
*
* Note: APM type of flush requires the same access as RPMA_MR_USAGE_READ_SRC
*/
static int
usage_to_access(int usage)
{
int access = 0;
if (usage & (RPMA_MR_USAGE_READ_SRC |\
RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY |\
RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT))
access |= IBV_ACCESS_REMOTE_READ;
if (usage & RPMA_MR_USAGE_READ_DST)
access |= IBV_ACCESS_LOCAL_WRITE;
if (usage & RPMA_MR_USAGE_WRITE_SRC)
access |= IBV_ACCESS_LOCAL_WRITE;
if (usage & RPMA_MR_USAGE_WRITE_DST)
/*
* If IBV_ACCESS_REMOTE_WRITE is set, then
* IBV_ACCESS_LOCAL_WRITE must be set too.
*/
access |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
if (usage & RPMA_MR_USAGE_RECV)
access |= IBV_ACCESS_LOCAL_WRITE;
/*
* There is no IBV_ACCESS_* value to be set for RPMA_MR_USAGE_SEND.
*/
return access;
}
/* internal librpma API */
/*
* rpma_mr_read -- post an RDMA read from src to dst
*/
int
rpma_mr_read(struct ibv_qp *qp,
struct rpma_mr_local *dst, size_t dst_offset,
const struct rpma_mr_remote *src, size_t src_offset,
size_t len, int flags, const void *op_context)
{
struct ibv_send_wr wr;
struct ibv_sge sge;
/* source */
wr.wr.rdma.remote_addr = src->raddr + src_offset;
wr.wr.rdma.rkey = src->rkey;
/* destination */
sge.addr = (uint64_t)((uintptr_t)dst->ibv_mr->addr + dst_offset);
sge.length = (uint32_t)len;
sge.lkey = dst->ibv_mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
wr.wr_id = (uint64_t)op_context;
wr.next = NULL;
wr.opcode = IBV_WR_RDMA_READ;
wr.send_flags = (flags & RPMA_F_COMPLETION_ON_SUCCESS) ?
IBV_SEND_SIGNALED : 0;
struct ibv_send_wr *bad_wr;
int ret = ibv_post_send(qp, &wr, &bad_wr);
if (ret) {
RPMA_LOG_ERROR_WITH_ERRNO(ret,
"ibv_post_send(src_addr=0x%x, rkey=0x%x, dst_addr=0x%x, length=%u, lkey=0x%x, wr_id=0x%x, opcode=IBV_WR_RDMA_READ, send_flags=%s)",
wr.wr.rdma.remote_addr, wr.wr.rdma.rkey,
sge.addr, sge.length, sge.lkey, wr.wr_id,
(flags & RPMA_F_COMPLETION_ON_SUCCESS) ?
"IBV_SEND_SIGNALED" : "0");
return RPMA_E_PROVIDER;
}
return 0;
}
/*
* rpma_mr_write -- post an RDMA write from src to dst
*/
int
rpma_mr_write(struct ibv_qp *qp,
struct rpma_mr_remote *dst, size_t dst_offset,
const struct rpma_mr_local *src, size_t src_offset,
size_t len, int flags, const void *op_context, bool fence)
{
struct ibv_send_wr wr;
struct ibv_sge sge;
/* source */
sge.addr = (uint64_t)((uintptr_t)src->ibv_mr->addr + src_offset);
sge.length = (uint32_t)len;
sge.lkey = src->ibv_mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
/* destination */
wr.wr.rdma.remote_addr = dst->raddr + dst_offset;
wr.wr.rdma.rkey = dst->rkey;
wr.wr_id = (uint64_t)op_context;
wr.next = NULL;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = (flags & RPMA_F_COMPLETION_ON_SUCCESS) ?
IBV_SEND_SIGNALED : 0;
wr.send_flags |= fence ? IBV_SEND_FENCE : 0;
struct ibv_send_wr *bad_wr;
int ret = ibv_post_send(qp, &wr, &bad_wr);
if (ret) {
RPMA_LOG_ERROR_WITH_ERRNO(ret,
"ibv_post_send(src_addr=0x%x, rkey=0x%x, dst_addr=0x%x, length=%u, lkey=0x%x, wr_id=0x%x, opcode=IBV_WR_RDMA_WRITE, send_flags=%s)",
wr.wr.rdma.remote_addr, wr.wr.rdma.rkey,
sge.addr, sge.length, sge.lkey, wr.wr_id,
(flags & RPMA_F_COMPLETION_ON_SUCCESS) ?
"IBV_SEND_SIGNALED" : "0");
return RPMA_E_PROVIDER;
}
return 0;
}
/*
* rpma_mr_send -- post an RDMA send from src
*/
int
rpma_mr_send(struct ibv_qp *qp,
const struct rpma_mr_local *src, size_t offset,
size_t len, int flags, const void *op_context,
enum ibv_wr_opcode operation, uint32_t value)
{
struct ibv_send_wr wr;
struct ibv_sge sge;
/* source */
sge.addr = (uint64_t)((uintptr_t)src->ibv_mr->addr + offset);
sge.length = (uint32_t)len;
sge.lkey = src->ibv_mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
wr.next = NULL;
wr.opcode = operation;
switch (wr.opcode) {
case IBV_WR_SEND:
break;
case IBV_WR_SEND_WITH_IMM:
wr.imm_data = htobe32(value);
break;
default:
RPMA_LOG_ERROR("unsupported wr.opcode == %d", wr.opcode);
return RPMA_E_NOSUPP;
}
wr.wr_id = (uint64_t)op_context;
wr.send_flags = (flags & RPMA_F_COMPLETION_ON_SUCCESS) ?
IBV_SEND_SIGNALED : 0;
struct ibv_send_wr *bad_wr;
int ret = ibv_post_send(qp, &wr, &bad_wr);
if (ret) {
RPMA_LOG_ERROR_WITH_ERRNO(ret, "ibv_post_send");
return RPMA_E_PROVIDER;
}
return 0;
}
/*
* rpma_mr_recv -- post an RDMA recv from dst
*/
int
rpma_mr_recv(struct ibv_qp *qp,
struct rpma_mr_local *dst, size_t offset,
size_t len, const void *op_context)
{
struct ibv_recv_wr wr;
struct ibv_sge sge;
/* source */
sge.addr = (uint64_t)((uintptr_t)dst->ibv_mr->addr + offset);
sge.length = (uint32_t)len;
sge.lkey = dst->ibv_mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
wr.next = NULL;
wr.wr_id = (uint64_t)op_context;
struct ibv_recv_wr *bad_wr;
int ret = ibv_post_recv(qp, &wr, &bad_wr);
if (ret) {
RPMA_LOG_ERROR_WITH_ERRNO(ret, "ibv_post_recv");
return RPMA_E_PROVIDER;
}
return 0;
}
/* public librpma API */
/*
* rpma_mr_reg -- create a local memory registration object
*/
int
rpma_mr_reg(struct rpma_peer *peer, void *ptr, size_t size, int usage,
struct rpma_mr_local **mr_ptr)
{
if (peer == NULL || ptr == NULL || size == 0 || mr_ptr == NULL)
return RPMA_E_INVAL;
if (usage == 0 || (usage & ~USAGE_ALL_ALLOWED))
return RPMA_E_INVAL;
struct rpma_mr_local *mr;
mr = malloc(sizeof(struct rpma_mr_local));
if (mr == NULL)
return RPMA_E_NOMEM;
struct ibv_mr *ibv_mr;
int ret = rpma_peer_mr_reg(peer, &ibv_mr, ptr, size,
usage_to_access(usage));
if (ret) {
free(mr);
return ret;
}
mr->ibv_mr = ibv_mr;
mr->usage = usage;
*mr_ptr = mr;
return 0;
}
/*
* rpma_mr_dereg -- delete a local memory registration object
*/
int
rpma_mr_dereg(struct rpma_mr_local **mr_ptr)
{
if (mr_ptr == NULL)
return RPMA_E_INVAL;
if (*mr_ptr == NULL)
return 0;
int ret = 0;
struct rpma_mr_local *mr = *mr_ptr;
errno = ibv_dereg_mr(mr->ibv_mr);
if (errno) {
RPMA_LOG_ERROR_WITH_ERRNO(errno, "ibv_dereg_mr()");
ret = RPMA_E_PROVIDER;
}
free(mr);
*mr_ptr = NULL;
return ret;
}
/*
* rpma_mr_get_descriptor -- get a descriptor of memory region
*/
int
rpma_mr_get_descriptor(const struct rpma_mr_local *mr, void *desc)
{
if (mr == NULL || desc == NULL)
return RPMA_E_INVAL;
char *buff = (char *)desc;
uint64_t addr = htole64((uint64_t)mr->ibv_mr->addr);
memcpy(buff, &addr, sizeof(uint64_t));
buff += sizeof(uint64_t);
uint64_t length = htole64((uint64_t)mr->ibv_mr->length);
memcpy(buff, &length, sizeof(uint64_t));
buff += sizeof(uint64_t);
uint32_t rkey = htole32(mr->ibv_mr->rkey);
memcpy(buff, &rkey, sizeof(uint32_t));
buff += sizeof(uint32_t);
*((uint8_t *)buff) = (uint8_t)mr->usage;
return 0;
}
/*
* rpma_mr_remote_from_descriptor -- create a remote memory region from
* a descriptor
*/
int
rpma_mr_remote_from_descriptor(const void *desc,
size_t desc_size, struct rpma_mr_remote **mr_ptr)
{
if (desc == NULL || mr_ptr == NULL)
return RPMA_E_INVAL;
char *buff = (char *)desc;
uint64_t raddr;
uint64_t size;
uint32_t rkey;
if (desc_size < RPMA_MR_DESC_SIZE) {
RPMA_LOG_ERROR(
"incorrect size of the descriptor: %i bytes (should be at least: %i bytes)",
desc_size, RPMA_MR_DESC_SIZE);
return RPMA_E_INVAL;
}
memcpy(&raddr, buff, sizeof(uint64_t));
buff += sizeof(uint64_t);
memcpy(&size, buff, sizeof(uint64_t));
buff += sizeof(uint64_t);
memcpy(&rkey, buff, sizeof(uint32_t));
buff += sizeof(uint32_t);
uint8_t usage = *(uint8_t *)buff;
if (usage == 0) {
RPMA_LOG_ERROR("usage type of memory is not set");
return RPMA_E_INVAL;
}
struct rpma_mr_remote *mr = malloc(sizeof(struct rpma_mr_remote));
if (mr == NULL)
return RPMA_E_NOMEM;
mr->raddr = le64toh(raddr);
mr->size = le64toh(size);
mr->rkey = le32toh(rkey);
mr->usage = usage;
*mr_ptr = mr;
RPMA_LOG_INFO("new rpma_mr_remote(raddr=0x%" PRIx64 ", size=%" PRIu64
", rkey=0x%" PRIx32 ", usage=0x%" PRIx8 ")",
raddr, size, rkey, usage);
return 0;
}
/*
* rpma_mr_get_descriptor_size -- get size of a memory region descriptor
*/
int
rpma_mr_get_descriptor_size(const struct rpma_mr_local *mr, size_t *desc_size)
{
if (mr == NULL || desc_size == NULL)
return RPMA_E_INVAL;
*desc_size = RPMA_MR_DESC_SIZE;
return 0;
}
/*
* rpma_mr_remote_get_size -- get a remote memory region size
*/
int
rpma_mr_remote_get_size(const struct rpma_mr_remote *mr, size_t *size)
{
if (mr == NULL || size == NULL)
return RPMA_E_INVAL;
*size = mr->size;
return 0;
}
/*
* rpma_mr_remote_delete -- delete a remote memory region's structure
*/
int
rpma_mr_remote_delete(struct rpma_mr_remote **mr_ptr)
{
if (mr_ptr == NULL)
return RPMA_E_INVAL;
if (*mr_ptr == NULL)
return 0;
free(*mr_ptr);
*mr_ptr = NULL;
return 0;
}
/*
* rpma_mr_remote_get_flush_type -- get a flush type supported
* by the remote memory region
*/
int
rpma_mr_remote_get_flush_type(const struct rpma_mr_remote *mr, int *flush_type)
{
if (mr == NULL || flush_type == NULL)
return RPMA_E_INVAL;
*flush_type = mr->usage & (RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT |
RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY);
return 0;
}