Merge lp:~semiosis/ubuntu/trusty/glusterfs/fix-for-1268064 into lp:ubuntu/trusty/glusterfs

Proposed by Louis Zuckerman
Status: Merged
Approved by: James Page
Approved revision: 31
Merge reported by: James Page
Merged at revision: not available
Proposed branch: lp:~semiosis/ubuntu/trusty/glusterfs/fix-for-1268064
Merge into: lp:ubuntu/trusty/glusterfs
Diff against target: 17170 lines (+7414/-7666)
93 files modified
.pc/.quilt_patches (+0/-1)
.pc/.quilt_series (+0/-1)
.pc/.version (+0/-1)
.pc/01-spelling-error.diff/rpc/rpc-transport/rdma/src/rdma.c (+0/-4578)
.pc/01-spelling-error.diff/xlators/mgmt/glusterd/src/glusterd-store.c (+0/-2530)
.pc/applied-patches (+0/-1)
ChangeLog (+1978/-4)
api/Makefile.am (+1/-1)
api/Makefile.in (+1/-1)
api/examples/Makefile.am (+6/-0)
api/examples/Makefile.in (+555/-0)
api/examples/README (+19/-0)
api/examples/gfapi.py (+402/-0)
api/examples/glfsxmp.c (+1601/-0)
api/examples/setup.py.in (+29/-0)
api/src/Makefile.am (+3/-2)
api/src/Makefile.in (+15/-3)
api/src/glfs-fops.c (+91/-32)
api/src/glfs-handleops.c (+1278/-0)
api/src/glfs-handles.h (+143/-0)
api/src/glfs-internal.h (+59/-0)
api/src/glfs-mem-types.h (+2/-1)
api/src/glfs-resolve.c (+70/-3)
api/src/glfs.c (+22/-4)
api/src/glfs.h (+26/-0)
cli/src/cli-cmd-parser.c (+36/-4)
cli/src/cli-cmd-volume.c (+68/-3)
cli/src/cli-rpc-ops.c (+22/-14)
cli/src/cli-xml-output.c (+81/-4)
configure (+13/-11)
configure.ac (+3/-1)
debian/changelog (+16/-0)
debian/glusterfs-client.mounting-glusterfs.upstart (+9/-0)
debian/glusterfs-server.mounting-glusterfs.upstart (+0/-7)
debian/rules (+3/-2)
debian/source.lintian-overrides (+1/-0)
doc/glusterd.vol (+1/-0)
glusterfs.spec (+41/-16)
glusterfs.spec.in (+33/-8)
glusterfsd/src/glusterfsd-mgmt.c (+5/-0)
libglusterfs/src/globals.c (+56/-1)
libglusterfs/src/globals.h (+4/-0)
libglusterfs/src/mem-types.h (+2/-1)
libglusterfs/src/syncop.c (+154/-0)
libglusterfs/src/syncop.h (+63/-19)
rpc/rpc-transport/rdma/src/rdma.c (+4/-4)
xlators/cluster/afr/src/afr-inode-read.c (+1/-1)
xlators/cluster/afr/src/afr.c (+10/-0)
xlators/cluster/dht/src/dht-common.c (+16/-6)
xlators/cluster/dht/src/dht-common.h (+1/-0)
xlators/cluster/dht/src/dht-inode-read.c (+4/-0)
xlators/cluster/dht/src/dht-inode-write.c (+2/-0)
xlators/cluster/dht/src/dht-layout.c (+11/-3)
xlators/cluster/dht/src/dht-rebalance.c (+29/-14)
xlators/cluster/dht/src/dht-selfheal.c (+7/-0)
xlators/mgmt/glusterd/src/glusterd-brick-ops.c (+9/-3)
xlators/mgmt/glusterd/src/glusterd-op-sm.c (+71/-4)
xlators/mgmt/glusterd/src/glusterd-pmap.c (+5/-5)
xlators/mgmt/glusterd/src/glusterd-replace-brick.c (+8/-3)
xlators/mgmt/glusterd/src/glusterd-store.c (+3/-4)
xlators/mgmt/glusterd/src/glusterd-utils.c (+148/-14)
xlators/mgmt/glusterd/src/glusterd-utils.h (+7/-2)
xlators/mgmt/glusterd/src/glusterd-volume-ops.c (+12/-4)
xlators/mgmt/glusterd/src/glusterd.c (+10/-2)
xlators/mgmt/glusterd/src/glusterd.h (+1/-0)
xlators/mount/fuse/src/fuse-bridge.h (+14/-7)
xlators/nfs/server/src/mount3.c (+5/-14)
xlators/nfs/server/src/mount3.h (+4/-13)
xlators/nfs/server/src/mount3udp_svc.c (+4/-13)
xlators/nfs/server/src/nfs-common.c (+4/-13)
xlators/nfs/server/src/nfs-common.h (+4/-13)
xlators/nfs/server/src/nfs-fops.c (+4/-13)
xlators/nfs/server/src/nfs-fops.h (+4/-13)
xlators/nfs/server/src/nfs-generics.c (+4/-13)
xlators/nfs/server/src/nfs-generics.h (+4/-13)
xlators/nfs/server/src/nfs-inodes.c (+4/-13)
xlators/nfs/server/src/nfs-inodes.h (+4/-13)
xlators/nfs/server/src/nfs-mem-types.h (+4/-13)
xlators/nfs/server/src/nfs.c (+4/-13)
xlators/nfs/server/src/nfs.h (+4/-13)
xlators/nfs/server/src/nfs3-fh.c (+4/-13)
xlators/nfs/server/src/nfs3-fh.h (+4/-13)
xlators/nfs/server/src/nfs3-helpers.c (+4/-13)
xlators/nfs/server/src/nfs3-helpers.h (+4/-13)
xlators/nfs/server/src/nfs3.c (+4/-13)
xlators/nfs/server/src/nfs3.h (+4/-13)
xlators/nfs/server/src/nlm4.c (+4/-13)
xlators/nfs/server/src/nlm4.h (+4/-13)
xlators/nfs/server/src/nlmcbk_svc.c (+4/-13)
xlators/protocol/auth/addr/src/addr.c (+4/-13)
xlators/protocol/auth/login/src/login.c (+4/-13)
xlators/protocol/client/src/client-handshake.c (+1/-0)
xlators/storage/posix/src/posix.c (+27/-10)
To merge this branch: bzr merge lp:~semiosis/ubuntu/trusty/glusterfs/fix-for-1268064
Reviewer Review Type Date Requested Status
James Page Pending
Review via email: mp+201280@code.launchpad.net

Description of the change

Update glusterfs version & fix bug

To post a comment you must log in.
31. By Louis Zuckerman

adding debian/glusterfs-client.mounting-glusterfs.upstart which was missed in the last commit

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== removed directory '.pc'
2=== removed file '.pc/.quilt_patches'
3--- .pc/.quilt_patches 2013-05-29 09:55:03 +0000
4+++ .pc/.quilt_patches 1970-01-01 00:00:00 +0000
5@@ -1,1 +0,0 @@
6-debian/patches
7
8=== removed file '.pc/.quilt_series'
9--- .pc/.quilt_series 2013-05-29 09:55:03 +0000
10+++ .pc/.quilt_series 1970-01-01 00:00:00 +0000
11@@ -1,1 +0,0 @@
12-series
13
14=== removed file '.pc/.version'
15--- .pc/.version 2013-05-29 09:55:03 +0000
16+++ .pc/.version 1970-01-01 00:00:00 +0000
17@@ -1,1 +0,0 @@
18-2
19
20=== removed directory '.pc/01-spelling-error.diff'
21=== removed directory '.pc/01-spelling-error.diff/rpc'
22=== removed directory '.pc/01-spelling-error.diff/rpc/rpc-transport'
23=== removed directory '.pc/01-spelling-error.diff/rpc/rpc-transport/rdma'
24=== removed directory '.pc/01-spelling-error.diff/rpc/rpc-transport/rdma/src'
25=== removed file '.pc/01-spelling-error.diff/rpc/rpc-transport/rdma/src/rdma.c'
26--- .pc/01-spelling-error.diff/rpc/rpc-transport/rdma/src/rdma.c 2013-07-23 13:53:32 +0000
27+++ .pc/01-spelling-error.diff/rpc/rpc-transport/rdma/src/rdma.c 1970-01-01 00:00:00 +0000
28@@ -1,4578 +0,0 @@
29-/*
30- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
31- This file is part of GlusterFS.
32-
33- This file is licensed to you under your choice of the GNU Lesser
34- General Public License, version 3 or any later version (LGPLv3 or
35- later), or the GNU General Public License, version 2 (GPLv2), in all
36- cases as published by the Free Software Foundation.
37-*/
38-
39-#ifndef _CONFIG_H
40-#define _CONFIG_H
41-#include "config.h"
42-#endif
43-
44-#include "dict.h"
45-#include "glusterfs.h"
46-#include "logging.h"
47-#include "rdma.h"
48-#include "name.h"
49-#include "byte-order.h"
50-#include "xlator.h"
51-#include "xdr-rpc.h"
52-#include <signal.h>
53-
54-#define GF_RDMA_LOG_NAME "rpc-transport/rdma"
55-
56-static int32_t
57-__gf_rdma_ioq_churn (gf_rdma_peer_t *peer);
58-
59-gf_rdma_post_t *
60-gf_rdma_post_ref (gf_rdma_post_t *post);
61-
62-int
63-gf_rdma_post_unref (gf_rdma_post_t *post);
64-
65-static void *
66-gf_rdma_send_completion_proc (void *data);
67-
68-static void *
69-gf_rdma_recv_completion_proc (void *data);
70-
71-void *
72-gf_rdma_async_event_thread (void *context);
73-
74-static int32_t
75-gf_rdma_create_qp (rpc_transport_t *this);
76-
77-static int32_t
78-__gf_rdma_teardown (rpc_transport_t *this);
79-
80-static int32_t
81-gf_rdma_teardown (rpc_transport_t *this);
82-
83-static int32_t
84-gf_rdma_disconnect (rpc_transport_t *this);
85-
86-static void
87-gf_rdma_cm_handle_disconnect (rpc_transport_t *this);
88-
89-
90-static void
91-gf_rdma_put_post (gf_rdma_queue_t *queue, gf_rdma_post_t *post)
92-{
93- post->ctx.is_request = 0;
94-
95- pthread_mutex_lock (&queue->lock);
96- {
97- if (post->prev) {
98- queue->active_count--;
99- post->prev->next = post->next;
100- }
101-
102- if (post->next) {
103- post->next->prev = post->prev;
104- }
105-
106- post->prev = &queue->passive_posts;
107- post->next = post->prev->next;
108- post->prev->next = post;
109- post->next->prev = post;
110- queue->passive_count++;
111- }
112- pthread_mutex_unlock (&queue->lock);
113-}
114-
115-
116-static gf_rdma_post_t *
117-gf_rdma_new_post (rpc_transport_t *this, gf_rdma_device_t *device, int32_t len,
118- gf_rdma_post_type_t type)
119-{
120- gf_rdma_post_t *post = NULL;
121- int ret = -1;
122-
123- post = (gf_rdma_post_t *) GF_CALLOC (1, sizeof (*post),
124- gf_common_mt_rdma_post_t);
125- if (post == NULL) {
126- goto out;
127- }
128-
129- pthread_mutex_init (&post->lock, NULL);
130-
131- post->buf_size = len;
132-
133- post->buf = valloc (len);
134- if (!post->buf) {
135- gf_log_nomem (GF_RDMA_LOG_NAME, GF_LOG_ERROR, len);
136- goto out;
137- }
138-
139- post->mr = ibv_reg_mr (device->pd,
140- post->buf,
141- post->buf_size,
142- IBV_ACCESS_LOCAL_WRITE);
143- if (!post->mr) {
144- gf_log (this->name, GF_LOG_WARNING,
145- "memory registration failed (%s)",
146- strerror (errno));
147- goto out;
148- }
149-
150- post->device = device;
151- post->type = type;
152-
153- ret = 0;
154-out:
155- if (ret != 0) {
156- free (post->buf);
157-
158- GF_FREE (post);
159- post = NULL;
160- }
161-
162- return post;
163-}
164-
165-
166-static gf_rdma_post_t *
167-gf_rdma_get_post (gf_rdma_queue_t *queue)
168-{
169- gf_rdma_post_t *post = NULL;
170-
171- pthread_mutex_lock (&queue->lock);
172- {
173- post = queue->passive_posts.next;
174- if (post == &queue->passive_posts)
175- post = NULL;
176-
177- if (post) {
178- if (post->prev)
179- post->prev->next = post->next;
180- if (post->next)
181- post->next->prev = post->prev;
182- post->prev = &queue->active_posts;
183- post->next = post->prev->next;
184- post->prev->next = post;
185- post->next->prev = post;
186- post->reused++;
187- queue->active_count++;
188- }
189- }
190- pthread_mutex_unlock (&queue->lock);
191-
192- return post;
193-}
194-
195-void
196-gf_rdma_destroy_post (gf_rdma_post_t *post)
197-{
198- ibv_dereg_mr (post->mr);
199- free (post->buf);
200- GF_FREE (post);
201-}
202-
203-
204-static int32_t
205-__gf_rdma_quota_get (gf_rdma_peer_t *peer)
206-{
207- int32_t ret = -1;
208- gf_rdma_private_t *priv = NULL;
209-
210- priv = peer->trans->private;
211-
212- if (priv->connected && peer->quota > 0) {
213- ret = peer->quota--;
214- }
215-
216- return ret;
217-}
218-
219-
220-static void
221-__gf_rdma_ioq_entry_free (gf_rdma_ioq_t *entry)
222-{
223- list_del_init (&entry->list);
224-
225- if (entry->iobref) {
226- iobref_unref (entry->iobref);
227- entry->iobref = NULL;
228- }
229-
230- if (entry->msg.request.rsp_iobref) {
231- iobref_unref (entry->msg.request.rsp_iobref);
232- entry->msg.request.rsp_iobref = NULL;
233- }
234-
235- mem_put (entry);
236-}
237-
238-
239-static void
240-__gf_rdma_ioq_flush (gf_rdma_peer_t *peer)
241-{
242- gf_rdma_ioq_t *entry = NULL, *dummy = NULL;
243-
244- list_for_each_entry_safe (entry, dummy, &peer->ioq, list) {
245- __gf_rdma_ioq_entry_free (entry);
246- }
247-}
248-
249-
250-static int32_t
251-__gf_rdma_disconnect (rpc_transport_t *this)
252-{
253- gf_rdma_private_t *priv = NULL;
254-
255- priv = this->private;
256-
257- if (priv->connected) {
258- rdma_disconnect (priv->peer.cm_id);
259- }
260-
261- return 0;
262-}
263-
264-
265-static void
266-gf_rdma_queue_init (gf_rdma_queue_t *queue)
267-{
268- pthread_mutex_init (&queue->lock, NULL);
269-
270- queue->active_posts.next = &queue->active_posts;
271- queue->active_posts.prev = &queue->active_posts;
272- queue->passive_posts.next = &queue->passive_posts;
273- queue->passive_posts.prev = &queue->passive_posts;
274-}
275-
276-
277-static void
278-__gf_rdma_destroy_queue (gf_rdma_post_t *post)
279-{
280- gf_rdma_post_t *tmp = NULL;
281-
282- while (post->next != post) {
283- tmp = post->next;
284-
285- post->next = post->next->next;
286- post->next->prev = post;
287-
288- gf_rdma_destroy_post (tmp);
289- }
290-}
291-
292-
293-static void
294-gf_rdma_destroy_queue (gf_rdma_queue_t *queue)
295-{
296- if (queue == NULL) {
297- goto out;
298- }
299-
300- pthread_mutex_lock (&queue->lock);
301- {
302- if (queue->passive_count > 0) {
303- __gf_rdma_destroy_queue (&queue->passive_posts);
304- queue->passive_count = 0;
305- }
306-
307- if (queue->active_count > 0) {
308- __gf_rdma_destroy_queue (&queue->active_posts);
309- queue->active_count = 0;
310- }
311- }
312- pthread_mutex_unlock (&queue->lock);
313-
314-out:
315- return;
316-}
317-
318-
319-static void
320-gf_rdma_destroy_posts (rpc_transport_t *this)
321-{
322- gf_rdma_device_t *device = NULL;
323- gf_rdma_private_t *priv = NULL;
324-
325- if (this == NULL) {
326- goto out;
327- }
328-
329- priv = this->private;
330- device = priv->device;
331-
332- gf_rdma_destroy_queue (&device->sendq);
333- gf_rdma_destroy_queue (&device->recvq);
334-
335-out:
336- return;
337-}
338-
339-
340-static int32_t
341-__gf_rdma_create_posts (rpc_transport_t *this, int32_t count, int32_t size,
342- gf_rdma_queue_t *q, gf_rdma_post_type_t type)
343-{
344- int32_t i = 0;
345- int32_t ret = 0;
346- gf_rdma_private_t *priv = NULL;
347- gf_rdma_device_t *device = NULL;
348-
349- priv = this->private;
350- device = priv->device;
351-
352- for (i=0 ; i<count ; i++) {
353- gf_rdma_post_t *post = NULL;
354-
355- post = gf_rdma_new_post (this, device, size + 2048, type);
356- if (!post) {
357- gf_log (this->name, GF_LOG_ERROR,
358- "post creation failed");
359- ret = -1;
360- break;
361- }
362-
363- gf_rdma_put_post (q, post);
364- }
365- return ret;
366-}
367-
368-
369-static int32_t
370-gf_rdma_post_recv (struct ibv_srq *srq,
371- gf_rdma_post_t *post)
372-{
373- struct ibv_sge list = {
374- .addr = (unsigned long) post->buf,
375- .length = post->buf_size,
376- .lkey = post->mr->lkey
377- };
378-
379- struct ibv_recv_wr wr = {
380- .wr_id = (unsigned long) post,
381- .sg_list = &list,
382- .num_sge = 1,
383- }, *bad_wr;
384-
385- gf_rdma_post_ref (post);
386-
387- return ibv_post_srq_recv (srq, &wr, &bad_wr);
388-}
389-
390-
391-static int32_t
392-gf_rdma_create_posts (rpc_transport_t *this)
393-{
394- int32_t i = 0, ret = 0;
395- gf_rdma_post_t *post = NULL;
396- gf_rdma_private_t *priv = NULL;
397- gf_rdma_options_t *options = NULL;
398- gf_rdma_device_t *device = NULL;
399-
400- priv = this->private;
401- options = &priv->options;
402- device = priv->device;
403-
404- ret = __gf_rdma_create_posts (this, options->send_count,
405- options->send_size,
406- &device->sendq, GF_RDMA_SEND_POST);
407- if (!ret)
408- ret = __gf_rdma_create_posts (this, options->recv_count,
409- options->recv_size,
410- &device->recvq,
411- GF_RDMA_RECV_POST);
412-
413- if (!ret) {
414- for (i=0 ; i<options->recv_count ; i++) {
415- post = gf_rdma_get_post (&device->recvq);
416- if (gf_rdma_post_recv (device->srq, post) != 0) {
417- ret = -1;
418- break;
419- }
420- }
421- }
422-
423- if (ret)
424- gf_rdma_destroy_posts (this);
425-
426- return ret;
427-}
428-
429-
430-static void
431-gf_rdma_destroy_cq (rpc_transport_t *this)
432-{
433- gf_rdma_private_t *priv = NULL;
434- gf_rdma_device_t *device = NULL;
435-
436- priv = this->private;
437- device = priv->device;
438-
439- if (device->recv_cq)
440- ibv_destroy_cq (device->recv_cq);
441- device->recv_cq = NULL;
442-
443- if (device->send_cq)
444- ibv_destroy_cq (device->send_cq);
445- device->send_cq = NULL;
446-
447- return;
448-}
449-
450-
451-static int32_t
452-gf_rdma_create_cq (rpc_transport_t *this)
453-{
454- gf_rdma_private_t *priv = NULL;
455- gf_rdma_options_t *options = NULL;
456- gf_rdma_device_t *device = NULL;
457- uint64_t send_cqe = 0;
458- int32_t ret = 0;
459- struct ibv_device_attr device_attr = {{0}, };
460-
461- priv = this->private;
462- options = &priv->options;
463- device = priv->device;
464-
465- device->recv_cq = ibv_create_cq (priv->device->context,
466- options->recv_count * 2,
467- device,
468- device->recv_chan,
469- 0);
470- if (!device->recv_cq) {
471- gf_log (this->name, GF_LOG_ERROR,
472- "creation of CQ for device %s failed",
473- device->device_name);
474- ret = -1;
475- goto out;
476- } else if (ibv_req_notify_cq (device->recv_cq, 0)) {
477- gf_log (this->name, GF_LOG_ERROR,
478- "ibv_req_notify_cq on recv CQ of device %s failed",
479- device->device_name);
480- ret = -1;
481- goto out;
482- }
483-
484- do {
485- ret = ibv_query_device (priv->device->context, &device_attr);
486- if (ret != 0) {
487- gf_log (this->name, GF_LOG_ERROR,
488- "ibv_query_device on %s returned %d (%s)",
489- priv->device->device_name, ret,
490- (ret > 0) ? strerror (ret) : "");
491- ret = -1;
492- goto out;
493- }
494-
495- send_cqe = options->send_count * 128;
496- send_cqe = (send_cqe > device_attr.max_cqe)
497- ? device_attr.max_cqe : send_cqe;
498-
499- /* TODO: make send_cq size dynamically adaptive */
500- device->send_cq = ibv_create_cq (priv->device->context,
501- send_cqe, device,
502- device->send_chan, 0);
503- if (!device->send_cq) {
504- gf_log (this->name, GF_LOG_ERROR,
505- "creation of send_cq for device %s failed",
506- device->device_name);
507- ret = -1;
508- goto out;
509- }
510-
511- if (ibv_req_notify_cq (device->send_cq, 0)) {
512- gf_log (this->name, GF_LOG_ERROR,
513- "ibv_req_notify_cq on send_cq for device %s"
514- " failed", device->device_name);
515- ret = -1;
516- goto out;
517- }
518- } while (0);
519-
520-out:
521- if (ret != 0)
522- gf_rdma_destroy_cq (this);
523-
524- return ret;
525-}
526-
527-
528-static gf_rdma_device_t *
529-gf_rdma_get_device (rpc_transport_t *this, struct ibv_context *ibctx,
530- char *device_name)
531-{
532- glusterfs_ctx_t *ctx = NULL;
533- gf_rdma_private_t *priv = NULL;
534- gf_rdma_options_t *options = NULL;
535- int32_t ret = 0;
536- int32_t i = 0;
537- gf_rdma_device_t *trav = NULL, *device = NULL;
538- gf_rdma_ctx_t *rdma_ctx = NULL;
539-
540- priv = this->private;
541- options = &priv->options;
542- ctx = this->ctx;
543- rdma_ctx = ctx->ib;
544-
545- trav = rdma_ctx->device;
546-
547- while (trav) {
548- if (!strcmp (trav->device_name, device_name))
549- break;
550- trav = trav->next;
551- }
552-
553- if (!trav) {
554- trav = GF_CALLOC (1, sizeof (*trav),
555- gf_common_mt_rdma_device_t);
556- if (trav == NULL) {
557- goto out;
558- }
559-
560- priv->device = trav;
561- trav->context = ibctx;
562-
563- trav->request_ctx_pool
564- = mem_pool_new (gf_rdma_request_context_t,
565- GF_RDMA_POOL_SIZE);
566- if (trav->request_ctx_pool == NULL) {
567- goto out;
568- }
569-
570- trav->ioq_pool
571- = mem_pool_new (gf_rdma_ioq_t, GF_RDMA_POOL_SIZE);
572- if (trav->ioq_pool == NULL) {
573- goto out;
574- }
575-
576- trav->reply_info_pool = mem_pool_new (gf_rdma_reply_info_t,
577- GF_RDMA_POOL_SIZE);
578- if (trav->reply_info_pool == NULL) {
579- goto out;
580- }
581-
582- trav->device_name = gf_strdup (device_name);
583-
584- trav->next = rdma_ctx->device;
585- rdma_ctx->device = trav;
586-
587- trav->send_chan = ibv_create_comp_channel (trav->context);
588- if (!trav->send_chan) {
589- gf_log (this->name, GF_LOG_ERROR,
590- "could not create send completion channel for "
591- "device (%s)", device_name);
592- goto out;
593- }
594-
595- trav->recv_chan = ibv_create_comp_channel (trav->context);
596- if (!trav->recv_chan) {
597- gf_log (this->name, GF_LOG_ERROR,
598- "could not create recv completion channel for "
599- "device (%s)", device_name);
600-
601- /* TODO: cleanup current mess */
602- goto out;
603- }
604-
605- if (gf_rdma_create_cq (this) < 0) {
606- gf_log (this->name, GF_LOG_ERROR,
607- "could not create CQ for device (%s)",
608- device_name);
609- goto out;
610- }
611-
612- /* protection domain */
613- trav->pd = ibv_alloc_pd (trav->context);
614-
615- if (!trav->pd) {
616- gf_log (this->name, GF_LOG_ERROR,
617- "could not allocate protection domain for "
618- "device (%s)", device_name);
619- goto out;
620- }
621-
622- struct ibv_srq_init_attr attr = {
623- .attr = {
624- .max_wr = options->recv_count,
625- .max_sge = 1,
626- .srq_limit = 10
627- }
628- };
629- trav->srq = ibv_create_srq (trav->pd, &attr);
630-
631- if (!trav->srq) {
632- gf_log (this->name, GF_LOG_ERROR,
633- "could not create SRQ for device (%s)",
634- device_name);
635- goto out;
636- }
637-
638- /* queue init */
639- gf_rdma_queue_init (&trav->sendq);
640- gf_rdma_queue_init (&trav->recvq);
641-
642- if (gf_rdma_create_posts (this) < 0) {
643- gf_log (this->name, GF_LOG_ERROR,
644- "could not allocate posts for device (%s)",
645- device_name);
646- goto out;
647- }
648-
649- /* completion threads */
650- ret = pthread_create (&trav->send_thread,
651- NULL,
652- gf_rdma_send_completion_proc,
653- trav->send_chan);
654- if (ret) {
655- gf_log (this->name, GF_LOG_ERROR,
656- "could not create send completion thread for "
657- "device (%s)", device_name);
658- goto out;
659- }
660-
661- ret = pthread_create (&trav->recv_thread,
662- NULL,
663- gf_rdma_recv_completion_proc,
664- trav->recv_chan);
665- if (ret) {
666- gf_log (this->name, GF_LOG_ERROR,
667- "could not create recv completion thread "
668- "for device (%s)", device_name);
669- return NULL;
670- }
671-
672- ret = pthread_create (&trav->async_event_thread,
673- NULL,
674- gf_rdma_async_event_thread,
675- ibctx);
676- if (ret) {
677- gf_log (this->name, GF_LOG_ERROR,
678- "could not create async_event_thread");
679- return NULL;
680- }
681-
682- /* qpreg */
683- pthread_mutex_init (&trav->qpreg.lock, NULL);
684- for (i=0; i<42; i++) {
685- trav->qpreg.ents[i].next = &trav->qpreg.ents[i];
686- trav->qpreg.ents[i].prev = &trav->qpreg.ents[i];
687- }
688- }
689-
690- device = trav;
691- trav = NULL;
692-out:
693-
694- if (trav != NULL) {
695- gf_rdma_destroy_posts (this);
696- mem_pool_destroy (trav->ioq_pool);
697- mem_pool_destroy (trav->request_ctx_pool);
698- mem_pool_destroy (trav->reply_info_pool);
699- ibv_dealloc_pd (trav->pd);
700- gf_rdma_destroy_cq (this);
701- ibv_destroy_comp_channel (trav->recv_chan);
702- ibv_destroy_comp_channel (trav->send_chan);
703- GF_FREE ((char *)trav->device_name);
704- GF_FREE (trav);
705- }
706-
707- return device;
708-}
709-
710-
711-static rpc_transport_t *
712-gf_rdma_transport_new (rpc_transport_t *listener, struct rdma_cm_id *cm_id)
713-{
714- gf_rdma_private_t *listener_priv = NULL, *priv = NULL;
715- rpc_transport_t *this = NULL, *new = NULL;
716- gf_rdma_options_t *options = NULL;
717- char *device_name = NULL;
718-
719- listener_priv = listener->private;
720-
721- this = GF_CALLOC (1, sizeof (rpc_transport_t),
722- gf_common_mt_rpc_transport_t);
723- if (this == NULL) {
724- goto out;
725- }
726-
727- this->listener = listener;
728-
729- priv = GF_CALLOC (1, sizeof (gf_rdma_private_t),
730- gf_common_mt_rdma_private_t);
731- if (priv == NULL) {
732- goto out;
733- }
734-
735- this->private = priv;
736- priv->options = listener_priv->options;
737-
738- priv->listener = listener;
739- priv->entity = GF_RDMA_SERVER;
740-
741- options = &priv->options;
742-
743- this->ops = listener->ops;
744- this->init = listener->init;
745- this->fini = listener->fini;
746- this->ctx = listener->ctx;
747- this->name = gf_strdup (listener->name);
748- this->notify = listener->notify;
749- this->mydata = listener->mydata;
750-
751- this->myinfo.sockaddr_len = sizeof (cm_id->route.addr.src_addr);
752- memcpy (&this->myinfo.sockaddr, &cm_id->route.addr.src_addr,
753- this->myinfo.sockaddr_len);
754-
755- this->peerinfo.sockaddr_len = sizeof (cm_id->route.addr.dst_addr);
756- memcpy (&this->peerinfo.sockaddr, &cm_id->route.addr.dst_addr,
757- this->peerinfo.sockaddr_len);
758-
759- priv->peer.trans = this;
760- gf_rdma_get_transport_identifiers (this);
761-
762- device_name = (char *)ibv_get_device_name (cm_id->verbs->device);
763- if (device_name == NULL) {
764- gf_log (listener->name, GF_LOG_WARNING,
765- "cannot get device name (peer:%s me:%s)",
766- this->peerinfo.identifier, this->myinfo.identifier);
767- goto out;
768- }
769-
770- priv->device = gf_rdma_get_device (this, cm_id->verbs,
771- device_name);
772- if (priv->device == NULL) {
773- gf_log (listener->name, GF_LOG_WARNING,
774- "cannot get infiniband device %s (peer:%s me:%s)",
775- device_name, this->peerinfo.identifier,
776- this->myinfo.identifier);
777- goto out;
778- }
779-
780- priv->peer.send_count = options->send_count;
781- priv->peer.recv_count = options->recv_count;
782- priv->peer.send_size = options->send_size;
783- priv->peer.recv_size = options->recv_size;
784- priv->peer.cm_id = cm_id;
785- INIT_LIST_HEAD (&priv->peer.ioq);
786-
787- pthread_mutex_init (&priv->write_mutex, NULL);
788- pthread_mutex_init (&priv->recv_mutex, NULL);
789-
790- cm_id->context = this;
791-
792- new = rpc_transport_ref (this);
793- this = NULL;
794-out:
795- if (this != NULL) {
796- if (this->private != NULL) {
797- GF_FREE (this->private);
798- }
799-
800- if (this->name != NULL) {
801- GF_FREE (this->name);
802- }
803-
804- GF_FREE (this);
805- }
806-
807- return new;
808-}
809-
810-
811-static int
812-gf_rdma_cm_handle_connect_request (struct rdma_cm_event *event)
813-{
814- int ret = -1;
815- rpc_transport_t *this = NULL, *listener = NULL;
816- struct rdma_cm_id *child_cm_id = NULL, *listener_cm_id = NULL;
817- struct rdma_conn_param conn_param = {0, };
818- gf_rdma_private_t *priv = NULL;
819- gf_rdma_options_t *options = NULL;
820-
821- child_cm_id = event->id;
822- listener_cm_id = event->listen_id;
823-
824- listener = listener_cm_id->context;
825- priv = listener->private;
826- options = &priv->options;
827-
828- this = gf_rdma_transport_new (listener, child_cm_id);
829- if (this == NULL) {
830- gf_log (listener->name, GF_LOG_WARNING,
831- "could not create a transport for incoming connection"
832- " (me.name:%s me.identifier:%s)", listener->name,
833- listener->myinfo.identifier);
834- rdma_destroy_id (child_cm_id);
835- goto out;
836- }
837-
838- gf_log (listener->name, GF_LOG_TRACE,
839- "got a connect request (me:%s peer:%s)",
840- listener->myinfo.identifier, this->peerinfo.identifier);
841-
842- ret = gf_rdma_create_qp (this);
843- if (ret < 0) {
844- gf_log (listener->name, GF_LOG_WARNING,
845- "could not create QP (peer:%s me:%s)",
846- this->peerinfo.identifier, this->myinfo.identifier);
847- gf_rdma_cm_handle_disconnect (this);
848- goto out;
849- }
850-
851- conn_param.responder_resources = 1;
852- conn_param.initiator_depth = 1;
853- conn_param.retry_count = options->attr_retry_cnt;
854- conn_param.rnr_retry_count = options->attr_rnr_retry;
855-
856- ret = rdma_accept(child_cm_id, &conn_param);
857- if (ret < 0) {
858- gf_log (listener->name, GF_LOG_WARNING, "rdma_accept failed "
859- "peer:%s me:%s (%s)", this->peerinfo.identifier,
860- this->myinfo.identifier, strerror (errno));
861- gf_rdma_cm_handle_disconnect (this);
862- goto out;
863- }
864-
865- ret = 0;
866-
867-out:
868- return ret;
869-}
870-
871-
872-static int
873-gf_rdma_cm_handle_route_resolved (struct rdma_cm_event *event)
874-{
875- struct rdma_conn_param conn_param = {0, };
876- int ret = 0;
877- rpc_transport_t *this = NULL;
878- gf_rdma_private_t *priv = NULL;
879- gf_rdma_peer_t *peer = NULL;
880- gf_rdma_options_t *options = NULL;
881-
882- if (event == NULL) {
883- goto out;
884- }
885-
886- this = event->id->context;
887-
888- priv = this->private;
889- peer = &priv->peer;
890- options = &priv->options;
891-
892- ret = gf_rdma_create_qp (this);
893- if (ret != 0) {
894- gf_log (this->name, GF_LOG_WARNING,
895- "could not create QP (peer:%s me:%s)",
896- this->peerinfo.identifier, this->myinfo.identifier);
897- gf_rdma_cm_handle_disconnect (this);
898- goto out;
899- }
900-
901- memset(&conn_param, 0, sizeof conn_param);
902- conn_param.responder_resources = 1;
903- conn_param.initiator_depth = 1;
904- conn_param.retry_count = options->attr_retry_cnt;
905- conn_param.rnr_retry_count = options->attr_rnr_retry;
906-
907- ret = rdma_connect(peer->cm_id, &conn_param);
908- if (ret != 0) {
909- gf_log (this->name, GF_LOG_WARNING,
910- "rdma_connect failed (%s)", strerror (errno));
911- gf_rdma_cm_handle_disconnect (this);
912- goto out;
913- }
914-
915- gf_log (this->name, GF_LOG_TRACE, "route resolved (me:%s peer:%s)",
916- this->myinfo.identifier, this->peerinfo.identifier);
917-
918- ret = 0;
919-out:
920- return ret;
921-}
922-
923-
924-static int
925-gf_rdma_cm_handle_addr_resolved (struct rdma_cm_event *event)
926-{
927- rpc_transport_t *this = NULL;
928- gf_rdma_peer_t *peer = NULL;
929- gf_rdma_private_t *priv = NULL;
930- int ret = 0;
931-
932- this = event->id->context;
933-
934- priv = this->private;
935- peer = &priv->peer;
936-
937- GF_ASSERT (peer->cm_id == event->id);
938-
939- this->myinfo.sockaddr_len = sizeof (peer->cm_id->route.addr.src_addr);
940- memcpy (&this->myinfo.sockaddr, &peer->cm_id->route.addr.src_addr,
941- this->myinfo.sockaddr_len);
942-
943- this->peerinfo.sockaddr_len = sizeof (peer->cm_id->route.addr.dst_addr);
944- memcpy (&this->peerinfo.sockaddr, &peer->cm_id->route.addr.dst_addr,
945- this->peerinfo.sockaddr_len);
946-
947- gf_rdma_get_transport_identifiers (this);
948-
949- ret = rdma_resolve_route(peer->cm_id, 2000);
950- if (ret != 0) {
951- gf_log (this->name, GF_LOG_WARNING,
952- "rdma_resolve_route failed (me:%s peer:%s) (%s)",
953- this->myinfo.identifier, this->peerinfo.identifier,
954- strerror (errno));
955- gf_rdma_cm_handle_disconnect (this);
956- }
957-
958- gf_log (this->name, GF_LOG_TRACE, "Address resolved (me:%s peer:%s)",
959- this->myinfo.identifier, this->peerinfo.identifier);
960-
961- return ret;
962-}
963-
964-
965-static void
966-gf_rdma_cm_handle_disconnect (rpc_transport_t *this)
967-{
968- gf_rdma_private_t *priv = NULL;
969- char need_unref = 0, connected = 0;
970-
971- priv = this->private;
972- gf_log (this->name, GF_LOG_DEBUG,
973- "peer disconnected, cleaning up");
974-
975- pthread_mutex_lock (&priv->write_mutex);
976- {
977- if (priv->peer.cm_id != NULL) {
978- need_unref = 1;
979- connected = priv->connected;
980- priv->connected = 0;
981- }
982-
983- __gf_rdma_teardown (this);
984- }
985- pthread_mutex_unlock (&priv->write_mutex);
986-
987- if (connected) {
988- rpc_transport_notify (this, RPC_TRANSPORT_DISCONNECT, this);
989- }
990-
991- if (need_unref)
992- rpc_transport_unref (this);
993-
994-}
995-
996-
997-static int
998-gf_rdma_cm_handle_event_established (struct rdma_cm_event *event)
999-{
1000- rpc_transport_t *this = NULL;
1001- gf_rdma_private_t *priv = NULL;
1002- struct rdma_cm_id *cm_id = NULL;
1003- int ret = 0;
1004-
1005- cm_id = event->id;
1006- this = cm_id->context;
1007- priv = this->private;
1008-
1009- priv->connected = 1;
1010-
1011- pthread_mutex_lock (&priv->write_mutex);
1012- {
1013- priv->peer.quota = 1;
1014- priv->peer.quota_set = 0;
1015- }
1016- pthread_mutex_unlock (&priv->write_mutex);
1017-
1018- if (priv->entity == GF_RDMA_CLIENT) {
1019- ret = rpc_transport_notify (this, RPC_TRANSPORT_CONNECT, this);
1020-
1021- } else if (priv->entity == GF_RDMA_SERVER) {
1022- ret = rpc_transport_notify (priv->listener,
1023- RPC_TRANSPORT_ACCEPT, this);
1024- }
1025-
1026- if (ret < 0) {
1027- gf_rdma_disconnect (this);
1028- }
1029-
1030- gf_log (this->name, GF_LOG_TRACE,
1031- "recieved event RDMA_CM_EVENT_ESTABLISHED (me:%s peer:%s)",
1032- this->myinfo.identifier, this->peerinfo.identifier);
1033-
1034- return ret;
1035-}
1036-
1037-
1038-static int
1039-gf_rdma_cm_handle_event_error (rpc_transport_t *this)
1040-{
1041- gf_rdma_private_t *priv = NULL;
1042-
1043- priv = this->private;
1044-
1045- if (priv->entity != GF_RDMA_SERVER_LISTENER) {
1046- gf_rdma_cm_handle_disconnect (this);
1047- }
1048-
1049- return 0;
1050-}
1051-
1052-
1053-static int
1054-gf_rdma_cm_handle_device_removal (struct rdma_cm_event *event)
1055-{
1056- return 0;
1057-}
1058-
1059-
1060-static void *
1061-gf_rdma_cm_event_handler (void *data)
1062-{
1063- struct rdma_cm_event *event = NULL;
1064- int ret = 0;
1065- rpc_transport_t *this = NULL;
1066- struct rdma_event_channel *event_channel = NULL;
1067-
1068- event_channel = data;
1069-
1070- while (1) {
1071- ret = rdma_get_cm_event (event_channel, &event);
1072- if (ret != 0) {
1073- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1074- "rdma_cm_get_event failed (%s)",
1075- strerror (errno));
1076- break;
1077- }
1078-
1079- switch (event->event) {
1080- case RDMA_CM_EVENT_ADDR_RESOLVED:
1081- gf_rdma_cm_handle_addr_resolved (event);
1082- break;
1083-
1084- case RDMA_CM_EVENT_ROUTE_RESOLVED:
1085- gf_rdma_cm_handle_route_resolved (event);
1086- break;
1087-
1088- case RDMA_CM_EVENT_CONNECT_REQUEST:
1089- gf_rdma_cm_handle_connect_request (event);
1090- break;
1091-
1092- case RDMA_CM_EVENT_ESTABLISHED:
1093- gf_rdma_cm_handle_event_established (event);
1094- break;
1095-
1096- case RDMA_CM_EVENT_ADDR_ERROR:
1097- case RDMA_CM_EVENT_ROUTE_ERROR:
1098- case RDMA_CM_EVENT_CONNECT_ERROR:
1099- case RDMA_CM_EVENT_UNREACHABLE:
1100- case RDMA_CM_EVENT_REJECTED:
1101- this = event->id->context;
1102-
1103- gf_log (this->name, GF_LOG_WARNING,
1104- "cma event %s, error %d (me:%s peer:%s)\n",
1105- rdma_event_str(event->event), event->status,
1106- this->myinfo.identifier,
1107- this->peerinfo.identifier);
1108-
1109- rdma_ack_cm_event (event);
1110- event = NULL;
1111-
1112- gf_rdma_cm_handle_event_error (this);
1113- continue;
1114-
1115- case RDMA_CM_EVENT_DISCONNECTED:
1116- this = event->id->context;
1117-
1118- gf_log (this->name, GF_LOG_DEBUG,
1119- "recieved disconnect (me:%s peer:%s)\n",
1120- this->myinfo.identifier,
1121- this->peerinfo.identifier);
1122-
1123- rdma_ack_cm_event (event);
1124- event = NULL;
1125-
1126- gf_rdma_cm_handle_disconnect (this);
1127- continue;
1128-
1129- case RDMA_CM_EVENT_DEVICE_REMOVAL:
1130- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1131- "device removed");
1132- gf_rdma_cm_handle_device_removal (event);
1133- break;
1134-
1135- default:
1136- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1137- "unhandled event: %s, ignoring",
1138- rdma_event_str(event->event));
1139- break;
1140- }
1141-
1142- rdma_ack_cm_event (event);
1143- }
1144-
1145- return NULL;
1146-}
1147-
1148-
1149-static int32_t
1150-gf_rdma_post_send (struct ibv_qp *qp, gf_rdma_post_t *post, int32_t len)
1151-{
1152- struct ibv_sge list = {
1153- .addr = (unsigned long) post->buf,
1154- .length = len,
1155- .lkey = post->mr->lkey
1156- };
1157-
1158- struct ibv_send_wr wr = {
1159- .wr_id = (unsigned long) post,
1160- .sg_list = &list,
1161- .num_sge = 1,
1162- .opcode = IBV_WR_SEND,
1163- .send_flags = IBV_SEND_SIGNALED,
1164- }, *bad_wr;
1165-
1166- if (!qp)
1167- return EINVAL;
1168-
1169- return ibv_post_send (qp, &wr, &bad_wr);
1170-}
1171-
1172-int
1173-__gf_rdma_encode_error(gf_rdma_peer_t *peer, gf_rdma_reply_info_t *reply_info,
1174- struct iovec *rpchdr, gf_rdma_header_t *hdr,
1175- gf_rdma_errcode_t err)
1176-{
1177- struct rpc_msg *rpc_msg = NULL;
1178-
1179- if (reply_info != NULL) {
1180- hdr->rm_xid = hton32(reply_info->rm_xid);
1181- } else {
1182- rpc_msg = rpchdr[0].iov_base; /* assume rpchdr contains
1183- * only one vector.
1184- * (which is true)
1185- */
1186- hdr->rm_xid = rpc_msg->rm_xid;
1187- }
1188-
1189- hdr->rm_vers = hton32(GF_RDMA_VERSION);
1190- hdr->rm_credit = hton32(peer->send_count);
1191- hdr->rm_type = hton32(GF_RDMA_ERROR);
1192- hdr->rm_body.rm_error.rm_type = hton32(err);
1193- if (err == ERR_VERS) {
1194- hdr->rm_body.rm_error.rm_version.gf_rdma_vers_low
1195- = hton32(GF_RDMA_VERSION);
1196- hdr->rm_body.rm_error.rm_version.gf_rdma_vers_high
1197- = hton32(GF_RDMA_VERSION);
1198- }
1199-
1200- return sizeof (*hdr);
1201-}
1202-
1203-
1204-int32_t
1205-__gf_rdma_send_error (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
1206- gf_rdma_post_t *post, gf_rdma_reply_info_t *reply_info,
1207- gf_rdma_errcode_t err)
1208-{
1209- int32_t ret = -1, len = 0;
1210-
1211- len = __gf_rdma_encode_error (peer, reply_info, entry->rpchdr,
1212- (gf_rdma_header_t *)post->buf, err);
1213- if (len == -1) {
1214- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
1215- "encode error returned -1");
1216- goto out;
1217- }
1218-
1219- gf_rdma_post_ref (post);
1220-
1221- ret = gf_rdma_post_send (peer->qp, post, len);
1222- if (!ret) {
1223- ret = len;
1224- } else {
1225- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1226- "gf_rdma_post_send (to %s) failed with ret = %d (%s)",
1227- peer->trans->peerinfo.identifier, ret,
1228- (ret > 0) ? strerror (ret) : "");
1229- gf_rdma_post_unref (post);
1230- __gf_rdma_disconnect (peer->trans);
1231- ret = -1;
1232- }
1233-
1234-out:
1235- return ret;
1236-}
1237-
1238-
1239-int32_t
1240-__gf_rdma_create_read_chunks_from_vector (gf_rdma_peer_t *peer,
1241- gf_rdma_read_chunk_t **readch_ptr,
1242- int32_t *pos, struct iovec *vector,
1243- int count,
1244- gf_rdma_request_context_t *request_ctx)
1245-{
1246- int i = 0;
1247- gf_rdma_private_t *priv = NULL;
1248- gf_rdma_device_t *device = NULL;
1249- struct ibv_mr *mr = NULL;
1250- gf_rdma_read_chunk_t *readch = NULL;
1251- int32_t ret = -1;
1252-
1253- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
1254- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, readch_ptr, out);
1255- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, *readch_ptr, out);
1256- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, request_ctx, out);
1257- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, vector, out);
1258-
1259- priv = peer->trans->private;
1260- device = priv->device;
1261- readch = *readch_ptr;
1262-
1263- for (i = 0; i < count; i++) {
1264- readch->rc_discrim = hton32 (1);
1265- readch->rc_position = hton32 (*pos);
1266-
1267- mr = ibv_reg_mr (device->pd, vector[i].iov_base,
1268- vector[i].iov_len,
1269- IBV_ACCESS_REMOTE_READ);
1270- if (!mr) {
1271- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1272- "memory registration failed (%s) (peer:%s)",
1273- strerror (errno),
1274- peer->trans->peerinfo.identifier);
1275- goto out;
1276- }
1277-
1278- request_ctx->mr[request_ctx->mr_count++] = mr;
1279-
1280- readch->rc_target.rs_handle = hton32 (mr->rkey);
1281- readch->rc_target.rs_length
1282- = hton32 (vector[i].iov_len);
1283- readch->rc_target.rs_offset
1284- = hton64 ((uint64_t)(unsigned long)vector[i].iov_base);
1285-
1286- *pos = *pos + vector[i].iov_len;
1287- readch++;
1288- }
1289-
1290- *readch_ptr = readch;
1291-
1292- ret = 0;
1293-out:
1294- return ret;
1295-}
1296-
1297-
1298-int32_t
1299-__gf_rdma_create_read_chunks (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
1300- gf_rdma_chunktype_t type, uint32_t **ptr,
1301- gf_rdma_request_context_t *request_ctx)
1302-{
1303- int32_t ret = -1;
1304- int pos = 0;
1305-
1306- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
1307- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, entry, out);
1308- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, ptr, out);
1309- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, *ptr, out);
1310- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, request_ctx, out);
1311-
1312- request_ctx->iobref = iobref_ref (entry->iobref);
1313-
1314- if (type == gf_rdma_areadch) {
1315- pos = 0;
1316- ret = __gf_rdma_create_read_chunks_from_vector (peer,
1317- (gf_rdma_read_chunk_t **)ptr,
1318- &pos,
1319- entry->rpchdr,
1320- entry->rpchdr_count,
1321- request_ctx);
1322- if (ret == -1) {
1323- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1324- "cannot create read chunks from vector "
1325- "entry->rpchdr");
1326- goto out;
1327- }
1328-
1329- ret = __gf_rdma_create_read_chunks_from_vector (peer,
1330- (gf_rdma_read_chunk_t **)ptr,
1331- &pos,
1332- entry->proghdr,
1333- entry->proghdr_count,
1334- request_ctx);
1335- if (ret == -1) {
1336- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1337- "cannot create read chunks from vector "
1338- "entry->proghdr");
1339- }
1340-
1341- if (entry->prog_payload_count != 0) {
1342- ret = __gf_rdma_create_read_chunks_from_vector (peer,
1343- (gf_rdma_read_chunk_t **)ptr,
1344- &pos,
1345- entry->prog_payload,
1346- entry->prog_payload_count,
1347- request_ctx);
1348- if (ret == -1) {
1349- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1350- "cannot create read chunks from vector"
1351- " entry->prog_payload");
1352- }
1353- }
1354- } else {
1355- pos = iov_length (entry->rpchdr, entry->rpchdr_count);
1356- ret = __gf_rdma_create_read_chunks_from_vector (peer,
1357- (gf_rdma_read_chunk_t **)ptr,
1358- &pos,
1359- entry->prog_payload,
1360- entry->prog_payload_count,
1361- request_ctx);
1362- if (ret == -1) {
1363- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1364- "cannot create read chunks from vector "
1365- "entry->prog_payload");
1366- }
1367- }
1368-
1369- /* terminate read-chunk list*/
1370- **ptr = 0;
1371- *ptr = *ptr + 1;
1372-out:
1373- return ret;
1374-}
1375-
1376-
1377-int32_t
1378-__gf_rdma_create_write_chunks_from_vector (gf_rdma_peer_t *peer,
1379- gf_rdma_write_chunk_t **writech_ptr,
1380- struct iovec *vector, int count,
1381- gf_rdma_request_context_t *request_ctx)
1382-{
1383- int i = 0;
1384- gf_rdma_private_t *priv = NULL;
1385- gf_rdma_device_t *device = NULL;
1386- struct ibv_mr *mr = NULL;
1387- gf_rdma_write_chunk_t *writech = NULL;
1388- int32_t ret = -1;
1389-
1390- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
1391- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, writech_ptr, out);
1392- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, *writech_ptr, out);
1393- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, request_ctx, out);
1394- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, vector, out);
1395-
1396- writech = *writech_ptr;
1397-
1398- priv = peer->trans->private;
1399- device = priv->device;
1400-
1401- for (i = 0; i < count; i++) {
1402- mr = ibv_reg_mr (device->pd, vector[i].iov_base,
1403- vector[i].iov_len,
1404- IBV_ACCESS_REMOTE_WRITE
1405- | IBV_ACCESS_LOCAL_WRITE);
1406- if (!mr) {
1407- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1408- "memory registration failed (%s) (peer:%s)",
1409- strerror (errno),
1410- peer->trans->peerinfo.identifier);
1411- goto out;
1412- }
1413-
1414- request_ctx->mr[request_ctx->mr_count++] = mr;
1415-
1416- writech->wc_target.rs_handle = hton32 (mr->rkey);
1417- writech->wc_target.rs_length = hton32 (vector[i].iov_len);
1418- writech->wc_target.rs_offset
1419- = hton64 (((uint64_t)(unsigned long)vector[i].iov_base));
1420-
1421- writech++;
1422- }
1423-
1424- *writech_ptr = writech;
1425-
1426- ret = 0;
1427-out:
1428- return ret;
1429-}
1430-
1431-
1432-int32_t
1433-__gf_rdma_create_write_chunks (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
1434- gf_rdma_chunktype_t chunk_type, uint32_t **ptr,
1435- gf_rdma_request_context_t *request_ctx)
1436-{
1437- int32_t ret = -1;
1438- gf_rdma_write_array_t *warray = NULL;
1439-
1440- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
1441- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, ptr, out);
1442- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, *ptr, out);
1443- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, request_ctx, out);
1444- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, entry, out);
1445-
1446- if ((chunk_type == gf_rdma_replych)
1447- && ((entry->msg.request.rsphdr_count != 1) ||
1448- (entry->msg.request.rsphdr_vec[0].iov_base == NULL))) {
1449- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1450- (entry->msg.request.rsphdr_count == 1)
1451- ? "chunktype specified as reply chunk but the vector "
1452- "specifying the buffer to be used for holding reply"
1453- " header is not correct" :
1454- "chunktype specified as reply chunk, but more than one "
1455- "buffer provided for holding reply");
1456- goto out;
1457- }
1458-
1459-/*
1460- if ((chunk_type == gf_rdma_writech)
1461- && ((entry->msg.request.rsphdr_count == 0)
1462- || (entry->msg.request.rsphdr_vec[0].iov_base == NULL))) {
1463- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
1464- "vector specifying buffer to hold the program's reply "
1465- "header should also be provided when buffers are "
1466- "provided for holding the program's payload in reply");
1467- goto out;
1468- }
1469-*/
1470-
1471- if (chunk_type == gf_rdma_writech) {
1472- warray = (gf_rdma_write_array_t *)*ptr;
1473- warray->wc_discrim = hton32 (1);
1474- warray->wc_nchunks
1475- = hton32 (entry->msg.request.rsp_payload_count);
1476-
1477- *ptr = (uint32_t *)&warray->wc_array[0];
1478-
1479- ret = __gf_rdma_create_write_chunks_from_vector (peer,
1480- (gf_rdma_write_chunk_t **)ptr,
1481- entry->msg.request.rsp_payload,
1482- entry->msg.request.rsp_payload_count,
1483- request_ctx);
1484- if (ret == -1) {
1485- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1486- "cannot create write chunks from vector "
1487- "entry->rpc_payload");
1488- goto out;
1489- }
1490-
1491- /* terminate write chunklist */
1492- **ptr = 0;
1493- *ptr = *ptr + 1;
1494-
1495- /* no reply chunklist */
1496- **ptr = 0;
1497- *ptr = *ptr + 1;
1498- } else {
1499- /* no write chunklist */
1500- **ptr = 0;
1501- *ptr = *ptr + 1;
1502-
1503- warray = (gf_rdma_write_array_t *)*ptr;
1504- warray->wc_discrim = hton32 (1);
1505- warray->wc_nchunks = hton32 (entry->msg.request.rsphdr_count);
1506-
1507- *ptr = (uint32_t *)&warray->wc_array[0];
1508-
1509- ret = __gf_rdma_create_write_chunks_from_vector (peer,
1510- (gf_rdma_write_chunk_t **)ptr,
1511- entry->msg.request.rsphdr_vec,
1512- entry->msg.request.rsphdr_count,
1513- request_ctx);
1514- if (ret == -1) {
1515- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1516- "cannot create write chunks from vector "
1517- "entry->rpchdr");
1518- goto out;
1519- }
1520-
1521- /* terminate reply chunklist */
1522- **ptr = 0;
1523- *ptr = *ptr + 1;
1524- }
1525-
1526-out:
1527- return ret;
1528-}
1529-
1530-
1531-static inline void
1532-__gf_rdma_deregister_mr (struct ibv_mr **mr, int count)
1533-{
1534- int i = 0;
1535-
1536- if (mr == NULL) {
1537- goto out;
1538- }
1539-
1540- for (i = 0; i < count; i++) {
1541- ibv_dereg_mr (mr[i]);
1542- }
1543-
1544-out:
1545- return;
1546-}
1547-
1548-
1549-static int32_t
1550-__gf_rdma_quota_put (gf_rdma_peer_t *peer)
1551-{
1552- int32_t ret = 0;
1553-
1554- peer->quota++;
1555- ret = peer->quota;
1556-
1557- if (!list_empty (&peer->ioq)) {
1558- ret = __gf_rdma_ioq_churn (peer);
1559- }
1560-
1561- return ret;
1562-}
1563-
1564-
1565-static int32_t
1566-gf_rdma_quota_put (gf_rdma_peer_t *peer)
1567-{
1568- int32_t ret = 0;
1569- gf_rdma_private_t *priv = NULL;
1570-
1571- priv = peer->trans->private;
1572- pthread_mutex_lock (&priv->write_mutex);
1573- {
1574- ret = __gf_rdma_quota_put (peer);
1575- }
1576- pthread_mutex_unlock (&priv->write_mutex);
1577-
1578- return ret;
1579-}
1580-
1581-
1582-/* to be called with priv->mutex held */
1583-void
1584-__gf_rdma_request_context_destroy (gf_rdma_request_context_t *context)
1585-{
1586- gf_rdma_peer_t *peer = NULL;
1587- gf_rdma_private_t *priv = NULL;
1588- int32_t ret = 0;
1589-
1590- if (context == NULL) {
1591- goto out;
1592- }
1593-
1594- peer = context->peer;
1595-
1596- __gf_rdma_deregister_mr (context->mr, context->mr_count);
1597-
1598- priv = peer->trans->private;
1599-
1600- if (priv->connected) {
1601- ret = __gf_rdma_quota_put (peer);
1602- if (ret < 0) {
1603- gf_log ("rdma", GF_LOG_DEBUG,
1604- "failed to send "
1605- "message");
1606- mem_put (context);
1607- __gf_rdma_disconnect (peer->trans);
1608- goto out;
1609- }
1610- }
1611-
1612- if (context->iobref != NULL) {
1613- iobref_unref (context->iobref);
1614- context->iobref = NULL;
1615- }
1616-
1617- if (context->rsp_iobref != NULL) {
1618- iobref_unref (context->rsp_iobref);
1619- context->rsp_iobref = NULL;
1620- }
1621-
1622- mem_put (context);
1623-
1624-out:
1625- return;
1626-}
1627-
1628-
1629-void
1630-gf_rdma_post_context_destroy (gf_rdma_post_context_t *ctx)
1631-{
1632- if (ctx == NULL) {
1633- goto out;
1634- }
1635-
1636- __gf_rdma_deregister_mr (ctx->mr, ctx->mr_count);
1637-
1638- if (ctx->iobref != NULL) {
1639- iobref_unref (ctx->iobref);
1640- }
1641-
1642- if (ctx->hdr_iobuf != NULL) {
1643- iobuf_unref (ctx->hdr_iobuf);
1644- }
1645-
1646- memset (ctx, 0, sizeof (*ctx));
1647-out:
1648- return;
1649-}
1650-
1651-
1652-int
1653-gf_rdma_post_unref (gf_rdma_post_t *post)
1654-{
1655- int refcount = -1;
1656-
1657- if (post == NULL) {
1658- goto out;
1659- }
1660-
1661- pthread_mutex_lock (&post->lock);
1662- {
1663- refcount = --post->refcount;
1664- }
1665- pthread_mutex_unlock (&post->lock);
1666-
1667- if (refcount == 0) {
1668- gf_rdma_post_context_destroy (&post->ctx);
1669- if (post->type == GF_RDMA_SEND_POST) {
1670- gf_rdma_put_post (&post->device->sendq, post);
1671- } else {
1672- gf_rdma_post_recv (post->device->srq, post);
1673- }
1674- }
1675-out:
1676- return refcount;
1677-}
1678-
1679-
1680-int
1681-gf_rdma_post_get_refcount (gf_rdma_post_t *post)
1682-{
1683- int refcount = -1;
1684-
1685- if (post == NULL) {
1686- goto out;
1687- }
1688-
1689- pthread_mutex_lock (&post->lock);
1690- {
1691- refcount = post->refcount;
1692- }
1693- pthread_mutex_unlock (&post->lock);
1694-
1695-out:
1696- return refcount;
1697-}
1698-
1699-gf_rdma_post_t *
1700-gf_rdma_post_ref (gf_rdma_post_t *post)
1701-{
1702- if (post == NULL) {
1703- goto out;
1704- }
1705-
1706- pthread_mutex_lock (&post->lock);
1707- {
1708- post->refcount++;
1709- }
1710- pthread_mutex_unlock (&post->lock);
1711-
1712-out:
1713- return post;
1714-}
1715-
1716-
1717-int32_t
1718-__gf_rdma_ioq_churn_request (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
1719- gf_rdma_post_t *post)
1720-{
1721- gf_rdma_chunktype_t rtype = gf_rdma_noch;
1722- gf_rdma_chunktype_t wtype = gf_rdma_noch;
1723- uint64_t send_size = 0;
1724- gf_rdma_header_t *hdr = NULL;
1725- struct rpc_msg *rpc_msg = NULL;
1726- uint32_t *chunkptr = NULL;
1727- char *buf = NULL;
1728- int32_t ret = 0;
1729- gf_rdma_private_t *priv = NULL;
1730- gf_rdma_device_t *device = NULL;
1731- int chunk_count = 0;
1732- gf_rdma_request_context_t *request_ctx = NULL;
1733- uint32_t prog_payload_length = 0, len = 0;
1734- struct rpc_req *rpc_req = NULL;
1735-
1736- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
1737- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, entry, out);
1738- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, post, out);
1739-
1740- if ((entry->msg.request.rsphdr_count != 0)
1741- && (entry->msg.request.rsp_payload_count != 0)) {
1742- ret = -1;
1743- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1744- "both write-chunklist and reply-chunk cannot be "
1745- "present");
1746- goto out;
1747- }
1748-
1749- post->ctx.is_request = 1;
1750- priv = peer->trans->private;
1751- device = priv->device;
1752-
1753- hdr = (gf_rdma_header_t *)post->buf;
1754-
1755- send_size = iov_length (entry->rpchdr, entry->rpchdr_count)
1756- + iov_length (entry->proghdr, entry->proghdr_count)
1757- + GLUSTERFS_RDMA_MAX_HEADER_SIZE;
1758-
1759- if (entry->prog_payload_count != 0) {
1760- prog_payload_length
1761- = iov_length (entry->prog_payload,
1762- entry->prog_payload_count);
1763- }
1764-
1765- if (send_size > GLUSTERFS_RDMA_INLINE_THRESHOLD) {
1766- rtype = gf_rdma_areadch;
1767- } else if ((send_size + prog_payload_length)
1768- < GLUSTERFS_RDMA_INLINE_THRESHOLD) {
1769- rtype = gf_rdma_noch;
1770- } else if (entry->prog_payload_count != 0) {
1771- rtype = gf_rdma_readch;
1772- }
1773-
1774- if (entry->msg.request.rsphdr_count != 0) {
1775- wtype = gf_rdma_replych;
1776- } else if (entry->msg.request.rsp_payload_count != 0) {
1777- wtype = gf_rdma_writech;
1778- }
1779-
1780- if (rtype == gf_rdma_readch) {
1781- chunk_count += entry->prog_payload_count;
1782- } else if (rtype == gf_rdma_areadch) {
1783- chunk_count += entry->rpchdr_count;
1784- chunk_count += entry->proghdr_count;
1785- }
1786-
1787- if (wtype == gf_rdma_writech) {
1788- chunk_count += entry->msg.request.rsp_payload_count;
1789- } else if (wtype == gf_rdma_replych) {
1790- chunk_count += entry->msg.request.rsphdr_count;
1791- }
1792-
1793- if (chunk_count > GF_RDMA_MAX_SEGMENTS) {
1794- ret = -1;
1795- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1796- "chunk count(%d) exceeding maximum allowed RDMA "
1797- "segment count(%d)", chunk_count, GF_RDMA_MAX_SEGMENTS);
1798- goto out;
1799- }
1800-
1801- if ((wtype != gf_rdma_noch) || (rtype != gf_rdma_noch)) {
1802- request_ctx = mem_get (device->request_ctx_pool);
1803- if (request_ctx == NULL) {
1804- ret = -1;
1805- goto out;
1806- }
1807-
1808- memset (request_ctx, 0, sizeof (*request_ctx));
1809-
1810- request_ctx->pool = device->request_ctx_pool;
1811- request_ctx->peer = peer;
1812-
1813- entry->msg.request.rpc_req->conn_private = request_ctx;
1814-
1815- if (entry->msg.request.rsp_iobref != NULL) {
1816- request_ctx->rsp_iobref
1817- = iobref_ref (entry->msg.request.rsp_iobref);
1818- }
1819- }
1820-
1821- rpc_msg = (struct rpc_msg *) entry->rpchdr[0].iov_base;
1822-
1823- hdr->rm_xid = rpc_msg->rm_xid; /* no need of hton32(rpc_msg->rm_xid),
1824- * since rpc_msg->rm_xid is already
1825- * hton32ed value of actual xid
1826- */
1827- hdr->rm_vers = hton32 (GF_RDMA_VERSION);
1828- hdr->rm_credit = hton32 (peer->send_count);
1829- if (rtype == gf_rdma_areadch) {
1830- hdr->rm_type = hton32 (GF_RDMA_NOMSG);
1831- } else {
1832- hdr->rm_type = hton32 (GF_RDMA_MSG);
1833- }
1834-
1835- chunkptr = &hdr->rm_body.rm_chunks[0];
1836- if (rtype != gf_rdma_noch) {
1837- ret = __gf_rdma_create_read_chunks (peer, entry, rtype,
1838- &chunkptr,
1839- request_ctx);
1840- if (ret != 0) {
1841- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1842- "creation of read chunks failed");
1843- goto out;
1844- }
1845- } else {
1846- *chunkptr++ = 0; /* no read chunks */
1847- }
1848-
1849- if (wtype != gf_rdma_noch) {
1850- ret = __gf_rdma_create_write_chunks (peer, entry, wtype,
1851- &chunkptr,
1852- request_ctx);
1853- if (ret != 0) {
1854- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1855- "creation of write/reply chunk failed");
1856- goto out;
1857- }
1858- } else {
1859- *chunkptr++ = 0; /* no write chunks */
1860- *chunkptr++ = 0; /* no reply chunk */
1861- }
1862-
1863- buf = (char *)chunkptr;
1864-
1865- if (rtype != gf_rdma_areadch) {
1866- iov_unload (buf, entry->rpchdr, entry->rpchdr_count);
1867- buf += iov_length (entry->rpchdr, entry->rpchdr_count);
1868-
1869- iov_unload (buf, entry->proghdr, entry->proghdr_count);
1870- buf += iov_length (entry->proghdr, entry->proghdr_count);
1871-
1872- if (rtype != gf_rdma_readch) {
1873- iov_unload (buf, entry->prog_payload,
1874- entry->prog_payload_count);
1875- buf += iov_length (entry->prog_payload,
1876- entry->prog_payload_count);
1877- }
1878- }
1879-
1880- len = buf - post->buf;
1881-
1882- gf_rdma_post_ref (post);
1883-
1884- ret = gf_rdma_post_send (peer->qp, post, len);
1885- if (!ret) {
1886- ret = len;
1887- } else {
1888- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1889- "gf_rdma_post_send (to %s) failed with ret = %d (%s)",
1890- peer->trans->peerinfo.identifier, ret,
1891- (ret > 0) ? strerror (ret) : "");
1892- gf_rdma_post_unref (post);
1893- __gf_rdma_disconnect (peer->trans);
1894- ret = -1;
1895- }
1896-
1897-out:
1898- if (ret == -1) {
1899- rpc_req = entry->msg.request.rpc_req;
1900-
1901- if (request_ctx != NULL) {
1902- __gf_rdma_request_context_destroy (rpc_req->conn_private);
1903- }
1904-
1905- rpc_req->conn_private = NULL;
1906- }
1907-
1908- return ret;
1909-}
1910-
1911-
1912-static inline void
1913-__gf_rdma_fill_reply_header (gf_rdma_header_t *header, struct iovec *rpchdr,
1914- gf_rdma_reply_info_t *reply_info, int credits)
1915-{
1916- struct rpc_msg *rpc_msg = NULL;
1917-
1918- if (reply_info != NULL) {
1919- header->rm_xid = hton32 (reply_info->rm_xid);
1920- } else {
1921- rpc_msg = rpchdr[0].iov_base; /* assume rpchdr contains
1922- * only one vector.
1923- * (which is true)
1924- */
1925- header->rm_xid = rpc_msg->rm_xid;
1926- }
1927-
1928- header->rm_type = hton32 (GF_RDMA_MSG);
1929- header->rm_vers = hton32 (GF_RDMA_VERSION);
1930- header->rm_credit = hton32 (credits);
1931-
1932- header->rm_body.rm_chunks[0] = 0; /* no read chunks */
1933- header->rm_body.rm_chunks[1] = 0; /* no write chunks */
1934- header->rm_body.rm_chunks[2] = 0; /* no reply chunks */
1935-
1936- return;
1937-}
1938-
1939-
1940-int32_t
1941-__gf_rdma_send_reply_inline (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
1942- gf_rdma_post_t *post,
1943- gf_rdma_reply_info_t *reply_info)
1944-{
1945- gf_rdma_header_t *header = NULL;
1946- int32_t send_size = 0, ret = 0;
1947- char *buf = NULL;
1948-
1949- send_size = iov_length (entry->rpchdr, entry->rpchdr_count)
1950- + iov_length (entry->proghdr, entry->proghdr_count)
1951- + iov_length (entry->prog_payload, entry->prog_payload_count)
1952- + sizeof (gf_rdma_header_t); /*
1953- * remember, no chunklists in the
1954- * reply
1955- */
1956-
1957- if (send_size > GLUSTERFS_RDMA_INLINE_THRESHOLD) {
1958- ret = __gf_rdma_send_error (peer, entry, post, reply_info,
1959- ERR_CHUNK);
1960- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1961- "msg size (%d) is greater than maximum size "
1962- "of msg that can be sent inlined (%d)",
1963- send_size, GLUSTERFS_RDMA_INLINE_THRESHOLD);
1964- goto out;
1965- }
1966-
1967- header = (gf_rdma_header_t *)post->buf;
1968-
1969- __gf_rdma_fill_reply_header (header, entry->rpchdr, reply_info,
1970- peer->send_count);
1971-
1972- buf = (char *)&header->rm_body.rm_chunks[3];
1973-
1974- if (entry->rpchdr_count != 0) {
1975- iov_unload (buf, entry->rpchdr, entry->rpchdr_count);
1976- buf += iov_length (entry->rpchdr, entry->rpchdr_count);
1977- }
1978-
1979- if (entry->proghdr_count != 0) {
1980- iov_unload (buf, entry->proghdr, entry->proghdr_count);
1981- buf += iov_length (entry->proghdr, entry->proghdr_count);
1982- }
1983-
1984- if (entry->prog_payload_count != 0) {
1985- iov_unload (buf, entry->prog_payload,
1986- entry->prog_payload_count);
1987- buf += iov_length (entry->prog_payload,
1988- entry->prog_payload_count);
1989- }
1990-
1991- gf_rdma_post_ref (post);
1992-
1993- ret = gf_rdma_post_send (peer->qp, post, (buf - post->buf));
1994- if (!ret) {
1995- ret = send_size;
1996- } else {
1997- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
1998- "posting send (to %s) failed with ret = %d (%s)",
1999- peer->trans->peerinfo.identifier, ret,
2000- (ret > 0) ? strerror (ret) : "");
2001- gf_rdma_post_unref (post);
2002- __gf_rdma_disconnect (peer->trans);
2003- ret = -1;
2004- }
2005-
2006-out:
2007- return ret;
2008-}
2009-
2010-
2011-int32_t
2012-__gf_rdma_reply_encode_write_chunks (gf_rdma_peer_t *peer,
2013- uint32_t payload_size,
2014- gf_rdma_post_t *post,
2015- gf_rdma_reply_info_t *reply_info,
2016- uint32_t **ptr)
2017-{
2018- uint32_t chunk_size = 0;
2019- int32_t ret = -1;
2020- gf_rdma_write_array_t *target_array = NULL;
2021- int i = 0;
2022-
2023- target_array = (gf_rdma_write_array_t *)*ptr;
2024-
2025- for (i = 0; i < reply_info->wc_array->wc_nchunks; i++) {
2026- chunk_size +=
2027- reply_info->wc_array->wc_array[i].wc_target.rs_length;
2028- }
2029-
2030- if (chunk_size < payload_size) {
2031- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
2032- "length of payload (%d) is exceeding the total "
2033- "write chunk length (%d)", payload_size, chunk_size);
2034- goto out;
2035- }
2036-
2037- target_array->wc_discrim = hton32 (1);
2038- for (i = 0; (i < reply_info->wc_array->wc_nchunks)
2039- && (payload_size != 0);
2040- i++) {
2041- target_array->wc_array[i].wc_target.rs_offset
2042- = hton64 (reply_info->wc_array->wc_array[i].wc_target.rs_offset);
2043-
2044- target_array->wc_array[i].wc_target.rs_length
2045- = hton32 (min (payload_size,
2046- reply_info->wc_array->wc_array[i].wc_target.rs_length));
2047- }
2048-
2049- target_array->wc_nchunks = hton32 (i);
2050- target_array->wc_array[i].wc_target.rs_handle = 0; /* terminate
2051- chunklist */
2052-
2053- ret = 0;
2054-
2055- *ptr = &target_array->wc_array[i].wc_target.rs_length;
2056-out:
2057- return ret;
2058-}
2059-
2060-
2061-inline int32_t
2062-__gf_rdma_register_local_mr_for_rdma (gf_rdma_peer_t *peer,
2063- struct iovec *vector, int count,
2064- gf_rdma_post_context_t *ctx)
2065-{
2066- int i = 0;
2067- int32_t ret = -1;
2068- gf_rdma_private_t *priv = NULL;
2069- gf_rdma_device_t *device = NULL;
2070-
2071- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, ctx, out);
2072- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, vector, out);
2073-
2074- priv = peer->trans->private;
2075- device = priv->device;
2076-
2077- for (i = 0; i < count; i++) {
2078- /* what if the memory is registered more than once?
2079- * Assume that a single write buffer is passed to afr, which
2080- * then passes it to its children. If more than one children
2081- * happen to use rdma, then the buffer is registered more than
2082- * once.
2083- * Ib-verbs specification says that multiple registrations of
2084- * same memory location is allowed. Refer to 10.6.3.8 of
2085- * Infiniband Architecture Specification Volume 1
2086- * (Release 1.2.1)
2087- */
2088- ctx->mr[ctx->mr_count] = ibv_reg_mr (device->pd,
2089- vector[i].iov_base,
2090- vector[i].iov_len,
2091- IBV_ACCESS_LOCAL_WRITE);
2092- if (ctx->mr[ctx->mr_count] == NULL) {
2093- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2094- "registering memory for IBV_ACCESS_LOCAL_WRITE "
2095- "failed (%s)", strerror (errno));
2096- goto out;
2097- }
2098-
2099- ctx->mr_count++;
2100- }
2101-
2102- ret = 0;
2103-out:
2104- return ret;
2105-}
2106-
2107-/* 1. assumes xfer_len of data is pointed by vector(s) starting from vec[*idx]
2108- * 2. modifies vec
2109- */
2110-int32_t
2111-__gf_rdma_write (gf_rdma_peer_t *peer, gf_rdma_post_t *post, struct iovec *vec,
2112- uint32_t xfer_len, int *idx, gf_rdma_write_chunk_t *writech)
2113-{
2114- int size = 0, num_sge = 0, i = 0;
2115- int32_t ret = -1;
2116- struct ibv_sge *sg_list = NULL;
2117- struct ibv_send_wr wr = {
2118- .opcode = IBV_WR_RDMA_WRITE,
2119- .send_flags = IBV_SEND_SIGNALED,
2120- }, *bad_wr;
2121-
2122- if ((peer == NULL) || (writech == NULL) || (idx == NULL)
2123- || (post == NULL) || (vec == NULL) || (xfer_len == 0)) {
2124- goto out;
2125- }
2126-
2127- for (i = *idx; size < xfer_len; i++) {
2128- size += vec[i].iov_len;
2129- }
2130-
2131- num_sge = i - *idx;
2132-
2133- sg_list = GF_CALLOC (num_sge, sizeof (struct ibv_sge),
2134- gf_common_mt_sge);
2135- if (sg_list == NULL) {
2136- ret = -1;
2137- goto out;
2138- }
2139-
2140- for ((i = *idx), (num_sge = 0); (xfer_len != 0); i++, num_sge++) {
2141- size = min (xfer_len, vec[i].iov_len);
2142-
2143- sg_list [num_sge].addr = (unsigned long)vec[i].iov_base;
2144- sg_list [num_sge].length = size;
2145- sg_list [num_sge].lkey = post->ctx.mr[i]->lkey;
2146-
2147- xfer_len -= size;
2148- }
2149-
2150- *idx = i;
2151-
2152- if (size < vec[i - 1].iov_len) {
2153- vec[i - 1].iov_base += size;
2154- vec[i - 1].iov_len -= size;
2155- *idx = i - 1;
2156- }
2157-
2158- wr.sg_list = sg_list;
2159- wr.num_sge = num_sge;
2160- wr.wr_id = (unsigned long) gf_rdma_post_ref (post);
2161- wr.wr.rdma.rkey = writech->wc_target.rs_handle;
2162- wr.wr.rdma.remote_addr = writech->wc_target.rs_offset;
2163-
2164- ret = ibv_post_send(peer->qp, &wr, &bad_wr);
2165- if (ret) {
2166- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2167- "rdma write to "
2168- "client (%s) failed with ret = %d (%s)",
2169- peer->trans->peerinfo.identifier, ret,
2170- (ret > 0) ? strerror (ret) : "");
2171- ret = -1;
2172- }
2173-
2174- GF_FREE (sg_list);
2175-out:
2176- return ret;
2177-}
2178-
2179-
2180-int32_t
2181-__gf_rdma_do_gf_rdma_write (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
2182- struct iovec *vector, int count,
2183- struct iobref *iobref,
2184- gf_rdma_reply_info_t *reply_info)
2185-{
2186- int i = 0, payload_idx = 0;
2187- uint32_t payload_size = 0, xfer_len = 0;
2188- int32_t ret = -1;
2189-
2190- if (count != 0) {
2191- payload_size = iov_length (vector, count);
2192- }
2193-
2194- if (payload_size == 0) {
2195- ret = 0;
2196- goto out;
2197- }
2198-
2199- ret = __gf_rdma_register_local_mr_for_rdma (peer, vector, count,
2200- &post->ctx);
2201- if (ret == -1) {
2202- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2203- "registering memory region for rdma failed");
2204- goto out;
2205- }
2206-
2207- post->ctx.iobref = iobref_ref (iobref);
2208-
2209- for (i = 0; (i < reply_info->wc_array->wc_nchunks)
2210- && (payload_size != 0);
2211- i++) {
2212- xfer_len = min (payload_size,
2213- reply_info->wc_array->wc_array[i].wc_target.rs_length);
2214-
2215- ret = __gf_rdma_write (peer, post, vector, xfer_len,
2216- &payload_idx,
2217- &reply_info->wc_array->wc_array[i]);
2218- if (ret == -1) {
2219- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2220- "rdma write to client (%s) failed",
2221- peer->trans->peerinfo.identifier);
2222- goto out;
2223- }
2224-
2225- payload_size -= xfer_len;
2226- }
2227-
2228- ret = 0;
2229-out:
2230-
2231- return ret;
2232-}
2233-
2234-
2235-int32_t
2236-__gf_rdma_send_reply_type_nomsg (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
2237- gf_rdma_post_t *post,
2238- gf_rdma_reply_info_t *reply_info)
2239-{
2240- gf_rdma_header_t *header = NULL;
2241- char *buf = NULL;
2242- uint32_t payload_size = 0;
2243- int count = 0, i = 0;
2244- int32_t ret = 0;
2245- struct iovec vector[MAX_IOVEC];
2246-
2247- header = (gf_rdma_header_t *)post->buf;
2248-
2249- __gf_rdma_fill_reply_header (header, entry->rpchdr, reply_info,
2250- peer->send_count);
2251-
2252- header->rm_type = hton32 (GF_RDMA_NOMSG);
2253-
2254- payload_size = iov_length (entry->rpchdr, entry->rpchdr_count) +
2255- iov_length (entry->proghdr, entry->proghdr_count);
2256-
2257- /* encode reply chunklist */
2258- buf = (char *)&header->rm_body.rm_chunks[2];
2259- ret = __gf_rdma_reply_encode_write_chunks (peer, payload_size, post,
2260- reply_info,
2261- (uint32_t **)&buf);
2262- if (ret == -1) {
2263- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2264- "encoding write chunks failed");
2265- ret = __gf_rdma_send_error (peer, entry, post, reply_info,
2266- ERR_CHUNK);
2267- goto out;
2268- }
2269-
2270- gf_rdma_post_ref (post);
2271-
2272- for (i = 0; i < entry->rpchdr_count; i++) {
2273- vector[count++] = entry->rpchdr[i];
2274- }
2275-
2276- for (i = 0; i < entry->proghdr_count; i++) {
2277- vector[count++] = entry->proghdr[i];
2278- }
2279-
2280- ret = __gf_rdma_do_gf_rdma_write (peer, post, vector, count,
2281- entry->iobref, reply_info);
2282- if (ret == -1) {
2283- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2284- "rdma write to peer (%s) failed",
2285- peer->trans->peerinfo.identifier);
2286- gf_rdma_post_unref (post);
2287- goto out;
2288- }
2289-
2290- ret = gf_rdma_post_send (peer->qp, post, (buf - post->buf));
2291- if (ret) {
2292- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2293- "posting a send request to client (%s) failed with "
2294- "ret = %d (%s)", peer->trans->peerinfo.identifier, ret,
2295- (ret > 0) ? strerror (ret) : "");
2296- ret = -1;
2297- gf_rdma_post_unref (post);
2298- } else {
2299- ret = payload_size;
2300- }
2301-
2302-out:
2303- return ret;
2304-}
2305-
2306-
2307-int32_t
2308-__gf_rdma_send_reply_type_msg (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
2309- gf_rdma_post_t *post,
2310- gf_rdma_reply_info_t *reply_info)
2311-{
2312- gf_rdma_header_t *header = NULL;
2313- int32_t send_size = 0, ret = 0;
2314- char *ptr = NULL;
2315- uint32_t payload_size = 0;
2316-
2317- send_size = iov_length (entry->rpchdr, entry->rpchdr_count)
2318- + iov_length (entry->proghdr, entry->proghdr_count)
2319- + GLUSTERFS_RDMA_MAX_HEADER_SIZE;
2320-
2321- if (send_size > GLUSTERFS_RDMA_INLINE_THRESHOLD) {
2322- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2323- "client has provided only write chunks, but the "
2324- "combined size of rpc and program header (%d) is "
2325- "exceeding the size of msg that can be sent using "
2326- "RDMA send (%d)", send_size,
2327- GLUSTERFS_RDMA_INLINE_THRESHOLD);
2328-
2329- ret = __gf_rdma_send_error (peer, entry, post, reply_info,
2330- ERR_CHUNK);
2331- goto out;
2332- }
2333-
2334- header = (gf_rdma_header_t *)post->buf;
2335-
2336- __gf_rdma_fill_reply_header (header, entry->rpchdr, reply_info,
2337- peer->send_count);
2338-
2339- payload_size = iov_length (entry->prog_payload,
2340- entry->prog_payload_count);
2341- ptr = (char *)&header->rm_body.rm_chunks[1];
2342-
2343- ret = __gf_rdma_reply_encode_write_chunks (peer, payload_size, post,
2344- reply_info,
2345- (uint32_t **)&ptr);
2346- if (ret == -1) {
2347- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2348- "encoding write chunks failed");
2349- ret = __gf_rdma_send_error (peer, entry, post, reply_info,
2350- ERR_CHUNK);
2351- goto out;
2352- }
2353-
2354- *(uint32_t *)ptr = 0; /* terminate reply chunklist */
2355- ptr += sizeof (uint32_t);
2356-
2357- gf_rdma_post_ref (post);
2358-
2359- ret = __gf_rdma_do_gf_rdma_write (peer, post, entry->prog_payload,
2360- entry->prog_payload_count,
2361- entry->iobref, reply_info);
2362- if (ret == -1) {
2363- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING, "rdma write to peer "
2364- "(%s) failed", peer->trans->peerinfo.identifier);
2365- gf_rdma_post_unref (post);
2366- goto out;
2367- }
2368-
2369- iov_unload (ptr, entry->rpchdr, entry->rpchdr_count);
2370- ptr += iov_length (entry->rpchdr, entry->rpchdr_count);
2371-
2372- iov_unload (ptr, entry->proghdr, entry->proghdr_count);
2373- ptr += iov_length (entry->proghdr, entry->proghdr_count);
2374-
2375- ret = gf_rdma_post_send (peer->qp, post, (ptr - post->buf));
2376- if (ret) {
2377- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2378- "rdma send to client (%s) failed with ret = %d (%s)",
2379- peer->trans->peerinfo.identifier, ret,
2380- (ret > 0) ? strerror (ret) : "");
2381- gf_rdma_post_unref (post);
2382- ret = -1;
2383- } else {
2384- ret = send_size + payload_size;
2385- }
2386-
2387-out:
2388- return ret;
2389-}
2390-
2391-
2392-void
2393-gf_rdma_reply_info_destroy (gf_rdma_reply_info_t *reply_info)
2394-{
2395- if (reply_info == NULL) {
2396- goto out;
2397- }
2398-
2399- if (reply_info->wc_array != NULL) {
2400- GF_FREE (reply_info->wc_array);
2401- reply_info->wc_array = NULL;
2402- }
2403-
2404- mem_put (reply_info);
2405-out:
2406- return;
2407-}
2408-
2409-
2410-gf_rdma_reply_info_t *
2411-gf_rdma_reply_info_alloc (gf_rdma_peer_t *peer)
2412-{
2413- gf_rdma_reply_info_t *reply_info = NULL;
2414- gf_rdma_private_t *priv = NULL;
2415-
2416- priv = peer->trans->private;
2417-
2418- reply_info = mem_get (priv->device->reply_info_pool);
2419- if (reply_info == NULL) {
2420- goto out;
2421- }
2422-
2423- memset (reply_info, 0, sizeof (*reply_info));
2424- reply_info->pool = priv->device->reply_info_pool;
2425-
2426-out:
2427- return reply_info;
2428-}
2429-
2430-
2431-int32_t
2432-__gf_rdma_ioq_churn_reply (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry,
2433- gf_rdma_post_t *post)
2434-{
2435- gf_rdma_reply_info_t *reply_info = NULL;
2436- int32_t ret = -1;
2437- gf_rdma_chunktype_t type = gf_rdma_noch;
2438-
2439- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, peer, out);
2440- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, entry, out);
2441- GF_VALIDATE_OR_GOTO (GF_RDMA_LOG_NAME, post, out);
2442-
2443- reply_info = entry->msg.reply_info;
2444- if (reply_info != NULL) {
2445- type = reply_info->type;
2446- }
2447-
2448- switch (type) {
2449- case gf_rdma_noch:
2450- ret = __gf_rdma_send_reply_inline (peer, entry, post,
2451- reply_info);
2452- if (ret < 0) {
2453- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2454- "failed to send reply to peer (%s) as an "
2455- "inlined rdma msg",
2456- peer->trans->peerinfo.identifier);
2457- }
2458- break;
2459-
2460- case gf_rdma_replych:
2461- ret = __gf_rdma_send_reply_type_nomsg (peer, entry, post,
2462- reply_info);
2463- if (ret < 0) {
2464- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2465- "failed to send reply to peer (%s) as "
2466- "RDMA_NOMSG", peer->trans->peerinfo.identifier);
2467- }
2468- break;
2469-
2470- case gf_rdma_writech:
2471- ret = __gf_rdma_send_reply_type_msg (peer, entry, post,
2472- reply_info);
2473- if (ret < 0) {
2474- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2475- "failed to send reply with write chunks "
2476- "to peer (%s)",
2477- peer->trans->peerinfo.identifier);
2478- }
2479- break;
2480-
2481- default:
2482- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2483- "invalid chunktype (%d) specified for sending reply "
2484- " (peer:%s)", type, peer->trans->peerinfo.identifier);
2485- break;
2486- }
2487-
2488- if (reply_info != NULL) {
2489- gf_rdma_reply_info_destroy (reply_info);
2490- }
2491-out:
2492- return ret;
2493-}
2494-
2495-
2496-int32_t
2497-__gf_rdma_ioq_churn_entry (gf_rdma_peer_t *peer, gf_rdma_ioq_t *entry)
2498-{
2499- int32_t ret = 0, quota = 0;
2500- gf_rdma_private_t *priv = NULL;
2501- gf_rdma_device_t *device = NULL;
2502- gf_rdma_options_t *options = NULL;
2503- gf_rdma_post_t *post = NULL;
2504-
2505- priv = peer->trans->private;
2506- options = &priv->options;
2507- device = priv->device;
2508-
2509- quota = __gf_rdma_quota_get (peer);
2510- if (quota > 0) {
2511- post = gf_rdma_get_post (&device->sendq);
2512- if (post == NULL) {
2513- post = gf_rdma_new_post (peer->trans, device,
2514- (options->send_size + 2048),
2515- GF_RDMA_SEND_POST);
2516- }
2517-
2518- if (post == NULL) {
2519- ret = -1;
2520- gf_log_callingfn (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2521- "not able to get a post to send msg");
2522- goto out;
2523- }
2524-
2525- if (entry->is_request) {
2526- ret = __gf_rdma_ioq_churn_request (peer, entry, post);
2527- if (ret < 0) {
2528- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2529- "failed to process request ioq entry "
2530- "to peer(%s)",
2531- peer->trans->peerinfo.identifier);
2532- }
2533- } else {
2534- ret = __gf_rdma_ioq_churn_reply (peer, entry, post);
2535- if (ret < 0) {
2536- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2537- "failed to process reply ioq entry "
2538- "to peer (%s)",
2539- peer->trans->peerinfo.identifier);
2540- }
2541- }
2542-
2543- if (ret != 0) {
2544- __gf_rdma_ioq_entry_free (entry);
2545- }
2546- } else {
2547- ret = 0;
2548- }
2549-
2550-out:
2551- return ret;
2552-}
2553-
2554-
2555-static int32_t
2556-__gf_rdma_ioq_churn (gf_rdma_peer_t *peer)
2557-{
2558- gf_rdma_ioq_t *entry = NULL;
2559- int32_t ret = 0;
2560-
2561- while (!list_empty (&peer->ioq))
2562- {
2563- /* pick next entry */
2564- entry = peer->ioq_next;
2565-
2566- ret = __gf_rdma_ioq_churn_entry (peer, entry);
2567-
2568- if (ret <= 0)
2569- break;
2570- }
2571-
2572- /*
2573- list_for_each_entry_safe (entry, dummy, &peer->ioq, list) {
2574- ret = __gf_rdma_ioq_churn_entry (peer, entry);
2575- if (ret <= 0) {
2576- break;
2577- }
2578- }
2579- */
2580-
2581- return ret;
2582-}
2583-
2584-
2585-static int32_t
2586-gf_rdma_writev (rpc_transport_t *this, gf_rdma_ioq_t *entry)
2587-{
2588- int32_t ret = 0, need_append = 1;
2589- gf_rdma_private_t *priv = NULL;
2590- gf_rdma_peer_t *peer = NULL;
2591-
2592- priv = this->private;
2593- pthread_mutex_lock (&priv->write_mutex);
2594- {
2595- if (!priv->connected) {
2596- gf_log (this->name, GF_LOG_WARNING,
2597- "rdma is not connected to peer (%s)",
2598- this->peerinfo.identifier);
2599- ret = -1;
2600- goto unlock;
2601- }
2602-
2603- peer = &priv->peer;
2604- if (list_empty (&peer->ioq)) {
2605- ret = __gf_rdma_ioq_churn_entry (peer, entry);
2606- if (ret != 0) {
2607- need_append = 0;
2608-
2609- if (ret < 0) {
2610- gf_log (this->name, GF_LOG_WARNING,
2611- "processing ioq entry destined "
2612- "to (%s) failed",
2613- this->peerinfo.identifier);
2614- }
2615- }
2616- }
2617-
2618- if (need_append) {
2619- list_add_tail (&entry->list, &peer->ioq);
2620- }
2621- }
2622-unlock:
2623- pthread_mutex_unlock (&priv->write_mutex);
2624- return ret;
2625-}
2626-
2627-
2628-gf_rdma_ioq_t *
2629-gf_rdma_ioq_new (rpc_transport_t *this, rpc_transport_data_t *data)
2630-{
2631- gf_rdma_ioq_t *entry = NULL;
2632- int count = 0, i = 0;
2633- rpc_transport_msg_t *msg = NULL;
2634- gf_rdma_private_t *priv = NULL;
2635-
2636- if ((data == NULL) || (this == NULL)) {
2637- goto out;
2638- }
2639-
2640- priv = this->private;
2641-
2642- entry = mem_get (priv->device->ioq_pool);
2643- if (entry == NULL) {
2644- goto out;
2645- }
2646- memset (entry, 0, sizeof (*entry));
2647- entry->pool = priv->device->ioq_pool;
2648-
2649- if (data->is_request) {
2650- msg = &data->data.req.msg;
2651- if (data->data.req.rsp.rsphdr_count != 0) {
2652- for (i = 0; i < data->data.req.rsp.rsphdr_count; i++) {
2653- entry->msg.request.rsphdr_vec[i]
2654- = data->data.req.rsp.rsphdr[i];
2655- }
2656-
2657- entry->msg.request.rsphdr_count =
2658- data->data.req.rsp.rsphdr_count;
2659- }
2660-
2661- if (data->data.req.rsp.rsp_payload_count != 0) {
2662- for (i = 0; i < data->data.req.rsp.rsp_payload_count;
2663- i++) {
2664- entry->msg.request.rsp_payload[i]
2665- = data->data.req.rsp.rsp_payload[i];
2666- }
2667-
2668- entry->msg.request.rsp_payload_count =
2669- data->data.req.rsp.rsp_payload_count;
2670- }
2671-
2672- entry->msg.request.rpc_req = data->data.req.rpc_req;
2673-
2674- if (data->data.req.rsp.rsp_iobref != NULL) {
2675- entry->msg.request.rsp_iobref
2676- = iobref_ref (data->data.req.rsp.rsp_iobref);
2677- }
2678- } else {
2679- msg = &data->data.reply.msg;
2680- entry->msg.reply_info = data->data.reply.private;
2681- }
2682-
2683- entry->is_request = data->is_request;
2684-
2685- count = msg->rpchdrcount + msg->proghdrcount + msg->progpayloadcount;
2686-
2687- GF_ASSERT (count <= MAX_IOVEC);
2688-
2689- if (msg->rpchdr != NULL) {
2690- memcpy (&entry->rpchdr[0], msg->rpchdr,
2691- sizeof (struct iovec) * msg->rpchdrcount);
2692- entry->rpchdr_count = msg->rpchdrcount;
2693- }
2694-
2695- if (msg->proghdr != NULL) {
2696- memcpy (&entry->proghdr[0], msg->proghdr,
2697- sizeof (struct iovec) * msg->proghdrcount);
2698- entry->proghdr_count = msg->proghdrcount;
2699- }
2700-
2701- if (msg->progpayload != NULL) {
2702- memcpy (&entry->prog_payload[0], msg->progpayload,
2703- sizeof (struct iovec) * msg->progpayloadcount);
2704- entry->prog_payload_count = msg->progpayloadcount;
2705- }
2706-
2707- if (msg->iobref != NULL) {
2708- entry->iobref = iobref_ref (msg->iobref);
2709- }
2710-
2711- INIT_LIST_HEAD (&entry->list);
2712-
2713-out:
2714- return entry;
2715-}
2716-
2717-
2718-int32_t
2719-gf_rdma_submit_request (rpc_transport_t *this, rpc_transport_req_t *req)
2720-{
2721- int32_t ret = 0;
2722- gf_rdma_ioq_t *entry = NULL;
2723- rpc_transport_data_t data = {0, };
2724-
2725- if (req == NULL) {
2726- goto out;
2727- }
2728-
2729- data.is_request = 1;
2730- data.data.req = *req;
2731-
2732- entry = gf_rdma_ioq_new (this, &data);
2733- if (entry == NULL) {
2734- gf_log (this->name, GF_LOG_WARNING,
2735- "getting a new ioq entry failed (peer:%s)",
2736- this->peerinfo.identifier);
2737- goto out;
2738- }
2739-
2740- ret = gf_rdma_writev (this, entry);
2741-
2742- if (ret > 0) {
2743- ret = 0;
2744- } else if (ret < 0) {
2745- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2746- "sending request to peer (%s) failed",
2747- this->peerinfo.identifier);
2748- rpc_transport_disconnect (this);
2749- }
2750-
2751-out:
2752- return ret;
2753-}
2754-
2755-int32_t
2756-gf_rdma_submit_reply (rpc_transport_t *this, rpc_transport_reply_t *reply)
2757-{
2758- int32_t ret = 0;
2759- gf_rdma_ioq_t *entry = NULL;
2760- rpc_transport_data_t data = {0, };
2761-
2762- if (reply == NULL) {
2763- goto out;
2764- }
2765-
2766- data.data.reply = *reply;
2767-
2768- entry = gf_rdma_ioq_new (this, &data);
2769- if (entry == NULL) {
2770- gf_log (this->name, GF_LOG_WARNING,
2771- "getting a new ioq entry failed (peer:%s)",
2772- this->peerinfo.identifier);
2773- goto out;
2774- }
2775-
2776- ret = gf_rdma_writev (this, entry);
2777- if (ret > 0) {
2778- ret = 0;
2779- } else if (ret < 0) {
2780- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
2781- "sending request to peer (%s) failed",
2782- this->peerinfo.identifier);
2783- rpc_transport_disconnect (this);
2784- }
2785-
2786-out:
2787- return ret;
2788-}
2789-
2790-
2791-static int
2792-gf_rdma_register_peer (gf_rdma_device_t *device, int32_t qp_num,
2793- gf_rdma_peer_t *peer)
2794-{
2795- struct _qpent *ent = NULL;
2796- gf_rdma_qpreg_t *qpreg = NULL;
2797- int32_t hash = 0;
2798- int ret = -1;
2799-
2800- qpreg = &device->qpreg;
2801- hash = qp_num % 42;
2802-
2803- pthread_mutex_lock (&qpreg->lock);
2804- {
2805- ent = qpreg->ents[hash].next;
2806- while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) {
2807- ent = ent->next;
2808- }
2809-
2810- if (ent->qp_num == qp_num) {
2811- ret = 0;
2812- goto unlock;
2813- }
2814-
2815- ent = (struct _qpent *) GF_CALLOC (1, sizeof (*ent),
2816- gf_common_mt_qpent);
2817- if (ent == NULL) {
2818- goto unlock;
2819- }
2820-
2821- /* TODO: ref reg->peer */
2822- ent->peer = peer;
2823- ent->next = &qpreg->ents[hash];
2824- ent->prev = ent->next->prev;
2825- ent->next->prev = ent;
2826- ent->prev->next = ent;
2827- ent->qp_num = qp_num;
2828- qpreg->count++;
2829- ret = 0;
2830- }
2831-unlock:
2832- pthread_mutex_unlock (&qpreg->lock);
2833-
2834- return ret;
2835-}
2836-
2837-
2838-static void
2839-gf_rdma_unregister_peer (gf_rdma_device_t *device, int32_t qp_num)
2840-{
2841- struct _qpent *ent = NULL;
2842- gf_rdma_qpreg_t *qpreg = NULL;
2843- int32_t hash = 0;
2844-
2845- qpreg = &device->qpreg;
2846- hash = qp_num % 42;
2847-
2848- pthread_mutex_lock (&qpreg->lock);
2849- {
2850- ent = qpreg->ents[hash].next;
2851- while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num))
2852- ent = ent->next;
2853- if (ent->qp_num != qp_num) {
2854- pthread_mutex_unlock (&qpreg->lock);
2855- return;
2856- }
2857- ent->prev->next = ent->next;
2858- ent->next->prev = ent->prev;
2859- /* TODO: unref reg->peer */
2860- GF_FREE (ent);
2861- qpreg->count--;
2862- }
2863- pthread_mutex_unlock (&qpreg->lock);
2864-}
2865-
2866-
2867-static gf_rdma_peer_t *
2868-__gf_rdma_lookup_peer (gf_rdma_device_t *device, int32_t qp_num)
2869-{
2870- struct _qpent *ent = NULL;
2871- gf_rdma_peer_t *peer = NULL;
2872- gf_rdma_qpreg_t *qpreg = NULL;
2873- int32_t hash = 0;
2874-
2875- qpreg = &device->qpreg;
2876- hash = qp_num % 42;
2877- ent = qpreg->ents[hash].next;
2878- while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num))
2879- ent = ent->next;
2880-
2881- if (ent != &qpreg->ents[hash]) {
2882- peer = ent->peer;
2883- }
2884-
2885- return peer;
2886-}
2887-
2888-
2889-static void
2890-__gf_rdma_destroy_qp (rpc_transport_t *this)
2891-{
2892- gf_rdma_private_t *priv = NULL;
2893-
2894- priv = this->private;
2895- if (priv->peer.qp) {
2896- gf_rdma_unregister_peer (priv->device, priv->peer.qp->qp_num);
2897- rdma_destroy_qp (priv->peer.cm_id);
2898- }
2899- priv->peer.qp = NULL;
2900-
2901- return;
2902-}
2903-
2904-
2905-static int32_t
2906-gf_rdma_create_qp (rpc_transport_t *this)
2907-{
2908- gf_rdma_private_t *priv = NULL;
2909- gf_rdma_device_t *device = NULL;
2910- int32_t ret = 0;
2911- gf_rdma_peer_t *peer = NULL;
2912- char *device_name = NULL;
2913-
2914- priv = this->private;
2915-
2916- peer = &priv->peer;
2917-
2918- device_name = (char *)ibv_get_device_name (peer->cm_id->verbs->device);
2919- if (device_name == NULL) {
2920- ret = -1;
2921- gf_log (this->name, GF_LOG_WARNING, "cannot get device_name");
2922- goto out;
2923- }
2924-
2925- device = gf_rdma_get_device (this, peer->cm_id->verbs,
2926- device_name);
2927- if (device == NULL) {
2928- ret = -1;
2929- gf_log (this->name, GF_LOG_WARNING, "cannot get device for "
2930- "device %s", device_name);
2931- goto out;
2932- }
2933-
2934- if (priv->device == NULL) {
2935- priv->device = device;
2936- }
2937-
2938- struct ibv_qp_init_attr init_attr = {
2939- .send_cq = device->send_cq,
2940- .recv_cq = device->recv_cq,
2941- .srq = device->srq,
2942- .cap = {
2943- .max_send_wr = peer->send_count,
2944- .max_recv_wr = peer->recv_count,
2945- .max_send_sge = 2,
2946- .max_recv_sge = 1
2947- },
2948- .qp_type = IBV_QPT_RC
2949- };
2950-
2951- ret = rdma_create_qp(peer->cm_id, device->pd, &init_attr);
2952- if (ret != 0) {
2953- gf_log (peer->trans->name, GF_LOG_CRITICAL,
2954- "%s: could not create QP (%s)", this->name,
2955- strerror (errno));
2956- ret = -1;
2957- goto out;
2958- }
2959-
2960- peer->qp = peer->cm_id->qp;
2961-
2962- ret = gf_rdma_register_peer (device, peer->qp->qp_num, peer);
2963-
2964-out:
2965- if (ret == -1)
2966- __gf_rdma_destroy_qp (this);
2967-
2968- return ret;
2969-}
2970-
2971-
2972-static int32_t
2973-__gf_rdma_teardown (rpc_transport_t *this)
2974-{
2975- gf_rdma_private_t *priv = NULL;
2976- gf_rdma_peer_t *peer = NULL;
2977-
2978- priv = this->private;
2979- peer = &priv->peer;
2980-
2981- if (peer->cm_id->qp != NULL) {
2982- __gf_rdma_destroy_qp (this);
2983- }
2984-
2985- if (!list_empty (&priv->peer.ioq)) {
2986- __gf_rdma_ioq_flush (peer);
2987- }
2988-
2989- if (peer->cm_id != NULL) {
2990- rdma_destroy_id (peer->cm_id);
2991- peer->cm_id = NULL;
2992- }
2993-
2994- /* TODO: decrement cq size */
2995- return 0;
2996-}
2997-
2998-
2999-static int32_t
3000-gf_rdma_teardown (rpc_transport_t *this)
3001-{
3002- int32_t ret = 0;
3003- gf_rdma_private_t *priv = NULL;
3004-
3005- if (this == NULL) {
3006- goto out;
3007- }
3008-
3009- priv = this->private;
3010-
3011- pthread_mutex_lock (&priv->write_mutex);
3012- {
3013- ret = __gf_rdma_teardown (this);
3014- }
3015- pthread_mutex_unlock (&priv->write_mutex);
3016-
3017-out:
3018- return ret;
3019-}
3020-
3021-
3022-/*
3023- * allocates new memory to hold write-chunklist. New memory is needed since
3024- * write-chunklist will be used while sending reply and the post holding initial
3025- * write-chunklist sent from client will be put back to srq before a pollin
3026- * event is sent to upper layers.
3027- */
3028-int32_t
3029-gf_rdma_get_write_chunklist (char **ptr, gf_rdma_write_array_t **write_ary)
3030-{
3031- gf_rdma_write_array_t *from = NULL, *to = NULL;
3032- int32_t ret = -1, size = 0, i = 0;
3033-
3034- from = (gf_rdma_write_array_t *) *ptr;
3035- if (from->wc_discrim == 0) {
3036- ret = 0;
3037- goto out;
3038- }
3039-
3040- from->wc_nchunks = ntoh32 (from->wc_nchunks);
3041-
3042- size = sizeof (*from)
3043- + (sizeof (gf_rdma_write_chunk_t) * from->wc_nchunks);
3044-
3045- to = GF_CALLOC (1, size, gf_common_mt_char);
3046- if (to == NULL) {
3047- ret = -1;
3048- goto out;
3049- }
3050-
3051- to->wc_discrim = ntoh32 (from->wc_discrim);
3052- to->wc_nchunks = from->wc_nchunks;
3053-
3054- for (i = 0; i < to->wc_nchunks; i++) {
3055- to->wc_array[i].wc_target.rs_handle
3056- = ntoh32 (from->wc_array[i].wc_target.rs_handle);
3057- to->wc_array[i].wc_target.rs_length
3058- = ntoh32 (from->wc_array[i].wc_target.rs_length);
3059- to->wc_array[i].wc_target.rs_offset
3060- = ntoh64 (from->wc_array[i].wc_target.rs_offset);
3061- }
3062-
3063- *write_ary = to;
3064- ret = 0;
3065- *ptr = (char *)&from->wc_array[i].wc_target.rs_handle;
3066-out:
3067- return ret;
3068-}
3069-
3070-
3071-/*
3072- * does not allocate new memory to hold read-chunklist. New memory is not
3073- * needed, since post is not put back to srq till we've completed all the
3074- * rdma-reads and hence readchunk-list can point to memory held by post.
3075- */
3076-int32_t
3077-gf_rdma_get_read_chunklist (char **ptr, gf_rdma_read_chunk_t **readch)
3078-{
3079- int32_t ret = -1;
3080- gf_rdma_read_chunk_t *chunk = NULL;
3081- int i = 0;
3082-
3083- chunk = (gf_rdma_read_chunk_t *)*ptr;
3084- if (chunk[0].rc_discrim == 0) {
3085- ret = 0;
3086- goto out;
3087- }
3088-
3089- for (i = 0; chunk[i].rc_discrim != 0; i++) {
3090- chunk[i].rc_discrim = ntoh32 (chunk[i].rc_discrim);
3091- chunk[i].rc_position = ntoh32 (chunk[i].rc_position);
3092- chunk[i].rc_target.rs_handle
3093- = ntoh32 (chunk[i].rc_target.rs_handle);
3094- chunk[i].rc_target.rs_length
3095- = ntoh32 (chunk[i].rc_target.rs_length);
3096- chunk[i].rc_target.rs_offset
3097- = ntoh64 (chunk[i].rc_target.rs_offset);
3098- }
3099-
3100- *readch = &chunk[0];
3101- ret = 0;
3102- *ptr = (char *)&chunk[i].rc_discrim;
3103-out:
3104- return ret;
3105-}
3106-
3107-
3108-inline int32_t
3109-gf_rdma_decode_error_msg (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
3110- size_t bytes_in_post)
3111-{
3112- gf_rdma_header_t *header = NULL;
3113- struct iobuf *iobuf = NULL;
3114- struct iobref *iobref = NULL;
3115- int32_t ret = -1;
3116- struct rpc_msg rpc_msg = {0, };
3117-
3118- header = (gf_rdma_header_t *)post->buf;
3119- header->rm_body.rm_error.rm_type
3120- = ntoh32 (header->rm_body.rm_error.rm_type);
3121- if (header->rm_body.rm_error.rm_type == ERR_VERS) {
3122- header->rm_body.rm_error.rm_version.gf_rdma_vers_low =
3123- ntoh32 (header->rm_body.rm_error.rm_version.gf_rdma_vers_low);
3124- header->rm_body.rm_error.rm_version.gf_rdma_vers_high =
3125- ntoh32 (header->rm_body.rm_error.rm_version.gf_rdma_vers_high);
3126- }
3127-
3128- rpc_msg.rm_xid = header->rm_xid;
3129- rpc_msg.rm_direction = REPLY;
3130- rpc_msg.rm_reply.rp_stat = MSG_DENIED;
3131-
3132- iobuf = iobuf_get2 (peer->trans->ctx->iobuf_pool, bytes_in_post);
3133- if (iobuf == NULL) {
3134- ret = -1;
3135- goto out;
3136- }
3137-
3138- post->ctx.iobref = iobref = iobref_new ();
3139- if (iobref == NULL) {
3140- ret = -1;
3141- goto out;
3142- }
3143-
3144- iobref_add (iobref, iobuf);
3145- iobuf_unref (iobuf);
3146-
3147- ret = rpc_reply_to_xdr (&rpc_msg, iobuf_ptr (iobuf),
3148- iobuf_pagesize (iobuf), &post->ctx.vector[0]);
3149- if (ret == -1) {
3150- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3151- "Failed to create RPC reply");
3152- goto out;
3153- }
3154-
3155- post->ctx.count = 1;
3156-
3157- iobuf = NULL;
3158- iobref = NULL;
3159-
3160-out:
3161- if (ret == -1) {
3162- if (iobuf != NULL) {
3163- iobuf_unref (iobuf);
3164- }
3165-
3166- if (iobref != NULL) {
3167- iobref_unref (iobref);
3168- }
3169- }
3170-
3171- return 0;
3172-}
3173-
3174-
3175-int32_t
3176-gf_rdma_decode_msg (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
3177- gf_rdma_read_chunk_t **readch, size_t bytes_in_post)
3178-{
3179- int32_t ret = -1;
3180- gf_rdma_header_t *header = NULL;
3181- gf_rdma_reply_info_t *reply_info = NULL;
3182- char *ptr = NULL;
3183- gf_rdma_write_array_t *write_ary = NULL;
3184- size_t header_len = 0;
3185-
3186- header = (gf_rdma_header_t *)post->buf;
3187-
3188- ptr = (char *)&header->rm_body.rm_chunks[0];
3189-
3190- ret = gf_rdma_get_read_chunklist (&ptr, readch);
3191- if (ret == -1) {
3192- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3193- "cannot get read chunklist from msg");
3194- goto out;
3195- }
3196-
3197- /* skip terminator of read-chunklist */
3198- ptr = ptr + sizeof (uint32_t);
3199-
3200- ret = gf_rdma_get_write_chunklist (&ptr, &write_ary);
3201- if (ret == -1) {
3202- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3203- "cannot get write chunklist from msg");
3204- goto out;
3205- }
3206-
3207- /* skip terminator of write-chunklist */
3208- ptr = ptr + sizeof (uint32_t);
3209-
3210- if (write_ary != NULL) {
3211- reply_info = gf_rdma_reply_info_alloc (peer);
3212- if (reply_info == NULL) {
3213- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3214- "reply_info_alloc failed");
3215- ret = -1;
3216- goto out;
3217- }
3218-
3219- reply_info->type = gf_rdma_writech;
3220- reply_info->wc_array = write_ary;
3221- reply_info->rm_xid = header->rm_xid;
3222- } else {
3223- ret = gf_rdma_get_write_chunklist (&ptr, &write_ary);
3224- if (ret == -1) {
3225- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3226- "cannot get reply chunklist from msg");
3227- goto out;
3228- }
3229-
3230- if (write_ary != NULL) {
3231- reply_info = gf_rdma_reply_info_alloc (peer);
3232- if (reply_info == NULL) {
3233- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3234- "reply_info_alloc_failed");
3235- ret = -1;
3236- goto out;
3237- }
3238-
3239- reply_info->type = gf_rdma_replych;
3240- reply_info->wc_array = write_ary;
3241- reply_info->rm_xid = header->rm_xid;
3242- }
3243- }
3244-
3245- /* skip terminator of reply chunk */
3246- ptr = ptr + sizeof (uint32_t);
3247- if (header->rm_type != GF_RDMA_NOMSG) {
3248- header_len = (long)ptr - (long)post->buf;
3249- post->ctx.vector[0].iov_len = (bytes_in_post - header_len);
3250-
3251- post->ctx.hdr_iobuf = iobuf_get2 (peer->trans->ctx->iobuf_pool,
3252- (bytes_in_post - header_len));
3253- if (post->ctx.hdr_iobuf == NULL) {
3254- ret = -1;
3255- goto out;
3256- }
3257-
3258- post->ctx.vector[0].iov_base = iobuf_ptr (post->ctx.hdr_iobuf);
3259- memcpy (post->ctx.vector[0].iov_base, ptr,
3260- post->ctx.vector[0].iov_len);
3261- post->ctx.count = 1;
3262- }
3263-
3264- post->ctx.reply_info = reply_info;
3265-out:
3266- if (ret == -1) {
3267- if (*readch != NULL) {
3268- GF_FREE (*readch);
3269- *readch = NULL;
3270- }
3271-
3272- GF_FREE (write_ary);
3273- }
3274-
3275- return ret;
3276-}
3277-
3278-
3279-/* Assumes only one of either write-chunklist or a reply chunk is present */
3280-int32_t
3281-gf_rdma_decode_header (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
3282- gf_rdma_read_chunk_t **readch, size_t bytes_in_post)
3283-{
3284- int32_t ret = -1;
3285- gf_rdma_header_t *header = NULL;
3286-
3287- header = (gf_rdma_header_t *)post->buf;
3288-
3289- header->rm_xid = ntoh32 (header->rm_xid);
3290- header->rm_vers = ntoh32 (header->rm_vers);
3291- header->rm_credit = ntoh32 (header->rm_credit);
3292- header->rm_type = ntoh32 (header->rm_type);
3293-
3294- switch (header->rm_type) {
3295- case GF_RDMA_MSG:
3296- case GF_RDMA_NOMSG:
3297- ret = gf_rdma_decode_msg (peer, post, readch, bytes_in_post);
3298- if (ret < 0) {
3299- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3300- "cannot decode msg of type (%d)",
3301- header->rm_type);
3302- }
3303-
3304- break;
3305-
3306- case GF_RDMA_MSGP:
3307- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3308- "rdma msg of msg-type GF_RDMA_MSGP should not have "
3309- "been received");
3310- ret = -1;
3311- break;
3312-
3313- case GF_RDMA_DONE:
3314- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3315- "rdma msg of msg-type GF_RDMA_DONE should not have "
3316- "been received");
3317- ret = -1;
3318- break;
3319-
3320- case GF_RDMA_ERROR:
3321- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3322- "received a msg of type RDMA_ERROR");
3323- ret = gf_rdma_decode_error_msg (peer, post, bytes_in_post);
3324- break;
3325-
3326- default:
3327- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3328- "unknown rdma msg-type (%d)", header->rm_type);
3329- }
3330-
3331- return ret;
3332-}
3333-
3334-
3335-int32_t
3336-__gf_rdma_read (gf_rdma_peer_t *peer, gf_rdma_post_t *post, struct iovec *to,
3337- gf_rdma_read_chunk_t *readch)
3338-{
3339- int32_t ret = -1;
3340- struct ibv_sge list = {0, };
3341- struct ibv_send_wr wr = {0, }, *bad_wr = NULL;
3342-
3343- ret = __gf_rdma_register_local_mr_for_rdma (peer, to, 1, &post->ctx);
3344- if (ret == -1) {
3345- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3346- "registering local memory for rdma read failed");
3347- goto out;
3348- }
3349-
3350- list.addr = (unsigned long) to->iov_base;
3351- list.length = to->iov_len;
3352- list.lkey = post->ctx.mr[post->ctx.mr_count - 1]->lkey;
3353-
3354- wr.wr_id = (unsigned long) gf_rdma_post_ref (post);
3355- wr.sg_list = &list;
3356- wr.num_sge = 1;
3357- wr.opcode = IBV_WR_RDMA_READ;
3358- wr.send_flags = IBV_SEND_SIGNALED;
3359- wr.wr.rdma.remote_addr = readch->rc_target.rs_offset;
3360- wr.wr.rdma.rkey = readch->rc_target.rs_handle;
3361-
3362- ret = ibv_post_send (peer->qp, &wr, &bad_wr);
3363- if (ret) {
3364- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3365- "rdma read from client "
3366- "(%s) failed with ret = %d (%s)",
3367- peer->trans->peerinfo.identifier,
3368- ret, (ret > 0) ? strerror (ret) : "");
3369- ret = -1;
3370- gf_rdma_post_unref (post);
3371- }
3372-out:
3373- return ret;
3374-}
3375-
3376-
3377-int32_t
3378-gf_rdma_do_reads (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
3379- gf_rdma_read_chunk_t *readch)
3380-{
3381- int32_t ret = -1, i = 0, count = 0;
3382- size_t size = 0;
3383- char *ptr = NULL;
3384- struct iobuf *iobuf = NULL;
3385- gf_rdma_private_t *priv = NULL;
3386-
3387- priv = peer->trans->private;
3388-
3389- for (i = 0; readch[i].rc_discrim != 0; i++) {
3390- size += readch[i].rc_target.rs_length;
3391- }
3392-
3393- if (i == 0) {
3394- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3395- "message type specified as rdma-read but there are no "
3396- "rdma read-chunks present");
3397- goto out;
3398- }
3399-
3400- post->ctx.gf_rdma_reads = i;
3401-
3402- iobuf = iobuf_get2 (peer->trans->ctx->iobuf_pool, size);
3403- if (iobuf == NULL) {
3404- goto out;
3405- }
3406-
3407- if (post->ctx.iobref == NULL) {
3408- post->ctx.iobref = iobref_new ();
3409- if (post->ctx.iobref == NULL) {
3410- iobuf_unref (iobuf);
3411- goto out;
3412- }
3413- }
3414-
3415- iobref_add (post->ctx.iobref, iobuf);
3416- iobuf_unref (iobuf);
3417-
3418- ptr = iobuf_ptr (iobuf);
3419- iobuf = NULL;
3420-
3421- pthread_mutex_lock (&priv->write_mutex);
3422- {
3423- if (!priv->connected) {
3424- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3425- "transport not connected to peer (%s), "
3426- "not doing rdma reads",
3427- peer->trans->peerinfo.identifier);
3428- goto unlock;
3429- }
3430-
3431- for (i = 0; readch[i].rc_discrim != 0; i++) {
3432- count = post->ctx.count++;
3433- post->ctx.vector[count].iov_base = ptr;
3434- post->ctx.vector[count].iov_len
3435- = readch[i].rc_target.rs_length;
3436-
3437- ret = __gf_rdma_read (peer, post,
3438- &post->ctx.vector[count],
3439- &readch[i]);
3440- if (ret == -1) {
3441- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3442- "rdma read from peer (%s) failed",
3443- peer->trans->peerinfo.identifier);
3444- goto unlock;
3445- }
3446-
3447- ptr += readch[i].rc_target.rs_length;
3448- }
3449-
3450- ret = 0;
3451- }
3452-unlock:
3453- pthread_mutex_unlock (&priv->write_mutex);
3454-out:
3455-
3456- if (ret == -1) {
3457- if (iobuf != NULL) {
3458- iobuf_unref (iobuf);
3459- }
3460- }
3461-
3462- return ret;
3463-}
3464-
3465-
3466-int32_t
3467-gf_rdma_pollin_notify (gf_rdma_peer_t *peer, gf_rdma_post_t *post)
3468-{
3469- int32_t ret = -1;
3470- enum msg_type msg_type = 0;
3471- struct rpc_req *rpc_req = NULL;
3472- gf_rdma_request_context_t *request_context = NULL;
3473- rpc_request_info_t request_info = {0, };
3474- gf_rdma_private_t *priv = NULL;
3475- uint32_t *ptr = NULL;
3476- rpc_transport_pollin_t *pollin = NULL;
3477-
3478- if ((peer == NULL) || (post == NULL)) {
3479- goto out;
3480- }
3481-
3482- if (post->ctx.iobref == NULL) {
3483- post->ctx.iobref = iobref_new ();
3484- if (post->ctx.iobref == NULL) {
3485- goto out;
3486- }
3487-
3488- /* handling the case where both hdr and payload of
3489- * GF_FOP_READ_CBK were received in a single iobuf
3490- * because of server sending entire msg as inline without
3491- * doing rdma writes.
3492- */
3493- if (post->ctx.hdr_iobuf)
3494- iobref_add (post->ctx.iobref, post->ctx.hdr_iobuf);
3495- }
3496-
3497- pollin = rpc_transport_pollin_alloc (peer->trans,
3498- post->ctx.vector,
3499- post->ctx.count,
3500- post->ctx.hdr_iobuf,
3501- post->ctx.iobref,
3502- post->ctx.reply_info);
3503- if (pollin == NULL) {
3504- goto out;
3505- }
3506-
3507- ptr = (uint32_t *)pollin->vector[0].iov_base;
3508-
3509- request_info.xid = ntoh32 (*ptr);
3510- msg_type = ntoh32 (*(ptr + 1));
3511-
3512- if (msg_type == REPLY) {
3513- ret = rpc_transport_notify (peer->trans,
3514- RPC_TRANSPORT_MAP_XID_REQUEST,
3515- &request_info);
3516- if (ret == -1) {
3517- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
3518- "cannot get request information from rpc "
3519- "layer");
3520- goto out;
3521- }
3522-
3523- rpc_req = request_info.rpc_req;
3524- if (rpc_req == NULL) {
3525- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
3526- "rpc request structure not found");
3527- ret = -1;
3528- goto out;
3529- }
3530-
3531- request_context = rpc_req->conn_private;
3532- rpc_req->conn_private = NULL;
3533-
3534- priv = peer->trans->private;
3535- if (request_context != NULL) {
3536- pthread_mutex_lock (&priv->write_mutex);
3537- {
3538- __gf_rdma_request_context_destroy (request_context);
3539- }
3540- pthread_mutex_unlock (&priv->write_mutex);
3541- } else {
3542- gf_rdma_quota_put (peer);
3543- }
3544-
3545- pollin->is_reply = 1;
3546- }
3547-
3548- ret = rpc_transport_notify (peer->trans, RPC_TRANSPORT_MSG_RECEIVED,
3549- pollin);
3550- if (ret < 0) {
3551- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3552- "transport_notify failed");
3553- }
3554-
3555-out:
3556- if (pollin != NULL) {
3557- pollin->private = NULL;
3558- rpc_transport_pollin_destroy (pollin);
3559- }
3560-
3561- return ret;
3562-}
3563-
3564-
3565-int32_t
3566-gf_rdma_recv_reply (gf_rdma_peer_t *peer, gf_rdma_post_t *post)
3567-{
3568- int32_t ret = -1;
3569- gf_rdma_header_t *header = NULL;
3570- gf_rdma_reply_info_t *reply_info = NULL;
3571- gf_rdma_write_array_t *wc_array = NULL;
3572- int i = 0;
3573- uint32_t *ptr = NULL;
3574- gf_rdma_request_context_t *ctx = NULL;
3575- rpc_request_info_t request_info = {0, };
3576- struct rpc_req *rpc_req = NULL;
3577-
3578- header = (gf_rdma_header_t *)post->buf;
3579- reply_info = post->ctx.reply_info;
3580-
3581- /* no write chunklist, just notify upper layers */
3582- if (reply_info == NULL) {
3583- ret = 0;
3584- goto out;
3585- }
3586-
3587- wc_array = reply_info->wc_array;
3588-
3589- if (header->rm_type == GF_RDMA_NOMSG) {
3590- post->ctx.vector[0].iov_base
3591- = (void *)(long)wc_array->wc_array[0].wc_target.rs_offset;
3592- post->ctx.vector[0].iov_len
3593- = wc_array->wc_array[0].wc_target.rs_length;
3594-
3595- post->ctx.count = 1;
3596- } else {
3597- for (i = 0; i < wc_array->wc_nchunks; i++) {
3598- post->ctx.vector[i + 1].iov_base
3599- = (void *)(long)wc_array->wc_array[i].wc_target.rs_offset;
3600- post->ctx.vector[i + 1].iov_len
3601- = wc_array->wc_array[i].wc_target.rs_length;
3602- }
3603-
3604- post->ctx.count += wc_array->wc_nchunks;
3605- }
3606-
3607- ptr = (uint32_t *)post->ctx.vector[0].iov_base;
3608- request_info.xid = ntoh32 (*ptr);
3609-
3610- ret = rpc_transport_notify (peer->trans,
3611- RPC_TRANSPORT_MAP_XID_REQUEST,
3612- &request_info);
3613- if (ret == -1) {
3614- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3615- "cannot get request information (peer:%s) from rpc "
3616- "layer", peer->trans->peerinfo.identifier);
3617- goto out;
3618- }
3619-
3620- rpc_req = request_info.rpc_req;
3621- if (rpc_req == NULL) {
3622- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3623- "rpc request structure not found");
3624- ret = -1;
3625- goto out;
3626- }
3627-
3628- ctx = rpc_req->conn_private;
3629- if ((post->ctx.iobref == NULL) && ctx->rsp_iobref) {
3630- post->ctx.iobref = iobref_ref (ctx->rsp_iobref);
3631- }
3632-
3633- ret = 0;
3634-
3635- gf_rdma_reply_info_destroy (reply_info);
3636-
3637-out:
3638- if (ret == 0) {
3639- ret = gf_rdma_pollin_notify (peer, post);
3640- if (ret < 0) {
3641- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3642- "pollin notify failed");
3643- }
3644- }
3645-
3646- return ret;
3647-}
3648-
3649-
3650-inline int32_t
3651-gf_rdma_recv_request (gf_rdma_peer_t *peer, gf_rdma_post_t *post,
3652- gf_rdma_read_chunk_t *readch)
3653-{
3654- int32_t ret = -1;
3655-
3656- if (readch != NULL) {
3657- ret = gf_rdma_do_reads (peer, post, readch);
3658- if (ret < 0) {
3659- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3660- "rdma read from peer (%s) failed",
3661- peer->trans->peerinfo.identifier);
3662- }
3663- } else {
3664- ret = gf_rdma_pollin_notify (peer, post);
3665- if (ret == -1) {
3666- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3667- "pollin notification failed");
3668- }
3669- }
3670-
3671- return ret;
3672-}
3673-
3674-void
3675-gf_rdma_process_recv (gf_rdma_peer_t *peer, struct ibv_wc *wc)
3676-{
3677- gf_rdma_post_t *post = NULL;
3678- gf_rdma_read_chunk_t *readch = NULL;
3679- int ret = -1;
3680- uint32_t *ptr = NULL;
3681- enum msg_type msg_type = 0;
3682- gf_rdma_header_t *header = NULL;
3683- gf_rdma_private_t *priv = NULL;
3684-
3685- post = (gf_rdma_post_t *) (long) wc->wr_id;
3686- if (post == NULL) {
3687- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3688- "no post found in successful work completion element");
3689- goto out;
3690- }
3691-
3692- ret = gf_rdma_decode_header (peer, post, &readch, wc->byte_len);
3693- if (ret == -1) {
3694- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3695- "decoding of header failed");
3696- goto out;
3697- }
3698-
3699- header = (gf_rdma_header_t *)post->buf;
3700-
3701- priv = peer->trans->private;
3702-
3703- pthread_mutex_lock (&priv->write_mutex);
3704- {
3705- if (!priv->peer.quota_set) {
3706- priv->peer.quota_set = 1;
3707-
3708- /* Initially peer.quota is set to 1 as per RFC 5666. We
3709- * have to account for the quota used while sending
3710- * first msg (which may or may not be returned to pool
3711- * at this point) while deriving peer.quota from
3712- * header->rm_credit. Hence the arithmatic below,
3713- * instead of directly setting it to header->rm_credit.
3714- */
3715- priv->peer.quota = header->rm_credit
3716- - ( 1 - priv->peer.quota);
3717- }
3718- }
3719- pthread_mutex_unlock (&priv->write_mutex);
3720-
3721- switch (header->rm_type) {
3722- case GF_RDMA_MSG:
3723- ptr = (uint32_t *)post->ctx.vector[0].iov_base;
3724- msg_type = ntoh32 (*(ptr + 1));
3725- break;
3726-
3727- case GF_RDMA_NOMSG:
3728- if (readch != NULL) {
3729- msg_type = CALL;
3730- } else {
3731- msg_type = REPLY;
3732- }
3733- break;
3734-
3735- case GF_RDMA_ERROR:
3736- if (header->rm_body.rm_error.rm_type == ERR_CHUNK) {
3737- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3738- "peer (%s), couldn't encode or decode the msg "
3739- "properly or write chunks were not provided "
3740- "for replies that were bigger than "
3741- "RDMA_INLINE_THRESHOLD (%d)",
3742- peer->trans->peerinfo.identifier,
3743- GLUSTERFS_RDMA_INLINE_THRESHOLD);
3744- ret = gf_rdma_pollin_notify (peer, post);
3745- if (ret == -1) {
3746- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
3747- "pollin notification failed");
3748- }
3749- goto out;
3750- } else {
3751- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
3752- "an error has happened while transmission of "
3753- "msg, disconnecting the transport");
3754- ret = -1;
3755- goto out;
3756- }
3757-
3758- default:
3759- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3760- "invalid rdma msg-type (%d)", header->rm_type);
3761- goto out;
3762- }
3763-
3764- if (msg_type == CALL) {
3765- ret = gf_rdma_recv_request (peer, post, readch);
3766- if (ret < 0) {
3767- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3768- "receiving a request from peer (%s) failed",
3769- peer->trans->peerinfo.identifier);
3770- }
3771- } else {
3772- ret = gf_rdma_recv_reply (peer, post);
3773- if (ret < 0) {
3774- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3775- "receiving a reply from peer (%s) failed",
3776- peer->trans->peerinfo.identifier);
3777- }
3778- }
3779-
3780-out:
3781- if (ret == -1) {
3782- rpc_transport_disconnect (peer->trans);
3783- }
3784-
3785- return;
3786-}
3787-
3788-void *
3789-gf_rdma_async_event_thread (void *context)
3790-{
3791- struct ibv_async_event event;
3792- int ret;
3793-
3794- while (1) {
3795- do {
3796- ret = ibv_get_async_event((struct ibv_context *)context,
3797- &event);
3798-
3799- if (ret && errno != EINTR) {
3800- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3801- "Error getting event (%s)",
3802- strerror (errno));
3803- }
3804- } while(ret && errno == EINTR);
3805-
3806- switch (event.event_type) {
3807- case IBV_EVENT_SRQ_LIMIT_REACHED:
3808- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3809- "recieved srq_limit reached");
3810- break;
3811-
3812- default:
3813- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
3814- "event (%d) recieved", event.event_type);
3815- break;
3816- }
3817-
3818- ibv_ack_async_event(&event);
3819- }
3820-
3821- return 0;
3822-}
3823-
3824-
3825-static void *
3826-gf_rdma_recv_completion_proc (void *data)
3827-{
3828- struct ibv_comp_channel *chan = NULL;
3829- gf_rdma_device_t *device = NULL;;
3830- gf_rdma_post_t *post = NULL;
3831- gf_rdma_peer_t *peer = NULL;
3832- struct ibv_cq *event_cq = NULL;
3833- struct ibv_wc wc = {0, };
3834- void *event_ctx = NULL;
3835- int32_t ret = 0;
3836-
3837- chan = data;
3838-
3839- while (1) {
3840- ret = ibv_get_cq_event (chan, &event_cq, &event_ctx);
3841- if (ret) {
3842- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
3843- "ibv_get_cq_event failed, terminating recv "
3844- "thread %d (%d)", ret, errno);
3845- continue;
3846- }
3847-
3848- device = event_ctx;
3849-
3850- ret = ibv_req_notify_cq (event_cq, 0);
3851- if (ret) {
3852- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
3853- "ibv_req_notify_cq on %s failed, terminating "
3854- "recv thread: %d (%d)",
3855- device->device_name, ret, errno);
3856- continue;
3857- }
3858-
3859- device = (gf_rdma_device_t *) event_ctx;
3860-
3861- while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) {
3862- post = (gf_rdma_post_t *) (long) wc.wr_id;
3863-
3864- pthread_mutex_lock (&device->qpreg.lock);
3865- {
3866- peer = __gf_rdma_lookup_peer (device,
3867- wc.qp_num);
3868-
3869- /*
3870- * keep a refcount on transport so that it
3871- * does not get freed because of some error
3872- * indicated by wc.status till we are done
3873- * with usage of peer and thereby that of trans.
3874- */
3875- if (peer != NULL) {
3876- rpc_transport_ref (peer->trans);
3877- }
3878- }
3879- pthread_mutex_unlock (&device->qpreg.lock);
3880-
3881- if (wc.status != IBV_WC_SUCCESS) {
3882- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
3883- "recv work request on `%s' returned "
3884- "error (%d)", device->device_name,
3885- wc.status);
3886- if (peer) {
3887- rpc_transport_unref (peer->trans);
3888- rpc_transport_disconnect (peer->trans);
3889- }
3890-
3891- if (post) {
3892- gf_rdma_post_unref (post);
3893- }
3894- continue;
3895- }
3896-
3897- if (peer) {
3898- gf_rdma_process_recv (peer, &wc);
3899- rpc_transport_unref (peer->trans);
3900- } else {
3901- gf_log (GF_RDMA_LOG_NAME,
3902- GF_LOG_DEBUG,
3903- "could not lookup peer for qp_num: %d",
3904- wc.qp_num);
3905- }
3906-
3907- gf_rdma_post_unref (post);
3908- }
3909-
3910- if (ret < 0) {
3911- gf_log (GF_RDMA_LOG_NAME,
3912- GF_LOG_ERROR,
3913- "ibv_poll_cq on `%s' returned error "
3914- "(ret = %d, errno = %d)",
3915- device->device_name, ret, errno);
3916- continue;
3917- }
3918- ibv_ack_cq_events (event_cq, 1);
3919- }
3920-
3921- return NULL;
3922-}
3923-
3924-
3925-void
3926-gf_rdma_handle_failed_send_completion (gf_rdma_peer_t *peer, struct ibv_wc *wc)
3927-{
3928- gf_rdma_post_t *post = NULL;
3929- gf_rdma_device_t *device = NULL;
3930- gf_rdma_private_t *priv = NULL;
3931-
3932- if (peer != NULL) {
3933- priv = peer->trans->private;
3934- if (priv != NULL) {
3935- device = priv->device;
3936- }
3937- }
3938-
3939-
3940- post = (gf_rdma_post_t *) (long) wc->wr_id;
3941-
3942- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
3943- "send work request on `%s' returned error "
3944- "wc.status = %d, wc.vendor_err = %d, post->buf = %p, "
3945- "wc.byte_len = %d, post->reused = %d",
3946- (device != NULL) ? device->device_name : NULL, wc->status,
3947- wc->vendor_err, post->buf, wc->byte_len, post->reused);
3948-
3949- if (wc->status == IBV_WC_RETRY_EXC_ERR) {
3950- gf_log ("rdma", GF_LOG_ERROR, "connection between client and"
3951- " server not working. check by running "
3952- "'ibv_srq_pingpong'. also make sure subnet manager"
3953- " is running (eg: 'opensm'), or check if rdma port is "
3954- "valid (or active) by running 'ibv_devinfo'. contact "
3955- "Gluster Support Team if the problem persists.");
3956- }
3957-
3958- if (peer) {
3959- rpc_transport_disconnect (peer->trans);
3960- }
3961-
3962- return;
3963-}
3964-
3965-
3966-void
3967-gf_rdma_handle_successful_send_completion (gf_rdma_peer_t *peer,
3968- struct ibv_wc *wc)
3969-{
3970- gf_rdma_post_t *post = NULL;
3971- int reads = 0, ret = 0;
3972- gf_rdma_header_t *header = NULL;
3973-
3974- if (wc->opcode != IBV_WC_RDMA_READ) {
3975- goto out;
3976- }
3977-
3978- post = (gf_rdma_post_t *)(long) wc->wr_id;
3979-
3980- pthread_mutex_lock (&post->lock);
3981- {
3982- reads = --post->ctx.gf_rdma_reads;
3983- }
3984- pthread_mutex_unlock (&post->lock);
3985-
3986- if (reads != 0) {
3987- /* if it is not the last rdma read, we've got nothing to do */
3988- goto out;
3989- }
3990-
3991- header = (gf_rdma_header_t *)post->buf;
3992-
3993- if (header->rm_type == GF_RDMA_NOMSG) {
3994- post->ctx.count = 1;
3995- post->ctx.vector[0].iov_len += post->ctx.vector[1].iov_len;
3996- }
3997-
3998- ret = gf_rdma_pollin_notify (peer, post);
3999- if ((ret == -1) && (peer != NULL)) {
4000- rpc_transport_disconnect (peer->trans);
4001- }
4002-
4003-out:
4004- return;
4005-}
4006-
4007-
4008-static void *
4009-gf_rdma_send_completion_proc (void *data)
4010-{
4011- struct ibv_comp_channel *chan = NULL;
4012- gf_rdma_post_t *post = NULL;
4013- gf_rdma_peer_t *peer = NULL;
4014- struct ibv_cq *event_cq = NULL;
4015- void *event_ctx = NULL;
4016- gf_rdma_device_t *device = NULL;
4017- struct ibv_wc wc = {0, };
4018- char is_request = 0;
4019- int32_t ret = 0, quota_ret = 0;
4020-
4021- chan = data;
4022- while (1) {
4023- ret = ibv_get_cq_event (chan, &event_cq, &event_ctx);
4024- if (ret) {
4025- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
4026- "ibv_get_cq_event on failed, terminating "
4027- "send thread: %d (%d)", ret, errno);
4028- continue;
4029- }
4030-
4031- device = event_ctx;
4032-
4033- ret = ibv_req_notify_cq (event_cq, 0);
4034- if (ret) {
4035- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
4036- "ibv_req_notify_cq on %s failed, terminating "
4037- "send thread: %d (%d)",
4038- device->device_name, ret, errno);
4039- continue;
4040- }
4041-
4042- while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) {
4043- post = (gf_rdma_post_t *) (long) wc.wr_id;
4044-
4045- pthread_mutex_lock (&device->qpreg.lock);
4046- {
4047- peer = __gf_rdma_lookup_peer (device, wc.qp_num);
4048-
4049- /*
4050- * keep a refcount on transport so that it
4051- * does not get freed because of some error
4052- * indicated by wc.status, till we are done
4053- * with usage of peer and thereby that of trans.
4054- */
4055- if (peer != NULL) {
4056- rpc_transport_ref (peer->trans);
4057- }
4058- }
4059- pthread_mutex_unlock (&device->qpreg.lock);
4060-
4061- if (wc.status != IBV_WC_SUCCESS) {
4062- gf_rdma_handle_failed_send_completion (peer, &wc);
4063- } else {
4064- gf_rdma_handle_successful_send_completion (peer,
4065- &wc);
4066- }
4067-
4068- if (post) {
4069- is_request = post->ctx.is_request;
4070-
4071- ret = gf_rdma_post_unref (post);
4072- if ((ret == 0)
4073- && (wc.status == IBV_WC_SUCCESS)
4074- && !is_request
4075- && (post->type == GF_RDMA_SEND_POST)
4076- && (peer != NULL)) {
4077- /* An GF_RDMA_RECV_POST can end up in
4078- * gf_rdma_send_completion_proc for
4079- * rdma-reads, and we do not take
4080- * quota for getting an GF_RDMA_RECV_POST.
4081- */
4082-
4083- /*
4084- * if it is request, quota is returned
4085- * after reply has come.
4086- */
4087- quota_ret = gf_rdma_quota_put (peer);
4088- if (quota_ret < 0) {
4089- gf_log ("rdma", GF_LOG_DEBUG,
4090- "failed to send "
4091- "message");
4092- }
4093- }
4094- }
4095-
4096- if (peer) {
4097- rpc_transport_unref (peer->trans);
4098- } else {
4099- gf_log (GF_RDMA_LOG_NAME, GF_LOG_DEBUG,
4100- "could not lookup peer for qp_num: %d",
4101- wc.qp_num);
4102- }
4103- }
4104-
4105- if (ret < 0) {
4106- gf_log (GF_RDMA_LOG_NAME, GF_LOG_ERROR,
4107- "ibv_poll_cq on `%s' returned error (ret = %d,"
4108- " errno = %d)",
4109- device->device_name, ret, errno);
4110- continue;
4111- }
4112-
4113- ibv_ack_cq_events (event_cq, 1);
4114- }
4115-
4116- return NULL;
4117-}
4118-
4119-
4120-static void
4121-gf_rdma_options_init (rpc_transport_t *this)
4122-{
4123- gf_rdma_private_t *priv = NULL;
4124- gf_rdma_options_t *options = NULL;
4125- int32_t mtu = 0;
4126- data_t *temp = NULL;
4127-
4128- /* TODO: validate arguments from options below */
4129-
4130- priv = this->private;
4131- options = &priv->options;
4132- options->send_size = GLUSTERFS_RDMA_INLINE_THRESHOLD;/*this->ctx->page_size * 4; 512 KB*/
4133- options->recv_size = GLUSTERFS_RDMA_INLINE_THRESHOLD;/*this->ctx->page_size * 4; 512 KB*/
4134- options->send_count = 4096;
4135- options->recv_count = 4096;
4136- options->attr_timeout = GF_RDMA_TIMEOUT;
4137- options->attr_retry_cnt = GF_RDMA_RETRY_CNT;
4138- options->attr_rnr_retry = GF_RDMA_RNR_RETRY;
4139-
4140- temp = dict_get (this->options,
4141- "transport.rdma.work-request-send-count");
4142- if (temp)
4143- options->send_count = data_to_int32 (temp);
4144-
4145- temp = dict_get (this->options,
4146- "transport.rdma.work-request-recv-count");
4147- if (temp)
4148- options->recv_count = data_to_int32 (temp);
4149-
4150- temp = dict_get (this->options, "transport.rdma.attr-timeout");
4151-
4152- if (temp)
4153- options->attr_timeout = data_to_uint8 (temp);
4154-
4155- temp = dict_get (this->options, "transport.rdma.attr-retry-cnt");
4156-
4157- if (temp)
4158- options->attr_retry_cnt = data_to_uint8 (temp);
4159-
4160- temp = dict_get (this->options, "transport.rdma.attr-rnr-retry");
4161-
4162- if (temp)
4163- options->attr_rnr_retry = data_to_uint8 (temp);
4164-
4165- options->port = 1;
4166- temp = dict_get (this->options,
4167- "transport.rdma.port");
4168- if (temp)
4169- options->port = data_to_uint64 (temp);
4170-
4171- options->mtu = mtu = IBV_MTU_2048;
4172- temp = dict_get (this->options,
4173- "transport.rdma.mtu");
4174- if (temp)
4175- mtu = data_to_int32 (temp);
4176- switch (mtu) {
4177- case 256: options->mtu = IBV_MTU_256;
4178- break;
4179- case 512: options->mtu = IBV_MTU_512;
4180- break;
4181- case 1024: options->mtu = IBV_MTU_1024;
4182- break;
4183- case 2048: options->mtu = IBV_MTU_2048;
4184- break;
4185- case 4096: options->mtu = IBV_MTU_4096;
4186- break;
4187- default:
4188- if (temp)
4189- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
4190- "%s: unrecognized MTU value '%s', defaulting "
4191- "to '2048'", this->name,
4192- data_to_str (temp));
4193- else
4194- gf_log (GF_RDMA_LOG_NAME, GF_LOG_TRACE,
4195- "%s: defaulting MTU to '2048'",
4196- this->name);
4197- options->mtu = IBV_MTU_2048;
4198- break;
4199- }
4200-
4201- temp = dict_get (this->options,
4202- "transport.rdma.device-name");
4203- if (temp)
4204- options->device_name = gf_strdup (temp->data);
4205-
4206- return;
4207-}
4208-
4209-
4210-gf_rdma_ctx_t *
4211-__gf_rdma_ctx_create (void)
4212-{
4213- gf_rdma_ctx_t *rdma_ctx = NULL;
4214- int ret = -1;
4215-
4216- rdma_ctx = GF_CALLOC (1, sizeof (*rdma_ctx), gf_common_mt_char);
4217- if (rdma_ctx == NULL) {
4218- goto out;
4219- }
4220-
4221- rdma_ctx->rdma_cm_event_channel = rdma_create_event_channel ();
4222- if (rdma_ctx->rdma_cm_event_channel == NULL) {
4223- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
4224- "rdma_cm event channel creation failed (%s)",
4225- strerror (errno));
4226- goto out;
4227- }
4228-
4229- ret = pthread_create (&rdma_ctx->rdma_cm_thread, NULL,
4230- gf_rdma_cm_event_handler,
4231- rdma_ctx->rdma_cm_event_channel);
4232- if (ret != 0) {
4233- gf_log (GF_RDMA_LOG_NAME, GF_LOG_WARNING,
4234- "creation of thread to handle rdma-cm events "
4235- "failed (%s)", strerror (ret));
4236- goto out;
4237- }
4238-
4239-out:
4240- if (ret < 0) {
4241- if (rdma_ctx->rdma_cm_event_channel != NULL) {
4242- rdma_destroy_event_channel (rdma_ctx->rdma_cm_event_channel);
4243- }
4244-
4245- GF_FREE (rdma_ctx);
4246- rdma_ctx = NULL;
4247- }
4248-
4249- return rdma_ctx;
4250-}
4251-
4252-static int32_t
4253-gf_rdma_init (rpc_transport_t *this)
4254-{
4255- gf_rdma_private_t *priv = NULL;
4256- int32_t ret = 0;
4257- glusterfs_ctx_t *ctx = NULL;
4258- gf_rdma_options_t *options = NULL;
4259-
4260- ctx= this->ctx;
4261-
4262- priv = this->private;
4263-
4264- ibv_fork_init ();
4265- gf_rdma_options_init (this);
4266-
4267- options = &priv->options;
4268- priv->peer.send_count = options->send_count;
4269- priv->peer.recv_count = options->recv_count;
4270- priv->peer.send_size = options->send_size;
4271- priv->peer.recv_size = options->recv_size;
4272-
4273- priv->peer.trans = this;
4274- INIT_LIST_HEAD (&priv->peer.ioq);
4275-
4276- pthread_mutex_init (&priv->write_mutex, NULL);
4277- pthread_mutex_init (&priv->recv_mutex, NULL);
4278- pthread_cond_init (&priv->recv_cond, NULL);
4279-
4280- pthread_mutex_lock (&ctx->lock);
4281- {
4282- if (ctx->ib == NULL) {
4283- ctx->ib = __gf_rdma_ctx_create ();
4284- if (ctx->ib == NULL) {
4285- ret = -1;
4286- }
4287- }
4288- }
4289- pthread_mutex_unlock (&ctx->lock);
4290-
4291- return ret;
4292-}
4293-
4294-
4295-static int32_t
4296-gf_rdma_disconnect (rpc_transport_t *this)
4297-{
4298- gf_rdma_private_t *priv = NULL;
4299- int32_t ret = 0;
4300-
4301- priv = this->private;
4302- gf_log_callingfn (this->name, GF_LOG_WARNING,
4303- "disconnect called (peer:%s)",
4304- this->peerinfo.identifier);
4305-
4306- pthread_mutex_lock (&priv->write_mutex);
4307- {
4308- ret = __gf_rdma_disconnect (this);
4309- }
4310- pthread_mutex_unlock (&priv->write_mutex);
4311-
4312- return ret;
4313-}
4314-
4315-
4316-static int32_t
4317-gf_rdma_connect (struct rpc_transport *this, int port)
4318-{
4319- gf_rdma_private_t *priv = NULL;
4320- int32_t ret = 0;
4321- union gf_sock_union sock_union = {{0, }, };
4322- socklen_t sockaddr_len = 0;
4323- gf_rdma_peer_t *peer = NULL;
4324- gf_rdma_ctx_t *rdma_ctx = NULL;
4325- gf_boolean_t connected = _gf_false;
4326-
4327- priv = this->private;
4328-
4329- peer = &priv->peer;
4330-
4331- rpc_transport_ref (this);
4332-
4333- ret = gf_rdma_client_get_remote_sockaddr (this,
4334- &sock_union.sa,
4335- &sockaddr_len, port);
4336- if (ret != 0) {
4337- gf_log (this->name, GF_LOG_DEBUG,
4338- "cannot get remote address to connect");
4339- goto out;
4340- }
4341-
4342- rdma_ctx = this->ctx->ib;
4343-
4344- pthread_mutex_lock (&priv->write_mutex);
4345- {
4346- if (peer->cm_id != NULL) {
4347- ret = -1;
4348- errno = EINPROGRESS;
4349- connected = _gf_true;
4350- goto unlock;
4351- }
4352-
4353- priv->entity = GF_RDMA_CLIENT;
4354-
4355- ret = rdma_create_id (rdma_ctx->rdma_cm_event_channel,
4356- &peer->cm_id, this, RDMA_PS_TCP);
4357- if (ret != 0) {
4358- gf_log (this->name, GF_LOG_ERROR,
4359- "creation of rdma_cm_id failed (%s)",
4360- strerror (errno));
4361- ret = -errno;
4362- goto unlock;
4363- }
4364-
4365- memcpy (&this->peerinfo.sockaddr, &sock_union.storage,
4366- sockaddr_len);
4367- this->peerinfo.sockaddr_len = sockaddr_len;
4368-
4369- if (port > 0)
4370- sock_union.sin.sin_port = htons (port);
4371-
4372- ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family =
4373- ((struct sockaddr *)&this->peerinfo.sockaddr)->sa_family;
4374-
4375- ret = gf_rdma_client_bind (this,
4376- (struct sockaddr *)&this->myinfo.sockaddr,
4377- &this->myinfo.sockaddr_len,
4378- peer->cm_id);
4379- if (ret != 0) {
4380- gf_log (this->name, GF_LOG_WARNING,
4381- "client bind failed: %s", strerror (errno));
4382- goto unlock;
4383- }
4384-
4385- ret = rdma_resolve_addr (peer->cm_id, NULL, &sock_union.sa,
4386- 2000);
4387- if (ret != 0) {
4388- gf_log (this->name, GF_LOG_WARNING,
4389- "rdma_resolve_addr failed (%s)",
4390- strerror (errno));
4391- goto unlock;
4392- }
4393-
4394- priv->connected = 0;
4395- }
4396-unlock:
4397- pthread_mutex_unlock (&priv->write_mutex);
4398-
4399-out:
4400- if (ret != 0) {
4401- if (!connected) {
4402- gf_rdma_teardown (this);
4403- }
4404-
4405- rpc_transport_unref (this);
4406- }
4407-
4408- return ret;
4409-}
4410-
4411-
4412-static int32_t
4413-gf_rdma_listen (rpc_transport_t *this)
4414-{
4415- union gf_sock_union sock_union = {{0, }, };
4416- socklen_t sockaddr_len = 0;
4417- gf_rdma_private_t *priv = NULL;
4418- gf_rdma_peer_t *peer = NULL;
4419- int ret = 0;
4420- gf_rdma_ctx_t *rdma_ctx = NULL;
4421- char service[NI_MAXSERV], host[NI_MAXHOST];
4422-
4423- priv = this->private;
4424- peer = &priv->peer;
4425-
4426- priv->entity = GF_RDMA_SERVER_LISTENER;
4427-
4428- rdma_ctx = this->ctx->ib;
4429-
4430- ret = gf_rdma_server_get_local_sockaddr (this, &sock_union.sa,
4431- &sockaddr_len);
4432- if (ret != 0) {
4433- gf_log (this->name, GF_LOG_WARNING,
4434- "cannot find network address of server to bind to");
4435- goto err;
4436- }
4437-
4438- ret = rdma_create_id (rdma_ctx->rdma_cm_event_channel,
4439- &peer->cm_id, this, RDMA_PS_TCP);
4440- if (ret != 0) {
4441- gf_log (this->name, GF_LOG_WARNING,
4442- "creation of rdma_cm_id failed (%s)",
4443- strerror (errno));
4444- goto err;
4445- }
4446-
4447- memcpy (&this->myinfo.sockaddr, &sock_union.storage,
4448- sockaddr_len);
4449- this->myinfo.sockaddr_len = sockaddr_len;
4450-
4451- ret = getnameinfo ((struct sockaddr *)&this->myinfo.sockaddr,
4452- this->myinfo.sockaddr_len, host, sizeof (host),
4453- service, sizeof (service),
4454- NI_NUMERICHOST);
4455- if (ret != 0) {
4456- gf_log (this->name, GF_LOG_ERROR,
4457- "getnameinfo failed (%s)", gai_strerror (ret));
4458- goto err;
4459- }
4460-
4461- sprintf (this->myinfo.identifier, "%s:%s", host, service);
4462-
4463- ret = rdma_bind_addr (peer->cm_id, &sock_union.sa);
4464- if (ret != 0) {
4465- gf_log (this->name, GF_LOG_WARNING,
4466- "rdma_bind_addr failed (%s)", strerror (errno));
4467- goto err;
4468- }
4469-
4470- ret = rdma_listen (peer->cm_id, 10);
4471- if (ret != 0) {
4472- gf_log (this->name, GF_LOG_WARNING,
4473- "rdma_listen failed (%s)", strerror (errno));
4474- goto err;
4475- }
4476-
4477- rpc_transport_ref (this);
4478-
4479- ret = 0;
4480-err:
4481- if (ret < 0) {
4482- if (peer->cm_id != NULL) {
4483- rdma_destroy_id (peer->cm_id);
4484- peer->cm_id = NULL;
4485- }
4486- }
4487-
4488- return ret;
4489-}
4490-
4491-
4492-struct rpc_transport_ops tops = {
4493- .submit_request = gf_rdma_submit_request,
4494- .submit_reply = gf_rdma_submit_reply,
4495- .connect = gf_rdma_connect,
4496- .disconnect = gf_rdma_disconnect,
4497- .listen = gf_rdma_listen,
4498-};
4499-
4500-int32_t
4501-init (rpc_transport_t *this)
4502-{
4503- gf_rdma_private_t *priv = NULL;
4504-
4505- priv = GF_CALLOC (1, sizeof (*priv), gf_common_mt_rdma_private_t);
4506- if (!priv)
4507- return -1;
4508-
4509- this->private = priv;
4510-
4511- if (gf_rdma_init (this)) {
4512- gf_log (this->name, GF_LOG_ERROR,
4513- "Failed to initialize IB Device");
4514- return -1;
4515- }
4516-
4517- return 0;
4518-}
4519-
4520-void
4521-fini (struct rpc_transport *this)
4522-{
4523- /* TODO: verify this function does graceful finish */
4524- gf_rdma_private_t *priv = NULL;
4525-
4526- priv = this->private;
4527-
4528- this->private = NULL;
4529-
4530- if (priv) {
4531- pthread_mutex_destroy (&priv->recv_mutex);
4532- pthread_mutex_destroy (&priv->write_mutex);
4533-
4534- gf_log (this->name, GF_LOG_TRACE,
4535- "called fini on transport: %p", this);
4536- GF_FREE (priv);
4537- }
4538- return;
4539-}
4540-
4541-/* TODO: expand each option */
4542-struct volume_options options[] = {
4543- { .key = {"transport.rdma.port",
4544- "rdma-port"},
4545- .type = GF_OPTION_TYPE_INT,
4546- .min = 1,
4547- .max = 4,
4548- .description = "check the option by 'ibv_devinfo'"
4549- },
4550- { .key = {"transport.rdma.mtu",
4551- "rdma-mtu"},
4552- .type = GF_OPTION_TYPE_INT,
4553- },
4554- { .key = {"transport.rdma.device-name",
4555- "rdma-device-name"},
4556- .type = GF_OPTION_TYPE_ANY,
4557- .description = "check by 'ibv_devinfo'"
4558- },
4559- { .key = {"transport.rdma.work-request-send-count",
4560- "rdma-work-request-send-count"},
4561- .type = GF_OPTION_TYPE_INT,
4562- },
4563- { .key = {"transport.rdma.work-request-recv-count",
4564- "rdma-work-request-recv-count"},
4565- .type = GF_OPTION_TYPE_INT,
4566- },
4567- { .key = {"remote-port",
4568- "transport.remote-port",
4569- "transport.rdma.remote-port"},
4570- .type = GF_OPTION_TYPE_INT
4571- },
4572- { .key = {"transport.rdma.attr-timeout",
4573- "rdma-attr-timeout"},
4574- .type = GF_OPTION_TYPE_INT
4575- },
4576- { .key = {"transport.rdma.attr-retry-cnt",
4577- "rdma-attr-retry-cnt"},
4578- .type = GF_OPTION_TYPE_INT
4579- },
4580- { .key = {"transport.rdma.attr-rnr-retry",
4581- "rdma-attr-rnr-retry"},
4582- .type = GF_OPTION_TYPE_INT
4583- },
4584- { .key = {"transport.rdma.listen-port", "listen-port"},
4585- .type = GF_OPTION_TYPE_INT
4586- },
4587- { .key = {"transport.rdma.connect-path", "connect-path"},
4588- .type = GF_OPTION_TYPE_ANY
4589- },
4590- { .key = {"transport.rdma.bind-path", "bind-path"},
4591- .type = GF_OPTION_TYPE_ANY
4592- },
4593- { .key = {"transport.rdma.listen-path", "listen-path"},
4594- .type = GF_OPTION_TYPE_ANY
4595- },
4596- { .key = {"transport.address-family",
4597- "address-family"},
4598- .value = {"inet", "inet6", "inet/inet6", "inet6/inet",
4599- "unix", "inet-sdp" },
4600- .type = GF_OPTION_TYPE_STR
4601- },
4602- { .key = {"transport.socket.lowlat"},
4603- .type = GF_OPTION_TYPE_BOOL
4604- },
4605- { .key = {NULL} }
4606-};
4607
4608=== removed directory '.pc/01-spelling-error.diff/xlators'
4609=== removed directory '.pc/01-spelling-error.diff/xlators/mgmt'
4610=== removed directory '.pc/01-spelling-error.diff/xlators/mgmt/glusterd'
4611=== removed directory '.pc/01-spelling-error.diff/xlators/mgmt/glusterd/src'
4612=== removed file '.pc/01-spelling-error.diff/xlators/mgmt/glusterd/src/glusterd-store.c'
4613--- .pc/01-spelling-error.diff/xlators/mgmt/glusterd/src/glusterd-store.c 2013-09-27 21:40:14 +0000
4614+++ .pc/01-spelling-error.diff/xlators/mgmt/glusterd/src/glusterd-store.c 1970-01-01 00:00:00 +0000
4615@@ -1,2530 +0,0 @@
4616-/*
4617- Copyright (c) 2007-2012 Red Hat, Inc. <http://www.redhat.com>
4618- This file is part of GlusterFS.
4619-
4620- This file is licensed to you under your choice of the GNU Lesser
4621- General Public License, version 3 or any later version (LGPLv3 or
4622- later), or the GNU General Public License, version 2 (GPLv2), in all
4623- cases as published by the Free Software Foundation.
4624-*/
4625-
4626-#ifndef _CONFIG_H
4627-#define _CONFIG_H
4628-#include "config.h"
4629-#endif
4630-
4631-#include "glusterd-op-sm.h"
4632-#include <inttypes.h>
4633-
4634-
4635-#include "globals.h"
4636-#include "glusterfs.h"
4637-#include "compat.h"
4638-#include "dict.h"
4639-#include "protocol-common.h"
4640-#include "xlator.h"
4641-#include "logging.h"
4642-#include "timer.h"
4643-#include "defaults.h"
4644-#include "compat.h"
4645-#include "compat-errno.h"
4646-#include "statedump.h"
4647-#include "glusterd-mem-types.h"
4648-#include "glusterd.h"
4649-#include "glusterd-sm.h"
4650-#include "glusterd-op-sm.h"
4651-#include "glusterd-utils.h"
4652-#include "glusterd-hooks.h"
4653-#include "store.h"
4654-#include "glusterd-store.h"
4655-
4656-#include "rpc-clnt.h"
4657-#include "common-utils.h"
4658-
4659-#include <sys/resource.h>
4660-#include <inttypes.h>
4661-#include <dirent.h>
4662-
4663-void
4664-glusterd_replace_slash_with_hipen (char *str)
4665-{
4666- char *ptr = NULL;
4667-
4668- ptr = strchr (str, '/');
4669-
4670- while (ptr) {
4671- *ptr = '-';
4672- ptr = strchr (str, '/');
4673- }
4674-}
4675-
4676-int32_t
4677-glusterd_store_create_brick_dir (glusterd_volinfo_t *volinfo)
4678-{
4679- int32_t ret = -1;
4680- char brickdirpath[PATH_MAX] = {0,};
4681- glusterd_conf_t *priv = NULL;
4682-
4683- GF_ASSERT (volinfo);
4684-
4685- priv = THIS->private;
4686- GF_ASSERT (priv);
4687-
4688- GLUSTERD_GET_BRICK_DIR (brickdirpath, volinfo, priv);
4689- ret = gf_store_mkdir (brickdirpath);
4690-
4691- return ret;
4692-}
4693-
4694-static void
4695-glusterd_store_key_vol_brick_set (glusterd_brickinfo_t *brickinfo,
4696- char *key_vol_brick, size_t len)
4697-{
4698- GF_ASSERT (brickinfo);
4699- GF_ASSERT (key_vol_brick);
4700- GF_ASSERT (len >= PATH_MAX);
4701-
4702- snprintf (key_vol_brick, len, "%s", brickinfo->path);
4703- glusterd_replace_slash_with_hipen (key_vol_brick);
4704-}
4705-
4706-static void
4707-glusterd_store_brickinfofname_set (glusterd_brickinfo_t *brickinfo,
4708- char *brickfname, size_t len)
4709-{
4710- char key_vol_brick[PATH_MAX] = {0};
4711-
4712- GF_ASSERT (brickfname);
4713- GF_ASSERT (brickinfo);
4714- GF_ASSERT (len >= PATH_MAX);
4715-
4716- glusterd_store_key_vol_brick_set (brickinfo, key_vol_brick,
4717- sizeof (key_vol_brick));
4718- snprintf (brickfname, len, "%s:%s", brickinfo->hostname, key_vol_brick);
4719-}
4720-
4721-static void
4722-glusterd_store_brickinfopath_set (glusterd_volinfo_t *volinfo,
4723- glusterd_brickinfo_t *brickinfo,
4724- char *brickpath, size_t len)
4725-{
4726- char brickfname[PATH_MAX] = {0};
4727- char brickdirpath[PATH_MAX] = {0,};
4728- glusterd_conf_t *priv = NULL;
4729-
4730- GF_ASSERT (brickpath);
4731- GF_ASSERT (brickinfo);
4732- GF_ASSERT (len >= PATH_MAX);
4733-
4734- priv = THIS->private;
4735- GF_ASSERT (priv);
4736-
4737- GLUSTERD_GET_BRICK_DIR (brickdirpath, volinfo, priv);
4738- glusterd_store_brickinfofname_set (brickinfo, brickfname,
4739- sizeof (brickfname));
4740- snprintf (brickpath, len, "%s/%s", brickdirpath, brickfname);
4741-}
4742-
4743-gf_boolean_t
4744-glusterd_store_is_valid_brickpath (char *volname, char *brick)
4745-{
4746- char brickpath[PATH_MAX] = {0};
4747- glusterd_brickinfo_t *brickinfo = NULL;
4748- glusterd_volinfo_t *volinfo = NULL;
4749- int32_t ret = 0;
4750- size_t volname_len = strlen (volname);
4751- xlator_t *this = NULL;
4752-
4753- this = THIS;
4754- GF_ASSERT (this);
4755-
4756- ret = glusterd_brickinfo_new_from_brick (brick, &brickinfo);
4757- if (ret) {
4758- gf_log (this->name, GF_LOG_WARNING, "Failed to create brick "
4759- "info for brick %s", brick);
4760- ret = 0;
4761- goto out;
4762- }
4763- ret = glusterd_volinfo_new (&volinfo);
4764- if (ret) {
4765- gf_log (this->name, GF_LOG_WARNING, "Failed to create volinfo");
4766- ret = 0;
4767- goto out;
4768- }
4769- if (volname_len >= sizeof (volinfo->volname)) {
4770- gf_log (this->name, GF_LOG_WARNING, "volume name too long");
4771- ret = 0;
4772- goto out;
4773- }
4774- memcpy (volinfo->volname, volname, volname_len+1);
4775- glusterd_store_brickinfopath_set (volinfo, brickinfo, brickpath,
4776- sizeof (brickpath));
4777-
4778- ret = (strlen (brickpath) < _POSIX_PATH_MAX);
4779-
4780-out:
4781- if (brickinfo)
4782- glusterd_brickinfo_delete (brickinfo);
4783- if (volinfo)
4784- glusterd_volinfo_delete (volinfo);
4785-
4786- return ret;
4787-}
4788-
4789-int32_t
4790-glusterd_store_volinfo_brick_fname_write (int vol_fd,
4791- glusterd_brickinfo_t *brickinfo,
4792- int32_t brick_count)
4793-{
4794- char key[PATH_MAX] = {0,};
4795- char brickfname[PATH_MAX] = {0,};
4796- int32_t ret = -1;
4797-
4798- snprintf (key, sizeof (key), "%s-%d", GLUSTERD_STORE_KEY_VOL_BRICK,
4799- brick_count);
4800- glusterd_store_brickinfofname_set (brickinfo, brickfname,
4801- sizeof (brickfname));
4802- ret = gf_store_save_value (vol_fd, key, brickfname);
4803- if (ret)
4804- goto out;
4805-
4806-out:
4807- return ret;
4808-}
4809-
4810-int32_t
4811-glusterd_store_create_brick_shandle_on_absence (glusterd_volinfo_t *volinfo,
4812- glusterd_brickinfo_t *brickinfo)
4813-{
4814- char brickpath[PATH_MAX] = {0,};
4815- int32_t ret = 0;
4816-
4817- GF_ASSERT (volinfo);
4818- GF_ASSERT (brickinfo);
4819-
4820- glusterd_store_brickinfopath_set (volinfo, brickinfo, brickpath,
4821- sizeof (brickpath));
4822- ret = gf_store_handle_create_on_absence (&brickinfo->shandle,
4823- brickpath);
4824- return ret;
4825-}
4826-
4827-int32_t
4828-glusterd_store_brickinfo_write (int fd, glusterd_brickinfo_t *brickinfo)
4829-{
4830- char value[256] = {0,};
4831- int32_t ret = 0;
4832-
4833- GF_ASSERT (brickinfo);
4834- GF_ASSERT (fd > 0);
4835-
4836- ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_HOSTNAME,
4837- brickinfo->hostname);
4838- if (ret)
4839- goto out;
4840-
4841- ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PATH,
4842- brickinfo->path);
4843- if (ret)
4844- goto out;
4845-
4846- snprintf (value, sizeof(value), "%d", brickinfo->port);
4847- ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_PORT, value);
4848-
4849- snprintf (value, sizeof(value), "%d", brickinfo->rdma_port);
4850- ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_RDMA_PORT,
4851- value);
4852-
4853- snprintf (value, sizeof(value), "%d", brickinfo->decommissioned);
4854- ret = gf_store_save_value (fd, GLUSTERD_STORE_KEY_BRICK_DECOMMISSIONED,
4855- value);
4856- if (ret)
4857- goto out;
4858-
4859-out:
4860- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
4861- return ret;
4862-}
4863-
4864-int32_t
4865-glusterd_store_perform_brick_store (glusterd_brickinfo_t *brickinfo)
4866-{
4867- int fd = -1;
4868- int32_t ret = -1;
4869- GF_ASSERT (brickinfo);
4870-
4871- fd = gf_store_mkstemp (brickinfo->shandle);
4872- if (fd <= 0) {
4873- ret = -1;
4874- goto out;
4875- }
4876-
4877- ret = glusterd_store_brickinfo_write (fd, brickinfo);
4878- if (ret)
4879- goto out;
4880-
4881-out:
4882- if (ret && (fd > 0))
4883- gf_store_unlink_tmppath (brickinfo->shandle);
4884- if (fd > 0)
4885- close (fd);
4886- gf_log (THIS->name, GF_LOG_DEBUG, "Returning %d", ret);
4887- return ret;
4888-}
4889-
4890-int32_t
4891-glusterd_store_brickinfo (glusterd_volinfo_t *volinfo,
4892- glusterd_brickinfo_t *brickinfo, int32_t brick_count,
4893- int vol_fd)
4894-{
4895- int32_t ret = -1;
4896-
4897- GF_ASSERT (volinfo);
4898- GF_ASSERT (brickinfo);
4899-
4900- ret = glusterd_store_volinfo_brick_fname_write (vol_fd, brickinfo,
4901- brick_count);
4902- if (ret)
4903- goto out;
4904-
4905- ret = glusterd_store_create_brick_dir (volinfo);
4906- if (ret)
4907- goto out;
4908-
4909- ret = glusterd_store_create_brick_shandle_on_absence (volinfo,
4910- brickinfo);
4911- if (ret)
4912- goto out;
4913-
4914- ret = glusterd_store_perform_brick_store (brickinfo);
4915-out:
4916- gf_log (THIS->name, GF_LOG_DEBUG, "Returning with %d", ret);
4917- return ret;
4918-}
4919-
4920-int32_t
4921-glusterd_store_delete_brick (glusterd_volinfo_t *volinfo,
4922- glusterd_brickinfo_t *brickinfo)
4923-{
4924- int32_t ret = -1;
4925- glusterd_conf_t *priv = NULL;
4926- char path[PATH_MAX] = {0,};
4927- char brickpath[PATH_MAX] = {0,};
4928- char *ptr = NULL;
4929- char *tmppath = NULL;
4930- xlator_t *this = NULL;
4931-
4932- this = THIS;
4933- GF_ASSERT (this);
4934- GF_ASSERT (volinfo);
4935- GF_ASSERT (brickinfo);
4936-
4937- priv = this->private;
4938-
4939- GF_ASSERT (priv);
4940-
4941- GLUSTERD_GET_BRICK_DIR (path, volinfo, priv);
4942-
4943- tmppath = gf_strdup (brickinfo->path);
4944-
4945- ptr = strchr (tmppath, '/');
4946-
4947- while (ptr) {
4948- *ptr = '-';
4949- ptr = strchr (tmppath, '/');
4950- }
4951-
4952- snprintf (brickpath, sizeof (brickpath), "%s/%s:%s",
4953- path, brickinfo->hostname, tmppath);
4954-
4955- GF_FREE (tmppath);
4956-
4957- ret = unlink (brickpath);
4958-
4959- if ((ret < 0) && (errno != ENOENT)) {
4960- gf_log (this->name, GF_LOG_ERROR, "Unlink failed on %s, "
4961- "reason: %s", brickpath, strerror(errno));
4962- ret = -1;
4963- goto out;
4964- } else {
4965- ret = 0;
4966- }
4967-
4968-out:
4969- if (brickinfo->shandle) {
4970- gf_store_handle_destroy (brickinfo->shandle);
4971- brickinfo->shandle = NULL;
4972- }
4973- gf_log (this->name, GF_LOG_DEBUG, "Returning with %d", ret);
4974- return ret;
4975-}
4976-
4977-int32_t
4978-glusterd_store_remove_bricks (glusterd_volinfo_t *volinfo)
4979-{
4980- int32_t ret = 0;
4981- glusterd_brickinfo_t *tmp = NULL;
4982- glusterd_conf_t *priv = NULL;
4983- char brickdir [PATH_MAX] = {0,};
4984- DIR *dir = NULL;
4985- struct dirent *entry = NULL;
4986- char path[PATH_MAX] = {0,};
4987- xlator_t *this = NULL;
4988-
4989- this = THIS;
4990- GF_ASSERT (this);
4991-
4992- GF_ASSERT (volinfo);
4993-
4994- list_for_each_entry (tmp, &volinfo->bricks, brick_list) {
4995- ret = glusterd_store_delete_brick (volinfo, tmp);
4996- if (ret)
4997- goto out;
4998- }
4999-
5000- priv = this->private;
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches