From 16b4c4019c7b7904de171c588ed8c72ca732d2cf Mon Sep 17 00:00:00 2001 From: Garrett D'Amore Date: Wed, 9 May 2018 17:21:27 -0700 Subject: fixes #352 aio lock is burning hot fixes #326 consider nni_taskq_exec_synch() fixes #410 kqueue implementation could be smarter fixes #411 epoll_implementation could be smarter fixes #426 synchronous completion can lead to panic fixes #421 pipe close race condition/duplicate destroy This is a major refactoring of two significant parts of the code base, which are closely interrelated. First the aio and taskq framework have undergone a number of simplifications, and improvements. We have ditched a few parts of the internal API (for example tasks no longer support cancellation) that weren't terribly useful but added a lot of complexity, and we've made aio_schedule something that now checks for cancellation or other "premature" completions. The aio framework now uses the tasks more tightly, so that aio wait can devolve into just nni_task_wait(). We did have to add a "task_prep()" step to prevent race conditions. Second, the entire POSIX poller framework has been simplified, and made more robust, and more scalable. There were some fairly inherent race conditions around the shutdown/close code, where we *thought* we were synchronizing against the other thread, but weren't doing so adequately. With a cleaner design, we've been able to tighten up the implementation to remove these race conditions, while substantially reducing the chance for lock contention, thereby improving scalability. The illumos poller also got a performance boost by polling for multiple events. In highly "busy" systems, we expect to see vast reductions in lock contention, and therefore greater scalability, in addition to overall improved reliability. One area where we currently can do better is that there is still only a single poller thread run. Scaling this out is a task that has to be done differently for each poller, and carefuly to ensure that close conditions are safe on all pollers, and that no chance for deadlock/livelock waiting for pfd finalizers can occur. --- src/protocol/reqrep0/rep.c | 26 ++++++++++++-------------- src/protocol/reqrep0/req.c | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'src/protocol/reqrep0') diff --git a/src/protocol/reqrep0/rep.c b/src/protocol/reqrep0/rep.c index 019727be..965cbea7 100644 --- a/src/protocol/reqrep0/rep.c +++ b/src/protocol/reqrep0/rep.c @@ -216,8 +216,7 @@ rep0_ctx_send(void *arg, nni_aio *aio) return; } - rv = nni_aio_schedule_verify(aio, rep0_ctx_cancel_send, ctx); - if (rv != 0) { + if ((rv = nni_aio_schedule(aio, rep0_ctx_cancel_send, ctx)) != 0) { nni_mtx_unlock(&s->lk); nni_aio_finish_error(aio, rv); return; @@ -354,6 +353,9 @@ rep0_pipe_stop(void *arg) rep0_sock *s = p->rep; rep0_ctx * ctx; + nni_aio_stop(p->aio_send); + nni_aio_stop(p->aio_recv); + nni_mtx_lock(&s->lk); if (nni_list_active(&s->recvpipes, p)) { // We are no longer "receivable". @@ -379,9 +381,6 @@ rep0_pipe_stop(void *arg) } nni_idhash_remove(s->pipes, nni_pipe_id(p->pipe)); nni_mtx_unlock(&s->lk); - - nni_aio_stop(p->aio_send); - nni_aio_stop(p->aio_recv); } static void @@ -459,8 +458,7 @@ rep0_ctx_recv(void *arg, nni_aio *aio) nni_mtx_lock(&s->lk); if ((p = nni_list_first(&s->recvpipes)) == NULL) { int rv; - rv = nni_aio_schedule_verify(aio, rep0_cancel_recv, ctx); - if (rv != 0) { + if ((rv = nni_aio_schedule(aio, rep0_cancel_recv, ctx)) != 0) { nni_mtx_unlock(&s->lk); nni_aio_finish_error(aio, rv); return; @@ -516,10 +514,10 @@ rep0_pipe_recv_cb(void *arg) bool end = false; if (hops > s->ttl) { - // This isn't malformed, but it has gone through - // too many hops. Do not disconnect, because we - // can legitimately receive messages with too many - // hops from devices, etc. + // This isn't malformed, but it has gone + // through too many hops. Do not disconnect, + // because we can legitimately receive messages + // with too many hops from devices, etc. goto drop; } hops++; @@ -566,9 +564,9 @@ rep0_pipe_recv_cb(void *arg) nni_msg_header_clear(msg); ctx->pipe_id = p->id; - // If we got a request on a pipe that wasn't busy, we should mark - // it sendable. (The sendable flag is not set when there is no - // request needing a reply.) + // If we got a request on a pipe that wasn't busy, we should + // mark it sendable. (The sendable flag is not set when there + // is no request needing a reply.) if ((ctx == s->ctx) && (!p->busy)) { nni_pollable_raise(s->sendable); } diff --git a/src/protocol/reqrep0/req.c b/src/protocol/reqrep0/req.c index 3ecc8604..bbb0b886 100644 --- a/src/protocol/reqrep0/req.c +++ b/src/protocol/reqrep0/req.c @@ -638,7 +638,7 @@ req0_ctx_recv(void *arg, nni_aio *aio) if ((msg = ctx->repmsg) == NULL) { int rv; - rv = nni_aio_schedule_verify(aio, req0_ctx_cancel_recv, ctx); + rv = nni_aio_schedule(aio, req0_ctx_cancel_recv, ctx); if (rv != 0) { nni_mtx_unlock(&s->mtx); nni_aio_finish_error(aio, rv); @@ -740,7 +740,7 @@ req0_ctx_send(void *arg, nni_aio *aio) } // If no pipes are ready, and the request was a poll (no background // schedule), then fail it. Should be NNG_TIMEDOUT. - rv = nni_aio_schedule_verify(aio, req0_ctx_cancel_send, ctx); + rv = nni_aio_schedule(aio, req0_ctx_cancel_send, ctx); if ((rv != 0) && (nni_list_empty(&s->readypipes))) { nni_idhash_remove(s->reqids, id); nni_mtx_unlock(&s->mtx); -- cgit v1.2.3-70-g09d2