From 5299c3f255dada8605c2cffed9eba1b68d9d42b4 Mon Sep 17 00:00:00 2001
From: Eric Wong <bofh@yhbt.net>
Date: Tue, 28 Mar 2023 12:24:37 +0000
Subject: epollexclusive: use maxevents=1 for epoll_wait

This allows us to avoid both malloc (slow) and alloca
(unpredictable stack usage) at the cost of needing to make more
epoll_wait syscalls in a rare case.

In unicorn (and most servers), I expect the most frequent setup
is to have one active listener serving the majority of the
connections, so the most frequent epoll_wait return value would
be 1.

Even with >1 events, any syscall overhead saved by having
epoll_wait retrieve multiple events is dwarfed by Rack app
processing overhead.

Worse yet, if a worker retrieves an event sooner than it can
process it, the kernel (regardless of EPOLLEXCLUSIVE or not) is
able to enqueue another new event to that worker.  In this
example where `a' and `b' are both listeners:

  U=userspace, K=kernel
  K: client hits `a' and `b', enqueues them both (events #1 and #2)
  U: epoll_wait(maxevents: 2) => [ a, b ]
  K: enqueues another event for `b' (event #3)
  U: process_client(a.accept) # this takes a long time

While process_client(a.accept) is happening, `b' can have two
clients pending on a given worker.  It's actually better to
leave the first `b' event unretrieved so the second `b'
event can go to the ep->rdllist of another worker.

The kernel is only capable of enqueuing an item if it hasn't
been enqueued.  Meaning, it's impossible for epoll_wait to ever
retrieve `[ b, b ]' in one call.
---
 ext/unicorn_http/epollexclusive.h | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/ext/unicorn_http/epollexclusive.h b/ext/unicorn_http/epollexclusive.h
index 677e1fe..8f4ea9a 100644
--- a/ext/unicorn_http/epollexclusive.h
+++ b/ext/unicorn_http/epollexclusive.h
@@ -64,18 +64,22 @@ static VALUE prep_readers(VALUE cls, VALUE readers)
 
 #if USE_EPOLL
 struct ep_wait {
-	struct epoll_event *events;
+	struct epoll_event event;
 	rb_io_t *fptr;
-	int maxevents;
 	int timeout_msec;
 };
 
 static void *do_wait(void *ptr) /* runs w/o GVL */
 {
 	struct ep_wait *epw = ptr;
-
-	return (void *)(long)epoll_wait(epw->fptr->fd, epw->events,
-				epw->maxevents, epw->timeout_msec);
+	/*
+	 * Linux delivers epoll events in the order received, and using
+	 * maxevents=1 ensures we pluck one item off ep->rdllist
+	 * at-a-time (c.f. fs/eventpoll.c in linux.git, it's quite
+	 * easy-to-understand for anybody familiar with Ruby C).
+	 */
+	return (void *)(long)epoll_wait(epw->fptr->fd, &epw->event, 1,
+					epw->timeout_msec);
 }
 
 /* :nodoc: */
@@ -84,14 +88,10 @@ static VALUE
 get_readers(VALUE epio, VALUE ready, VALUE readers, VALUE timeout_msec)
 {
 	struct ep_wait epw;
-	long i, n;
-	VALUE buf;
+	long n;
 
 	Check_Type(ready, T_ARRAY);
 	Check_Type(readers, T_ARRAY);
-	epw.maxevents = RARRAY_LENINT(readers);
-	buf = rb_str_buf_new(sizeof(struct epoll_event) * epw.maxevents);
-	epw.events = (struct epoll_event *)RSTRING_PTR(buf);
 	epio = rb_io_get_io(epio);
 	GetOpenFile(epio, epw.fptr);
 
@@ -99,17 +99,12 @@ get_readers(VALUE epio, VALUE ready, VALUE readers, VALUE timeout_msec)
 	n = (long)rb_thread_call_without_gvl(do_wait, &epw, RUBY_UBF_IO, NULL);
 	if (n < 0) {
 		if (errno != EINTR) rb_sys_fail("epoll_wait");
-		n = 0;
-	}
-	/* Linux delivers events in order received */
-	for (i = 0; i < n; i++) {
-		struct epoll_event *ev = &epw.events[i];
-		VALUE obj = rb_ary_entry(readers, ev->data.u64);
+	} else if (n > 0) { /* maxevents is hardcoded to 1 */
+		VALUE obj = rb_ary_entry(readers, epw.event.data.u64);
 
 		if (RTEST(obj))
 			rb_ary_push(ready, obj);
-	}
-	rb_str_resize(buf, 0);
+	} /* n == 0 : timeout */
 	return Qfalse;
 }
 #endif /* USE_EPOLL */
-- 
cgit v1.2.3-24-ge0c7