Skip to content

Commit

Permalink
linux: don't delay EPOLL_CTL_DEL operations (libuv#4328)
Browse files Browse the repository at this point in the history
Perform EPOLL_CTL_DEL immediately instead of going through
io_uring's submit queue, otherwise the file descriptor may
be closed by the time the kernel starts the operation.

Fixes: libuv#4323
  • Loading branch information
bnoordhuis authored and Keno committed Mar 23, 2024
1 parent 344a3f5 commit ca3a5a4
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 48 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ if(LIBUV_BUILD_TESTS)
test/test-hrtime.c
test/test-idle.c
test/test-idna.c
test/test-iouring-pollhup.c
test/test-ip4-addr.c
test/test-ip6-addr.c
test/test-ip-name.c
Expand Down
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ test_run_tests_SOURCES = test/blackhole-server.c \
test/test-hrtime.c \
test/test-idle.c \
test/test-idna.c \
test/test-iouring-pollhup.c \
test/test-ip4-addr.c \
test/test-ip6-addr.c \
test/test-ip-name.c \
Expand Down
95 changes: 47 additions & 48 deletions src/unix/linux.c
Original file line number Diff line number Diff line change
Expand Up @@ -690,23 +690,17 @@ void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
* This avoids a problem where the same file description remains open
* in another process, causing repeated junk epoll events.
*
* Perform EPOLL_CTL_DEL immediately instead of going through
* io_uring's submit queue, otherwise the file descriptor may
* be closed by the time the kernel starts the operation.
*
* We pass in a dummy epoll_event, to work around a bug in old kernels.
*
* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
* has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
*/
memset(&dummy, 0, sizeof(dummy));

if (inv == NULL) {
epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
} else {
uv__epoll_ctl_prep(loop->backend_fd,
&lfields->ctl,
inv->prep,
EPOLL_CTL_DEL,
fd,
&dummy);
}
epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
}


Expand Down Expand Up @@ -1184,6 +1178,10 @@ static void uv__poll_io_uring(uv_loop_t* loop, struct uv__iou* iou) {
}


/* Only for EPOLL_CTL_ADD and EPOLL_CTL_MOD. EPOLL_CTL_DEL should always be
* executed immediately, otherwise the file descriptor may have been closed
* by the time the kernel starts the operation.
*/
static void uv__epoll_ctl_prep(int epollfd,
struct uv__iou* ctl,
struct epoll_event (*events)[256],
Expand All @@ -1195,45 +1193,28 @@ static void uv__epoll_ctl_prep(int epollfd,
uint32_t mask;
uint32_t slot;

if (ctl->ringfd == -1) {
if (!epoll_ctl(epollfd, op, fd, e))
return;

if (op == EPOLL_CTL_DEL)
return; /* Ignore errors, may be racing with another thread. */

if (op != EPOLL_CTL_ADD)
abort();

if (errno != EEXIST)
abort();

/* File descriptor that's been watched before, update event mask. */
if (!epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, e))
return;

abort();
} else {
mask = ctl->sqmask;
slot = (*ctl->sqtail)++ & mask;
assert(op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD);
assert(ctl->ringfd != -1);

pe = &(*events)[slot];
*pe = *e;
mask = ctl->sqmask;
slot = (*ctl->sqtail)++ & mask;

sqe = ctl->sqe;
sqe = &sqe[slot];
pe = &(*events)[slot];
*pe = *e;

memset(sqe, 0, sizeof(*sqe));
sqe->addr = (uintptr_t) pe;
sqe->fd = epollfd;
sqe->len = op;
sqe->off = fd;
sqe->opcode = UV__IORING_OP_EPOLL_CTL;
sqe->user_data = op | slot << 2 | (int64_t) fd << 32;
sqe = ctl->sqe;
sqe = &sqe[slot];

if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
uv__epoll_ctl_flush(epollfd, ctl, events);
}
memset(sqe, 0, sizeof(*sqe));
sqe->addr = (uintptr_t) pe;
sqe->fd = epollfd;
sqe->len = op;
sqe->off = fd;
sqe->opcode = UV__IORING_OP_EPOLL_CTL;
sqe->user_data = op | slot << 2 | (int64_t) fd << 32;

if ((*ctl->sqhead & mask) == (*ctl->sqtail & mask))
uv__epoll_ctl_flush(epollfd, ctl, events);
}


Expand Down Expand Up @@ -1374,8 +1355,22 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
w->events = w->pevents;
e.events = w->pevents;
e.data.fd = w->fd;
fd = w->fd;

uv__epoll_ctl_prep(epollfd, ctl, &prep, op, w->fd, &e);
if (ctl->ringfd != -1) {
uv__epoll_ctl_prep(epollfd, ctl, &prep, op, fd, &e);
continue;
}

if (!epoll_ctl(epollfd, op, fd, &e))
continue;

assert(op == EPOLL_CTL_ADD);
assert(errno == EEXIST);

/* File descriptor that's been watched before, update event mask. */
if (epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &e))
abort();
}

inv.events = events;
Expand Down Expand Up @@ -1463,8 +1458,12 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
*
* Ignore all errors because we may be racing with another thread
* when the file descriptor is closed.
*
* Perform EPOLL_CTL_DEL immediately instead of going through
* io_uring's submit queue, otherwise the file descriptor may
* be closed by the time the kernel starts the operation.
*/
uv__epoll_ctl_prep(epollfd, ctl, &prep, EPOLL_CTL_DEL, fd, pe);
epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, pe);
continue;
}

Expand Down
111 changes: 111 additions & 0 deletions test/test-iouring-pollhup.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/* Copyright libuv project contributors. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/

#include "uv.h"
#include "task.h"

#ifdef _WIN32

TEST_IMPL(iouring_pollhup) {
RETURN_SKIP("Not on Windows.");
}

#else /* !_WIN32 */

#include <unistd.h> /* close() */

static uv_pipe_t p1;
static uv_pipe_t p2;
static uv_idle_t idle_handle;
static int iters;
static int duped_fd;
static int newpipefds[2];

static void alloc_buffer(uv_handle_t* handle,
size_t suggested_size,
uv_buf_t* buf) {
static char slab[32];
*buf = uv_buf_init(slab, sizeof(slab));
}

static void read_data2(uv_stream_t* stream,
ssize_t nread,
const uv_buf_t* buf) {
if (nread < 0) {
ASSERT_EQ(nread, UV_EOF);
ASSERT_OK(close(duped_fd));
duped_fd = -1;
uv_close((uv_handle_t*) &p2, NULL);
uv_close((uv_handle_t*) &idle_handle, NULL);
} else {
/* If nread == 0 is because of POLLHUP received still from pipefds[0] file
* description which is still referenced in duped_fd. It should not happen
* if close(p1) was called after EPOLL_CTL_DEL.
*/
ASSERT_GT(nread, 0);
}
}

static void idle_cb(uv_idle_t* handle) {
if (++iters == 1) {
ASSERT_OK(uv_pipe_open(&p2, newpipefds[0]));
ASSERT_OK(uv_read_start((uv_stream_t*) &p2, alloc_buffer, read_data2));
} else {
ASSERT_OK(uv_idle_stop(handle));
ASSERT_OK(close(newpipefds[1]));
newpipefds[1] = -1;
}
}

static void read_data(uv_stream_t* stream,
ssize_t nread,
const uv_buf_t* buf) {
ASSERT_EQ(nread, UV_EOF);
uv_close((uv_handle_t*) stream, NULL);
ASSERT_OK(uv_idle_start(&idle_handle, idle_cb));
}

TEST_IMPL(iouring_pollhup) {
uv_loop_t* loop;
int pipefds[2];

loop = uv_default_loop();
ASSERT_OK(uv_pipe_init(loop, &p1, 0));
ASSERT_OK(uv_pipe_init(loop, &p2, 0));
ASSERT_OK(uv_idle_init(loop, &idle_handle));
ASSERT_OK(pipe(pipefds));
ASSERT_OK(pipe(newpipefds));

ASSERT_OK(uv_pipe_open(&p1, pipefds[0]));
duped_fd = dup(pipefds[0]);
ASSERT_NE(duped_fd, -1);

ASSERT_OK(uv_read_start((uv_stream_t*) &p1, alloc_buffer, read_data));
ASSERT_OK(close(pipefds[1])); /* Close write end, generate POLLHUP. */
pipefds[1] = -1;

ASSERT_OK(uv_run(loop, UV_RUN_DEFAULT));

MAKE_VALGRIND_HAPPY(uv_default_loop());
return 0;
}

#endif /* !_WIN32 */
24 changes: 24 additions & 0 deletions test/test-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,8 @@ TEST_DECLARE (not_writable_after_shutdown)
TEST_DECLARE (not_readable_nor_writable_on_read_error)
TEST_DECLARE (readable_on_eof)

TEST_DECLARE (iouring_pollhup)

TEST_DECLARE (idna_toascii)
TEST_DECLARE (utf8_decode1)
TEST_DECLARE (utf8_decode1_overrun)
Expand Down Expand Up @@ -1200,6 +1202,28 @@ TASK_LIST_START
TEST_ENTRY (req_type_name)
TEST_ENTRY (getters_setters)

<<<<<<< HEAD
=======
#ifndef _WIN32
TEST_ENTRY (fork_timer)
TEST_ENTRY (fork_socketpair)
TEST_ENTRY (fork_socketpair_started)
TEST_ENTRY (fork_signal_to_child)
TEST_ENTRY (fork_signal_to_child_closed)
TEST_ENTRY (fork_close_signal_in_child)
#ifndef __APPLE__
TEST_ENTRY (fork_fs_events_child)
TEST_ENTRY (fork_fs_events_child_dir)
TEST_ENTRY (fork_fs_events_file_parent_child)
#endif
#ifndef __MVS__
TEST_ENTRY (fork_threadpool_queue_work_simple)
#endif
#endif

TEST_ENTRY (iouring_pollhup)

>>>>>>> 3ecce914... linux: don't delay EPOLL_CTL_DEL operations (#4328)
TEST_ENTRY (utf8_decode1)
TEST_ENTRY (utf8_decode1_overrun)
TEST_ENTRY (uname)
Expand Down

0 comments on commit ca3a5a4

Please sign in to comment.