From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1751932AbcCIBZJ (ORCPT <rfc822;w@1wt.eu>);
	Tue, 8 Mar 2016 20:25:09 -0500
Received: from mail.kernel.org ([198.145.29.136]:52060 "EHLO mail.kernel.org"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1750850AbcCIBZC (ORCPT <rfc822;linux-kernel@vger.kernel.org>);
	Tue, 8 Mar 2016 20:25:02 -0500
From: Andy Lutomirski <luto@kernel.org>
To: x86@kernel.org
Cc: linux-kernel@vger.kernel.org, Borislav Petkov <bp@alien8.de>,
        "musl@lists.openwall.com" <musl@lists.openwall.com>,
        Andy Lutomirski <luto@kernel.org>
Subject: [RFC PATCH] x86/vdso/32: Add AT_SYSINFO cancellation helpers
Date: Tue,  8 Mar 2016 17:24:52 -0800
Message-Id: <06079088639eddd756e2092b735ce4a682081308.1457486598.git.luto@kernel.org>
X-Mailer: git-send-email 2.5.0
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

musl implements system call cancellation in an unusual but clever way.
When a thread issues a cancellable syscall, musl issues the syscall
through a special thunk that looks roughly like this:

cancellable_syscall:
	test whether a cancel is queued
	jnz cancel_me
	int $0x80
end_cancellable_syscall:

If a pthread cancellation signal hits with
cancellable_syscall <= EIP < end_cancellable_syscall, then the
signal interrupted a cancellation point before the syscall in
question started.  If so, it rewrites the calling context to skip
the syscall and simulate a -EINTR return.  The caller will detect
this simulated -EINTR or an actual -EINTR and handle a possible
cancellation event.

This technique doesn't work if int $0x80 is replaced by a call to
AT_SYSINFO: the signal handler can no longer tell whether it's
interrupting a call to AT_SYSINFO or, if it is, where AT_SYSINFO was
called from.

Add minimal helpers so that musl's signal handler can learn the
status of a possible pending AT_SYSINFO invocation and, if it hasn't
entered the kernel yet, abort it without needing to parse the vdso
DWARF unwind data.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---

musl people-

Does this solve your AT_SYSINFO cancellation problem?  I'd like to
make sure it survives an actual implementation before I commit to the ABI.

x86 people-

Are you okay with this idea?


 arch/x86/entry/vdso/Makefile                      |   3 +-
 arch/x86/entry/vdso/vdso32/cancellation_helpers.c | 116 ++++++++++++++++++++++
 arch/x86/entry/vdso/vdso32/vdso32.lds.S           |   2 +
 tools/testing/selftests/x86/unwind_vdso.c         |  57 +++++++++--
 4 files changed, 171 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/entry/vdso/vdso32/cancellation_helpers.c

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index b88846471247..465052b49603 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,7 +130,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
 
 targets += vdso32/vdso32.lds
 targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
+targets += vdso32/vclock_gettime.o vdso32/cancellation_helpers.o
 
 KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
 $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
@@ -150,6 +150,7 @@ $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
 $(obj)/vdso32.so.dbg: FORCE \
 		      $(obj)/vdso32/vdso32.lds \
 		      $(obj)/vdso32/vclock_gettime.o \
+		      $(obj)/vdso32/cancellation_helpers.o \
 		      $(obj)/vdso32/note.o \
 		      $(obj)/vdso32/system_call.o \
 		      $(obj)/vdso32/sigreturn.o
diff --git a/arch/x86/entry/vdso/vdso32/cancellation_helpers.c b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
new file mode 100644
index 000000000000..3cb2e88baec6
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/cancellation_helpers.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Andrew Lutomirski
+ * Subject to the GNU Public License, v.2
+ *
+ * This provides helpers to enable libc implementations to cancel
+ * interrupted AT_SYSINFO invocations without needing to parse the
+ * DWARF unwinding instructions.
+ */
+
+#include <asm/signal.h>
+#include <asm/sigframe.h>
+
+extern char __kernel_vsyscall[] __attribute__((visibility("hidden")));
+extern char int80_landing_pad[] __attribute__((visibility("hidden")));
+
+static unsigned long *pending_syscall_retaddr_ptr(const void *context)
+{
+	const struct ucontext_ia32 *uc = context;
+	unsigned long ctx_eip = uc->uc_mcontext.ip;
+	unsigned long offset_into_vsyscall;
+	unsigned long *retaddr;
+
+	/*
+	 * An AT_SYSINFO system call is pending if and only if we're in
+	 * __kernel_vsyscall before int80_landing_pad.  If we're at
+	 * int80_landing_pad or beyond, we've finished the system call
+	 * and are on our way out.
+	 *
+	 * If we're at int80_landing_pad-2, then either we're using the
+	 * int $0x80 slow path because we have no fast system call
+	 * support or we are restarting a fast system call.  Either way,
+	 * the system call is still pending.
+	 */
+
+	if (ctx_eip < (unsigned long)__kernel_vsyscall ||
+	    ctx_eip >= (unsigned long)int80_landing_pad)
+		return NULL;
+
+	/*
+	 * The first three instructions of __kernel_vsyscall are one-byte
+	 * pushes.
+	 */
+	offset_into_vsyscall = (ctx_eip - (unsigned long)__kernel_vsyscall);
+	retaddr = (unsigned long *)uc->uc_mcontext.sp;
+	if (offset_into_vsyscall < 3)
+		retaddr += offset_into_vsyscall;
+	else
+		retaddr += 3;
+
+	/*
+	 * GCC (correctly) fails to deduce out that retaddr can't be NULL
+	 * in the success path.  Helping it out reduces code size.
+	 */
+	if (!retaddr)
+		__builtin_unreachable();
+
+	return retaddr;
+}
+
+/*
+ * If context is a sigcontet for a pending AT_SYSINFO syscall, returns
+ * the return address of that syscall.  Otherwise returns -1UL.
+ */
+unsigned long __vdso_pending_syscall_return_address(const void *context)
+{
+	unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+	return retaddr ? *retaddr : -1UL;
+}
+
+/*
+ * If context is a sigcontext for a pending AT_SYSINFO syscall, then
+ * this will pop off the call frame and point the context to
+ * AT_SYSINFO's return address.  ESP will contain whatever value it had
+ * immediately prior to the call instruction (i.e. ESP acts as though
+ * the system call returned normally).  EAX will be set to -EINTR.  All
+ * other GPRs will be clobbered.  __vdso_abort_pending_syscall will
+ * return 0.
+ *
+ * If context is a valid sigcontext that does not represent a pending
+ * AT_SYSINFO syscall, then __vdso_abort_pending_syscall returns
+ * -EINVAL.
+ *
+ * If context is not a valid sigcontext at all, behavior is undefined.
+ */
+long __vdso_abort_pending_syscall(void *context)
+{
+	struct ucontext_ia32 *uc = context;
+	unsigned long *retaddr = pending_syscall_retaddr_ptr(context);
+
+	if (!retaddr)
+		return -EINVAL;
+
+	uc->uc_mcontext.ip = *retaddr;
+	uc->uc_mcontext.sp = (unsigned long)(retaddr + 1);
+
+	/*
+	 * Clobber GPRs -- we don't want to implement full unwinding, and we
+	 * don't want userspace to start expecting anything about the final
+	 * state of the GPRs.
+	 *
+	 * (There really are subtleties here.  EAX can be clobbered by
+	 *  syscall restart, and register limitations mean that the
+	 *  saved context has at least one of the argument registers
+	 *  used for a different purpose by the calling sequence just
+	 *  prior to kernel entry.  In the current implementation, that
+	 *  register is EBP, but it could change.)
+	 */
+	uc->uc_mcontext.ax = -EINTR;
+	uc->uc_mcontext.bx = 0xFFFFFFFF;
+	uc->uc_mcontext.cx = 0xFFFFFFFF;
+	uc->uc_mcontext.dx = 0xFFFFFFFF;
+	uc->uc_mcontext.si = 0xFFFFFFFF;
+	uc->uc_mcontext.di = 0xFFFFFFFF;
+	uc->uc_mcontext.bp = 0xFFFFFFFF;
+	return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..f04e8bd30755 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -25,6 +25,8 @@ VERSION
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
 		__vdso_time;
+		__vdso_pending_syscall_return_address;
+		__vdso_abort_pending_syscall;
 	};
 
 	LINUX_2.5 {
diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c
index 00a26a82fa98..7c649b4b6834 100644
--- a/tools/testing/selftests/x86/unwind_vdso.c
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -35,6 +35,7 @@ int main()
 #include <syscall.h>
 #include <unistd.h>
 #include <string.h>
+#include <errno.h>
 #include <inttypes.h>
 #include <sys/mman.h>
 #include <signal.h>
@@ -88,8 +89,12 @@ static unsigned long sysinfo;
 static bool got_sysinfo = false;
 static unsigned long return_address;
 
+static unsigned long (*vdso_pending_syscall_return_address)(
+	const void *context);
+
 struct unwind_state {
 	unsigned long ip;	/* trap source */
+	unsigned long ax;	/* ax at call site */
 	int depth;		/* -1 until we hit the trap source */
 };
 
@@ -115,7 +120,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
 		unsigned long ebp = _Unwind_GetGR(ctx, 5);
 		unsigned long esi = _Unwind_GetGR(ctx, 6);
 		unsigned long edi = _Unwind_GetGR(ctx, 7);
-		bool ok = (eax == SYS_getpid || eax == getpid()) &&
+		bool ok = (eax == SYS_break || eax == -ENOSYS) &&
 			ebx == 1 && ecx == 2 && edx == 3 &&
 			esi == 4 && edi == 5 && ebp == 6;
 
@@ -125,6 +130,8 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
 		       (ok ? "OK" : "FAIL"),
 		       eax, ebx, ecx, edx, esi, edi, ebp);
 
+		state->ax = eax;
+
 		return _URC_NORMAL_STOP;
 	} else {
 		state->depth++;
@@ -137,6 +144,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 	ucontext_t *ctx = (ucontext_t *)ctx_void;
 	struct unwind_state state;
 	unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+	unsigned long reported_return_address = 0;
 
 	if (!got_sysinfo && ip == sysinfo) {
 		got_sysinfo = true;
@@ -148,8 +156,15 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 		       ip, return_address);
 	}
 
-	if (!got_sysinfo)
-		return;		/* Not there yet */
+	if (!got_sysinfo) {
+		if (vdso_pending_syscall_return_address &&
+		    vdso_pending_syscall_return_address(ctx_void) != -1UL) {
+			printf("[FAIL]\t__vdso_pending_syscall_return_address incorrectly detected a pending syscall\n");
+			nerrs++;
+		}
+
+		return;		/* We haven't started AT_SYSINFO yet */
+	}
 
 	if (ip == return_address) {
 		ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
@@ -157,11 +172,32 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 		return;
 	}
 
-	printf("\tSIGTRAP at 0x%lx\n", ip);
+	if (vdso_pending_syscall_return_address) {
+		reported_return_address =
+			vdso_pending_syscall_return_address(ctx_void);
+		if (reported_return_address != -1UL)
+			printf("\tSIGTRAP at 0x%lx, pending syscall will return to 0x%lx\n",
+			       ip, reported_return_address);
+		else
+			printf("\tSIGTRAP at 0x%lx, no syscall pending\n", ip);
+	} else {
+		printf("\tSIGTRAP at 0x%lx\n", ip);
+	}
 
 	state.ip = ip;
 	state.depth = -1;
 	_Unwind_Backtrace(trace_fn, &state);
+
+	if (vdso_pending_syscall_return_address) {
+		unsigned long expected =
+			(state.ax == SYS_break ? return_address : -1UL);
+		if (reported_return_address != expected) {
+			printf("[FAIL]\t  __vdso_pending_syscall_return_address returned 0x%lx; expected 0x%lx\n", reported_return_address, expected);
+			nerrs++;
+		} else {
+			printf("[OK]\t  __vdso_pending_syscall_return_address returned the correct value\n");
+		}
+	}
 }
 
 int main()
@@ -177,12 +213,21 @@ int main()
 		       info.dli_fname, info.dli_fbase);
 	}
 
+	void *vdso = dlopen("linux-gate.so.1", RTLD_NOW);
+	if (vdso)
+		vdso_pending_syscall_return_address = dlsym(vdso, "__vdso_pending_syscall_return_address");
+
 	sethandler(SIGTRAP, sigtrap, 0);
 
-	syscall(SYS_getpid);  /* Force symbol binding without TF set. */
+	syscall(SYS_break);  /* Force symbol binding without TF set. */
 	printf("[RUN]\tSet TF and check a fast syscall\n");
 	set_eflags(get_eflags() | X86_EFLAGS_TF);
-	syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+
+	/*
+	 * We need a harmless syscall that will never return its own syscall
+	 * nr.  SYS_break is not implemented and returns -ENOSYS.
+	 */
+	syscall(SYS_break, 1, 2, 3, 4, 5, 6);
 	if (!got_sysinfo) {
 		set_eflags(get_eflags() & ~X86_EFLAGS_TF);
 
-- 
2.5.0