Closed
Description
When looking at validating MPTCP+TFO for the server side, I got a kernel deadlock on top of the export branch from yesterday: a84974a → export/20221121T103206
root@(none):/home/nix.tessares.net/mbaerts/Workspace/multipath-tcp/upstream/mptcp_net-next# cd /opt/packetdrill/gtests/net/mptcp
root@(none):/opt/packetdrill/gtests/net/mptcp# /opt/packetdrill/gtests/net/packetdrill/packetdrill -v --ip_version=ipv4 --local_ip=192.168.0.2 --gateway_ip=192.168.0.1 --netmask_ip=255.255.0.0 --remote_ip=192.0.2.1/24 -D CMSG_LEVEL_IP=SOL_IP -D CMSG_TYPE_RECVERR=IP_RECVERR -D TFO_COOKIE=de4f234f0f433a55 fastopen/server-TCP_FASTOPEN-no-cookie.pkt
socket syscall: 1669138520.527445
setsockopt syscall: 1669138520.527768
bind syscall: 1669138520.528086
listen syscall: 1669138520.528386
inbound injected packet: 0.001298 S 0:500(500) win 65535 <mss 1460,sackOK,TS val 100 ecr 0,nop,wscale 8,FO,nop,nop,mp_capable v1 flags: |H| >
outbound sniffed packet: 0.002449 S. 2366816640:2366816640(0) ack 501 win 65535 <mss 1460,sackOK,TS val 1347476566 ecr 100,nop,wscale 8,mp_capable v1 flags: |H| sender_key: 665474359205250004>
fastopen/server-TCP_FASTOPEN-no-cookie.pkt:14: error handling packet: live packet field ipv4_total_length: expected: 76 (0x4c) vs actual: 72 (0x48)
script packet: 0.012451 S. 0:0(0) ack 501 <mss 1460,nop,nop,sackOK,nop,wscale 8,FO de4f234f0f433a55,nop,nop,mp_capable v1 flags: |H| sender_key: 665474359205250004>
actual packet: 0.002449 S. 0:0(0) ack 501 win 65535 <mss 1460,sackOK,TS val 1347476566 ecr 100,nop,wscale 8,mp_capable v1 flags: |H| sender_key: 665474359205250004>
[ 72.126002] sched: RT throttling activated
[ 91.474002] rcu: INFO: rcu_preempt self-detected stall on CPU
[ 91.474456] rcu: 1-....: (21001 ticks this GP) idle=9084/1/0x4000000000000000 softirq=1107/1107 fqs=5250
[ 91.475178] (t=21001 jiffies g=665 q=4 ncpus=2)
[ 91.475474] CPU: 1 PID: 162 Comm: packetdrill Not tainted 6.1.0-rc5 #361
[ 91.475970] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
[ 91.477741] RIP: 0010:queued_spin_lock_slowpath (arch/x86/include/asm/vdso/processor.h:19)
[ 91.478611] Code: 00 41 55 41 54 55 48 89 fd 53 66 90 ba 01 00 00 00 8b 45 00 85 c0 75 10 f0 0f b1 55 00 85 c0 75 f0 5b 5d 41 5c 41 5d c3 f3 90 <eb> e5 81 fe 00 01 00 00 74 4c 40 30 f6 85 f6 75 6f f0 0f ba 6d 00
All code
========
0: 00 41 55 add %al,0x55(%rcx)
3: 41 54 push %r12
5: 55 push %rbp
6: 48 89 fd mov %rdi,%rbp
9: 53 push %rbx
a: 66 90 xchg %ax,%ax
c: ba 01 00 00 00 mov $0x1,%edx
11: 8b 45 00 mov 0x0(%rbp),%eax
14: 85 c0 test %eax,%eax
16: 75 10 jne 0x28
18: f0 0f b1 55 00 lock cmpxchg %edx,0x0(%rbp)
1d: 85 c0 test %eax,%eax
1f: 75 f0 jne 0x11
21: 5b pop %rbx
22: 5d pop %rbp
23: 41 5c pop %r12
25: 41 5d pop %r13
27: c3 ret
28: f3 90 pause
2a:* eb e5 jmp 0x11 <-- trapping instruction
2c: 81 fe 00 01 00 00 cmp $0x100,%esi
32: 74 4c je 0x80
34: 40 30 f6 xor %sil,%sil
37: 85 f6 test %esi,%esi
39: 75 6f jne 0xaa
3b: f0 lock
3c: 0f .byte 0xf
3d: ba .byte 0xba
3e: 6d insl (%dx),%es:(%rdi)
...
Code starting with the faulting instruction
===========================================
0: eb e5 jmp 0xffffffffffffffe7
2: 81 fe 00 01 00 00 cmp $0x100,%esi
8: 74 4c je 0x56
a: 40 30 f6 xor %sil,%sil
d: 85 f6 test %esi,%esi
f: 75 6f jne 0x80
11: f0 lock
12: 0f .byte 0xf
13: ba .byte 0xba
14: 6d insl (%dx),%es:(%rdi)
...
[ 91.481686] RSP: 0018:ffff9d9d8028fb30 EFLAGS: 00000202
[ 91.482521] RAX: 0000000000000001 RBX: ffff8e2102ec8858 RCX: 0000000000000cc0
[ 91.483469] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8e2102ec8818
[ 91.484035] RBP: ffff8e2102ec8818 R08: 0000000000001f90 R09: ffffffff962068d0
[ 91.484640] R10: 0000000000000008 R11: ffff8e2102f6ab00 R12: ffff8e2102ec8818
[ 91.485208] R13: ffff8e2102ec8dd0 R14: 0000000000000000 R15: 0000000000000001
[ 91.485825] FS: 0000000000000000(0000) GS:ffff8e217dd00000(0000) knlGS:0000000000000000
[ 91.486517] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 91.487029] CR2: 00007f42b4ca4e40 CR3: 0000000058010000 CR4: 0000000000350ee0
[ 91.487667] Call Trace:
[ 91.487896] <TASK>
[ 91.488091] mptcp_destroy_common (include/linux/skbuff.h:1764)
[ 91.488487] mptcp_destroy (include/net/sock.h:1495)
[ 91.488821] __mptcp_destroy_sock (net/mptcp/protocol.c:2886)
[ 91.489214] __mptcp_close (net/mptcp/protocol.c:2959)
[ 91.489547] mptcp_subflow_queue_clean (net/mptcp/subflow.c:1813)
[ 91.489997] __mptcp_close_ssk (net/mptcp/protocol.c:2363)
[ 91.490372] mptcp_destroy_common (net/mptcp/protocol.c:3170)
[ 91.490747] mptcp_destroy (include/net/sock.h:1495)
[ 91.491072] __mptcp_destroy_sock (net/mptcp/protocol.c:2886)
[ 91.491449] __mptcp_close (net/mptcp/protocol.c:2959)
[ 91.491772] mptcp_close (net/mptcp/protocol.c:2974)
[ 91.492084] inet_release (net/ipv4/af_inet.c:432)
[ 91.492400] __sock_release (net/socket.c:651)
[ 91.492731] sock_close (net/socket.c:1367)
[ 91.493029] __fput (fs/file_table.c:321)
[ 91.493314] task_work_run (include/linux/sched.h:2077 (discriminator 1))
[ 91.493654] do_exit (kernel/exit.c:821)
[ 91.493952] do_group_exit (kernel/exit.c:932)
[ 91.494277] get_signal (kernel/signal.c:2825)
[ 91.494603] arch_do_signal_or_restart (arch/x86/kernel/signal.c:869)
[ 91.495034] ? do_futex (kernel/futex/syscalls.c:106)
[ 91.495350] ? __x64_sys_futex (kernel/futex/syscalls.c:183)
[ 91.495720] exit_to_user_mode_prepare (kernel/entry/common.c:170)
[ 91.496158] syscall_exit_to_user_mode (arch/x86/include/asm/jump_label.h:55)
[ 91.496577] do_syscall_64 (arch/x86/entry/common.c:87)
[ 91.496911] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
[ 91.497372] RIP: 0033:0x7f4169e4a340
[ 91.497704] Code: Unable to access opcode bytes at 0x7f4169e4a316.
Code starting with the faulting instruction
===========================================
[ 91.498249] RSP: 002b:00007f4169db4db8 EFLAGS: 00000282 ORIG_RAX: 00000000000000ca
[ 91.498922] RAX: fffffffffffffe00 RBX: 00007f4169db5640 RCX: 00007f4169e4a340
[ 91.499545] RDX: 0000000000000002 RSI: 0000000000000080 RDI: 000055da090279c0
[ 91.500184] RBP: 00007f4169db4df0 R08: 0000000000000000 R09: 0000000000000000
[ 91.500820] R10: 0000000000000000 R11: 0000000000000282 R12: 00007f4169db5640
[ 91.501389] R13: 0000000000000002 R14: 00007f4169e4d850 R15: 00007ffe04eab210
[ 91.502007] </TASK>
The packetdrill test that can reproduce the issue:
// Receive data with TFO + cookie
--tolerance_usecs=100000
`../common/defaults.sh
sysctl -q net.ipv4.tcp_fastopen=0x602`
// A first connection to store the cookie
0.0 socket(..., SOCK_STREAM, IPPROTO_MPTCP) = 3
+0.0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.0 bind(3, ..., ...) = 0
+0.0 listen(3, 1) = 0
+0.0 < S 0:500(500) win 65535 <mss 1460, sackOK, TS val 100 ecr 0, nop, wscale 8, FO, nop, nop, mpcapable v1 flags[flag_h] nokey>
So here, packetdrill has injected a (valid) SYN with data (TFO). In theory, a SYN+ACK has been sent back by the kernel and a RST has been injected by Packetdrill just after but I cannot prove if it happened before or after the deadlock.
I didn't investigate more for the moment. This has been reproduced on a kernel without debug kconfig so far.