During a ZFS scrub my kernel crashed in openzfs/module/avl/avl.c function avl_rotation because gchild was null in this code: gchild = child->avl_child[right]; gleft = gchild->avl_child[left]; gright = gchild->avl_child[right]; Longer: While a zpool scrub was active and nothing else was going on my system crashed due to a page fault in kernel mode. Faulting address was 0x8. Hardware is CPU: AMD Opteron(tm) X3421 APU (2096.10-MHz K8-class CPU) Origin="AuthenticAMD" Id=0x660f01 Family=0x15 Model=0x60 Stepping=1 Software is 13.1-STABLE as of January 9 (f61fca7409f6). Call chain at fault is avl_rotation + 0x51 avl_remove + 0x1b3 scan_io_queues_run_one + 0xad3 taskq_run + 0x1f Thread name is "zfskern/dsl_scan_iss". The struct trapframe passed to trap_pfault holds (lldb) p/x *frame (trapframe) $2 = { tf_rdi = 0xfffff800c1210460 tf_rsi = 0xfffff80300b0d968 tf_rdx = 0x0000000000000001 tf_rcx = 0x0000000000000001 tf_r8 = 0xfffff80300b0d968 tf_r9 = 0xfffff802674b7a78 tf_rax = 0x00000000ffffffff tf_rbx = 0x00000000ffffffff tf_rbp = 0xfffffe0125165ca0 tf_r10 = 0x0000000000000000 tf_r11 = 0xfffff80300000a78 tf_r12 = 0x0000000000000000 tf_r13 = 0x0000000000000000 tf_r14 = 0x0000000000000000 tf_r15 = 0xfffff802674b7a78 tf_trapno = 0x0000000c tf_fs = 0x0013 tf_gs = 0x001b tf_addr = 0x0000000000000008 tf_flags = 0x00000001 tf_es = 0x003b tf_ds = 0x003b tf_err = 0x0000000000000000 tf_rip = 0xffffffff81d0f461 tf_cs = 0x0000000000000020 tf_rflags = 0x0000000000010246 tf_rsp = 0xfffffe0125165c68 tf_ss = 0x0000000000000028 } The faulting instruction is the load (movq) at address 0x461 in the annotated disassembly below. It corresponds to line 409 of avl.c. # avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) 0x410: 55 pushq %rbp 0x411: 48 89 e5 movq %rsp, %rbp 0x414: 41 57 pushq %r15 0x416: 41 56 pushq %r14 0x418: 41 55 pushq %r13 0x41a: 41 54 pushq %r12 0x41c: 53 pushq %rbx 0x41d: 48 83 ec 10 subq $0x10, %rsp 0x421: 49 89 f0 movq %rsi, %r8 0x424: 89 d1 movl %edx, %ecx 0x426: f7 d1 notl %ecx 0x428: c1 e9 1f shrl $0x1f, %ecx # left = 1 0x42b: 41 89 ca movl %ecx, %r10d 0x42e: 41 83 f2 01 xorl $0x1, %r10d # right = 0 0x432: d1 fa sarl %edx 0x434: 89 d0 movl %edx, %eax 0x436: f7 d8 negl %eax # right_heavy = -1 0x438: 4c 8b 4e 10 movq 0x10(%rsi), %r9 0x43c: 8f 4a 78 10 ... bextrl $0x102, %r9d, %r13d 0x445: 49 83 e1 f8 andq $-0x8, %r9 0x449: 4c 8b 1c ce movq (%rsi,%rcx,8), %r11 # child = 0xfffff80300000a78 0x44d: 41 8b 5b 10 movl 0x10(%r11), %ebx 0x451: 83 e3 03 andl $0x3, %ebx 0x454: ff cb decl %ebx # child_bal = -1 0x456: 39 c3 cmpl %eax, %ebx 0x458: 75 73 jne 0x4cd # first if (...) 0x45a: 45 89 d6 movl %r10d, %r14d 0x45d: 4f 8b 24 f3 movq (%r11,%r14,8), %r12 # loads null 0x461: 4d 8b 3c cc movq (%r12,%rcx,8), %r15 # page fault %r12=0, %rcx=1 0x465: 4b 8b 1c f4 movq (%r12,%r14,8), %rbx 0x469: 49 89 1c c8 movq %rbx, (%r8,%rcx,8) 0x46d: 48 85 db testq %rbx, %rbx 0x470: 74 2c je 0x49e # if (gright != NULL)
This happened again running 13.2-STABLE at 79ce96abd6c5. It probably happened a couple other times since my original submission when the system was unable to save or analyze a crash. The AVL tree is corrupt and avl_walk crashes following a bad pointer. The initial node is (kgdb) p/x *(avl_node_t *)0xfffff801476764a0 $20 = {avl_child = {0x0, 0xfffff80200004d20}, avl_pcb = 0xfffff801f1c461fa} Following the avl_child[1] leads to (kgdb) p/x *(avl_node_t *)0xfffff80200004d20 $21 = {avl_child = {0x395753c375b177a6, 0xfa91e69b009252c}, avl_pcb = 0xfffff801476764a6} Following avl_child[0] leads to a GPF using 0x395753c375b177a6 as a base address. #7 avl_walk (tree=tree@entry=0xfffff80009178260, oldnode=oldnode@entry=0xfffff80147676440, left=left@entry=1) at /usr/src/sys/contrib/openzfs/module/avl/avl.c:147 #8 0xffffffff81c1bea5 in scan_io_queue_gather (queue=0xfffff80009178200, list=0xfffffe010f60eda8, rs=<optimized out>) at /usr/src/sys/contrib/openzfs/module/zfs/dsl_scan.c:2942 #9 scan_io_queues_run_one (arg=0xfffff80009178200) at /usr/src/sys/contrib/openzfs/module/zfs/dsl_scan.c:3093 #10 0xffffffff81b41bbf in taskq_run (arg=0xfffff80041735d80, pending=<optimized out>) at /usr/src/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c:315
Submitted as openzfs issue 15271.