Bug 250351

Summary: [ZFS] [panic] panic: fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)
Product: Base System Reporter: Brooks Davis <brooks>
Component: kernAssignee: Matt Macy <mmacy>
Status: New ---    
Severity: Affects Only Me CC: jbeich, juraj, kib
Priority: ---    
Version: CURRENT   
Hardware: Any   
OS: Any   
Attachments:
Description Flags
Unconditionally enter FPU block with NOCTX
none
latest patch results none

Description Brooks Davis freebsd_committer 2020-10-14 20:19:29 UTC
As of r366676 (GitHub export 4b539280260) I get the following panic at boot after updating /etc/rc.d with openzfs bits (I suspect this is the result of populating /etc/zfs/zpool.cache):

panic: fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)                                   
cpuid = 22                                                                              
time = 1602700258                                                                       
KDB: stack backtrace:                                                                   
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe02056ec310          
vpanic() at vpanic+0x182/frame 0xfffffe02056ec360                                       
panic() at panic+0x43/frame 0xfffffe02056ec3c0                                          
fpudna() at fpudna+0x95/frame 0xfffffe02056ec3e0                                        
trap() at trap+0x34b/frame 0xfffffe02056ec4f0                                           
calltrap() at calltrap+0x8/frame 0xfffffe02056ec4f0                                     
--- trap 0x16, rip = 0xffffffff82171ac5, rsp = 0xfffffe02056ec5c0, rbp = 0xfffffe02056ec
5e0 ---                                                                                 
fletcher_4_sse2_native() at fletcher_4_sse2_native+0x25/frame 0xfffffe02056ec5e0        
abd_fletcher_4_iter() at abd_fletcher_4_iter+0x3e/frame 0xfffffe02056ec620              
abd_iterate_func() at abd_iterate_func+0x149/frame 0xfffffe02056ec6b0                   
abd_fletcher_4_native() at abd_fletcher_4_native+0x5f/frame 0xfffffe02056ec880          
zio_checksum_error_impl() at zio_checksum_error_impl+0x22b/frame 0xfffffe02056eca10     
zio_checksum_error() at zio_checksum_error+0x8f/frame 0xfffffe02056eca40                
zio_checksum_verify() at zio_checksum_verify+0x31/frame 0xfffffe02056ecab0              
zio_execute() at zio_execute+0x69/frame 0xfffffe02056ecb00                              
taskqueue_run_locked() at taskqueue_run_locked+0xaa/frame 0xfffffe02056ecb80            
taskqueue_thread_loop() at taskqueue_thread_loop+0x94/frame 0xfffffe02056ecbb0          
fork_exit() at fork_exit+0x80/frame 0xfffffe02056ecbf0                                  
fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe02056ecbf0                       
--- trap 0, rip = 0, rsp = 0, rbp = 0 ---                                               
KDB: enter: panic                                                                       
[ thread pid 0 tid 100805 ]                                                             
Stopped at      kdb_enter+0x37: movq    $0,0x10ad516(%rip)                              

At jhb@'s suggestion I added logging of fpu_kern_enter(FPU_KERN_NOCTX) and the resulting boot (limited by serial console buffer) is at https://reviews.freebsd.org/P439

I'm happy to provide more information on the pools in question.
Comment 1 Konstantin Belousov freebsd_committer 2020-10-15 00:48:14 UTC
can you show disassemble of your fletcher_4_sse2_native ?
Comment 2 Brooks Davis freebsd_committer 2020-11-03 18:39:27 UTC
Sorry for the delay, I missed kib's comment.  This is with a tree updated to r367298 (bccdcf07274c6503b6c3cd4bc909941645d5992e) and built with CROSS_TOOLCHAIN=llvm11.

000000000005dc70 <fletcher_4_sse2_native>:
   5dc70:       55                      push   %rbp
   5dc71:       48 89 e5                mov    %rsp,%rbp   
   5dc74:       41 57                   push   %r15
   5dc76:       41 56                   push   %r14
   5dc78:       53                      push   %rbx
   5dc79:       50                      push   %rax
   5dc7a:       49 89 d7                mov    %rdx,%r15
   5dc7d:       48 89 f3                mov    %rsi,%rbx
   5dc80:       49 89 fe                mov    %rdi,%r14
   5dc83:       49 01 f7                add    %rsi,%r15
   5dc86:       31 ff                   xor    %edi,%edi
   5dc88:       e8 00 00 00 00          callq  5dc8d <fletcher_4_sse2_native+0x1
d>
   5dc8d:       85 c0                   test   %eax,%eax
   5dc8f:       0f 84 98 00 00 00       je     5dd2d <fletcher_4_sse2_native+0xb
d>
   5dc95:       f3 41 0f 6f 06          movdqu (%r14),%xmm0
   5dc9a:       f3 41 0f 6f 4e 10       movdqu 0x10(%r14),%xmm1
   5dca0:       f3 41 0f 6f 56 20       movdqu 0x20(%r14),%xmm2
   5dca6:       f3 41 0f 6f 5e 30       movdqu 0x30(%r14),%xmm3
   5dcac:       66 0f ef e4             pxor   %xmm4,%xmm4
   5dcb0:       49 39 df                cmp    %rbx,%r15
   5dcb3:       76 44                   jbe    5dcf9 <fletcher_4_sse2_native+0x8
9>
   5dcb5:       66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
   5dcbc:       00 00 00
   5dcbf:       90                      nop
   5dcc0:       f3 0f 6f 2b             movdqu (%rbx),%xmm5
   5dcc4:       66 0f 6f f5             movdqa %xmm5,%xmm6
   5dcc8:       66 0f 62 ec             punpckldq %xmm4,%xmm5
   5dccc:       66 0f 6a f4             punpckhdq %xmm4,%xmm6
   5dcd0:       66 0f d4 c5             paddq  %xmm5,%xmm0
   5dcd4:       66 0f d4 c8             paddq  %xmm0,%xmm1
   5dcd8:       66 0f d4 d1             paddq  %xmm1,%xmm2
   5dcdc:       66 0f d4 da             paddq  %xmm2,%xmm3
   5dce0:       66 0f d4 c6             paddq  %xmm6,%xmm0
   5dce4:       66 0f d4 c8             paddq  %xmm0,%xmm1
   5dce8:       66 0f d4 d1             paddq  %xmm1,%xmm2
   5dcec:       66 0f d4 da             paddq  %xmm2,%xmm3
   5dcf0:       48 83 c3 10             add    $0x10,%rbx
   5dcf4:       4c 39 fb                cmp    %r15,%rbx
5dcf7:       72 c7                   jb     5dcc0 <fletcher_4_sse2_native+0x5
0>
   5dcf9:       f3 41 0f 7f 06          movdqu %xmm0,(%r14)
   5dcfe:       f3 41 0f 7f 4e 10       movdqu %xmm1,0x10(%r14)
   5dd04:       f3 41 0f 7f 56 20       movdqu %xmm2,0x20(%r14)
   5dd0a:       f3 41 0f 7f 5e 30       movdqu %xmm3,0x30(%r14)
   5dd10:       65 48 8b 3c 25 00 00    mov    %gs:0x0,%rdi
   5dd17:       00 00
   5dd19:       80 bf d8 05 00 00 00    cmpb   $0x0,0x5d8(%rdi)
   5dd20:       78 25                   js     5dd47 <fletcher_4_sse2_native+0xd
7>
   5dd22:       48 83 c4 08             add    $0x8,%rsp
   5dd26:       5b                      pop    %rbx
   5dd27:       41 5e                   pop    %r14
   5dd29:       41 5f                   pop    %r15
   5dd2b:       5d                      pop    %rbp
   5dd2c:       c3                      retq
   5dd2d:       65 48 8b 3c 25 00 00    mov    %gs:0x0,%rdi
   5dd34:       00 00
   5dd36:       31 f6                   xor    %esi,%esi
   5dd38:       ba 04 00 00 00          mov    $0x4,%edx
   5dd3d:       e8 00 00 00 00          callq  5dd42 <fletcher_4_sse2_native+0xd
2>
   5dd42:       e9 4e ff ff ff          jmpq   5dc95 <fletcher_4_sse2_native+0x2
5>
   5dd47:       31 f6                   xor    %esi,%esi
   5dd49:       48 83 c4 08             add    $0x8,%rsp
   5dd4d:       5b                      pop    %rbx
   5dd4e:       41 5e                   pop    %r14
   5dd50:       41 5f                   pop    %r15
   5dd52:       5d                      pop    %rbp
   5dd53:       e9 00 00 00 00          jmpq   5dd58 <fletcher_4_sse2_native+0xe
8>
   5dd58:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
   5dd5f:       00
Comment 3 Brooks Davis freebsd_committer 2020-11-03 20:40:55 UTC
I realized I should probably provide a matching backtrace:

KDB: stack backtrace:                                                                   
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xffffffff827878a0          
vpanic() at vpanic+0x182/frame 0xffffffff827878f0                                       
panic() at panic+0x43/frame 0xffffffff82787950                                          
fpu_kern_enter() at fpu_kern_enter+0x231/frame 0xffffffff82787980                       
fletcher_4_sse2_native() at fletcher_4_sse2_native+0xd2/frame 0xffffffff827879b0        
fletcher_4_native() at fletcher_4_native+0xeb/frame 0xffffffff82787ba0                  
fletcher_4_benchmark_impl() at fletcher_4_benchmark_impl+0xe0/frame 0xffffffff82787c40  
fletcher_4_init() at fletcher_4_init+0x1a7/frame 0xffffffff82787c60                     
mi_startup() at mi_startup+0xec/frame 0xffffffff82787cb0                                
btext() at btext+0x2c                                                                   
KDB: enter: panic                                                                       
[ thread pid 0 tid 100000 ]                                                             
Stopped at      kdb_enter+0x37: movq    $0,0x10a4f06(%rip)                              
db>
Comment 4 Konstantin Belousov freebsd_committer 2020-11-03 23:18:52 UTC
Hm but the trace from the comment #3 is quite different, now there is
the fpu_kern_enter() frame that was not in initial description, and a
direct panic() invocation.

What is the panic message ?

(I suspect r366713 might have changed the way the situation reports itself).
Comment 5 Brooks Davis freebsd_committer 2020-11-09 18:21:48 UTC
Latest panic message:

panic: fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)   
cpuid = 38                                                                              
time = 1604945860                                                                       
KDB: stack backtrace:                                                                   
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe02053b8310          
vpanic() at vpanic+0x182/frame 0xfffffe02053b8360                                       
panic() at panic+0x43/frame 0xfffffe02053b83c0                                          
fpudna() at fpudna+0x95/frame 0xfffffe02053b83e0                                        
trap() at trap+0x34b/frame 0xfffffe02053b84f0                                           
calltrap() at calltrap+0x8/frame 0xfffffe02053b84f0                                     
--- trap 0x16, rip = 0xffffffff821735d5, rsp = 0xfffffe02053b85c0, rbp = 0xfffffe02053b8
5e0 ---                                                                                 
fletcher_4_sse2_native() at fletcher_4_sse2_native+0x25/frame 0xfffffe02053b85e0        
abd_fletcher_4_iter() at abd_fletcher_4_iter+0x3e/frame 0xfffffe02053b8620              
abd_iterate_func() at abd_iterate_func+0x149/frame 0xfffffe02053b86b0                   
abd_fletcher_4_native() at abd_fletcher_4_native+0x5f/frame 0xfffffe02053b8880          
zio_checksum_error_impl() at zio_checksum_error_impl+0x22b/frame 0xfffffe02053b8a10     
zio_checksum_error() at zio_checksum_error+0x8f/frame 0xfffffe02053b8a40                
zio_checksum_verify() at zio_checksum_verify+0x31/frame 0xfffffe02053b8ab0              
zio_execute() at zio_execute+0x69/frame 0xfffffe02053b8b00                              
taskqueue_run_locked() at taskqueue_run_locked+0xaa/frame 0xfffffe02053b8b80            
taskqueue_thread_loop() at taskqueue_thread_loop+0x94/frame 0xfffffe02053b8bb0          
fork_exit() at fork_exit+0x80/frame 0xfffffe02053b8bf0                                  
fork_trampoline() at fork_trampoline+0xe/frame 0xfffffe02053b8bf0                       
--- trap 0, rip = 0, rsp = 0, rbp = 0 ---                                               
KDB: enter: panic                                                                       
[ thread pid 0 tid 101231 ]                                                             
Stopped at      kdb_enter+0x37: movq    $0,0x10a3e46(%rip)                              
db>
Comment 6 Konstantin Belousov freebsd_committer 2020-11-09 23:10:04 UTC
(In reply to Brooks Davis from comment #5)
This backtrace again flipped to the #NM exception.  Is disassemble from comment
#2 still relevant ?

Is dump available ?  I want to see printout of *curthread.
Comment 7 Brooks Davis freebsd_committer 2020-11-10 17:07:41 UTC
I don't have a dump as it happens before init.  I'll see what I can do about getting thread info.  Here's a matching disassembly for comment #5:
000000000005d5b0 <fletcher_4_sse2_native>:
   5d5b0:       55                      push   %rbp
   5d5b1:       48 89 e5                mov    %rsp,%rbp
   5d5b4:       41 57                   push   %r15
   5d5b6:       41 56                   push   %r14
   5d5b8:       53                      push   %rbx
   5d5b9:       50                      push   %rax
   5d5ba:       49 89 d7                mov    %rdx,%r15
   5d5bd:       48 89 f3                mov    %rsi,%rbx
   5d5c0:       49 89 fe                mov    %rdi,%r14
   5d5c3:       49 01 f7                add    %rsi,%r15
   5d5c6:       31 ff                   xor    %edi,%edi
   5d5c8:       e8 00 00 00 00          callq  5d5cd <fletcher_4_sse2_native+0x1d>
   5d5cd:       85 c0                   test   %eax,%eax
   5d5cf:       0f 84 98 00 00 00       je     5d66d <fletcher_4_sse2_native+0xbd>
   5d5d5:       f3 41 0f 6f 06          movdqu (%r14),%xmm0
   5d5da:       f3 41 0f 6f 4e 10       movdqu 0x10(%r14),%xmm1
   5d5e0:       f3 41 0f 6f 56 20       movdqu 0x20(%r14),%xmm2
   5d5e6:       f3 41 0f 6f 5e 30       movdqu 0x30(%r14),%xmm3
   5d5ec:       66 0f ef e4             pxor   %xmm4,%xmm4
   5d5f0:       49 39 df                cmp    %rbx,%r15
   5d5f3:       76 44                   jbe    5d639 <fletcher_4_sse2_native+0x89>
   5d5f5:       66 2e 0f 1f 84 00 00    nopw   %cs:0x0(%rax,%rax,1)
   5d5fc:       00 00 00
   5d5ff:       90                      nop
   5d600:       f3 0f 6f 2b             movdqu (%rbx),%xmm5
   5d604:       66 0f 6f f5             movdqa %xmm5,%xmm6
   5d608:       66 0f 62 ec             punpckldq %xmm4,%xmm5
   5d60c:       66 0f 6a f4             punpckhdq %xmm4,%xmm6
   5d610:       66 0f d4 c5             paddq  %xmm5,%xmm0
   5d614:       66 0f d4 c8             paddq  %xmm0,%xmm1   
   5d618:       66 0f d4 d1             paddq  %xmm1,%xmm2
   5d61c:       66 0f d4 da             paddq  %xmm2,%xmm3
   5d620:       66 0f d4 c6             paddq  %xmm6,%xmm0
   5d624:       66 0f d4 c8             paddq  %xmm0,%xmm1
   5d628:       66 0f d4 d1             paddq  %xmm1,%xmm2
   5d62c:       66 0f d4 da             paddq  %xmm2,%xmm3
   5d630:       48 83 c3 10             add    $0x10,%rbx
   5d634:       4c 39 fb                cmp    %r15,%rbx
   5d637:       72 c7                   jb     5d600 <fletcher_4_sse2_native+0x5
   5d639:       f3 41 0f 7f 06          movdqu %xmm0,(%r14)
   5d63e:       f3 41 0f 7f 4e 10       movdqu %xmm1,0x10(%r14)
   5d644:       f3 41 0f 7f 56 20       movdqu %xmm2,0x20(%r14)
   5d64a:       f3 41 0f 7f 5e 30       movdqu %xmm3,0x30(%r14)
   5d650:       65 48 8b 3c 25 00 00    mov    %gs:0x0,%rdi
   5d657:       00 00
   5d659:       80 bf d8 05 00 00 00    cmpb   $0x0,0x5d8(%rdi)
   5d660:       78 25                   js     5d687 <fletcher_4_sse2_native+0xd7>
   5d662:       48 83 c4 08             add    $0x8,%rsp
   5d666:       5b                      pop    %rbx
   5d667:       41 5e                   pop    %r14
   5d669:       41 5f                   pop    %r15
   5d66b:       5d                      pop    %rbp
   5d66c:       c3                      retq
   5d66d:       65 48 8b 3c 25 00 00    mov    %gs:0x0,%rdi
   5d674:       00 00
   5d676:       31 f6                   xor    %esi,%esi
   5d678:       ba 04 00 00 00          mov    $0x4,%edx
   5d67d:       e8 00 00 00 00          callq  5d682 <fletcher_4_sse2_native+0xd2>
   5d682:       e9 4e ff ff ff          jmpq   5d5d5 <fletcher_4_sse2_native+0x25>
   5d687:       31 f6                   xor    %esi,%esi
   5d689:       48 83 c4 08             add    $0x8,%rsp
   5d68d:       5b                      pop    %rbx
   5d68e:       41 5e                   pop    %r14
   5d690:       41 5f                   pop    %r15
   5d692:       5d                      pop    %rbp
   5d693:       e9 00 00 00 00          jmpq   5d698 <fletcher_4_sse2_native+0xe8>
   5d698:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
   5d69f:       00
Comment 8 Konstantin Belousov freebsd_committer 2020-11-22 12:36:04 UTC
Created attachment 219882 [details]
Unconditionally enter FPU block with NOCTX

Try this please, even if only as a debugging aid.  I wonder if some of this code
happens to execute under larger fpu block.
Comment 9 Brooks Davis freebsd_committer 2020-11-24 00:16:22 UTC
I had the for a reboot at the end of the day. With that patch hand applied, I get:
Trying to mount root from ufs:/dev/ada0p2 [rw]...                                       
Dual Console: Serial Primary, Video Secondary                                           
No suitable dump device was found.                                                      
Setting host                                                                            
                                                                                        
Fatal trap 12: page fault while in kernel mode                                          
cpuid = 26; apic id = 26                                                                
fault virtual address   = 0x8                                                           
fault code              = supervisor read data, page not present                        
instruction pointer     = 0x20:0xffffffff81004df2                                       
stack pointer           = 0x28:0xffffffff82786990                                       
frame pointer           = 0x28:0xffffffff827869b0                                       
code segment            = base rx0, limit 0xfffff, type 0x1b                            
                        = DPL 0, pres 1, long 1, def32 0, gran 1                        
processor eflags        = interrupt enabled, resume, IOPL = 0                           
current process         = 0 (swapper)                                                   
trap number             = 12                                                            
panic: page fault                                                                       
cpuid = 26                                                                              
time = 1606176475                                                                       
KDB: stack backtrace:                                                                   
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xffffffff82786640          
vpanic() at vpanic+0x182/frame 0xffffffff82786690                                       
panic() at panic+0x43/frame 0xffffffff827866f0                                          
trap_fatal() at trap_fatal+0x387/frame 0xffffffff82786750                               
trap_pfault() at trap_pfault+0x97/frame 0xffffffff827867b0                              
trap() at trap+0x2ab/frame 0xffffffff827868c0                                           
calltrap() at calltrap+0x8/frame 0xffffffff827868c0                                     
--- trap 0xc, rip = 0xffffffff81004df2, rsp = 0xffffffff82786990, rbp = 0xffffffff827869
b0 ---                                                                                  
fpu_kern_leave() at fpu_kern_leave+0x22/frame 0xffffffff827869b0                        
fletcher_4_native() at fletcher_4_native+0xeb/frame 0xffffffff82786ba0                  
fletcher_4_benchmark_impl() at fletcher_4_benchmark_impl+0xe0/frame 0xffffffff82786c40  
fletcher_4_init() at fletcher_4_init+0x1a7/frame 0xffffffff82786c60                     
mi_startup() at mi_startup+0xec/frame 0xffffffff82786cb0                                
btext() at btext+0x2c                                                                   
KDB: enter: panic                                                                       
[ thread pid 0 tid 100000 ]                                                             
Stopped at      kdb_enter+0x37: movq    $0,0x10a6766(%rip)                              
db>
Comment 10 Konstantin Belousov freebsd_committer 2020-11-24 15:30:14 UTC
(In reply to Brooks Davis from comment #9)
Can you please look up the source line for fletcher_4_native+0xeb ?
Comment 11 Juraj Lutter 2020-11-25 13:53:44 UTC
Has anything changed since this report? r368018 is running fine.
Comment 12 Brooks Davis freebsd_committer 2020-11-25 19:22:44 UTC
That's 5c4bb:

        ops->fini_native(&ctx, zcp);
   5c4bb:       4c 89 ef                mov    %r13,%rdi
   5c4be:       4c 8b 6c 24 38          mov    0x38(%rsp),%r13
   5c4c3:       4c 89 ee                mov    %r13,%rsi
   5c4c6:       41 ff 56 08             callq  *0x8(%r14)
Comment 13 Konstantin Belousov freebsd_committer 2020-11-27 19:15:15 UTC
(In reply to Brooks Davis from comment #12)
Try this please.
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
index 0352b13aa24..4885f18d57d 100644
--- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -998,7 +998,7 @@ EXPORT_SYMBOL(zfs_kfpu_fpregs);
 #endif /* HAVE_KERNEL_FPU_INTERNAL */
 
 static int __init
-zcommon_init(void)
+zcommon_init(void *dummy __unused)
 {
 	int error = kfpu_init();
 	if (error)
@@ -1016,7 +1016,8 @@ zcommon_fini(void)
 	kfpu_fini();
 }
 
-module_init(zcommon_init);
+SYSINIT(zfs_zcommon_init, SI_SUB_CONFIGURE, SI_ORDER_FIRST, zcommon_init, NULL);
+//module_init(zcommon_init);
 module_exit(zcommon_fini);
 
 #endif
Comment 14 Juraj Lutter 2020-11-28 00:43:16 UTC
Created attachment 220037 [details]
latest patch results