On AArch64, registers x9-x18 are not callee-saved, yet they are preserved at many placed in swtch.S. This patch removes code that preserves these registers. It may also be worth it to remove the unused entries in the `pcb_x` field of `struct pcb`, but this would be a more disruptive change with compatibility concerns (e.g. `kgdb` would need to be changed to recognise the new layout the struct). --- a/sys/arm64/arm64/swtch.S +++ b/sys/arm64/arm64/swtch.S @@ -94,18 +94,12 @@ ENTRY(cpu_throw) msr tpidr_el0, x6 ldr x6, [x4, #PCB_TPIDRRO] msr tpidrro_el0, x6 - ldp x8, x9, [x4, #PCB_REGS + 8 * 8] - ldp x10, x11, [x4, #PCB_REGS + 10 * 8] - ldp x12, x13, [x4, #PCB_REGS + 12 * 8] - ldp x14, x15, [x4, #PCB_REGS + 14 * 8] - ldp x16, x17, [x4, #PCB_REGS + 16 * 8] - ldr x19, [x4, #PCB_REGS + 19 * 8] - ldp x20, x21, [x4, #PCB_REGS + 20 * 8] - ldp x22, x23, [x4, #PCB_REGS + 22 * 8] - ldp x24, x25, [x4, #PCB_REGS + 24 * 8] - ldp x26, x27, [x4, #PCB_REGS + 26 * 8] - ldp x28, x29, [x4, #PCB_REGS + 28 * 8] - ldr lr, [x4, #PCB_LR] + ldp x19, x20, [x4, #PCB_REGS + 19 * 8] + ldp x21, x22, [x4, #PCB_REGS + 21 * 8] + ldp x23, x24, [x4, #PCB_REGS + 23 * 8] + ldp x25, x26, [x4, #PCB_REGS + 25 * 8] + ldp x27, x28, [x4, #PCB_REGS + 27 * 8] + ldp x29, lr, [x4, #PCB_REGS + 29 * 8] ret END(cpu_throw) @@ -125,18 +119,12 @@ ENTRY(cpu_switch) ldr x4, [x0, #TD_PCB] /* Store the callee-saved registers */ - stp x8, x9, [x4, #PCB_REGS + 8 * 8] - stp x10, x11, [x4, #PCB_REGS + 10 * 8] - stp x12, x13, [x4, #PCB_REGS + 12 * 8] - stp x14, x15, [x4, #PCB_REGS + 14 * 8] - stp x16, x17, [x4, #PCB_REGS + 16 * 8] - stp x18, x19, [x4, #PCB_REGS + 18 * 8] - stp x20, x21, [x4, #PCB_REGS + 20 * 8] - stp x22, x23, [x4, #PCB_REGS + 22 * 8] - stp x24, x25, [x4, #PCB_REGS + 24 * 8] - stp x26, x27, [x4, #PCB_REGS + 26 * 8] - stp x28, x29, [x4, #PCB_REGS + 28 * 8] - str lr, [x4, #PCB_LR] + stp x19, x20, [x4, #PCB_REGS + 19 * 8] + stp x21, x22, [x4, #PCB_REGS + 21 * 8] + stp x23, x24, [x4, #PCB_REGS + 23 * 8] + stp x25, x26, [x4, #PCB_REGS + 25 * 8] + stp x27, x28, [x4, #PCB_REGS + 27 * 8] + stp x29, lr, [x4, #PCB_REGS + 29 * 8] /* And the old stack pointer */ mov x5, sp mrs x6, tpidrro_el0 @@ -195,18 +183,12 @@ ENTRY(cpu_switch) msr tpidr_el0, x6 ldr x6, [x4, #PCB_TPIDRRO] msr tpidrro_el0, x6 - ldp x8, x9, [x4, #PCB_REGS + 8 * 8] - ldp x10, x11, [x4, #PCB_REGS + 10 * 8] - ldp x12, x13, [x4, #PCB_REGS + 12 * 8] - ldp x14, x15, [x4, #PCB_REGS + 14 * 8] - ldp x16, x17, [x4, #PCB_REGS + 16 * 8] - ldr x19, [x4, #PCB_REGS + 19 * 8] - ldp x20, x21, [x4, #PCB_REGS + 20 * 8] - ldp x22, x23, [x4, #PCB_REGS + 22 * 8] - ldp x24, x25, [x4, #PCB_REGS + 24 * 8] - ldp x26, x27, [x4, #PCB_REGS + 26 * 8] - ldp x28, x29, [x4, #PCB_REGS + 28 * 8] - ldr lr, [x4, #PCB_LR] + ldp x19, x20, [x4, #PCB_REGS + 19 * 8] + ldp x21, x22, [x4, #PCB_REGS + 21 * 8] + ldp x23, x24, [x4, #PCB_REGS + 23 * 8] + ldp x25, x26, [x4, #PCB_REGS + 25 * 8] + ldp x27, x28, [x4, #PCB_REGS + 27 * 8] + ldp x29, lr, [x4, #PCB_REGS + 29 * 8] str xzr, [x4, #PCB_REGS + 18 * 8] ret @@ -258,23 +240,17 @@ ENTRY(fork_trampoline) * will be set to the desired value anyway. */ ERET - + END(fork_trampoline) ENTRY(savectx) /* Store the callee-saved registers */ - stp x8, x9, [x0, #PCB_REGS + 8 * 8] - stp x10, x11, [x0, #PCB_REGS + 10 * 8] - stp x12, x13, [x0, #PCB_REGS + 12 * 8] - stp x14, x15, [x0, #PCB_REGS + 14 * 8] - stp x16, x17, [x0, #PCB_REGS + 16 * 8] - stp x18, x19, [x0, #PCB_REGS + 18 * 8] - stp x20, x21, [x0, #PCB_REGS + 20 * 8] - stp x22, x23, [x0, #PCB_REGS + 22 * 8] - stp x24, x25, [x0, #PCB_REGS + 24 * 8] - stp x26, x27, [x0, #PCB_REGS + 26 * 8] - stp x28, x29, [x0, #PCB_REGS + 28 * 8] - str lr, [x0, #PCB_LR] + stp x19, x20, [x0, #PCB_REGS + 19 * 8] + stp x21, x22, [x0, #PCB_REGS + 21 * 8] + stp x23, x24, [x0, #PCB_REGS + 23 * 8] + stp x25, x26, [x0, #PCB_REGS + 25 * 8] + stp x27, x28, [x0, #PCB_REGS + 27 * 8] + stp x29, lr, [x0, #PCB_REGS + 29 * 8] /* And the old stack pointer */ mov x5, sp mrs x6, tpidrro_el0 diff --git a/sys/arm64/arm64/vm_machdep.c b/sys/arm64/arm64/vm_machdep.c index c52a7e2fc5..feb439314f 100644 --- a/sys/arm64/arm64/vm_machdep.c +++ b/sys/arm64/arm64/vm_machdep.c @@ -105,8 +105,8 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) td2->td_frame = tf; /* Set the return value registers for fork() */ - td2->td_pcb->pcb_x[8] = (uintptr_t)fork_return; - td2->td_pcb->pcb_x[9] = (uintptr_t)td2; + td2->td_pcb->pcb_x[19] = (uintptr_t)fork_return; + td2->td_pcb->pcb_x[20] = (uintptr_t)td2; td2->td_pcb->pcb_lr = (uintptr_t)fork_trampoline; td2->td_pcb->pcb_sp = (uintptr_t)td2->td_frame; td2->td_pcb->pcb_fpusaved = &td2->td_pcb->pcb_fpustate; @@ -183,8 +183,8 @@ cpu_copy_thread(struct thread *td, struct thread *td0) bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); bcopy(td0->td_pcb, td->td_pcb, sizeof(struct pcb)); - td->td_pcb->pcb_x[8] = (uintptr_t)fork_return; - td->td_pcb->pcb_x[9] = (uintptr_t)td; + td->td_pcb->pcb_x[19] = (uintptr_t)fork_return; + td->td_pcb->pcb_x[20] = (uintptr_t)td; td->td_pcb->pcb_lr = (uintptr_t)fork_trampoline; td->td_pcb->pcb_sp = (uintptr_t)td->td_frame; td->td_pcb->pcb_fpflags &= ~(PCB_FP_STARTED | PCB_FP_KERN | PCB_FP_NOSAVE); @@ -287,8 +287,8 @@ void cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) { - td->td_pcb->pcb_x[8] = (uintptr_t)func; - td->td_pcb->pcb_x[9] = (uintptr_t)arg; + td->td_pcb->pcb_x[19] = (uintptr_t)func; + td->td_pcb->pcb_x[20] = (uintptr_t)arg; } void
Created attachment 232594 [details] Patch
NOTE: Please review the attached patch (instead of the one pasted in comment #0) as the pasted patch is incomplete.
(In reply to Dapeng Gao from comment #0) Looks to me like some of this code is for context switching, for example. If true, see: https://developer.arm.com/documentation/den0024/a/The-Memory-Management-Unit/Context-switching where, in part, it reports: QUOTE Exactly what has to be saved and restored varies between different operating systems, but typically a process context switch includes saving or restoring some or all of the following elements: general-purpose registers X0-X30. Advanced SIMD and Floating-point registers V0 - V31. Some status registers. TTBR0_EL1 and TTBR0. Thread Process ID (TPIDxxx) Registers. Address Space ID (ASID). . . . END QUOTE
(In reply to Mark Millard from comment #3) These code are indeed related to context switching, but they are not expected to preserve all the listed registers. Instead, all changed functions (i.e. cpu_throw, cpu_switch, and savectx) are called by C code as normal C functions, and they only need to obey the C calling convention (i.e. preserving the callee-saved registers). (Also note that none of x0-x7) were preserved in the original code, which confirms the view that caller-saved registers need not be preserved.
(In reply to Dapeng Gao from comment #4) Using an example to indicate my limited understanding: static void sched_throw_tail(struct thread *td) { mtx_assert(&sched_lock, MA_OWNED); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); cpu_throw(td, choosethread()); /* doesn't return */ } cpu_throw does not return, for example, so niether does sched_throw_tail. sched_throw_tail can not restore any context during its return activity because it never returns. The return context is some place else. Any related restore is via another mechanism.
The patch uses ldp and stp with offsets that are odd multiples of 8 bytes. Are there any ARM v8 chips where ldp and stp are faster when accessing an aligned 16 byte block?
(In reply to Mark Millard from comment #5) Hmm. I used to know some out the powerpc64 and powerpc context for such things but it is clear that the code is not as I mis-remember it. So either I've mixed up contexts or just plain have forgotten what I once knew. So just ignore my comments (other than quotes from documentation). I did notice that: https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers lists "Caller-saved temporary registers (X9-X15)" and x8 is in a category of its own there. x16, x17, and x18 are also special: "Registers with a special purpose (X8, X16-X18, X29, X30)". x18 is for platform ABI use, so its handling is ABI-choice specific from what I can tell. (x18 "is an additional temporary register on platforms that don't assign a special meaning to it". As I understand, FreeBSD has a defined use of x18.) The others were (including x29 again): "Argument registers (X0-X7)" "Callee-saved registers (X19-X29)" Out of this, the handling of x8, x16, x17, and x30 are not explicitly listed as either caller-saved, nor callee-saved, so are more volatile. But traps should not mess them up by the time the code is returned to. (Some types of traps happen at arbitrary points in the execution.) Anyway, sorry for the noise.
Created attachment 232611 [details] Grouping MSR/MRS and LD/ST together This additional patch groups the MSR/MRS and LD/ST instructions together rather than interleaving them. This helps with processor stalls.
Forgot to tag this in the commit: https://cgit.freebsd.org/src/commit/?id=e605b87a9e75a7f693527f0aad8189ae9db20f16
Keyword: patch or patch-ready – in lieu of summary line prefix: [patch] * bulk change for the keyword * summary lines may be edited manually (not in bulk). Keyword descriptions and search interface: <https://bugs.freebsd.org/bugzilla/describekeywords.cgi>