Bug 262691 - [patch] Save only callee-saved registers in pcb
Summary: [patch] Save only callee-saved registers in pcb
Status: In Progress
Alias: None
Product: Base System
Classification: Unclassified
Component: arm (show other bugs)
Version: CURRENT
Hardware: arm64 Any
: --- Affects Some People
Assignee: Andrew Turner
URL:
Keywords: patch
Depends on:
Blocks:
 
Reported: 2022-03-20 20:58 UTC by Dapeng Gao
Modified: 2024-01-10 03:38 UTC (History)
4 users (show)

See Also:
linimon: mfc-stable13?


Attachments
Patch (5.49 KB, patch)
2022-03-20 21:04 UTC, Dapeng Gao
no flags Details | Diff
Grouping MSR/MRS and LD/ST together (2.65 KB, patch)
2022-03-21 14:53 UTC, Dapeng Gao
no flags Details | Diff

Note You need to log in before you can comment on or make changes to this bug.
Description Dapeng Gao 2022-03-20 20:58:03 UTC
On AArch64, registers x9-x18 are not callee-saved, yet they are preserved at many placed in swtch.S. This patch removes code that preserves these registers.

It may also be worth it to remove the unused entries in the `pcb_x` field of `struct pcb`, but this would be a more disruptive change with compatibility concerns (e.g. `kgdb` would need to be changed to recognise the new layout the struct).

--- a/sys/arm64/arm64/swtch.S
+++ b/sys/arm64/arm64/swtch.S
@@ -94,18 +94,12 @@ ENTRY(cpu_throw)
 	msr	tpidr_el0, x6
 	ldr	x6, [x4, #PCB_TPIDRRO]
 	msr	tpidrro_el0, x6
-	ldp	x8, x9, [x4, #PCB_REGS + 8 * 8]
-	ldp	x10, x11, [x4, #PCB_REGS + 10 * 8]
-	ldp	x12, x13, [x4, #PCB_REGS + 12 * 8]
-	ldp	x14, x15, [x4, #PCB_REGS + 14 * 8]
-	ldp	x16, x17, [x4, #PCB_REGS + 16 * 8]
-	ldr	     x19, [x4, #PCB_REGS + 19 * 8]
-	ldp	x20, x21, [x4, #PCB_REGS + 20 * 8]
-	ldp	x22, x23, [x4, #PCB_REGS + 22 * 8]
-	ldp	x24, x25, [x4, #PCB_REGS + 24 * 8]
-	ldp	x26, x27, [x4, #PCB_REGS + 26 * 8]
-	ldp	x28, x29, [x4, #PCB_REGS + 28 * 8]
-	ldr	lr, [x4, #PCB_LR]
+	ldp	x19, x20, [x4, #PCB_REGS + 19 * 8]
+	ldp	x21, x22, [x4, #PCB_REGS + 21 * 8]
+	ldp	x23, x24, [x4, #PCB_REGS + 23 * 8]
+	ldp	x25, x26, [x4, #PCB_REGS + 25 * 8]
+	ldp	x27, x28, [x4, #PCB_REGS + 27 * 8]
+	ldp	x29, lr, [x4, #PCB_REGS + 29 * 8]
 
 	ret
 END(cpu_throw)
@@ -125,18 +119,12 @@ ENTRY(cpu_switch)
 	ldr	x4, [x0, #TD_PCB]
 
 	/* Store the callee-saved registers */
-	stp	x8, x9, [x4, #PCB_REGS + 8 * 8]
-	stp	x10, x11, [x4, #PCB_REGS + 10 * 8]
-	stp	x12, x13, [x4, #PCB_REGS + 12 * 8]
-	stp	x14, x15, [x4, #PCB_REGS + 14 * 8]
-	stp	x16, x17, [x4, #PCB_REGS + 16 * 8]
-	stp	x18, x19, [x4, #PCB_REGS + 18 * 8]
-	stp	x20, x21, [x4, #PCB_REGS + 20 * 8]
-	stp	x22, x23, [x4, #PCB_REGS + 22 * 8]
-	stp	x24, x25, [x4, #PCB_REGS + 24 * 8]
-	stp	x26, x27, [x4, #PCB_REGS + 26 * 8]
-	stp	x28, x29, [x4, #PCB_REGS + 28 * 8]
-	str	lr, [x4, #PCB_LR]
+	stp	x19, x20, [x4, #PCB_REGS + 19 * 8]
+	stp	x21, x22, [x4, #PCB_REGS + 21 * 8]
+	stp	x23, x24, [x4, #PCB_REGS + 23 * 8]
+	stp	x25, x26, [x4, #PCB_REGS + 25 * 8]
+	stp	x27, x28, [x4, #PCB_REGS + 27 * 8]
+	stp	x29, lr, [x4, #PCB_REGS + 29 * 8]
 	/* And the old stack pointer */
 	mov	x5, sp
 	mrs	x6, tpidrro_el0
@@ -195,18 +183,12 @@ ENTRY(cpu_switch)
 	msr	tpidr_el0, x6
 	ldr	x6, [x4, #PCB_TPIDRRO]
 	msr	tpidrro_el0, x6
-	ldp	x8, x9, [x4, #PCB_REGS + 8 * 8]
-	ldp	x10, x11, [x4, #PCB_REGS + 10 * 8]
-	ldp	x12, x13, [x4, #PCB_REGS + 12 * 8]
-	ldp	x14, x15, [x4, #PCB_REGS + 14 * 8]
-	ldp	x16, x17, [x4, #PCB_REGS + 16 * 8]
-	ldr	     x19, [x4, #PCB_REGS + 19 * 8]
-	ldp	x20, x21, [x4, #PCB_REGS + 20 * 8]
-	ldp	x22, x23, [x4, #PCB_REGS + 22 * 8]
-	ldp	x24, x25, [x4, #PCB_REGS + 24 * 8]
-	ldp	x26, x27, [x4, #PCB_REGS + 26 * 8]
-	ldp	x28, x29, [x4, #PCB_REGS + 28 * 8]
-	ldr	lr, [x4, #PCB_LR]
+	ldp	x19, x20, [x4, #PCB_REGS + 19 * 8]
+	ldp	x21, x22, [x4, #PCB_REGS + 21 * 8]
+	ldp	x23, x24, [x4, #PCB_REGS + 23 * 8]
+	ldp	x25, x26, [x4, #PCB_REGS + 25 * 8]
+	ldp	x27, x28, [x4, #PCB_REGS + 27 * 8]
+	ldp	x29, lr, [x4, #PCB_REGS + 29 * 8]
 
 	str	xzr, [x4, #PCB_REGS + 18 * 8]
 	ret
@@ -258,23 +240,17 @@ ENTRY(fork_trampoline)
 	 * will be set to the desired value anyway.
 	 */
 	ERET
-	
+
 END(fork_trampoline)
 
 ENTRY(savectx)
 	/* Store the callee-saved registers */
-	stp	x8,  x9,  [x0, #PCB_REGS + 8 * 8]
-	stp	x10, x11, [x0, #PCB_REGS + 10 * 8]
-	stp	x12, x13, [x0, #PCB_REGS + 12 * 8]
-	stp	x14, x15, [x0, #PCB_REGS + 14 * 8]
-	stp	x16, x17, [x0, #PCB_REGS + 16 * 8]
-	stp	x18, x19, [x0, #PCB_REGS + 18 * 8]
-	stp	x20, x21, [x0, #PCB_REGS + 20 * 8]
-	stp	x22, x23, [x0, #PCB_REGS + 22 * 8]
-	stp	x24, x25, [x0, #PCB_REGS + 24 * 8]
-	stp	x26, x27, [x0, #PCB_REGS + 26 * 8]
-	stp	x28, x29, [x0, #PCB_REGS + 28 * 8]
-	str	lr, [x0, #PCB_LR]
+	stp	x19, x20, [x0, #PCB_REGS + 19 * 8]
+	stp	x21, x22, [x0, #PCB_REGS + 21 * 8]
+	stp	x23, x24, [x0, #PCB_REGS + 23 * 8]
+	stp	x25, x26, [x0, #PCB_REGS + 25 * 8]
+	stp	x27, x28, [x0, #PCB_REGS + 27 * 8]
+	stp	x29, lr, [x0, #PCB_REGS + 29 * 8]
 	/* And the old stack pointer */
 	mov	x5, sp
 	mrs	x6, tpidrro_el0
diff --git a/sys/arm64/arm64/vm_machdep.c b/sys/arm64/arm64/vm_machdep.c
index c52a7e2fc5..feb439314f 100644
--- a/sys/arm64/arm64/vm_machdep.c
+++ b/sys/arm64/arm64/vm_machdep.c
@@ -105,8 +105,8 @@ cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 	td2->td_frame = tf;
 
 	/* Set the return value registers for fork() */
-	td2->td_pcb->pcb_x[8] = (uintptr_t)fork_return;
-	td2->td_pcb->pcb_x[9] = (uintptr_t)td2;
+	td2->td_pcb->pcb_x[19] = (uintptr_t)fork_return;
+	td2->td_pcb->pcb_x[20] = (uintptr_t)td2;
 	td2->td_pcb->pcb_lr = (uintptr_t)fork_trampoline;
 	td2->td_pcb->pcb_sp = (uintptr_t)td2->td_frame;
 	td2->td_pcb->pcb_fpusaved = &td2->td_pcb->pcb_fpustate;
@@ -183,8 +183,8 @@ cpu_copy_thread(struct thread *td, struct thread *td0)
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 	bcopy(td0->td_pcb, td->td_pcb, sizeof(struct pcb));
 
-	td->td_pcb->pcb_x[8] = (uintptr_t)fork_return;
-	td->td_pcb->pcb_x[9] = (uintptr_t)td;
+	td->td_pcb->pcb_x[19] = (uintptr_t)fork_return;
+	td->td_pcb->pcb_x[20] = (uintptr_t)td;
 	td->td_pcb->pcb_lr = (uintptr_t)fork_trampoline;
 	td->td_pcb->pcb_sp = (uintptr_t)td->td_frame;
 	td->td_pcb->pcb_fpflags &= ~(PCB_FP_STARTED | PCB_FP_KERN | PCB_FP_NOSAVE);
@@ -287,8 +287,8 @@ void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 
-	td->td_pcb->pcb_x[8] = (uintptr_t)func;
-	td->td_pcb->pcb_x[9] = (uintptr_t)arg;
+	td->td_pcb->pcb_x[19] = (uintptr_t)func;
+	td->td_pcb->pcb_x[20] = (uintptr_t)arg;
 }
 
 void
Comment 1 Dapeng Gao 2022-03-20 21:04:33 UTC
Created attachment 232594 [details]
Patch
Comment 2 Dapeng Gao 2022-03-20 21:06:48 UTC
NOTE: Please review the attached patch (instead of the one pasted in comment #0) as the pasted patch is incomplete.
Comment 3 Mark Millard 2022-03-20 21:46:36 UTC
(In reply to Dapeng Gao from comment #0)

Looks to me like some of this code is for context switching, for
example. If true, see:

https://developer.arm.com/documentation/den0024/a/The-Memory-Management-Unit/Context-switching

where, in part, it reports:

QUOTE
Exactly what has to be saved and restored varies between different operating systems, but typically a process context switch includes saving or restoring some or all of the following elements:

general-purpose registers X0-X30.
Advanced SIMD and Floating-point registers V0 - V31.
Some status registers.
TTBR0_EL1 and TTBR0.
Thread Process ID (TPIDxxx) Registers.
Address Space ID (ASID).
. . .
END QUOTE
Comment 4 Dapeng Gao 2022-03-20 21:58:20 UTC
(In reply to Mark Millard from comment #3)

These code are indeed related to context switching, but they are not expected to preserve all the listed registers. Instead, all changed functions (i.e. cpu_throw, cpu_switch, and savectx) are called by C code as normal C functions, and they only need to obey the C calling convention (i.e. preserving the callee-saved registers).

(Also note that none of x0-x7) were preserved in the original code, which confirms the view that caller-saved registers need not be preserved.
Comment 5 Mark Millard 2022-03-20 22:16:24 UTC
(In reply to Dapeng Gao from comment #4)

Using an example to indicate my limited understanding:

static void
sched_throw_tail(struct thread *td)
{

        mtx_assert(&sched_lock, MA_OWNED);
        KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
        cpu_throw(td, choosethread());  /* doesn't return */
}

cpu_throw does not return, for example, so niether does sched_throw_tail.
sched_throw_tail can not restore any context during its return activity
because it never returns. The return context is some place else. Any
related restore is via another mechanism.
Comment 6 John F. Carr 2022-03-20 22:40:06 UTC
The patch uses ldp and stp with offsets that are odd multiples of 8 bytes.  Are there any ARM v8 chips where ldp and stp are faster when accessing an aligned 16 byte block?
Comment 7 Mark Millard 2022-03-21 00:53:47 UTC
(In reply to Mark Millard from comment #5)

Hmm. I used to know some out the powerpc64 and powerpc
context for such things but it is clear that the code
is not as I mis-remember it. So either I've mixed up
contexts or just plain have forgotten what I once
knew.

So just ignore my comments (other than quotes from
documentation).

I did notice that:

https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers

lists "Caller-saved temporary registers (X9-X15)" and x8 is
in a category of its own there. x16, x17, and x18 are also
special: "Registers with a special purpose (X8, X16-X18, X29,
X30)". x18 is for platform ABI use, so its handling is
ABI-choice specific from what I can tell. (x18 "is an
additional temporary register on platforms that don't
assign a special meaning to it". As I understand, FreeBSD
has a defined use of x18.)

The others were (including x29 again):

"Argument registers (X0-X7)"
"Callee-saved registers (X19-X29)"

Out of this, the handling of x8, x16, x17, and x30 are not
explicitly listed as either caller-saved, nor callee-saved,
so are more volatile. But traps should not mess them up by
the time the code is returned to. (Some types of traps
happen at arbitrary points in the execution.)


Anyway, sorry for the noise.
Comment 8 Dapeng Gao 2022-03-21 14:53:32 UTC
Created attachment 232611 [details]
Grouping MSR/MRS and LD/ST together

This additional patch groups the MSR/MRS and LD/ST instructions together rather than interleaving them. This helps with processor stalls.
Comment 9 Andrew Turner freebsd_committer freebsd_triage 2022-05-25 10:40:50 UTC
Forgot to tag this in the commit: https://cgit.freebsd.org/src/commit/?id=e605b87a9e75a7f693527f0aad8189ae9db20f16
Comment 10 Graham Perrin freebsd_committer freebsd_triage 2022-10-17 12:39:19 UTC
Keyword: 

    patch
or  patch-ready

– in lieu of summary line prefix: 

    [patch]

* bulk change for the keyword
* summary lines may be edited manually (not in bulk). 

Keyword descriptions and search interface: 

    <https://bugs.freebsd.org/bugzilla/describekeywords.cgi>