Attachment 235195 Details for Bug 264867 – 13.1 patch

[patch] 13.1 patch

diff (text/plain), 29.92 KB, created by Mark Johnston on 2022-07-11 20:15:40 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Mark Johnston

Created: 2022-07-11 20:15:40 UTC

Size: 29.92 KB

patch

obsolete

>diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
>index 48e06ee082fc..89d19bca9317 100644
>--- a/sys/kern/kern_clocksource.c
>+++ b/sys/kern/kern_clocksource.c
>@@ -65,8 +65,9 @@ static int		doconfigtimer(void);
> static void		configtimer(int start);
> static int		round_freq(struct eventtimer *et, int freq);
> 
>-static sbintime_t	getnextcpuevent(int idle);
>-static sbintime_t	getnextevent(void);
>+struct pcpu_state;
>+static sbintime_t	getnextcpuevent(struct pcpu_state *state, int idle);
>+static sbintime_t	getnextevent(struct pcpu_state *state);
> static int		handleevents(sbintime_t now, int fake);
> 
> static struct mtx	et_hw_mtx;
>@@ -213,8 +214,8 @@ handleevents(sbintime_t now, int fake)
> 		callout_process(now);
> 	}
> 
>-	t = getnextcpuevent(0);
> 	ET_HW_LOCK(state);
>+	t = getnextcpuevent(state, 0);
> 	if (!busy) {
> 		state->idle = 0;
> 		state->nextevent = t;
>@@ -229,13 +230,11 @@ handleevents(sbintime_t now, int fake)
>  * Schedule binuptime of the next event on current CPU.
>  */
> static sbintime_t
>-getnextcpuevent(int idle)
>+getnextcpuevent(struct pcpu_state *state, int idle)
> {
> 	sbintime_t event;
>-	struct pcpu_state *state;
> 	u_int hardfreq;
> 
>-	state = DPCPU_PTR(timerstate);
> 	/* Handle hardclock() events, skipping some if CPU is idle. */
> 	event = state->nexthard;
> 	if (idle) {
>@@ -266,9 +265,8 @@ getnextcpuevent(int idle)
>  * Schedule binuptime of the next event on all CPUs.
>  */
> static sbintime_t
>-getnextevent(void)
>+getnextevent(struct pcpu_state *state)
> {
>-	struct pcpu_state *state;
> 	sbintime_t event;
> #ifdef SMP
> 	int	cpu;
>@@ -278,7 +276,6 @@ getnextevent(void)
> 
> 	c = -1;
> #endif
>-	state = DPCPU_PTR(timerstate);
> 	event = state->nextevent;
> #ifdef SMP
> 	if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
>@@ -385,10 +382,10 @@ loadtimer(sbintime_t now, int start)
> 	uint64_t tmp;
> 	int eq;
> 
>-	if (timer->et_flags & ET_FLAGS_PERCPU) {
>-		state = DPCPU_PTR(timerstate);
>+	state = DPCPU_PTR(timerstate);
>+	if (timer->et_flags & ET_FLAGS_PERCPU)
> 		next = &state->nexttick;
>-	} else
>+	else
> 		next = &nexttick;
> 	if (periodic) {
> 		if (start) {
>@@ -407,7 +404,7 @@ loadtimer(sbintime_t now, int start)
> 			et_start(timer, new, timerperiod);
> 		}
> 	} else {
>-		new = getnextevent();
>+		new = getnextevent(state);
> 		eq = (new == *next);
> 		CTR4(KTR_SPARE2, "load at %d:    next %d.%08x eq %d",
> 		    curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
>@@ -681,14 +678,12 @@ cpu_initclocks_bsp(void)
> void
> cpu_initclocks_ap(void)
> {
>-	sbintime_t now;
> 	struct pcpu_state *state;
> 	struct thread *td;
> 
> 	state = DPCPU_PTR(timerstate);
>-	now = sbinuptime();
> 	ET_HW_LOCK(state);
>-	state->now = now;
>+	state->now = sbinuptime();
> 	hardclock_sync(curcpu);
> 	spinlock_enter();
> 	ET_HW_UNLOCK(state);
>@@ -772,14 +767,14 @@ cpu_idleclock(void)
> 	    )
> 		return (-1);
> 	state = DPCPU_PTR(timerstate);
>+	ET_HW_LOCK(state);
> 	if (periodic)
> 		now = state->now;
> 	else
> 		now = sbinuptime();
> 	CTR3(KTR_SPARE2, "idle at %d:    now  %d.%08x",
> 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
>-	t = getnextcpuevent(1);
>-	ET_HW_LOCK(state);
>+	t = getnextcpuevent(state, 1);
> 	state->idle = 1;
> 	state->nextevent = t;
> 	if (!periodic)
>@@ -799,15 +794,15 @@ cpu_activeclock(void)
> 	struct thread *td;
> 
> 	state = DPCPU_PTR(timerstate);
>-	if (state->idle == 0 || busy)
>+	if (atomic_load_int(&state->idle) == 0 || busy)
> 		return;
>+	spinlock_enter();
> 	if (periodic)
> 		now = state->now;
> 	else
> 		now = sbinuptime();
> 	CTR3(KTR_SPARE2, "active at %d:  now  %d.%08x",
> 	    curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
>-	spinlock_enter();
> 	td = curthread;
> 	td->td_intr_nesting_level++;
> 	handleevents(now, 1);
>diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
>index 1f859b286843..d851d2f91d49 100644
>--- a/sys/kern/sched_ule.c
>+++ b/sys/kern/sched_ule.c
>@@ -226,9 +226,16 @@ static int __read_mostly sched_idlespins = 10000;
> static int __read_mostly sched_idlespinthresh = -1;
> 
> /*
>- * tdq - per processor runqs and statistics.  All fields are protected by the
>- * tdq_lock.  The load and lowpri may be accessed without to avoid excess
>- * locking in sched_pickcpu();
>+ * tdq - per processor runqs and statistics.  A mutex synchronizes access to
>+ * most fields.  Some fields are loaded or modified without the mutex.
>+ *
>+ * Locking protocols:
>+ * (c)  constant after initialization
>+ * (f)  flag, set with the tdq lock held, cleared on local CPU
>+ * (l)  all accesses are CPU-local
>+ * (ls) stores are performed by the local CPU, loads may be lockless
>+ * (t)  all accesses are protected by the tdq mutex
>+ * (ts) stores are serialized by the tdq mutex, loads may be lockless
>  */
> struct tdq {
> 	/* 
>@@ -236,32 +243,41 @@ struct tdq {
> 	 * tdq_lock is padded to avoid false sharing with tdq_load and
> 	 * tdq_cpu_idle.
> 	 */
>-	struct mtx_padalign tdq_lock;		/* run queue lock. */
>-	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
>-	volatile int	tdq_load;		/* Aggregate load. */
>-	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
>-	int		tdq_sysload;		/* For loadavg, !ITHD load. */
>-	volatile int	tdq_transferable;	/* Transferable thread count. */
>-	volatile short	tdq_switchcnt;		/* Switches this tick. */
>-	volatile short	tdq_oldswitchcnt;	/* Switches last tick. */
>-	u_char		tdq_lowpri;		/* Lowest priority thread. */
>-	u_char		tdq_owepreempt;		/* Remote preemption pending. */
>-	u_char		tdq_idx;		/* Current insert index. */
>-	u_char		tdq_ridx;		/* Current removal index. */
>-	int		tdq_id;			/* cpuid. */
>-	struct runq	tdq_realtime;		/* real-time run queue. */
>-	struct runq	tdq_timeshare;		/* timeshare run queue. */
>-	struct runq	tdq_idle;		/* Queue of IDLE threads. */
>+	struct mtx_padalign tdq_lock;	/* run queue lock. */
>+	struct cpu_group *tdq_cg;	/* (c) Pointer to cpu topology. */
>+	struct thread	*tdq_curthread;	/* (t) Current executing thread. */
>+	int		tdq_load;	/* (ts) Aggregate load. */
>+	int		tdq_sysload;	/* (ts) For loadavg, !ITHD load. */
>+	int		tdq_cpu_idle;	/* (ls) cpu_idle() is active. */
>+	int		tdq_transferable; /* (ts) Transferable thread count. */
>+	short		tdq_switchcnt;	/* (l) Switches this tick. */
>+	short		tdq_oldswitchcnt; /* (l) Switches last tick. */
>+	u_char		tdq_lowpri;	/* (ts) Lowest priority thread. */
>+	u_char		tdq_owepreempt;	/* (f) Remote preemption pending. */
>+	u_char		tdq_idx;	/* (t) Current insert index. */
>+	u_char		tdq_ridx;	/* (t) Current removal index. */
>+	int		tdq_id;		/* (c) cpuid. */
>+	struct runq	tdq_realtime;	/* (t) real-time run queue. */
>+	struct runq	tdq_timeshare;	/* (t) timeshare run queue. */
>+	struct runq	tdq_idle;	/* (t) Queue of IDLE threads. */
> 	char		tdq_name[TDQ_NAME_LEN];
> #ifdef KTR
> 	char		tdq_loadname[TDQ_LOADNAME_LEN];
> #endif
>-} __aligned(64);
>+};
> 
> /* Idle thread states and config. */
> #define	TDQ_RUNNING	1
> #define	TDQ_IDLE	2
> 
>+/* Lockless accessors. */
>+#define	TDQ_LOAD(tdq)		atomic_load_int(&(tdq)->tdq_load)
>+#define	TDQ_TRANSFERABLE(tdq)	atomic_load_int(&(tdq)->tdq_transferable)
>+#define	TDQ_SWITCHCNT(tdq)	(atomic_load_short(&(tdq)->tdq_switchcnt) + \
>+				 atomic_load_short(&(tdq)->tdq_oldswitchcnt))
>+#define	TDQ_SWITCHCNT_INC(tdq)	(atomic_store_short(&(tdq)->tdq_switchcnt, \
>+				 atomic_load_short(&(tdq)->tdq_switchcnt) + 1))
>+
> #ifdef SMP
> struct cpu_group __read_mostly *cpu_top;		/* CPU topology */
> 
>@@ -306,6 +322,7 @@ static struct tdq	tdq_cpu;
> #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
> #define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
> 
>+static void sched_setpreempt(int);
> static void sched_priority(struct thread *);
> static void sched_thread_priority(struct thread *, u_char);
> static int sched_interact_score(struct thread *);
>@@ -321,18 +338,18 @@ static void tdq_load_rem(struct tdq *, struct thread *);
> static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
> static __inline void tdq_runq_rem(struct tdq *, struct thread *);
> static inline int sched_shouldpreempt(int, int, int);
>-void tdq_print(int cpu);
>+static void tdq_print(int cpu);
> static void runq_print(struct runq *rq);
>-static void tdq_add(struct tdq *, struct thread *, int);
>+static int tdq_add(struct tdq *, struct thread *, int);
> #ifdef SMP
>-static struct thread *tdq_move(struct tdq *, struct tdq *);
>+static int tdq_move(struct tdq *, struct tdq *);
> static int tdq_idled(struct tdq *);
>-static void tdq_notify(struct tdq *, struct thread *);
>+static void tdq_notify(struct tdq *, int lowpri);
> static struct thread *tdq_steal(struct tdq *, int);
> static struct thread *runq_steal(struct runq *, int);
> static int sched_pickcpu(struct thread *, int);
> static void sched_balance(void);
>-static int sched_balance_pair(struct tdq *, struct tdq *);
>+static bool sched_balance_pair(struct tdq *, struct tdq *);
> static inline struct tdq *sched_setcpu(struct thread *, int, int);
> static inline void thread_unblock_switch(struct thread *, struct mtx *);
> static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
>@@ -396,7 +413,7 @@ runq_print(struct runq *rq)
> /*
>  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
>  */
>-void
>+static void __unused
> tdq_print(int cpu)
> {
> 	struct tdq *tdq;
>@@ -606,7 +623,7 @@ tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
> 
> 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
> 	if (ctd == NULL)
>-		ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
>+		ctd = tdq->tdq_curthread;
> 	td = tdq_choose(tdq);
> 	if (td == NULL || td->td_priority > ctd->td_priority)
> 		tdq->tdq_lowpri = ctd->td_priority;
>@@ -697,7 +714,7 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
> 		if (!CPU_ISSET(c, &cg->cg_mask))
> 			continue;
> 		tdq = TDQ_CPU(c);
>-		l = tdq->tdq_load;
>+		l = TDQ_LOAD(tdq);
> 		if (c == s->cs_prefer) {
> 			if (__predict_false(s->cs_running))
> 				l--;
>@@ -712,7 +729,8 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
> 		 * If the threads is already on the CPU, don't look on the TDQ
> 		 * priority, since it can be the priority of the thread itself.
> 		 */
>-		if (l > s->cs_load || (tdq->tdq_lowpri <= s->cs_pri &&
>+		if (l > s->cs_load ||
>+		    (atomic_load_char(&tdq->tdq_lowpri) <= s->cs_pri &&
> 		     (!s->cs_running || c != s->cs_prefer)) ||
> 		    !CPU_ISSET(c, s->cs_mask))
> 			continue;
>@@ -767,14 +785,14 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
> 		if (!CPU_ISSET(c, &cg->cg_mask))
> 			continue;
> 		tdq = TDQ_CPU(c);
>-		l = tdq->tdq_load;
>+		l = TDQ_LOAD(tdq);
> 		load = l * 256;
> 		total += load;
> 
> 		/*
> 		 * Check this CPU is acceptable.
> 		 */
>-		if (l < s->cs_load || (tdq->tdq_transferable < s->cs_trans) ||
>+		if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans ||
> 		    !CPU_ISSET(c, s->cs_mask))
> 			continue;
> 
>@@ -846,13 +864,13 @@ sched_balance_group(struct cpu_group *cg)
> 		if (CPU_EMPTY(&lmask))
> 			break;
> 		tdq = TDQ_CPU(high);
>-		if (tdq->tdq_load == 1) {
>+		if (TDQ_LOAD(tdq) == 1) {
> 			/*
> 			 * There is only one running thread.  We can't move
> 			 * it from here, so tell it to pick new CPU by itself.
> 			 */
> 			TDQ_LOCK(tdq);
>-			td = pcpu_find(high)->pc_curthread;
>+			td = tdq->tdq_curthread;
> 			if ((td->td_flags & TDF_IDLETD) == 0 &&
> 			    THREAD_CAN_MIGRATE(td)) {
> 				td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU;
>@@ -864,9 +882,9 @@ sched_balance_group(struct cpu_group *cg)
> 		}
> 		anylow = 1;
> nextlow:
>-		if (tdq->tdq_transferable == 0)
>+		if (TDQ_TRANSFERABLE(tdq) == 0)
> 			continue;
>-		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1);
>+		low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1);
> 		/* Stop if we looked well and found no less loaded CPU. */
> 		if (anylow && low == -1)
> 			break;
>@@ -929,37 +947,49 @@ tdq_unlock_pair(struct tdq *one, struct tdq *two)
> }
> 
> /*
>- * Transfer load between two imbalanced thread queues.
>+ * Transfer load between two imbalanced thread queues.  Returns true if a thread
>+ * was moved between the queues, and false otherwise.
>  */
>-static int
>+static bool
> sched_balance_pair(struct tdq *high, struct tdq *low)
> {
>-	struct thread *td;
>-	int cpu;
>+	int cpu, lowpri;
>+	bool ret;
> 
>+	ret = false;
> 	tdq_lock_pair(high, low);
>-	td = NULL;
>+
> 	/*
> 	 * Transfer a thread from high to low.
> 	 */
>-	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
>-	    (td = tdq_move(high, low)) != NULL) {
>-		/*
>-		 * In case the target isn't the current cpu notify it of the
>-		 * new load, possibly sending an IPI to force it to reschedule.
>-		 */
>-		cpu = TDQ_ID(low);
>-		if (cpu != PCPU_GET(cpuid))
>-			tdq_notify(low, td);
>+	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load) {
>+		lowpri = tdq_move(high, low);
>+		if (lowpri != -1) {
>+			/*
>+			 * In case the target isn't the current CPU notify it of
>+			 * the new load, possibly sending an IPI to force it to
>+			 * reschedule.  Otherwise maybe schedule a preemption.
>+			 */
>+			cpu = TDQ_ID(low);
>+			if (cpu != PCPU_GET(cpuid))
>+				tdq_notify(low, lowpri);
>+			else
>+				sched_setpreempt(low->tdq_lowpri);
>+			ret = true;
>+		}
> 	}
> 	tdq_unlock_pair(high, low);
>-	return (td != NULL);
>+	return (ret);
> }
> 
> /*
>- * Move a thread from one thread queue to another.
>+ * Move a thread from one thread queue to another.  Returns -1 if the source
>+ * queue was empty, else returns the maximum priority of all threads in
>+ * the destination queue prior to the addition of the new thread.  In the latter
>+ * case, this priority can be used to determine whether an IPI needs to be
>+ * delivered.
>  */
>-static struct thread *
>+static int
> tdq_move(struct tdq *from, struct tdq *to)
> {
> 	struct thread *td;
>@@ -973,7 +1003,7 @@ tdq_move(struct tdq *from, struct tdq *to)
> 	cpu = TDQ_ID(to);
> 	td = tdq_steal(tdq, cpu);
> 	if (td == NULL)
>-		return (NULL);
>+		return (-1);
> 
> 	/*
> 	 * Although the run queue is locked the thread may be
>@@ -984,9 +1014,7 @@ tdq_move(struct tdq *from, struct tdq *to)
> 	THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from));
> 	td->td_lock = TDQ_LOCKPTR(to);
> 	td_get_sched(td)->ts_cpu = cpu;
>-	tdq_add(to, td, SRQ_YIELDING);
>-
>-	return (td);
>+	return (tdq_add(to, td, SRQ_YIELDING));
> }
> 
> /*
>@@ -1005,15 +1033,15 @@ tdq_idled(struct tdq *tdq)
> 		return (1);
> 	CPU_FILL(&mask);
> 	CPU_CLR(PCPU_GET(cpuid), &mask);
>-    restart:
>-	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
>+restart:
>+	switchcnt = TDQ_SWITCHCNT(tdq);
> 	for (cg = tdq->tdq_cg, goup = 0; ; ) {
> 		cpu = sched_highest(cg, &mask, steal_thresh, 1);
> 		/*
> 		 * We were assigned a thread but not preempted.  Returning
> 		 * 0 here will cause our caller to switch to it.
> 		 */
>-		if (tdq->tdq_load)
>+		if (TDQ_LOAD(tdq))
> 			return (0);
> 
> 		/*
>@@ -1049,8 +1077,8 @@ tdq_idled(struct tdq *tdq)
> 		 * this situation about 20% of the time on an 8 core
> 		 * 16 thread Ryzen 7, but it still helps performance.
> 		 */
>-		if (steal->tdq_load < steal_thresh ||
>-		    steal->tdq_transferable == 0)
>+		if (TDQ_LOAD(steal) < steal_thresh ||
>+		    TDQ_TRANSFERABLE(steal) == 0)
> 			goto restart;
> 		/*
> 		 * Try to lock both queues. If we are assigned a thread while
>@@ -1075,16 +1103,16 @@ tdq_idled(struct tdq *tdq)
> 		 * of date.  The latter is rare.  In either case restart
> 		 * the search.
> 		 */
>-		if (steal->tdq_load < steal_thresh ||
>-		    steal->tdq_transferable == 0 ||
>-		    switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
>+		if (TDQ_LOAD(steal) < steal_thresh ||
>+		    TDQ_TRANSFERABLE(steal) == 0 ||
>+		    switchcnt != TDQ_SWITCHCNT(tdq)) {
> 			tdq_unlock_pair(tdq, steal);
> 			goto restart;
> 		}
> 		/*
> 		 * Steal the thread and switch to it.
> 		 */
>-		if (tdq_move(steal, tdq) != NULL)
>+		if (tdq_move(steal, tdq) != -1)
> 			break;
> 		/*
> 		 * We failed to acquire a thread even though it looked
>@@ -1104,20 +1132,27 @@ tdq_idled(struct tdq *tdq)
> 
> /*
>  * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
>+ *
>+ * "lowpri" is the minimum scheduling priority among all threads on
>+ * the queue prior to the addition of the new thread.
>  */
> static void
>-tdq_notify(struct tdq *tdq, struct thread *td)
>+tdq_notify(struct tdq *tdq, int lowpri)
> {
>-	struct thread *ctd;
>-	int pri;
> 	int cpu;
> 
>+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
>+	KASSERT(tdq->tdq_lowpri <= lowpri,
>+	    ("tdq_notify: lowpri %d > tdq_lowpri %d", lowpri, tdq->tdq_lowpri));
>+
> 	if (tdq->tdq_owepreempt)
> 		return;
>-	cpu = td_get_sched(td)->ts_cpu;
>-	pri = td->td_priority;
>-	ctd = pcpu_find(cpu)->pc_curthread;
>-	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
>+
>+	/*
>+	 * Check to see if the newly added thread should preempt the one
>+	 * currently running.
>+	 */
>+	if (!sched_shouldpreempt(tdq->tdq_lowpri, lowpri, 1))
> 		return;
> 
> 	/*
>@@ -1127,14 +1162,15 @@ tdq_notify(struct tdq *tdq, struct thread *td)
> 	 */
> 	atomic_thread_fence_seq_cst();
> 
>-	if (TD_IS_IDLETHREAD(ctd)) {
>-		/*
>-		 * If the MD code has an idle wakeup routine try that before
>-		 * falling back to IPI.
>-		 */
>-		if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
>-			return;
>-	}
>+	/*
>+	 * Try to figure out if we can signal the idle thread instead of sending
>+	 * an IPI.  This check is racy; at worst, we will deliever an IPI
>+	 * unnecessarily.
>+	 */
>+	cpu = TDQ_ID(tdq);
>+	if (TD_IS_IDLETHREAD(tdq->tdq_curthread) &&
>+	    (atomic_load_int(&tdq->tdq_cpu_idle) == 0 || cpu_idle_wakeup(cpu)))
>+		return;
> 
> 	/*
> 	 * The run queues have been updated, so any switch on the remote CPU
>@@ -1326,13 +1362,15 @@ sched_pickcpu(struct thread *td, int flags)
> 	 * expired and it is idle, run it there.
> 	 */
> 	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
>-	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
>+	    atomic_load_int(&tdq->tdq_lowpri) >= PRI_MIN_IDLE &&
> 	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
> 		if (cg->cg_flags & CG_FLAG_THREAD) {
> 			/* Check all SMT threads for being idle. */
> 			for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) {
>+				pri =
>+				    atomic_load_char(&TDQ_CPU(cpu)->tdq_lowpri);
> 				if (CPU_ISSET(cpu, &cg->cg_mask) &&
>-				    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
>+				    pri < PRI_MIN_IDLE)
> 					break;
> 			}
> 			if (cpu > cg->cg_last) {
>@@ -1403,8 +1441,8 @@ sched_pickcpu(struct thread *td, int flags)
> 	 */
> 	tdq = TDQ_CPU(cpu);
> 	if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
>-	    tdq->tdq_lowpri < PRI_MIN_IDLE &&
>-	    TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) {
>+	    atomic_load_char(&tdq->tdq_lowpri) < PRI_MIN_IDLE &&
>+	    TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) {
> 		SCHED_STAT_INC(pickcpu_local);
> 		cpu = self;
> 	}
>@@ -1507,6 +1545,7 @@ sched_setup(void *dummy)
> 	TDQ_LOCK(tdq);
> 	thread0.td_lock = TDQ_LOCKPTR(tdq);
> 	tdq_load_add(tdq, &thread0);
>+	tdq->tdq_curthread = &thread0;
> 	tdq->tdq_lowpri = thread0.td_priority;
> 	TDQ_UNLOCK(tdq);
> }
>@@ -2001,7 +2040,7 @@ tdq_trysteal(struct tdq *tdq)
> 		 * If a thread was added while interrupts were disabled don't
> 		 * steal one here.
> 		 */
>-		if (tdq->tdq_load > 0) {
>+		if (TDQ_LOAD(tdq) > 0) {
> 			TDQ_LOCK(tdq);
> 			break;
> 		}
>@@ -2043,8 +2082,8 @@ tdq_trysteal(struct tdq *tdq)
> 		 * At this point unconditionally exit the loop to bound
> 		 * the time spent in the critcal section.
> 		 */
>-		if (steal->tdq_load < steal_thresh ||
>-		    steal->tdq_transferable == 0)
>+		if (TDQ_LOAD(steal) < steal_thresh ||
>+		    TDQ_TRANSFERABLE(steal) == 0)
> 			continue;
> 		/*
> 		 * Try to lock both queues. If we are assigned a thread while
>@@ -2061,8 +2100,8 @@ tdq_trysteal(struct tdq *tdq)
> 		 * The data returned by sched_highest() is stale and
>                  * the chosen CPU no longer has an eligible thread.
> 		 */
>-		if (steal->tdq_load < steal_thresh ||
>-		    steal->tdq_transferable == 0) {
>+		if (TDQ_LOAD(steal) < steal_thresh ||
>+		    TDQ_TRANSFERABLE(steal) == 0) {
> 			TDQ_UNLOCK(steal);
> 			break;
> 		}
>@@ -2071,7 +2110,7 @@ tdq_trysteal(struct tdq *tdq)
> 		 * bail out and let the idle thread to a more complete search
> 		 * outside of a critical section.
> 		 */
>-		if (tdq_move(steal, tdq) == NULL) {
>+		if (tdq_move(steal, tdq) == -1) {
> 			TDQ_UNLOCK(steal);
> 			break;
> 		}
>@@ -2090,6 +2129,7 @@ static struct mtx *
> sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
> {
> 	struct tdq *tdn;
>+	int lowpri;
> 
> 	KASSERT(THREAD_CAN_MIGRATE(td) ||
> 	    (td_get_sched(td)->ts_flags & TSF_BOUND) != 0,
>@@ -2107,8 +2147,8 @@ sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
> 	 */
> 	TDQ_UNLOCK(tdq);
> 	TDQ_LOCK(tdn);
>-	tdq_add(tdn, td, flags);
>-	tdq_notify(tdn, td);
>+	lowpri = tdq_add(tdn, td, flags);
>+	tdq_notify(tdn, lowpri);
> 	TDQ_UNLOCK(tdn);
> 	TDQ_LOCK(tdq);
> #endif
>@@ -2162,9 +2202,9 @@ sched_switch(struct thread *td, int flags)
> 	    (flags & SW_PREEMPT) != 0;
> 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND);
> 	td->td_owepreempt = 0;
>-	tdq->tdq_owepreempt = 0;
>+	atomic_store_char(&tdq->tdq_owepreempt, 0);
> 	if (!TD_IS_IDLETHREAD(td))
>-		tdq->tdq_switchcnt++;
>+		TDQ_SWITCHCNT_INC(tdq);
> 
> 	/*
> 	 * Always block the thread lock so we can drop the tdq lock early.
>@@ -2217,6 +2257,7 @@ sched_switch(struct thread *td, int flags)
> 	 * thread-queue locked.
> 	 */
> 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
>+	MPASS(td == tdq->tdq_curthread);
> 	newtd = choosethread();
> 	sched_pctcpu_update(td_get_sched(newtd), 0);
> 	TDQ_UNLOCK(tdq);
>@@ -2523,6 +2564,7 @@ sched_clock(struct thread *td, int cnt)
> 	 */
> 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
> 	tdq->tdq_switchcnt = tdq->tdq_load;
>+
> 	/*
> 	 * Advance the insert index once for each tick to ensure that all
> 	 * threads get a chance to run.
>@@ -2579,10 +2621,10 @@ sched_runnable(void)
> 
> 	tdq = TDQ_SELF();
> 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
>-		if (tdq->tdq_load > 0)
>+		if (TDQ_LOAD(tdq) > 0)
> 			goto out;
> 	} else
>-		if (tdq->tdq_load - 1 > 0)
>+		if (TDQ_LOAD(tdq) - 1 > 0)
> 			goto out;
> 	load = 0;
> out:
>@@ -2603,30 +2645,31 @@ sched_choose(void)
> 	tdq = TDQ_SELF();
> 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
> 	td = tdq_choose(tdq);
>-	if (td) {
>+	if (td != NULL) {
> 		tdq_runq_rem(tdq, td);
> 		tdq->tdq_lowpri = td->td_priority;
>-		return (td);
>+	} else { 
>+		tdq->tdq_lowpri = PRI_MAX_IDLE;
>+		td = PCPU_GET(idlethread);
> 	}
>-	tdq->tdq_lowpri = PRI_MAX_IDLE;
>-	return (PCPU_GET(idlethread));
>+	tdq->tdq_curthread = td;
>+	return (td);
> }
> 
> /*
>- * Set owepreempt if necessary.  Preemption never happens directly in ULE,
>- * we always request it once we exit a critical section.
>+ * Set owepreempt if the currently running thread has lower priority than "pri".
>+ * Preemption never happens directly in ULE, we always request it once we exit a
>+ * critical section.
>  */
>-static inline void
>-sched_setpreempt(struct thread *td)
>+static void
>+sched_setpreempt(int pri)
> {
> 	struct thread *ctd;
> 	int cpri;
>-	int pri;
>-
>-	THREAD_LOCK_ASSERT(curthread, MA_OWNED);
> 
> 	ctd = curthread;
>-	pri = td->td_priority;
>+	THREAD_LOCK_ASSERT(ctd, MA_OWNED);
>+
> 	cpri = ctd->td_priority;
> 	if (pri < cpri)
> 		ctd->td_flags |= TDF_NEEDRESCHED;
>@@ -2642,9 +2685,10 @@ sched_setpreempt(struct thread *td)
>  * thread to it.  This is the internal function called when the tdq is
>  * predetermined.
>  */
>-void
>+static int
> tdq_add(struct tdq *tdq, struct thread *td, int flags)
> {
>+	int lowpri;
> 
> 	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
> 	THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED);
>@@ -2655,10 +2699,12 @@ tdq_add(struct tdq *tdq, struct thread *td, int flags)
> 	KASSERT(td->td_flags & TDF_INMEM,
> 	    ("sched_add: thread swapped out"));
> 
>-	if (td->td_priority < tdq->tdq_lowpri)
>+	lowpri = tdq->tdq_lowpri;
>+	if (td->td_priority < lowpri)
> 		tdq->tdq_lowpri = td->td_priority;
> 	tdq_runq_add(tdq, td, flags);
> 	tdq_load_add(tdq, td);
>+	return (lowpri);
> }
> 
> /*
>@@ -2672,7 +2718,7 @@ sched_add(struct thread *td, int flags)
> {
> 	struct tdq *tdq;
> #ifdef SMP
>-	int cpu;
>+	int cpu, lowpri;
> #endif
> 
> 	KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
>@@ -2696,11 +2742,11 @@ sched_add(struct thread *td, int flags)
> 	 */
> 	cpu = sched_pickcpu(td, flags);
> 	tdq = sched_setcpu(td, cpu, flags);
>-	tdq_add(tdq, td, flags);
>+	lowpri = tdq_add(tdq, td, flags);
> 	if (cpu != PCPU_GET(cpuid))
>-		tdq_notify(tdq, td);
>+		tdq_notify(tdq, lowpri);
> 	else if (!(flags & SRQ_YIELDING))
>-		sched_setpreempt(td);
>+		sched_setpreempt(td->td_priority);
> #else
> 	tdq = TDQ_SELF();
> 	/*
>@@ -2714,9 +2760,9 @@ sched_add(struct thread *td, int flags)
> 		else
> 			thread_lock_set(td, TDQ_LOCKPTR(tdq));
> 	}
>-	tdq_add(tdq, td, flags);
>+	(void)tdq_add(tdq, td, flags);
> 	if (!(flags & SRQ_YIELDING))
>-		sched_setpreempt(td);
>+		sched_setpreempt(td->td_priority);
> #endif
> 	if (!(flags & SRQ_HOLDTD))
> 		thread_unlock(td);
>@@ -2874,10 +2920,10 @@ sched_load(void)
> 
> 	total = 0;
> 	CPU_FOREACH(i)
>-		total += TDQ_CPU(i)->tdq_sysload;
>+		total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload);
> 	return (total);
> #else
>-	return (TDQ_SELF()->tdq_sysload);
>+	return (atomic_load_int(&TDQ_SELF()->tdq_sysload));
> #endif
> }
> 
>@@ -2917,18 +2963,18 @@ sched_idletd(void *dummy)
> 	THREAD_NO_SLEEPING();
> 	oldswitchcnt = -1;
> 	for (;;) {
>-		if (tdq->tdq_load) {
>+		if (TDQ_LOAD(tdq)) {
> 			thread_lock(td);
> 			mi_switch(SW_VOL | SWT_IDLE);
> 		}
>-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
>+		switchcnt = TDQ_SWITCHCNT(tdq);
> #ifdef SMP
> 		if (always_steal || switchcnt != oldswitchcnt) {
> 			oldswitchcnt = switchcnt;
> 			if (tdq_idled(tdq) == 0)
> 				continue;
> 		}
>-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
>+		switchcnt = TDQ_SWITCHCNT(tdq);
> #else
> 		oldswitchcnt = switchcnt;
> #endif
>@@ -2941,23 +2987,23 @@ sched_idletd(void *dummy)
> 		 */
> 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
> 			for (i = 0; i < sched_idlespins; i++) {
>-				if (tdq->tdq_load)
>+				if (TDQ_LOAD(tdq))
> 					break;
> 				cpu_spinwait();
> 			}
> 		}
> 
> 		/* If there was context switch during spin, restart it. */
>-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
>-		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
>+		switchcnt = TDQ_SWITCHCNT(tdq);
>+		if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt)
> 			continue;
> 
> 		/* Run main MD idle handler. */
>-		tdq->tdq_cpu_idle = 1;
>+		atomic_store_int(&tdq->tdq_cpu_idle, 1);
> 		/*
>-		 * Make sure that tdq_cpu_idle update is globally visible
>-		 * before cpu_idle() read tdq_load.  The order is important
>-		 * to avoid race with tdq_notify.
>+		 * Make sure that the tdq_cpu_idle update is globally visible
>+		 * before cpu_idle() reads tdq_load.  The order is important
>+		 * to avoid races with tdq_notify().
> 		 */
> 		atomic_thread_fence_seq_cst();
> 		/*
>@@ -2965,21 +3011,21 @@ sched_idletd(void *dummy)
> 		 * threads often enough to make it worthwhile to do so in
> 		 * order to avoid calling cpu_idle().
> 		 */
>-		if (tdq->tdq_load != 0) {
>-			tdq->tdq_cpu_idle = 0;
>+		if (TDQ_LOAD(tdq) != 0) {
>+			atomic_store_int(&tdq->tdq_cpu_idle, 0);
> 			continue;
> 		}
> 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
>-		tdq->tdq_cpu_idle = 0;
>+		atomic_store_int(&tdq->tdq_cpu_idle, 0);
> 
> 		/*
> 		 * Account thread-less hardware interrupts and
> 		 * other wakeup reasons equal to context switches.
> 		 */
>-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
>+		switchcnt = TDQ_SWITCHCNT(tdq);
> 		if (switchcnt != oldswitchcnt)
> 			continue;
>-		tdq->tdq_switchcnt++;
>+		TDQ_SWITCHCNT_INC(tdq);
> 		oldswitchcnt++;
> 	}
> }
>diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
>index 53b32672132a..d7647b2e25ef 100644
>--- a/sys/x86/x86/cpu_machdep.c
>+++ b/sys/x86/x86/cpu_machdep.c
>@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
> #include "opt_maxmem.h"
> #include "opt_mp_watchdog.h"
> #include "opt_platform.h"
>+#include "opt_sched.h"
> #ifdef __i386__
> #include "opt_apic.h"
> #endif
>@@ -528,32 +529,24 @@ static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
> SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
>     0, "Use MONITOR/MWAIT for short idle");
> 
>-static void
>-cpu_idle_acpi(sbintime_t sbt)
>+static bool
>+cpu_idle_enter(int *statep, int newstate)
> {
>-	int *state;
>+	KASSERT(atomic_load_int(statep) == STATE_RUNNING,
>+	    ("%s: state %d", __func__, atomic_load_int(statep)));
> 
>-	state = &PCPU_PTR(monitorbuf)->idle_state;
>-	atomic_store_int(state, STATE_SLEEPING);
>-
>-	/* See comments in cpu_idle_hlt(). */
>-	disable_intr();
>-	if (sched_runnable())
>-		enable_intr();
>-	else if (cpu_idle_hook)
>-		cpu_idle_hook(sbt);
>-	else
>-		acpi_cpu_c1();
>-	atomic_store_int(state, STATE_RUNNING);
>-}
>-
>-static void
>-cpu_idle_hlt(sbintime_t sbt)
>-{
>-	int *state;
>-
>-	state = &PCPU_PTR(monitorbuf)->idle_state;
>-	atomic_store_int(state, STATE_SLEEPING);
>+	/*
>+	 * A fence is needed to prevent reordering of the load in
>+	 * sched_runnable() with this store to the idle state word.  Without it,
>+	 * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
>+	 * added load to the queue, and elide an IPI.  Then, sched_runnable()
>+	 * can observe tdq_load == 0, so the CPU ends up idling with pending
>+	 * work.
>+	 */
>+	atomic_store_int(statep, newstate);
>+#if defined(SCHED_ULE) && defined(SMP)
>+	atomic_thread_fence_seq_cst();
>+#endif
> 
> 	/*
> 	 * Since we may be in a critical section from cpu_idle(), if
>@@ -572,35 +565,62 @@ cpu_idle_hlt(sbintime_t sbt)
> 	 * interrupt.
> 	 */
> 	disable_intr();
>-	if (sched_runnable())
>+	if (sched_runnable()) {
> 		enable_intr();
>-	else
>-		acpi_cpu_c1();
>-	atomic_store_int(state, STATE_RUNNING);
>+		atomic_store_int(statep, STATE_RUNNING);
>+		return (false);
>+	} else {
>+		return (true);
>+	}
> }
> 
> static void
>-cpu_idle_mwait(sbintime_t sbt)
>+cpu_idle_exit(int *statep)
>+{
>+	atomic_store_int(statep, STATE_RUNNING);
>+}
>+
>+static void
>+cpu_idle_acpi(sbintime_t sbt)
> {
> 	int *state;
> 
> 	state = &PCPU_PTR(monitorbuf)->idle_state;
>-	atomic_store_int(state, STATE_MWAIT);
>+	if (cpu_idle_enter(state, STATE_SLEEPING)) {
>+		if (cpu_idle_hook)
>+			cpu_idle_hook(sbt);
>+		else
>+			acpi_cpu_c1();
>+		cpu_idle_exit(state);
>+	}
>+}
> 
>-	/* See comments in cpu_idle_hlt(). */
>-	disable_intr();
>-	if (sched_runnable()) {
>+static void
>+cpu_idle_hlt(sbintime_t sbt)
>+{
>+	int *state;
>+
>+	state = &PCPU_PTR(monitorbuf)->idle_state;
>+	if (cpu_idle_enter(state, STATE_SLEEPING)) {
>+		acpi_cpu_c1();
> 		atomic_store_int(state, STATE_RUNNING);
>-		enable_intr();
>-		return;
> 	}
>+}
> 
>-	cpu_monitor(state, 0, 0);
>-	if (atomic_load_int(state) == STATE_MWAIT)
>-		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
>-	else
>-		enable_intr();
>-	atomic_store_int(state, STATE_RUNNING);
>+static void
>+cpu_idle_mwait(sbintime_t sbt)
>+{
>+	int *state;
>+
>+	state = &PCPU_PTR(monitorbuf)->idle_state;
>+	if (cpu_idle_enter(state, STATE_MWAIT)) {
>+		cpu_monitor(state, 0, 0);
>+		if (atomic_load_int(state) == STATE_MWAIT)
>+			__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
>+		else
>+			enable_intr();
>+		cpu_idle_exit(state);
>+	}
> }
> 
> static void

Actions: View | Diff

Attachments on bug 264867: 234909 | 235005 | 235023 | 235195