[PATCH 00/18] sched: balance callbacks v4

All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 00/18] sched: balance callbacks v4
@ 2015-06-11 12:46 Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
                   ` (18 more replies)
  0 siblings, 19 replies; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.

The exact scenario is __sched_setscheduler() changing a (runnable) task from
FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
drop rq->lock. This gap allows regular cfs load-balancing to step in and
migrate our task.

However, check_class_changed() will happily continue with switched_to_fair()
which assumes our task is still on the old rq and makes the kernel go boom.

Instead of trying to patch this up and make things complicated; simply disallow
these methods to drop rq->lock and extend the current post_schedule stuff into
a balancing callback list, and use that.

This survives Mike's testcase.

Changes since -v3:
 - reworked the hrtimer stuff, again. -- Kirill, Oleg
 - small changes to the new lockdep stuff

Changes since -v2:
 - reworked the hrtimer patch. -- Kirill, tglx
 - added lock pinning

Changes since -v1:
 - make SMP=n build,
 - cured switched_from_dl()'s cancel_dl_timer().

no real tests on the new parts other than booting / building kernels.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 01/18] sched: Replace post_schedule with a balance callback list
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-11 15:32   ` Kirill Tkhai
  2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
                   ` (17 subsequent siblings)
  18 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-1.patch --]
[-- Type: text/plain, Size: 7096 bytes --]

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c |   21 +++++++++++----------
 kernel/sched/rt.c       |   25 +++++++++++--------------
 kernel/sched/sched.h    |   19 +++++++++++++++++--
 4 files changed, 63 insertions(+), 38 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7216,7 +7228,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_stru
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 03/18] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
                   ` (16 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-6.patch --]
[-- Type: text/plain, Size: 3953 bytes --]

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   65 ++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3428,7 +3428,7 @@ static bool dl_param_changed(struct task
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3614,18 +3614,20 @@ static int __sched_setscheduler(struct t
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3636,7 +3638,7 @@ static int __sched_setscheduler(struct t
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3651,7 +3653,8 @@ static int __sched_setscheduler(struct t
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3672,7 +3675,7 @@ static int _sched_setscheduler(struct ta
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3693,7 +3696,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7354,32 +7357,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7406,9 +7389,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 03/18] sched: Allow balance callbacks for check_class_changed()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 04/18] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
                   ` (15 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-7.patch --]
[-- Type: text/plain, Size: 3024 bytes --]

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_s
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3094,7 +3102,11 @@ void rt_mutex_setprio(struct task_struct
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3655,11 +3667,18 @@ static int __sched_setscheduler(struct t
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 04/18] sched,rt: Remove return value from pull_rt_task()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 03/18] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
                   ` (14 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-2.patch --]
[-- Type: text/plain, Size: 2408 bytes --]

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_gro
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(str
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct ir
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ static int pull_rt_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 04/18] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:01   ` [tip:sched/hrtimers] sched, rt: Convert switched_{from, to}_rt() " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 06/18] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
                   ` (13 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-3.patch --]
[-- Type: text/plain, Size: 3009 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct ta
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 06/18] sched,dl: Remove return value from pull_dl_task()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
                   ` (12 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-4.patch --]
[-- Type: text/plain, Size: 2135 bytes --]

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(str
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1049,7 +1048,7 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1480,15 +1479,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1543,7 +1543,7 @@ static int pull_dl_task(struct rq *this_
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1556,7 +1556,8 @@ static int pull_dl_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1712,8 +1713,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 06/18] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:02   ` [tip:sched/hrtimers] sched, dl: Convert switched_{from, to}_dl() " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE Peter Zijlstra
                   ` (11 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-post_schedule-5.patch --]
[-- Type: text/plain, Size: 3223 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -305,6 +312,10 @@ static inline void pull_dl_task(struct r
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1048,8 +1059,6 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1713,7 +1722,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1722,21 +1731,16 @@ static void switched_from_dl(struct rq *
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1756,15 +1760,14 @@ static void prio_changed_dl(struct rq *r
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (6 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Oleg Nesterov
  2015-06-11 12:46 ` [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole Peter Zijlstra
                   ` (10 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: oleg_nesterov-hrtimer-allow_hrtimer__function_to_free_the_timer.patch --]
[-- Type: text/plain, Size: 2199 bytes --]

I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally
confused it looks buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this
is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/hrtimer.h |    6 +-----
 kernel/time/hrtimer.c   |    7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1510,11 +1510,11 @@ static void migrate_hrtimer_list(struct
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1525,9 +1525,6 @@ static void migrate_hrtimer_list(struct
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (7 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 10/18] seqcount: Rename write_seqcount_barrier() Peter Zijlstra
                   ` (9 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-hrtimer-fix-restart.patch --]
[-- Type: text/plain, Size: 2226 bytes --]

A queued hrtimer that gets restarted (hrtimer_start*() while
hrtimer_is_queued()) will briefly appear as unqueued/inactive, even
though the timer has always been active, we just moved it.

Close this hole by preserving timer->state in
hrtimer_start_range_ns()'s remove_hrtimer() call.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/time/hrtimer.c |   23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -891,10 +891,10 @@ static void __remove_hrtimer(struct hrti
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
+		unsigned long state = timer->state;
 		int reprogram;
 
 		/*
@@ -908,12 +908,15 @@ remove_hrtimer(struct hrtimer *timer, st
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
+
+		if (!restart) {
+			/*
+			 * We must preserve the CALLBACK state flag here,
+			 * otherwise we could move the timer base in
+			 * switch_hrtimer_base.
+			 */
+			state &= HRTIMER_STATE_CALLBACK;
+		}
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -938,7 +941,7 @@ void hrtimer_start_range_ns(struct hrtim
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base, true);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1007,7 +1010,7 @@ int hrtimer_try_to_cancel(struct hrtimer
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, false);
 
 	unlock_hrtimer_base(timer, &flags);
 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 10/18] seqcount: Rename write_seqcount_barrier()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (8 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 22:19   ` [tip:timers/core] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier() Peter Zijlstra
                   ` (8 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Paul McKenney, Al Viro, Linus Torvalds

[-- Attachment #1: peterz-seqcount-rename-barrier.patch --]
[-- Type: text/plain, Size: 3255 bytes --]

I'll shortly be introducing another seqcount primitive that's useful
to provide ordering semantics and would like to use the
write_seqcount_barrier() name for that.

Seeing how there's only one user of the current primitive, lets rename
it to invalidate, as that appears what its doing.

While there, employ lockdep_assert_held() instead of
assert_spin_locked() to not generate debug code for regular kernels.

Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/dcache.c             |   16 ++++++++--------
 include/linux/seqlock.h |    6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -322,17 +322,17 @@ static void dentry_free(struct dentry *d
 }
 
 /**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
  * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
  */
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
 {
-	assert_spin_locked(&dentry->d_lock);
-	/* Go through a barrier */
-	write_seqcount_barrier(&dentry->d_seq);
+	lockdep_assert_held(&dentry->d_lock);
+	/* Go through am invalidation barrier */
+	write_seqcount_invalidate(&dentry->d_seq);
 }
 
 /*
@@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct d
 	struct inode *inode = dentry->d_inode;
 	__d_clear_type_and_inode(dentry);
 	hlist_del_init(&dentry->d_u.d_alias);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
@@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry)
 		__hlist_bl_del(&dentry->d_hash);
 		dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
-		dentry_rcuwalk_barrier(dentry);
+		dentry_rcuwalk_invalidate(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -1752,7 +1752,7 @@ static void __d_instantiate(struct dentr
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	__d_set_inode_and_type(dentry, inode, add_flags);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -266,13 +266,13 @@ static inline void write_seqcount_end(se
 }
 
 /**
- * write_seqcount_barrier - invalidate in-progress read-side seq operations
+ * write_seqcount_invalidate - invalidate in-progress read-side seq operations
  * @s: pointer to seqcount_t
  *
- * After write_seqcount_barrier, no read-side seq operations will complete
+ * After write_seqcount_invalidate, no read-side seq operations will complete
  * successfully and see data older than this.
  */
-static inline void write_seqcount_barrier(seqcount_t *s)
+static inline void write_seqcount_invalidate(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence+=2;



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (9 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 10/18] seqcount: Rename write_seqcount_barrier() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-11 15:33   ` Paul E. McKenney
  2015-06-11 12:46 ` [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
                   ` (7 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Al Viro, Linus Torvalds, Paul McKenney

[-- Attachment #1: peterz-seqcount-new-barrier.patch --]
[-- Type: text/plain, Size: 1750 bytes --]

Introduce raw_write_seqcount_barrier(), a new construct that can be
used to provide write barrier semantics in seqcount read loops instead
of the usual consistency guarantee.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/seqlock.h |   42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -233,6 +233,48 @@ static inline void raw_write_seqcount_en
 	s->sequence++;
 }
 
+/**
+ * raw_write_seqcount_barrier - do a seq write barrier
+ * @s: pointer to seqcount_t
+ *
+ * This can be used to provide an ordering guarantee instead of the
+ * usual consistency guarantee. It is one wmb cheaper, because we can
+ * collapse the two back-to-back wmb()s.
+ *
+ *      seqcount_t seq;
+ *      bool X = true, Y = false;
+ *
+ *      void read(void)
+ *      {
+ *              bool x, y;
+ *
+ *              do {
+ *                      int s = read_seqcount_begin(&seq);
+ *
+ *                      x = X; y = Y;
+ *
+ *              } while (read_seqcount_retry(&seq, s));
+ *
+ *              BUG_ON(!x && !y);
+ *      }
+ *
+ *      void write(void)
+ *      {
+ *              Y = true;
+ *
+ *              write_seqcount_begin(seq);
+ *              write_seqcount_end(seq);
+ *
+ *              X = false;
+ *      }
+ */
+static inline void raw_write_seqcount_barrier(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+	s->sequence++;
+}
+
 /*
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (10 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 22:19   ` [tip:timers/core] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
                   ` (6 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Al Viro, Linus Torvalds, Paul McKenney

[-- Attachment #1: peterz-hrtimer-base-running.patch --]
[-- Type: text/plain, Size: 10456 bytes --]

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/hrtimer.h |   41 ++++++-----------
 kernel/time/hrtimer.c   |  114 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 48 deletions(-)

--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(con
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(stru
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -111,6 +112,18 @@ static inline int hrtimer_clockid_to_bas
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_bas
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +207,8 @@ switch_hrtimer_base(struct hrtimer *time
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -840,11 +853,7 @@ static int enqueue_hrtimer(struct hrtime
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -909,14 +918,9 @@ remove_hrtimer(struct hrtimer *timer, st
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		if (!restart) {
-			/*
-			 * We must preserve the CALLBACK state flag here,
-			 * otherwise we could move the timer base in
-			 * switch_hrtimer_base.
-			 */
-			state &= HRTIMER_STATE_CALLBACK;
-		}
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -1117,6 +1121,51 @@ void hrtimer_init(struct hrtimer *timer,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1124,10 +1173,21 @@ static void __run_hrtimer(struct hrtimer
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1143,7 +1203,7 @@ static void __run_hrtimer(struct hrtimer
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1155,9 +1215,17 @@ static void __run_hrtimer(struct hrtimer
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (11 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 14/18] sched: Move code around Peter Zijlstra
                   ` (5 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz, Luca Abeni, Juri Lelli

[-- Attachment #1: peterz-sched-dl-cbs-bandwidth-timer.patch --]
[-- Type: text/plain, Size: 9142 bytes --]

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.
The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task left SCHED_DEADLINE it will not call task_dead_dl() and
we'll not cancel the hrtimer, leaving us a pending timer in free
space. Avoid this by giving the timer a task reference, this avoids
littering the task exit path for this rather uncommon case.

In order to do this, I had to move dl_task_offline_migration() below
the replenishment, such that the task_rq()->lock fully covers that.
While doing this, I noticed that it (was) buggy in assuming a task is
enqueued and or we need to enqueue the task now. Fixing this means
select_task_rq_dl() might encounter an offline rq -- look into that.

As a result this kills cancel_dl_timer() which included a rq->lock
break.

Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |  152 +++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 66 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -234,7 +234,7 @@ static inline void queue_pull_task(struc
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -268,14 +268,19 @@ static void dl_task_offline_migration(st
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -515,22 +520,23 @@ static void update_dl_entity(struct sche
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -542,7 +548,19 @@ static int start_dl_timer(struct sched_d
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
 	return 1;
 }
@@ -572,35 +590,40 @@ static enum hrtimer_restart dl_task_time
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -626,17 +649,38 @@ static enum hrtimer_restart dl_task_time
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -696,7 +740,7 @@ static void update_curr_dl(struct rq *rq
 	if (dl_runtime_exceeded(rq, dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1178,7 +1222,6 @@ static void task_fork_dl(struct task_str
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1188,8 +1231,6 @@ static void task_dead_dl(struct task_str
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1674,37 +1715,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 14/18] sched: Move code around
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (12 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 15/18] sched: Streamline the task migration locking a little Peter Zijlstra
                   ` (4 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-move-smp.patch --]
[-- Type: text/plain, Size: 12382 bytes --]

In preparation to reworking set_cpus_allowed_ptr() move some code
around. This also removes some superfluous #ifdefs and adds comments
to some #endifs.

   text    data     bss     dec     hex filename
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.pre
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.post

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |  364 +++++++++++++++++++++++++---------------------------
 1 file changed, 178 insertions(+), 186 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1046,6 +1046,180 @@ void check_preempt_curr(struct rq *rq, s
 }
 
 #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+
+	dequeue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, new_cpu);
+	raw_spin_unlock(&rq->lock);
+
+	rq = cpu_rq(new_cpu);
+
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_cpu(p) != new_cpu);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	enqueue_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+
+	return rq;
+}
+
+struct migration_arg {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!cpu_active(dest_cpu)))
+		return ret;
+
+	rq = cpu_rq(src_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto done;
+
+	/* Affinity changed (again). */
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+		goto fail;
+
+	/*
+	 * If we're not on a rq, the next wake-up will ensure we're
+	 * placed properly.
+	 */
+	if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+done:
+	ret = 1;
+fail:
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+	return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+	struct migration_arg *arg = data;
+
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
+	return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int dest_cpu;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (cpumask_equal(&p->cpus_allowed, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	} else if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+out:
+	task_rq_unlock(rq, p, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1186,13 +1360,6 @@ int migrate_swap(struct task_struct *cur
 	return ret;
 }
 
-struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1325,9 +1492,7 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
@@ -1432,7 +1597,7 @@ static void update_avg(u64 *avg, u64 sam
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -4767,149 +4932,6 @@ int task_can_attach(struct task_struct *
 }
 
 #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-	struct rq *rq = task_rq(p);
-
-	lockdep_assert_held(&rq->lock);
-
-	dequeue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_MIGRATING;
-	set_task_cpu(p, new_cpu);
-	raw_spin_unlock(&rq->lock);
-
-	rq = cpu_rq(new_cpu);
-
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-
-	return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	unsigned long flags;
-	struct rq *rq;
-	unsigned int dest_cpu;
-	int ret = 0;
-
-	rq = task_rq_lock(p, &flags);
-
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
-		return 0;
-	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-out:
-	task_rq_unlock(rq, p, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	struct rq *rq;
-	int ret = 0;
-
-	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
-
-	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
-
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
-}
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -4957,35 +4979,9 @@ void sched_setnuma(struct task_struct *p
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-	struct migration_arg *arg = data;
-
-	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
-	 */
-	local_irq_disable();
-	/*
-	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
-	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-	 */
-	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-	local_irq_enable();
-	return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5088,7 +5084,6 @@ static void migrate_tasks(unsigned int d
 
 	rq->stop = stop;
 }
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5267,7 +5262,7 @@ static void register_sched_domain_sysctl
 static void unregister_sched_domain_sysctl(void)
 {
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
 
 static void set_rq_online(struct rq *rq)
 {
@@ -5414,9 +5409,6 @@ static int __init migration_init(void)
 	return 0;
 }
 early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
 
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
@@ -6642,7 +6634,7 @@ static int __sdt_alloc(const struct cpum
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
-		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 15/18] sched: Streamline the task migration locking a little
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (13 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 14/18] sched: Move code around Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 16/18] lockdep: Simplify lock_release() Peter Zijlstra
                   ` (3 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-sched-migrate-locking.patch --]
[-- Type: text/plain, Size: 4633 bytes --]

The whole migrate_task{,s}() locking seems a little shaky, there's a
lot of dropping an require happening. Pull the locking up into the
callers as far as possible to streamline the lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   76 +++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,10 +1065,8 @@ void check_preempt_curr(struct rq *rq, s
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
 {
-	struct rq *rq = task_rq(p);
-
 	lockdep_assert_held(&rq->lock);
 
 	dequeue_task(rq, p, 0);
@@ -1100,41 +1098,19 @@ struct migration_arg {
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
  */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
-	struct rq *rq;
-	int ret = 0;
-
 	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
+		return rq;
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
+		return rq;
 
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
+	rq = move_queued_task(rq, p, dest_cpu);
+
+	return rq;
 }
 
 /*
@@ -1145,6 +1121,8 @@ static int __migrate_task(struct task_st
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
+	struct task_struct *p = arg->task;
+	struct rq *rq = this_rq();
 
 	/*
 	 * The original target cpu might have gone down and we might
@@ -1157,7 +1135,19 @@ static int migration_cpu_stop(void *data
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/*
+	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+	 * we're holding p->pi_lock.
+	 */
+	if (task_rq(p) == rq && task_on_rq_queued(p))
+		rq = __migrate_task(rq, p, arg->dest_cpu);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+
 	local_irq_enable();
 	return 0;
 }
@@ -1212,7 +1202,7 @@ int set_cpus_allowed_ptr(struct task_str
 		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
+		rq = move_queued_task(rq, p, dest_cpu);
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -5049,9 +5039,9 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
+	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 
@@ -5073,7 +5063,7 @@ static void migrate_tasks(unsigned int d
 	 */
 	update_rq_clock(rq);
 
-	for ( ; ; ) {
+	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
@@ -5086,12 +5076,14 @@ static void migrate_tasks(unsigned int d
 		next->sched_class->put_prev_task(rq, next);
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_cpu, next);
-		raw_spin_unlock(&rq->lock);
-
-		__migrate_task(next, dead_cpu, dest_cpu);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		raw_spin_lock(&rq->lock);
+		rq = __migrate_task(rq, next, dest_cpu);
+		if (rq != dead_rq) {
+			raw_spin_unlock(&rq->lock);
+			rq = dead_rq;
+			raw_spin_lock(&rq->lock);
+		}
 	}
 
 	rq->stop = stop;
@@ -5343,7 +5335,7 @@ migration_call(struct notifier_block *nf
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(cpu);
+		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 16/18] lockdep: Simplify lock_release()
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (14 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 15/18] sched: Streamline the task migration locking a little Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 17/18] lockdep: Implement lock pinning Peter Zijlstra
                   ` (2 subsequent siblings)
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peterz-lockdep-simplify-release.patch --]
[-- Type: text/plain, Size: 4879 bytes --]

lock_release() takes this nested argument that's mostly pointless
these days, remove the implementation but leave the argument a
rudiment for now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/locking/lockdep.c |  119 +++++++----------------------------------------
 1 file changed, 18 insertions(+), 101 deletions(-)

--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3260,26 +3260,6 @@ print_unlock_imbalance_bug(struct task_s
 	return 0;
 }
 
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
-			unsigned long ip)
-{
-	if (unlikely(!debug_locks))
-		return 0;
-	/*
-	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return 0;
-
-	if (curr->lockdep_depth <= 0)
-		return print_unlock_imbalance_bug(curr, lock, ip);
-
-	return 1;
-}
-
 static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 {
 	if (hlock->instance == lock)
@@ -3376,31 +3356,35 @@ __lock_set_class(struct lockdep_map *loc
 }
 
 /*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
  */
 static int
-lock_release_non_nested(struct task_struct *curr,
-			struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
+	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
 	unsigned int depth;
 	int i;
 
-	/*
-	 * Check whether the lock exists in the current stack
-	 * of held locks:
-	 */
+	if (unlikely(!debug_locks))
+		return 0;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
 	 * own any locks, you've been drinking again?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth))
-		return 0;
+	if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+		 return print_unlock_imbalance_bug(curr, lock, ip);
 
+	/*
+	 * Check whether the lock exists in the current stack
+	 * of held locks:
+	 */
 	prev_hlock = NULL;
 	for (i = depth-1; i >= 0; i--) {
 		hlock = curr->held_locks + i;
@@ -3456,78 +3440,10 @@ lock_release_non_nested(struct task_stru
 	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
 		return 0;
-	return 1;
-}
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
-			       struct lockdep_map *lock, unsigned long ip)
-{
-	struct held_lock *hlock;
-	unsigned int depth;
-
-	/*
-	 * Pop off the top of the lock stack:
-	 */
-	depth = curr->lockdep_depth - 1;
-	hlock = curr->held_locks + depth;
-
-	/*
-	 * Is the unlock non-nested:
-	 */
-	if (hlock->instance != lock || hlock->references)
-		return lock_release_non_nested(curr, lock, ip);
-	curr->lockdep_depth--;
-
-	/*
-	 * No more locks, but somehow we've got hash left over, who left it?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
-		return 0;
-
-	curr->curr_chain_key = hlock->prev_chain_key;
-
-	lock_release_holdtime(hlock);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	hlock->prev_chain_key = 0;
-	hlock->class_idx = 0;
-	hlock->acquire_ip = 0;
-	hlock->irq_context = 0;
-#endif
 	return 1;
 }
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
-{
-	struct task_struct *curr = current;
-
-	if (!check_unlock(curr, lock, ip))
-		return;
-
-	if (nested) {
-		if (!lock_release_nested(curr, lock, ip))
-			return;
-	} else {
-		if (!lock_release_non_nested(curr, lock, ip))
-			return;
-	}
-
-	check_chain_key(curr);
-}
-
 static int __lock_is_held(struct lockdep_map *lock)
 {
 	struct task_struct *curr = current;
@@ -3639,7 +3555,8 @@ void lock_release(struct lockdep_map *lo
 	check_flags(flags);
 	current->lockdep_recursion = 1;
 	trace_lock_release(lock, ip);
-	__lock_release(lock, nested, ip);
+	if (__lock_release(lock, nested, ip))
+		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 17/18] lockdep: Implement lock pinning
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (15 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 16/18] lockdep: Simplify lock_release() Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-06-11 12:46 ` [PATCH 18/18] sched,lockdep: Employ " Peter Zijlstra
  2015-12-29  5:41 ` [PATCH 00/18] sched: balance callbacks v4 Byungchul Park
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peter_zijlstra-lockdep-implement_lock_pinning.patch --]
[-- Type: text/plain, Size: 4999 bytes --]

Add a lockdep annotation that WARNs if you 'accidentially' unlock a
lock.

This is especially helpful for code with callbacks, where the upper
layer assumes a lock remains taken but a lower layer thinks it maybe
can drop and reacquire the lock.

By unwittingly breaking up the lock, races can be introduced.

Lock pinning is a lockdep annotation that helps with this, when you
lockdep_pin_lock() a held lock, any unlock without a
lockdep_unpin_lock() will produce a WARN. Think of this as a relative
of lockdep_assert_held(), except you don't only assert its held now,
but ensure it stays held until you release your assertion.

RFC: a possible alternative API would be something like:

  int cookie = lockdep_pin_lock(&foo);
  ...
  lockdep_unpin_lock(&foo, cookie);

Where we pick a random number for the pin_count; this makes it
impossible to sneak a lock break in without also passing the right
cookie along.

I've not done this because it ends up generating code for !LOCKDEP,
esp. if you need to pass the cookie around for some reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/lockdep.h  |   10 +++++
 kernel/locking/lockdep.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -255,6 +255,7 @@ struct held_lock {
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
+	unsigned int pin_count;
 };
 
 /*
@@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
+extern void lock_pin_lock(struct lockdep_map *lock);
+extern void lock_unpin_lock(struct lockdep_map *lock);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t ma
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
+#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
+#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -420,6 +427,9 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
+#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
+#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
+	hlock->pin_count = 0;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3403,6 +3404,8 @@ __lock_release(struct lockdep_map *lock,
 	if (hlock->instance == lock)
 		lock_release_holdtime(hlock);
 
+	WARN(hlock->pin_count, "releasing a pinned lock\n");
+
 	if (hlock->references) {
 		hlock->references--;
 		if (hlock->references) {
@@ -3459,6 +3462,49 @@ static int __lock_is_held(struct lockdep
 	return 0;
 }
 
+static void __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			hlock->pin_count++;
+			return;
+		}
+	}
+
+	WARN(1, "pinning an unheld lock\n");
+}
+
+static void __lock_unpin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+				return;
+
+			hlock->pin_count--;
+			return;
+		}
+	}
+
+	WARN(1, "unpinning an unheld lock\n");
+}
+
 /*
  * Check whether we follow the irq-flags state precisely:
  */
@@ -3582,6 +3628,40 @@ int lock_is_held(struct lockdep_map *loc
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
+void lock_pin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_pin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_unpin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;



^ permalink raw reply	[flat|nested] 58+ messages in thread

* [PATCH 18/18] sched,lockdep: Employ lock pinning
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (16 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 17/18] lockdep: Implement lock pinning Peter Zijlstra
@ 2015-06-11 12:46 ` Peter Zijlstra
  2015-06-18 23:04   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  2015-12-29  5:41 ` [PATCH 00/18] sched: balance callbacks v4 Byungchul Park
  18 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-11 12:46 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, tglx, juri.lelli, pang.xunlei, oleg, wanpeng.li,
	linux-kernel, peterz

[-- Attachment #1: peter_zijlstra-schedlockdep-employ_lock_pinning.patch --]
[-- Type: text/plain, Size: 8209 bytes --]

Employ the new lockdep lock pinning annotation to ensure no
'accidental' lock-breaks happen with rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/deadline.c |    8 ++++++++
 kernel/sched/fair.c     |   11 ++++++++---
 kernel/sched/rt.c       |    8 ++++++++
 kernel/sched/sched.h    |   10 ++++++++--
 5 files changed, 71 insertions(+), 8 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1201,8 +1201,15 @@ int set_cpus_allowed_ptr(struct task_str
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
-	} else if (task_on_rq_queued(p))
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		rq = move_queued_task(rq, p, dest_cpu);
+		lockdep_pin_lock(&rq->lock);
+	}
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -1562,6 +1569,8 @@ static int select_fallback_rq(int cpu, s
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
@@ -1652,9 +1661,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
-		 * XXX can drop rq->lock; most likely ok.
+		 * Our task @p is fully woken up and running; so its safe to
+		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (rq->idle_stamp) {
@@ -1674,6 +1686,8 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
@@ -1718,6 +1732,7 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
+	lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1725,6 +1740,7 @@ void sched_ttwu_pending(void)
 		ttwu_do_activate(rq, p, 0);
 	}
 
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1821,7 +1837,9 @@ static void ttwu_queue(struct task_struc
 #endif
 
 	raw_spin_lock(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1916,9 +1934,17 @@ static void try_to_wake_up_local(struct
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2538,6 +2564,7 @@ context_switch(struct rq *rq, struct tas
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
+	lockdep_unpin_lock(&rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
@@ -2960,6 +2987,7 @@ static void __sched __schedule(void)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3002,8 +3030,10 @@ static void __sched __schedule(void)
 
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
-	} else
+	} else {
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
 	balance_callback(rq);
 }
@@ -5071,6 +5101,11 @@ static void migrate_tasks(struct rq *dea
 		if (rq->nr_running == 1)
 			break;
 
+		/*
+		 * Ensure rq->lock covers the entire task selection
+		 * until the migration.
+		 */
+		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
@@ -5078,6 +5113,7 @@ static void migrate_tasks(struct rq *dea
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
+		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1153,7 +1153,15 @@ struct task_struct *pick_next_task_dl(st
 	dl_rq = &rq->dl;
 
 	if (need_pull_dl_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,7 +5392,15 @@ pick_next_task_fair(struct rq *rq, struc
 	return p;
 
 idle:
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	lockdep_unpin_lock(&rq->lock);
 	new_tasks = idle_balance(rq);
+	lockdep_pin_lock(&rq->lock);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -7426,9 +7434,6 @@ static int idle_balance(struct rq *this_
 		goto out;
 	}
 
-	/*
-	 * Drop the rq->lock, but keep IRQ/preempt disabled.
-	 */
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1478,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct
 	struct rt_rq *rt_rq = &rq->rt;
 
 	if (need_pull_rt_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_rt_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1438,8 +1438,10 @@ static inline struct rq *__task_rq_lock(
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -1476,8 +1478,10 @@ static inline struct rq *task_rq_lock(st
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 
@@ -1489,6 +1493,7 @@ static inline struct rq *task_rq_lock(st
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1497,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct tas
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 01/18] sched: Replace post_schedule with a balance callback list
  2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-11 15:32   ` Kirill Tkhai
  2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 58+ messages in thread
From: Kirill Tkhai @ 2015-06-11 15:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

A just came thought

В Чт, 11/06/2015 в 14:46 +0200, Peter Zijlstra пишет:
> Generalize the post_schedule() stuff into a balance callback list.
> This allows us to more easily use it outside of schedule() and cross
> sched_class.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c     |   36 ++++++++++++++++++++++++------------
>  kernel/sched/deadline.c |   21 +++++++++++----------
>  kernel/sched/rt.c       |   25 +++++++++++--------------
>  kernel/sched/sched.h    |   19 +++++++++++++++++--
>  4 files changed, 63 insertions(+), 38 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
>  #ifdef CONFIG_SMP
>  
>  /* rq->lock is NOT held, but preemption is disabled */
> -static inline void post_schedule(struct rq *rq)
> +static void __balance_callback(struct rq *rq)
>  {
> -       if (rq->post_schedule) {
> -               unsigned long flags;
> +       struct callback_head *head, *next;
> +       void (*func)(struct rq *rq);
> +       unsigned long flags;
>  
> -               raw_spin_lock_irqsave(&rq->lock, flags);
> -               if (rq->curr->sched_class->post_schedule)
> -                       rq->curr->sched_class->post_schedule(rq);
> -               raw_spin_unlock_irqrestore(&rq->lock, flags);
> +       raw_spin_lock_irqsave(&rq->lock, flags);
> +       head = rq->balance_callback;
> +       rq->balance_callback = NULL;
> +       while (head) {
> +               func = (void (*)(struct rq *))head->func;
> +               next = head->next;
> +               head->next = NULL;
> +               head = next;
>  
> -               rq->post_schedule = 0;
> +               func(rq);
>         }
> +       raw_spin_unlock_irqrestore(&rq->lock, flags);
> +}
> +
> +static inline void balance_callback(struct rq *rq)
> +{
> +       if (unlikely(rq->balance_callback))
> +               __balance_callback(rq);
>  }
>  
>  #else
>  
> -static inline void post_schedule(struct rq *rq)
> +static inline void balance_callback(struct rq *rq)
>  {
>  }
>  
> @@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
>         /* finish_task_switch() drops rq->lock and enables preemtion */
>         preempt_disable();
>         rq = finish_task_switch(prev);
> -       post_schedule(rq);
> +       balance_callback(rq);
>         preempt_enable();
>  
>         if (current->set_child_tid)
> @@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
>         } else
>                 raw_spin_unlock_irq(&rq->lock);
>  
> -       post_schedule(rq);
> +       balance_callback(rq);
>  }
>  
>  static inline void sched_submit_work(struct task_struct *tsk)
> @@ -7216,7 +7228,7 @@ void __init sched_init(void)
>                 rq->sd = NULL;
>                 rq->rd = NULL;
>                 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
> -               rq->post_schedule = 0;
> +               rq->balance_callback = NULL;
>                 rq->active_balance = 0;
>                 rq->next_balance = jiffies;
>                 rq->push_cpu = 0;
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
>         return dl_task(prev);
>  }
>  
> -static inline void set_post_schedule(struct rq *rq)
> +static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
> +
> +static void push_dl_tasks(struct rq *);
> +
> +static inline void queue_push_tasks(struct rq *rq)
>  {
> -       rq->post_schedule = has_pushable_dl_tasks(rq);
> +       if (!has_pushable_dl_tasks(rq))
> +               return;
> +
> +       queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
>  }
>  
>  static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
> @@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq
>         return 0;
>  }
>  
> -static inline void set_post_schedule(struct rq *rq)
> +static inline void queue_push_tasks(struct rq *rq)
>  {
>  }
>  #endif /* CONFIG_SMP */
> @@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
>         if (hrtick_enabled(rq))
>                 start_hrtick_dl(rq, p);
>  
> -       set_post_schedule(rq);
> +       queue_push_tasks(rq);
>  
>         return p;
>  }
> @@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
>         return ret;
>  }
>  
> -static void post_schedule_dl(struct rq *rq)
> -{
> -       push_dl_tasks(rq);
> -}
> -
>  /*
>   * Since the task is not running and a reschedule is not going to happen
>   * anytime soon on its runqueue, we try pushing it away now.
> @@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
>         .set_cpus_allowed       = set_cpus_allowed_dl,
>         .rq_online              = rq_online_dl,
>         .rq_offline             = rq_offline_dl,
> -       .post_schedule          = post_schedule_dl,
>         .task_woken             = task_woken_dl,
>  #endif
>  
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
>         return !plist_head_empty(&rq->rt.pushable_tasks);
>  }
>  
> -static inline void set_post_schedule(struct rq *rq)
> +static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
> +
> +static void push_rt_tasks(struct rq *);
> +
> +static inline void queue_push_tasks(struct rq *rq)
>  {
> -       /*
> -        * We detect this state here so that we can avoid taking the RQ
> -        * lock again later if there is no need to push
> -        */
> -       rq->post_schedule = has_pushable_tasks(rq);
> +       if (!has_pushable_tasks(rq))
> +               return;
> +
> +       queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
>  }
>  
>  static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
> @@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
>         return 0;
>  }
>  
> -static inline void set_post_schedule(struct rq *rq)
> +static inline void queue_push_tasks(struct rq *rq)
>  {
>  }
>  #endif /* CONFIG_SMP */
> @@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
>         /* The running task is never eligible for pushing */
>         dequeue_pushable_task(rq, p);
>  
> -       set_post_schedule(rq);
> +       queue_push_tasks(rq);
>  
>         return p;
>  }
> @@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
>         return ret;
>  }
>  
> -static void post_schedule_rt(struct rq *rq)
> -{
> -       push_rt_tasks(rq);
> -}
> -
>  /*
>   * If we are not running and we are not going to reschedule soon, we should
>   * try to push tasks away now
> @@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
>         .set_cpus_allowed       = set_cpus_allowed_rt,
>         .rq_online              = rq_online_rt,
>         .rq_offline             = rq_offline_rt,
> -       .post_schedule          = post_schedule_rt,
>         .task_woken             = task_woken_rt,
>         .switched_from          = switched_from_rt,
>  #endif
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -624,9 +624,10 @@ struct rq {
>         unsigned long cpu_capacity;
>         unsigned long cpu_capacity_orig;
>  
> +       struct callback_head *balance_callback;
> +
>         unsigned char idle_balance;
>         /* For active balancing */
> -       int post_schedule;
>         int active_balance;
>         int push_cpu;
>         struct cpu_stop_work active_balance_work;
> @@ -767,6 +768,21 @@ extern int migrate_swap(struct task_stru
>  
>  #ifdef CONFIG_SMP
>  
> +static inline void
> +queue_balance_callback(struct rq *rq,
> +                      struct callback_head *head,
> +                      void (*func)(struct rq *rq))
> +{
> +       lockdep_assert_held(&rq->lock);
> +
> +       if (unlikely(head->next))
> +               return;
> +
> +       head->func = (void (*)(struct callback_head *))func;
> +       head->next = rq->balance_callback;
> +       rq->balance_callback = head;
> +}

Maybe, we should queue a higher priority callback at the head?
And set aside a callback, which class's priority lower rq->curr's.

> +
>  extern void sched_ttwu_pending(void);
>  
>  #define rcu_dereference_check_sched_domain(p) \
> @@ -1192,7 +1208,6 @@ struct sched_class {
>         int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
>         void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
>  
> -       void (*post_schedule) (struct rq *this_rq);
>         void (*task_waking) (struct task_struct *task);
>         void (*task_woken) (struct rq *this_rq, struct task_struct *task);
>  
> 
> 



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 12:46 ` [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier() Peter Zijlstra
@ 2015-06-11 15:33   ` Paul E. McKenney
  2015-06-11 21:45     ` Paul E. McKenney
  0 siblings, 1 reply; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-11 15:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 11, 2015 at 02:46:47PM +0200, Peter Zijlstra wrote:
> Introduce raw_write_seqcount_barrier(), a new construct that can be
> used to provide write barrier semantics in seqcount read loops instead
> of the usual consistency guarantee.
> 
> Cc: Al Viro <viro@ZenIV.linux.org.uk>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/seqlock.h |   42 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 42 insertions(+)
> 
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -233,6 +233,48 @@ static inline void raw_write_seqcount_en
>  	s->sequence++;
>  }
> 
> +/**
> + * raw_write_seqcount_barrier - do a seq write barrier
> + * @s: pointer to seqcount_t
> + *
> + * This can be used to provide an ordering guarantee instead of the
> + * usual consistency guarantee. It is one wmb cheaper, because we can
> + * collapse the two back-to-back wmb()s.
> + *
> + *      seqcount_t seq;
> + *      bool X = true, Y = false;
> + *
> + *      void read(void)
> + *      {
> + *              bool x, y;
> + *
> + *              do {
> + *                      int s = read_seqcount_begin(&seq);
> + *
> + *                      x = X; y = Y;
> + *
> + *              } while (read_seqcount_retry(&seq, s));
> + *
> + *              BUG_ON(!x && !y);
> + *      }
> + *
> + *      void write(void)
> + *      {
> + *              Y = true;
> + *
> + *              write_seqcount_begin(seq);
> + *              write_seqcount_end(seq);
> + *
> + *              X = false;
> + *      }

So when using this, write() would instead look like this?

	void write(void)
	{
		Y = true;
		raw_write_seqcount_barrier(seq);
		X = false;
		}

I suggest calling this out explicitly.  Agreed, it should be obvious,
but some poor sot is going to be reading this at 3AM local time after
a couple days of no sleep, in which case obvious might not be so obvious.

I also would suggest READ_ONCE() and WRITE_ONCE() to keep the compiler
trickiness down to a dull roar.  Understood, it is hard to make anything
bad happen in this case, but small changes could result in badness.

> + */
> +static inline void raw_write_seqcount_barrier(seqcount_t *s)
> +{
> +	s->sequence++;
> +	smp_wmb();
> +	s->sequence++;
> +}
> +
>  /*
>   * raw_write_seqcount_latch - redirect readers to even/odd copy
>   * @s: pointer to seqcount_t

Looks good otherwise.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 15:33   ` Paul E. McKenney
@ 2015-06-11 21:45     ` Paul E. McKenney
  2015-06-12  7:08       ` Peter Zijlstra
                         ` (2 more replies)
  0 siblings, 3 replies; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-11 21:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 11, 2015 at 08:33:41AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 11, 2015 at 02:46:47PM +0200, Peter Zijlstra wrote:
> > Introduce raw_write_seqcount_barrier(), a new construct that can be
> > used to provide write barrier semantics in seqcount read loops instead
> > of the usual consistency guarantee.
> > 
> > Cc: Al Viro <viro@ZenIV.linux.org.uk>
> > Cc: Linus Torvalds <torvalds@linux-foundation.org>
> > Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
> > Suggested-by: Oleg Nesterov <oleg@redhat.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> > ---
> >  include/linux/seqlock.h |   42 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 42 insertions(+)
> > 
> > --- a/include/linux/seqlock.h
> > +++ b/include/linux/seqlock.h
> > @@ -233,6 +233,48 @@ static inline void raw_write_seqcount_en
> >  	s->sequence++;
> >  }
> > 
> > +/**
> > + * raw_write_seqcount_barrier - do a seq write barrier
> > + * @s: pointer to seqcount_t
> > + *
> > + * This can be used to provide an ordering guarantee instead of the
> > + * usual consistency guarantee. It is one wmb cheaper, because we can
> > + * collapse the two back-to-back wmb()s.
> > + *
> > + *      seqcount_t seq;
> > + *      bool X = true, Y = false;
> > + *
> > + *      void read(void)
> > + *      {
> > + *              bool x, y;
> > + *
> > + *              do {
> > + *                      int s = read_seqcount_begin(&seq);
> > + *
> > + *                      x = X; y = Y;
> > + *
> > + *              } while (read_seqcount_retry(&seq, s));
> > + *
> > + *              BUG_ON(!x && !y);
> > + *      }
> > + *
> > + *      void write(void)
> > + *      {
> > + *              Y = true;
> > + *
> > + *              write_seqcount_begin(seq);
> > + *              write_seqcount_end(seq);
> > + *
> > + *              X = false;
> > + *      }
> 
> So when using this, write() would instead look like this?
> 
> 	void write(void)
> 	{
> 		Y = true;
> 		raw_write_seqcount_barrier(seq);
> 		X = false;
> 		}
> 
> I suggest calling this out explicitly.  Agreed, it should be obvious,
> but some poor sot is going to be reading this at 3AM local time after
> a couple days of no sleep, in which case obvious might not be so obvious.
> 
> I also would suggest READ_ONCE() and WRITE_ONCE() to keep the compiler
> trickiness down to a dull roar.  Understood, it is hard to make anything
> bad happen in this case, but small changes could result in badness.
> 
> > + */
> > +static inline void raw_write_seqcount_barrier(seqcount_t *s)
> > +{
> > +	s->sequence++;
> > +	smp_wmb();
> > +	s->sequence++;
> > +}
> > +
> >  /*
> >   * raw_write_seqcount_latch - redirect readers to even/odd copy
> >   * @s: pointer to seqcount_t
> 
> Looks good otherwise.
> 
> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Color me slow and stupid.  Maybe due to reviewing a patch too early in
the morning, who knows?

There is nothing above that prevents the compiler and the CPU from
reordering the assignments to X and Y with the increment of s->sequence++.
One fix would be as follows:

	static inline void raw_write_seqcount_barrier(seqcount_t *s)
	{
		smp_wmb();
		s->sequence++;
		smp_wmb();
		s->sequence++;
		smp_wmb();
	}

Of course, this assumes that the accesses surrounding the call to
raw_write_seqcount_barrier() are writes.  If they can be a reads,
the two added smp_wmb() calls need to be full barriers.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 21:45     ` Paul E. McKenney
@ 2015-06-12  7:08       ` Peter Zijlstra
  2015-06-12 18:59       ` Oleg Nesterov
  2015-06-17 12:29       ` Peter Zijlstra
  2 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-12  7:08 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 11, 2015 at 02:45:57PM -0700, Paul E. McKenney wrote:
> Color me slow and stupid.  Maybe due to reviewing a patch too early in
> the morning, who knows?
> 
> There is nothing above that prevents the compiler and the CPU from
> reordering the assignments to X and Y with the increment of s->sequence++.

That's actually fine. As long as we observe an odd value the read side
will repeat.

> Of course, this assumes that the accesses surrounding the call to
> raw_write_seqcount_barrier() are writes.  

Which is why its got both write and barrier in the name :-)

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 21:45     ` Paul E. McKenney
  2015-06-12  7:08       ` Peter Zijlstra
@ 2015-06-12 18:59       ` Oleg Nesterov
  2015-06-17 12:29       ` Peter Zijlstra
  2 siblings, 0 replies; 58+ messages in thread
From: Oleg Nesterov @ 2015-06-12 18:59 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On 06/11, Paul E. McKenney wrote:
>
> > > + *      seqcount_t seq;
> > > + *      bool X = true, Y = false;
> > > + *
> > > + *      void read(void)
> > > + *      {
> > > + *              bool x, y;
> > > + *
> > > + *              do {
> > > + *                      int s = read_seqcount_begin(&seq);
> > > + *
> > > + *                      x = X; y = Y;
> > > + *
> > > + *              } while (read_seqcount_retry(&seq, s));
> > > + *
> > > + *              BUG_ON(!x && !y);
> > > + *      }
> > > + *
> > > + *      void write(void)
> > > + *      {
> > > + *              Y = true;
> > > + *
> > > + *              write_seqcount_begin(seq);
> > > + *              write_seqcount_end(seq);
> > > + *
> > > + *              X = false;
> > > + *      }
> >
> > > +static inline void raw_write_seqcount_barrier(seqcount_t *s)
> > > +{
> > > +	s->sequence++;
> > > +	smp_wmb();
> > > +	s->sequence++;
> > > +}
> > > +
> > >  /*
> > >   * raw_write_seqcount_latch - redirect readers to even/odd copy
> > >   * @s: pointer to seqcount_t
> >
> > Looks good otherwise.
> >
> > Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>
> Color me slow and stupid.  Maybe due to reviewing a patch too early in
> the morning, who knows?
>
> There is nothing above that prevents the compiler and the CPU from
> reordering the assignments to X and Y with the increment of s->sequence++.

Yes, but this doesn't matter, I think. The writer does

	Y = true;
	1st_increment;

	wmb();

	2nd_increment;
	X = false;

and we do not care about reordering before or after wmnb() at all. But we
rely on the fact that 1st_increment can not be reordered with "X = false",
and that "Y = true" can not be reordered with the 2nd_increment.


And another simple "proof" is that  seqcount_barrier() is equivalent to
write_seqcount_begin() + + write_seqcount_end() and thus the code above
is correct, or the ACQUIRE/RELEASE semantics of seqcount_t is broken ;)

Oleg.


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-11 21:45     ` Paul E. McKenney
  2015-06-12  7:08       ` Peter Zijlstra
  2015-06-12 18:59       ` Oleg Nesterov
@ 2015-06-17 12:29       ` Peter Zijlstra
  2015-06-17 14:57         ` Paul E. McKenney
  2015-06-18 22:19         ` [tip:timers/core] seqcount: Introduce raw_write_seqcount_barrier( ) tip-bot for Peter Zijlstra
  2 siblings, 2 replies; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-17 12:29 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 11, 2015 at 02:45:57PM -0700, Paul E. McKenney wrote:
> Color me slow and stupid.  Maybe due to reviewing a patch too early in
> the morning, who knows?
> 
> There is nothing above that prevents the compiler and the CPU from
> reordering the assignments to X and Y with the increment of s->sequence++.
> One fix would be as follows:
> 
> 	static inline void raw_write_seqcount_barrier(seqcount_t *s)
> 	{
> 		smp_wmb();
> 		s->sequence++;
> 		smp_wmb();
> 		s->sequence++;
> 		smp_wmb();
> 	}
> 
> Of course, this assumes that the accesses surrounding the call to
> raw_write_seqcount_barrier() are writes.  If they can be a reads,
> the two added smp_wmb() calls need to be full barriers.

I have updated the Changelog to hopefully explain things better.

I did leave off the READ/WRITE ONCE stuff, because I could not come up
with a scenario where it makes a difference -- I appreciate paranoia,
but I also think we should not overdo the thing.

---
Subject: seqcount: Introduce raw_write_seqcount_barrier()
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Jun 11 12:35:48 CEST 2015

Introduce raw_write_seqcount_barrier(), a new construct that can be
used to provide write barrier semantics in seqcount read loops instead
of the usual consistency guarantee.

raw_write_seqcount_barier() is equivalent to:

	raw_write_seqcount_begin();
	raw_write_seqcount_end();

But avoids issueing two back-to-back smp_wmb() instructions.

This construct works because the read side will 'stall' when observing
odd values. This means that -- referring to the example in the comment
below -- even though there is no (matching) read barrier between the
loads of X and Y, we cannot observe !x && !y, because:

 - if we observe Y == false we must observe the first sequence
   increment, which makes us loop, until

 - we observe !(seq & 1) -- the second sequence increment -- at which
   time we must also observe T == true.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/seqlock.h |   42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -233,6 +233,47 @@ static inline void raw_write_seqcount_en
 	s->sequence++;
 }
 
+/**
+ * raw_write_seqcount_barrier - do a seq write barrier
+ * @s: pointer to seqcount_t
+ *
+ * This can be used to provide an ordering guarantee instead of the
+ * usual consistency guarantee. It is one wmb cheaper, because we can
+ * collapse the two back-to-back wmb()s.
+ *
+ *      seqcount_t seq;
+ *      bool X = true, Y = false;
+ *
+ *      void read(void)
+ *      {
+ *              bool x, y;
+ *
+ *              do {
+ *                      int s = read_seqcount_begin(&seq);
+ *
+ *                      x = X; y = Y;
+ *
+ *              } while (read_seqcount_retry(&seq, s));
+ *
+ *              BUG_ON(!x && !y);
+ *      }
+ *
+ *      void write(void)
+ *      {
+ *              Y = true;
+ *
+ *              raw_write_seqcount_barrier(seq);
+ *
+ *              X = false;
+ *      }
+ */
+static inline void raw_write_seqcount_barrier(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+	s->sequence++;
+}
+
 /*
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 12:29       ` Peter Zijlstra
@ 2015-06-17 14:57         ` Paul E. McKenney
  2015-06-17 15:11           ` Peter Zijlstra
  2015-06-17 15:49           ` Peter Zijlstra
  2015-06-18 22:19         ` [tip:timers/core] seqcount: Introduce raw_write_seqcount_barrier( ) tip-bot for Peter Zijlstra
  1 sibling, 2 replies; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-17 14:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 02:29:24PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 11, 2015 at 02:45:57PM -0700, Paul E. McKenney wrote:
> > Color me slow and stupid.  Maybe due to reviewing a patch too early in
> > the morning, who knows?
> > 
> > There is nothing above that prevents the compiler and the CPU from
> > reordering the assignments to X and Y with the increment of s->sequence++.
> > One fix would be as follows:
> > 
> > 	static inline void raw_write_seqcount_barrier(seqcount_t *s)
> > 	{
> > 		smp_wmb();
> > 		s->sequence++;
> > 		smp_wmb();
> > 		s->sequence++;
> > 		smp_wmb();
> > 	}
> > 
> > Of course, this assumes that the accesses surrounding the call to
> > raw_write_seqcount_barrier() are writes.  If they can be a reads,
> > the two added smp_wmb() calls need to be full barriers.
> 
> I have updated the Changelog to hopefully explain things better.
> 
> I did leave off the READ/WRITE ONCE stuff, because I could not come up
> with a scenario where it makes a difference -- I appreciate paranoia,
> but I also think we should not overdo the thing.

I can only conclude that you have not read this document:

	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html

Specifically, please keep in mind that unless you mark either the variable
or the memory access, the compiler is within its rights to assume that
there are no concurrent accesses to that variable.  For but one example,
if you do a normal store to a given variable, then the compiler is
within its rights to use that variable as temporary storage prior to
that store.  And yes, you can reasonably argue that no sane compiler
would store something else to s->sequence given that it could free up
a register by storing the incremented value, but the fact remains that
you have given it permission to do so if it wants.

							Thanx, Paul

> ---
> Subject: seqcount: Introduce raw_write_seqcount_barrier()
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Jun 11 12:35:48 CEST 2015
> 
> Introduce raw_write_seqcount_barrier(), a new construct that can be
> used to provide write barrier semantics in seqcount read loops instead
> of the usual consistency guarantee.
> 
> raw_write_seqcount_barier() is equivalent to:
> 
> 	raw_write_seqcount_begin();
> 	raw_write_seqcount_end();
> 
> But avoids issueing two back-to-back smp_wmb() instructions.
> 
> This construct works because the read side will 'stall' when observing
> odd values. This means that -- referring to the example in the comment
> below -- even though there is no (matching) read barrier between the
> loads of X and Y, we cannot observe !x && !y, because:
> 
>  - if we observe Y == false we must observe the first sequence
>    increment, which makes us loop, until
> 
>  - we observe !(seq & 1) -- the second sequence increment -- at which
>    time we must also observe T == true.
> 
> Cc: Al Viro <viro@ZenIV.linux.org.uk>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
> Suggested-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/seqlock.h |   42 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 42 insertions(+)
> 
> --- a/include/linux/seqlock.h
> +++ b/include/linux/seqlock.h
> @@ -233,6 +233,47 @@ static inline void raw_write_seqcount_en
>  	s->sequence++;
>  }
> 
> +/**
> + * raw_write_seqcount_barrier - do a seq write barrier
> + * @s: pointer to seqcount_t
> + *
> + * This can be used to provide an ordering guarantee instead of the
> + * usual consistency guarantee. It is one wmb cheaper, because we can
> + * collapse the two back-to-back wmb()s.
> + *
> + *      seqcount_t seq;
> + *      bool X = true, Y = false;
> + *
> + *      void read(void)
> + *      {
> + *              bool x, y;
> + *
> + *              do {
> + *                      int s = read_seqcount_begin(&seq);
> + *
> + *                      x = X; y = Y;
> + *
> + *              } while (read_seqcount_retry(&seq, s));
> + *
> + *              BUG_ON(!x && !y);
> + *      }
> + *
> + *      void write(void)
> + *      {
> + *              Y = true;
> + *
> + *              raw_write_seqcount_barrier(seq);
> + *
> + *              X = false;
> + *      }
> + */
> +static inline void raw_write_seqcount_barrier(seqcount_t *s)
> +{
> +	s->sequence++;
> +	smp_wmb();
> +	s->sequence++;
> +}
> +
>  /*
>   * raw_write_seqcount_latch - redirect readers to even/odd copy
>   * @s: pointer to seqcount_t
> 


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 14:57         ` Paul E. McKenney
@ 2015-06-17 15:11           ` Peter Zijlstra
  2015-06-17 15:42             ` Paul E. McKenney
  2015-06-17 15:49           ` Peter Zijlstra
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-17 15:11 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 07:57:12AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 17, 2015 at 02:29:24PM +0200, Peter Zijlstra wrote:
> > I did leave off the READ/WRITE ONCE stuff, because I could not come up
> > with a scenario where it makes a difference -- I appreciate paranoia,
> > but I also think we should not overdo the thing.
> 
> I can only conclude that you have not read this document:
> 
> 	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html

This would be correct.

> Specifically, please keep in mind that unless you mark either the variable
> or the memory access, the compiler is within its rights to assume that
> there are no concurrent accesses to that variable.  For but one example,
> if you do a normal store to a given variable, then the compiler is
> within its rights to use that variable as temporary storage prior to
> that store.  And yes, you can reasonably argue that no sane compiler
> would store something else to s->sequence given that it could free up
> a register by storing the incremented value, but the fact remains that
> you have given it permission to do so if it wants.

Argh *grmbl*, that's bloody insane!

So I get the re-loading, I get the tearing, but this random intermittent
values (somewhat related to stores out of thin air) is completely
bonkers.

I would very much prefer a compiler switch that instructs the compiler
to not do bloody stupid things like this instead of marking every other
load/store in the kernel with volatile.

Note that if GCC were to actually do something like this, the kernel
would already be broken, because I'm very sure we did not consider/audit
it for this.



^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 15:11           ` Peter Zijlstra
@ 2015-06-17 15:42             ` Paul E. McKenney
  2015-06-17 16:58               ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-17 15:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 05:11:09PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 17, 2015 at 07:57:12AM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 17, 2015 at 02:29:24PM +0200, Peter Zijlstra wrote:
> > > I did leave off the READ/WRITE ONCE stuff, because I could not come up
> > > with a scenario where it makes a difference -- I appreciate paranoia,
> > > but I also think we should not overdo the thing.
> > 
> > I can only conclude that you have not read this document:
> > 
> > 	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html
> 
> This would be correct.
> 
> > Specifically, please keep in mind that unless you mark either the variable
> > or the memory access, the compiler is within its rights to assume that
> > there are no concurrent accesses to that variable.  For but one example,
> > if you do a normal store to a given variable, then the compiler is
> > within its rights to use that variable as temporary storage prior to
> > that store.  And yes, you can reasonably argue that no sane compiler
> > would store something else to s->sequence given that it could free up
> > a register by storing the incremented value, but the fact remains that
> > you have given it permission to do so if it wants.
> 
> Argh *grmbl*, that's bloody insane!

You expected me to argue with that statement?  ;-)

> So I get the re-loading, I get the tearing, but this random intermittent
> values (somewhat related to stores out of thin air) is completely
> bonkers.
> 
> I would very much prefer a compiler switch that instructs the compiler
> to not do bloody stupid things like this instead of marking every other
> load/store in the kernel with volatile.

I would of course be good with such a compiler switch, though my earlier
attempts to negotiate one were unsuccessful.  But I don't believe that we
discussed a switch to specifically prohibit only use of to-be-stored-into
variables as temporary scratch space.  The trick is finding restrictions
that are useful, but that don't imply -O0.

Any GCC or LLVM folks on the list?

> Note that if GCC were to actually do something like this, the kernel
> would already be broken, because I'm very sure we did not consider/audit
> it for this.

An accident waiting to happen, given that both GCC and the Linux kernel
are moving targets.  :-/

							Thanx, Paul


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 14:57         ` Paul E. McKenney
  2015-06-17 15:11           ` Peter Zijlstra
@ 2015-06-17 15:49           ` Peter Zijlstra
  2015-06-17 16:37             ` Paul E. McKenney
  1 sibling, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-17 15:49 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 07:57:12AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 17, 2015 at 02:29:24PM +0200, Peter Zijlstra wrote:
> > I did leave off the READ/WRITE ONCE stuff, because I could not come up
> > with a scenario where it makes a difference -- I appreciate paranoia,
> > but I also think we should not overdo the thing.
> 
> I can only conclude that you have not read this document:
> 
> 	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html
> 
> Specifically, please keep in mind that unless you mark either the variable
> or the memory access, the compiler is within its rights to assume that
> there are no concurrent accesses to that variable.  For but one example,
> if you do a normal store to a given variable, then the compiler is
> within its rights to use that variable as temporary storage prior to
> that store.  And yes, you can reasonably argue that no sane compiler
> would store something else to s->sequence given that it could free up
> a register by storing the incremented value, but the fact remains that
> you have given it permission to do so if it wants.

This is the "Optimizations without Atomics" section you're referring to.

It has words such as: "if the compiler can prove they are not accessed
in other threads concurrently" and "This requires escape analysis: the
compiler must see the full scope of the memory location 'p', or must
know that leaf functions don't capture 'p' and aren't used concurrently,
for this optimization to be valid."

But then it starts weasel wording and saying that the lack of
std::atomic<> usage implies a lack of concurrency, to which I strongly
object.

Esp. seeing how -ffreestanding does not have access to any of the atomic
stuff since its library bits and not language bits (something which I've
often said was a failure in the spec).

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 15:49           ` Peter Zijlstra
@ 2015-06-17 16:37             ` Paul E. McKenney
  2015-06-17 17:11               ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-17 16:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 05:49:27PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 17, 2015 at 07:57:12AM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 17, 2015 at 02:29:24PM +0200, Peter Zijlstra wrote:
> > > I did leave off the READ/WRITE ONCE stuff, because I could not come up
> > > with a scenario where it makes a difference -- I appreciate paranoia,
> > > but I also think we should not overdo the thing.
> > 
> > I can only conclude that you have not read this document:
> > 
> > 	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html
> > 
> > Specifically, please keep in mind that unless you mark either the variable
> > or the memory access, the compiler is within its rights to assume that
> > there are no concurrent accesses to that variable.  For but one example,
> > if you do a normal store to a given variable, then the compiler is
> > within its rights to use that variable as temporary storage prior to
> > that store.  And yes, you can reasonably argue that no sane compiler
> > would store something else to s->sequence given that it could free up
> > a register by storing the incremented value, but the fact remains that
> > you have given it permission to do so if it wants.
> 
> This is the "Optimizations without Atomics" section you're referring to.
> 
> It has words such as: "if the compiler can prove they are not accessed
> in other threads concurrently" and "This requires escape analysis: the
> compiler must see the full scope of the memory location 'p', or must
> know that leaf functions don't capture 'p' and aren't used concurrently,
> for this optimization to be valid."

Yes, but...  These qualifiers apply only in cases where the code at hand
is not writing to the variable.

In this case, it is legal for other threads to be concurrently reading
the variable.  After all, if the value of the variable is not changing,
then the various read-side optimizations have no effect -- even the silly
ones, like reading the variable one bit at a time.  If the compiler
introduces a write into code that was only reading the variable, then
it is the compiler's job to ensure that no other thread is reading it.
If the compiler were to introduce a write to such a variable that other
threads might be reading, then the compiler would have introduced a data
race, which would mean that the compiler introduced undefined behavior
to correct code.  The compiler therefore must be extremely careful when
introducing a write to a variable that is only read by the code at hand.
(Though there are still some "interesting" escape clauses for the compiler
if the variable in question is allocated on the stack and the function
can return without transferring control to some function in some other
translation unit.)

Recall that a data race occurs when there are multiple concurrent normal
accesses to a given normal variable, at least one of which is a write.
Here normal accesses and normal variables are those that are not marked
as atomic (in the C11 sense).  Accesses and variables marked as volatile
also disable most (perhaps all) of the dangerous optimizations that lead
to the undefined behavior.  That said, many compiler people hate volatile,
and will therefore automatically argue that it is useless in a misguided
attempt to convince people not to use it.  :-/

On the other hand, if the code is already writing to the variable (as it
is in the s->sequence++ case), then if there are any concurrent accesses,
the code -already- contains a data race.  This data race invokes undefined
behavior in the code as written, so the compiler is within its rights to
do anything at all, even spawn the proverbial game of rogue.  A somewhat
more reasonable compiler is within its rights to assume that no one is
concurrently reading any normal variable to which the code does a normal
write.  The compiler is therefore within its rights to use that variable
as scratch storage at any time between the most recent read and the write
in question.

> But then it starts weasel wording and saying that the lack of
> std::atomic<> usage implies a lack of concurrency, to which I strongly
> object.

Heh!

The point of std::atomic<> (and of the equivalent C11 syntax) is to
force the compiler to suppress optimizations that are unsafe for shared
variables.  We get more or less the same effect with volatile, protests
from compiler people notwithstanding.

I often tell the compiler guys that they have to expect make -some-
concessions for being 30 years late to the concurrency party, but
it nevertheless makes sense to future-proof our code where it is
reasonable to do so.

All that aside, I agree that "s->sequence++" is relatively low
priority, given that the compiler can easily free up a register by
storing the actual value.  But that might well be a failure of
imagination on my part.

> Esp. seeing how -ffreestanding does not have access to any of the atomic
> stuff since its library bits and not language bits (something which I've
> often said was a failure in the spec).

Agreed, given that atomics can almost always be inlined, it would be
nice if -ffreestanding didn't cut off the compiler's concurrency nose
to spite its concurrency face.  The reasoning behind it is that it is
legal (but, in my opinion, stupid) to create large atomic data structures.
The compilers normally introduce locks to implement these, which is
inappropriate in free-standing environments.  I believe that a better
strategy would be for -ffreestanding to implement only those atomics
that are machine-word sized, as in ATOMIC_..._LOCK_FREE==2.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 15:42             ` Paul E. McKenney
@ 2015-06-17 16:58               ` Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-17 16:58 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 08:42:45AM -0700, Paul E. McKenney wrote:
> > I would very much prefer a compiler switch that instructs the compiler
> > to not do bloody stupid things like this instead of marking every other
> > load/store in the kernel with volatile.
> 
> I would of course be good with such a compiler switch, though my earlier
> attempts to negotiate one were unsuccessful.  But I don't believe that we
> discussed a switch to specifically prohibit only use of to-be-stored-into
> variables as temporary scratch space.  The trick is finding restrictions
> that are useful, but that don't imply -O0.

I would request on that disables all the 'stores from thin air'
'optimizations'. IOW assume everything is shared memory and concurrent
unless you can prove its not so. For example a local stack variable that
does not escape scope.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 16:37             ` Paul E. McKenney
@ 2015-06-17 17:11               ` Peter Zijlstra
  2015-06-17 18:02                 ` Paul E. McKenney
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-17 17:11 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 09:37:31AM -0700, Paul E. McKenney wrote:
> The point of std::atomic<> (and of the equivalent C11 syntax) is to
> force the compiler to suppress optimizations that are unsafe for shared
> variables.  We get more or less the same effect with volatile, protests
> from compiler people notwithstanding.
> 
> I often tell the compiler guys that they have to expect make -some-
> concessions for being 30 years late to the concurrency party, but
> it nevertheless makes sense to future-proof our code where it is
> reasonable to do so.

Right, so in that regards I would request the compiler option (and or
#pragma) that disables all the out-of-thin-air nonsense.

Because while they hide behind their undefined behaviour, the fact is
that all of their machines for the past 30 odd years have been relying
on this 'undefined' behaviour to work. This being the machines they've
been typing their useless specs on :-)

I doubt there's a single OS kernel (that supports SMP configurations)
that does not rely on a whole host of 'undefined' behaviour.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 17:11               ` Peter Zijlstra
@ 2015-06-17 18:02                 ` Paul E. McKenney
  2015-06-18  9:15                   ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-17 18:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 07:11:40PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 17, 2015 at 09:37:31AM -0700, Paul E. McKenney wrote:
> > The point of std::atomic<> (and of the equivalent C11 syntax) is to
> > force the compiler to suppress optimizations that are unsafe for shared
> > variables.  We get more or less the same effect with volatile, protests
> > from compiler people notwithstanding.
> > 
> > I often tell the compiler guys that they have to expect make -some-
> > concessions for being 30 years late to the concurrency party, but
> > it nevertheless makes sense to future-proof our code where it is
> > reasonable to do so.
> 
> Right, so in that regards I would request the compiler option (and or
> #pragma) that disables all the out-of-thin-air nonsense.

OK.  What is the form of the #pragma?  If it focuses on a specific
access, we are likely to get a lot of pushback.

> Because while they hide behind their undefined behaviour, the fact is
> that all of their machines for the past 30 odd years have been relying
> on this 'undefined' behaviour to work. This being the machines they've
> been typing their useless specs on :-)

Maybe I can scare them into doing all their work on UP systems.  ;-)

Interestingly enough, LLVM is taking a slightly different approach.
Rather than invoke undefined behavior, they say that data races result
in random bits being loaded.  Not that it makes much difference to the
health and well-being of the software, mind you...

> I doubt there's a single OS kernel (that supports SMP configurations)
> that does not rely on a whole host of 'undefined' behaviour.

An alternative approach would be a compiler switch (or similar) that
changed the default atomic access from SC to relaxed.  Then shared
variables could be marked atomic, and normal C code could be used to
access them, but without the compiler emitting memory barriers all over
the place (yes, even on x86).

							Thanx, Paul


^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-17 18:02                 ` Paul E. McKenney
@ 2015-06-18  9:15                   ` Peter Zijlstra
  2015-06-18  9:40                     ` Ingo Molnar
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-18  9:15 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Wed, Jun 17, 2015 at 11:02:14AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 17, 2015 at 07:11:40PM +0200, Peter Zijlstra wrote:
> > On Wed, Jun 17, 2015 at 09:37:31AM -0700, Paul E. McKenney wrote:
> > > The point of std::atomic<> (and of the equivalent C11 syntax) is to
> > > force the compiler to suppress optimizations that are unsafe for shared
> > > variables.  We get more or less the same effect with volatile, protests
> > > from compiler people notwithstanding.
> > > 
> > > I often tell the compiler guys that they have to expect make -some-
> > > concessions for being 30 years late to the concurrency party, but
> > > it nevertheless makes sense to future-proof our code where it is
> > > reasonable to do so.
> > 
> > Right, so in that regards I would request the compiler option (and or
> > #pragma) that disables all the out-of-thin-air nonsense.
> 
> OK.  What is the form of the #pragma?  If it focuses on a specific
> access, we are likely to get a lot of pushback.

I didn't have anything specific in mind; other than

#pragma no_speculative_stores_ever

Which would forbid all these retarded 'optimizations' for the entire
translation unit.

> > Because while they hide behind their undefined behaviour, the fact is
> > that all of their machines for the past 30 odd years have been relying
> > on this 'undefined' behaviour to work. This being the machines they've
> > been typing their useless specs on :-)
> 
> Maybe I can scare them into doing all their work on UP systems.  ;-)
> 
> Interestingly enough, LLVM is taking a slightly different approach.
> Rather than invoke undefined behavior, they say that data races result
> in random bits being loaded.  Not that it makes much difference to the
> health and well-being of the software, mind you...

I'm not sure I follow that argument.

> > I doubt there's a single OS kernel (that supports SMP configurations)
> > that does not rely on a whole host of 'undefined' behaviour.
> 
> An alternative approach would be a compiler switch (or similar) that
> changed the default atomic access from SC to relaxed.  Then shared
> variables could be marked atomic, and normal C code could be used to
> access them, but without the compiler emitting memory barriers all over
> the place (yes, even on x86).

See, I don;'t think that is a realistic approach. Who is going to audit
our ~16 million lines of code to mark all shared variables? Or all the
other existing code bases that rely on this behaviour?

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-18  9:15                   ` Peter Zijlstra
@ 2015-06-18  9:40                     ` Ingo Molnar
  2015-06-18 10:40                       ` Peter Zijlstra
  0 siblings, 1 reply; 58+ messages in thread
From: Ingo Molnar @ 2015-06-18  9:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> > > I doubt there's a single OS kernel (that supports SMP configurations) that 
> > > does not rely on a whole host of 'undefined' behaviour.
> > 
> > An alternative approach would be a compiler switch (or similar) that changed 
> > the default atomic access from SC to relaxed.  Then shared variables could be 
> > marked atomic, and normal C code could be used to access them, but without the 
> > compiler emitting memory barriers all over the place (yes, even on x86).
> 
> See, I don;'t think that is a realistic approach. Who is going to audit our ~16 
> million lines of code to mark all shared variables? Or all the other existing 
> code bases that rely on this behaviour?

Sidenote: we are well beyond 19 million lines meanwhile.

But generating speculative writes unless the compiler can prove it's not shared 
memory are crazy. Who on earth argues they are sane?

In what retarded use-case do unasked for speculative writes even make any sense 
beyond as a sadistic tool to make parallel, threaded code even more fragile??

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-18  9:40                     ` Ingo Molnar
@ 2015-06-18 10:40                       ` Peter Zijlstra
  2015-06-18 16:54                         ` Paul E. McKenney
  0 siblings, 1 reply; 58+ messages in thread
From: Peter Zijlstra @ 2015-06-18 10:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Paul E. McKenney, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 18, 2015 at 11:40:14AM +0200, Ingo Molnar wrote:

> In what retarded use-case do unasked for speculative writes even make any sense 
> beyond as a sadistic tool to make parallel, threaded code even more fragile??

So what worries me most is the "Takeaways" from the document:

  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html

Those read like a 'fuck you' to 30 years of concurrent code in C.

Sure, its nice and all that they finally have something that's
standardized, and this might be an option for new projects (in reality
it might only really be an option in another 5-10 years).

But the active encouragement to break existing code is utterly fucked.

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-18 10:40                       ` Peter Zijlstra
@ 2015-06-18 16:54                         ` Paul E. McKenney
  2015-06-18 17:10                           ` Steven Rostedt
  0 siblings, 1 reply; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-18 16:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, umgwanakikbuti, mingo, ktkhai, rostedt, tglx,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 18, 2015 at 12:40:15PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 18, 2015 at 11:40:14AM +0200, Ingo Molnar wrote:
> 
> > In what retarded use-case do unasked for speculative writes even make any sense 
> > beyond as a sadistic tool to make parallel, threaded code even more fragile??

Well, these are the compiler guys we are talking about, so why would
you expect otherwise?  Sorry, couldn't resist...  ;-)

They believe that all parallel threaded code should mark either all
shared variables (but get memory barriers on all normal accesses) or all
accesses.  Their use case is that they would like to be able to continue
carrying out single-threaded-safe optimizations on concurrent code.  And
speculative writes of that form are just fine in single-threaded code.

And if you think -that- is bad, you should have seen the screaming and
shouting when they were prohibited from stomping on unrelated variables.
They used to do "write widening", where they might use a vector unit
to do a store, then fix up the adjacent variables that were clobbered
by a too-wide write.  You just can't make this stuff up!

Anyway, I am having an ongoing discussion with them on handling
pre-existing code.  It has been an "interesting" discussion, particularly
the parts involving rcu_dereference().

> So what worries me most is the "Takeaways" from the document:
> 
>   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4455.html
> 
> Those read like a 'fuck you' to 30 years of concurrent code in C.

Pretty much.  I already told them that, and they will hopefully do better
in the next version.  Some of them were annoyed at me for telling them
what they could do with their proposal to eliminate thread-local storage
(TLS, similar to Linux kernel per-CPU variables), though JF Bastien was
co-author with me on that paper.  But perhaps he felt the need to kiss
up to the people who were annoyed by the refusal to go along with their
TLS-removal urge.

> Sure, its nice and all that they finally have something that's
> standardized, and this might be an option for new projects (in reality
> it might only really be an option in another 5-10 years).
> 
> But the active encouragement to break existing code is utterly fucked.

Yep, I have to frequently remind them that most projects need to support
old compilers.  And I did point out that the commentary at the end
of that document would not encourage adoption of C11.  They of course
felt this was unfair of me, so I have to thank you both for proving the
correctness of my reply to them.  Although you guys didn't use quite as
many swear words as I would have expected.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-18 16:54                         ` Paul E. McKenney
@ 2015-06-18 17:10                           ` Steven Rostedt
  2015-06-18 17:51                             ` Paul E. McKenney
  0 siblings, 1 reply; 58+ messages in thread
From: Steven Rostedt @ 2015-06-18 17:10 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Ingo Molnar, umgwanakikbuti, mingo, ktkhai, tglx,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, 18 Jun 2015 09:54:07 -0700
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> Yep, I have to frequently remind them that most projects need to support
> old compilers.  And I did point out that the commentary at the end
> of that document would not encourage adoption of C11.  They of course
> felt this was unfair of me, so I have to thank you both for proving the
> correctness of my reply to them.  Although you guys didn't use quite as
> many swear words as I would have expected.  ;-)

I could add a few more if you would like ;-)

What's their issue? Is there some kind of benchmark war going on
between different compilers? Where they want want to prove they can
produce the absolute fastest code possible, but only use single
threaded apps and screw those that must support multi-threaded
applications.

My phone and my camera have multicore systems. Single threaded is not
the way of the future. Which ever compiler makes it easier to write
multi-threaded applications is going to win, regardless of how well a
compiler can claim they optimize code the best for a single threaded
app.

-- Steve

^ permalink raw reply	[flat|nested] 58+ messages in thread

* Re: [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier()
  2015-06-18 17:10                           ` Steven Rostedt
@ 2015-06-18 17:51                             ` Paul E. McKenney
  0 siblings, 0 replies; 58+ messages in thread
From: Paul E. McKenney @ 2015-06-18 17:51 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Ingo Molnar, umgwanakikbuti, mingo, ktkhai, tglx,
	juri.lelli, pang.xunlei, oleg, wanpeng.li, linux-kernel, Al Viro,
	Linus Torvalds

On Thu, Jun 18, 2015 at 01:10:22PM -0400, Steven Rostedt wrote:
> On Thu, 18 Jun 2015 09:54:07 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > Yep, I have to frequently remind them that most projects need to support
> > old compilers.  And I did point out that the commentary at the end
> > of that document would not encourage adoption of C11.  They of course
> > felt this was unfair of me, so I have to thank you both for proving the
> > correctness of my reply to them.  Although you guys didn't use quite as
> > many swear words as I would have expected.  ;-)
> 
> I could add a few more if you would like ;-)

I bet!  ;-)

> What's their issue? Is there some kind of benchmark war going on
> between different compilers? Where they want want to prove they can
> produce the absolute fastest code possible, but only use single
> threaded apps and screw those that must support multi-threaded
> applications.
> 
> My phone and my camera have multicore systems. Single threaded is not
> the way of the future. Which ever compiler makes it easier to write
> multi-threaded applications is going to win, regardless of how well a
> compiler can claim they optimize code the best for a single threaded
> app.

Their viewpoint is that they have produced syntax for marking shared
variables and also for marking accesses to shared variables, and that for
code that is not using those markings, anything goes.  They need frequent
reminders of the need to accommodate pre-existing code, and usually don't
take such reminders very well.  Ditto for projects such as the Linux
kernel to support pre-C11 compilers, and that need production-quality
compiler support.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 58+ messages in thread

* [tip:timers/core] hrtimer: Remove HRTIMER_STATE_MIGRATE
  2015-06-11 12:46 ` [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE Peter Zijlstra
@ 2015-06-18 22:18   ` tip-bot for Oleg Nesterov
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Oleg Nesterov @ 2015-06-18 22:18 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, oleg, mingo, hpa, linux-kernel, tglx

Commit-ID:  c04dca02bc73096435a5c36efd5ccb2171edcbe1
Gitweb:     http://git.kernel.org/tip/c04dca02bc73096435a5c36efd5ccb2171edcbe1
Author:     Oleg Nesterov <oleg@redhat.com>
AuthorDate: Thu, 11 Jun 2015 14:46:44 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200

hrtimer: Remove HRTIMER_STATE_MIGRATE

I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally
confused it looks buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this
is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 6 +-----
 kernel/time/hrtimer.c   | 7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3f82a7e..2f9e57d 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 278d4b3..b1b795e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1508,11 +1508,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1523,9 +1523,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:timers/core] hrtimer: Fix hrtimer_is_queued() hole
  2015-06-11 12:46 ` [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole Peter Zijlstra
@ 2015-06-18 22:18   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 22:18 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, hpa, mingo, tglx, linux-kernel, oleg

Commit-ID:  8edfb0362e8e52dec2de08fa163af01c9da2c9d0
Gitweb:     http://git.kernel.org/tip/8edfb0362e8e52dec2de08fa163af01c9da2c9d0
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:45 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200

hrtimer: Fix hrtimer_is_queued() hole

A queued hrtimer that gets restarted (hrtimer_start*() while
hrtimer_is_queued()) will briefly appear as unqueued/inactive, even
though the timer has always been active, we just moved it.

Close this hole by preserving timer->state in
hrtimer_start_range_ns()'s remove_hrtimer() call.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.175989138@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index b1b795e..1604157 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -889,10 +889,10 @@ static void __remove_hrtimer(struct hrtimer *timer,
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
+		unsigned long state = timer->state;
 		int reprogram;
 
 		/*
@@ -906,12 +906,15 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
+
+		if (!restart) {
+			/*
+			 * We must preserve the CALLBACK state flag here,
+			 * otherwise we could move the timer base in
+			 * switch_hrtimer_base.
+			 */
+			state &= HRTIMER_STATE_CALLBACK;
+		}
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -936,7 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base, true);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1005,7 +1008,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, false);
 
 	unlock_hrtimer_base(timer, &flags);
 

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:timers/core] seqcount: Rename write_seqcount_barrier()
  2015-06-11 12:46 ` [PATCH 10/18] seqcount: Rename write_seqcount_barrier() Peter Zijlstra
@ 2015-06-18 22:19   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 22:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: hpa, tglx, viro, oleg, peterz, linux-kernel, torvalds, paulmck,
	mingo

Commit-ID:  a7c6f571ff51cc77d90dd54968f7c5c938c43998
Gitweb:     http://git.kernel.org/tip/a7c6f571ff51cc77d90dd54968f7c5c938c43998
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:46 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200

seqcount: Rename write_seqcount_barrier()

I'll shortly be introducing another seqcount primitive that's useful
to provide ordering semantics and would like to use the
write_seqcount_barrier() name for that.

Seeing how there's only one user of the current primitive, lets rename
it to invalidate, as that appears what its doing.

While there, employ lockdep_assert_held() instead of
assert_spin_locked() to not generate debug code for regular kernels.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: wanpeng.li@linux.intel.com
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.279926217@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/dcache.c             | 16 ++++++++--------
 include/linux/seqlock.h |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 656ce52..b43a169 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry)
 }
 
 /**
- * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups
  * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
  */
-static inline void dentry_rcuwalk_barrier(struct dentry *dentry)
+static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
 {
-	assert_spin_locked(&dentry->d_lock);
-	/* Go through a barrier */
-	write_seqcount_barrier(&dentry->d_seq);
+	lockdep_assert_held(&dentry->d_lock);
+	/* Go through am invalidation barrier */
+	write_seqcount_invalidate(&dentry->d_seq);
 }
 
 /*
@@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	struct inode *inode = dentry->d_inode;
 	__d_clear_type_and_inode(dentry);
 	hlist_del_init(&dentry->d_u.d_alias);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
@@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry)
 		__hlist_bl_del(&dentry->d_hash);
 		dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
-		dentry_rcuwalk_barrier(dentry);
+		dentry_rcuwalk_invalidate(dentry);
 	}
 }
 EXPORT_SYMBOL(__d_drop);
@@ -1752,7 +1752,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
 	__d_set_inode_and_type(dentry, inode, add_flags);
-	dentry_rcuwalk_barrier(dentry);
+	dentry_rcuwalk_invalidate(dentry);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 5f68d0a..c07e3a5 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -266,13 +266,13 @@ static inline void write_seqcount_end(seqcount_t *s)
 }
 
 /**
- * write_seqcount_barrier - invalidate in-progress read-side seq operations
+ * write_seqcount_invalidate - invalidate in-progress read-side seq operations
  * @s: pointer to seqcount_t
  *
- * After write_seqcount_barrier, no read-side seq operations will complete
+ * After write_seqcount_invalidate, no read-side seq operations will complete
  * successfully and see data older than this.
  */
-static inline void write_seqcount_barrier(seqcount_t *s)
+static inline void write_seqcount_invalidate(seqcount_t *s)
 {
 	smp_wmb();
 	s->sequence+=2;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:timers/core] seqcount: Introduce raw_write_seqcount_barrier( )
  2015-06-17 12:29       ` Peter Zijlstra
  2015-06-17 14:57         ` Paul E. McKenney
@ 2015-06-18 22:19         ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 22:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: torvalds, tglx, viro, linux-kernel, hpa, oleg, peterz, mingo,
	paulmck

Commit-ID:  c4bfa3f5f906aee2e084c5b1fb15caf876338ef8
Gitweb:     http://git.kernel.org/tip/c4bfa3f5f906aee2e084c5b1fb15caf876338ef8
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Wed, 17 Jun 2015 14:29:24 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200

seqcount: Introduce raw_write_seqcount_barrier()

Introduce raw_write_seqcount_barrier(), a new construct that can be
used to provide write barrier semantics in seqcount read loops instead
of the usual consistency guarantee.

raw_write_seqcount_barier() is equivalent to:

	raw_write_seqcount_begin();
	raw_write_seqcount_end();

But avoids issueing two back-to-back smp_wmb() instructions.

This construct works because the read side will 'stall' when observing
odd values. This means that -- referring to the example in the comment
below -- even though there is no (matching) read barrier between the
loads of X and Y, we cannot observe !x && !y, because:

 - if we observe Y == false we must observe the first sequence
   increment, which makes us loop, until

 - we observe !(seq & 1) -- the second sequence increment -- at which
   time we must also observe T == true.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: umgwanakikbuti@gmail.com
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20150617122924.GP3644@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/seqlock.h | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index c07e3a5..486e685 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -233,6 +233,47 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	s->sequence++;
 }
 
+/**
+ * raw_write_seqcount_barrier - do a seq write barrier
+ * @s: pointer to seqcount_t
+ *
+ * This can be used to provide an ordering guarantee instead of the
+ * usual consistency guarantee. It is one wmb cheaper, because we can
+ * collapse the two back-to-back wmb()s.
+ *
+ *      seqcount_t seq;
+ *      bool X = true, Y = false;
+ *
+ *      void read(void)
+ *      {
+ *              bool x, y;
+ *
+ *              do {
+ *                      int s = read_seqcount_begin(&seq);
+ *
+ *                      x = X; y = Y;
+ *
+ *              } while (read_seqcount_retry(&seq, s));
+ *
+ *              BUG_ON(!x && !y);
+ *      }
+ *
+ *      void write(void)
+ *      {
+ *              Y = true;
+ *
+ *              raw_write_seqcount_barrier(seq);
+ *
+ *              X = false;
+ *      }
+ */
+static inline void raw_write_seqcount_barrier(seqcount_t *s)
+{
+	s->sequence++;
+	smp_wmb();
+	s->sequence++;
+}
+
 /*
  * raw_write_seqcount_latch - redirect readers to even/odd copy
  * @s: pointer to seqcount_t

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:timers/core] hrtimer: Allow hrtimer::function() to free the timer
  2015-06-11 12:46 ` [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
@ 2015-06-18 22:19   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 22:19 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: paulmck, oleg, peterz, viro, torvalds, mingo, hpa, tglx,
	linux-kernel

Commit-ID:  887d9dc989eb0154492e41e7c07492edbb088ba1
Gitweb:     http://git.kernel.org/tip/887d9dc989eb0154492e41e7c07492edbb088ba1
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:48 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:09:56 +0200

hrtimer: Allow hrtimer::function() to free the timer

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  41 +++++++----------
 kernel/time/hrtimer.c   | 114 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 48 deletions(-)

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2f9e57d..5db0558 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1604157..f026413 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -111,6 +112,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 #ifdef CONFIG_SMP
 
 /*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
+/*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +207,8 @@ again:
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		if (!restart) {
-			/*
-			 * We must preserve the CALLBACK state flag here,
-			 * otherwise we could move the timer base in
-			 * switch_hrtimer_base.
-			 */
-			state &= HRTIMER_STATE_CALLBACK;
-		}
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched: Replace post_schedule with a balance callback list
  2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-11 15:32   ` Kirill Tkhai
@ 2015-06-18 23:00   ` tip-bot for Peter Zijlstra
  1 sibling, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:00 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, peterz, tglx, hpa, mingo

Commit-ID:  e3fca9e7cbfb72694a21c886fcdf9f059cfded9c
Gitweb:     http://git.kernel.org/tip/e3fca9e7cbfb72694a21c886fcdf9f059cfded9c
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:37 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched: Replace post_schedule with a balance callback list

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.424032725@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c     | 36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c | 21 +++++++++++----------
 kernel/sched/rt.c       | 25 +++++++++++--------------
 kernel/sched/sched.h    | 19 +++++++++++++++++--
 4 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 41942a5..fa32bc0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2823,7 +2835,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7219,7 +7231,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7a08d59..d80523f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -296,7 +303,7 @@ static inline int pull_dl_task(struct rq *rq)
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ skip:
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d7093c5..4f3726f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq *this_rq)
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ skip:
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f10a445..62949ab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 
 #ifdef CONFIG_SMP
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 extern void sched_ttwu_pending(void);
 
 #define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched: Use replace normalize_task() with __sched_setscheduler()
  2015-06-11 12:46 ` [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
@ 2015-06-18 23:00   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:00 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: tglx, linux-kernel, hpa, mingo, peterz

Commit-ID:  dbc7f069b93a249340e974d6e8f55656280d8701
Gitweb:     http://git.kernel.org/tip/dbc7f069b93a249340e974d6e8f55656280d8701
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:38 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched: Use replace normalize_task() with __sched_setscheduler()

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.532642391@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 65 +++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa32bc0..b610ef9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3438,7 +3438,7 @@ static bool dl_param_changed(struct task_struct *p,
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3624,18 +3624,20 @@ change:
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3646,7 +3648,7 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3661,7 +3663,8 @@ change:
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3682,7 +3685,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3703,7 +3706,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7361,32 +7364,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7413,9 +7396,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched: Allow balance callbacks for check_class_changed()
  2015-06-11 12:46 ` [PATCH 03/18] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-18 23:01   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:01 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, umgwanakikbuti, peterz, mingo, tglx

Commit-ID:  4c9a4bc89a9cca8128bce67d6bc8870d6b7ee0b2
Gitweb:     http://git.kernel.org/tip/4c9a4bc89a9cca8128bce67d6bc8870d6b7ee0b2
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:39 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched: Allow balance callbacks for check_class_changed()

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Link: http://lkml.kernel.org/r/20150611124742.615343911@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b610ef9..ef546e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_struct *p)
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3100,7 +3108,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3661,11 +3673,18 @@ change:
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched,rt: Remove return value from pull_rt_task()
  2015-06-11 12:46 ` [PATCH 04/18] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
@ 2015-06-18 23:01   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:01 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, linux-kernel, tglx, peterz, mingo

Commit-ID:  8046d6806247088de5725eaf8a2580b29e50ac5a
Gitweb:     http://git.kernel.org/tip/8046d6806247088de5725eaf8a2580b29e50ac5a
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:40 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched,rt: Remove return value from pull_rt_task()

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.679002000@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/rt.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f3726f..c702b48 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct irq_work *work)
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_rq)
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_rq)
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched, rt: Convert switched_{from, to}_rt()  / prio_changed_rt() to balance callbacks
  2015-06-11 12:46 ` [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
@ 2015-06-18 23:01   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:01 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, hpa, tglx, mingo, linux-kernel

Commit-ID:  fd7a4bed183523275279c9addbf42fce550c2e90
Gitweb:     http://git.kernel.org/tip/fd7a4bed183523275279c9addbf42fce550c2e90
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:41 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched, rt: Convert switched_{from, to}_rt() / prio_changed_rt() to balance callbacks

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.766832367@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/rt.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c702b48..460f858 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched,dl: Remove return value from pull_dl_task()
  2015-06-11 12:46 ` [PATCH 06/18] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
@ 2015-06-18 23:02   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:02 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: peterz, tglx, linux-kernel, hpa, mingo

Commit-ID:  0ea60c2054fc3b0c3eb68ac4f6884f3ee78d9925
Gitweb:     http://git.kernel.org/tip/0ea60c2054fc3b0c3eb68ac4f6884f3ee78d9925
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:42 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched,dl: Remove return value from pull_dl_task()

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.859398977@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d80523f..079c092 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1041,7 +1040,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1472,15 +1471,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1535,7 +1535,7 @@ static int pull_dl_task(struct rq *this_rq)
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1548,7 +1548,8 @@ skip:
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1704,8 +1705,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched, dl: Convert switched_{from, to}_dl()  / prio_changed_dl() to balance callbacks
  2015-06-11 12:46 ` [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-18 23:02   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:02 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, peterz, mingo, hpa, tglx

Commit-ID:  9916e214998a4a363b152b637245e5c958067350
Gitweb:     http://git.kernel.org/tip/9916e214998a4a363b152b637245e5c958067350
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:43 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched, dl: Convert switched_{from, to}_dl() / prio_changed_dl() to balance callbacks

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124742.968262663@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 079c092..69d9f50 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -305,6 +312,10 @@ static inline void pull_dl_task(struct rq *rq)
 static inline void queue_push_tasks(struct rq *rq)
 {
 }
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1040,8 +1051,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1705,7 +1714,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1714,21 +1723,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1748,15 +1752,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched,dl: Fix sched class hopping CBS hole
  2015-06-11 12:46 ` [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
@ 2015-06-18 23:02   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:02 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: luca.abeni, peterz, linux-kernel, wanpeng.li, juri.lelli, hpa,
	tglx, mingo

Commit-ID:  a649f237db18450de767d70f40a41d5dbd0291de
Gitweb:     http://git.kernel.org/tip/a649f237db18450de767d70f40a41d5dbd0291de
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:49 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched,dl: Fix sched class hopping CBS hole

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.
The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task left SCHED_DEADLINE it will not call task_dead_dl() and
we'll not cancel the hrtimer, leaving us a pending timer in free
space. Avoid this by giving the timer a task reference, this avoids
littering the task exit path for this rather uncommon case.

In order to do this, I had to move dl_task_offline_migration() below
the replenishment, such that the task_rq()->lock fully covers that.
While doing this, I noticed that it (was) buggy in assuming a task is
enqueued and or we need to enqueue the task now. Fixing this means
select_task_rq_dl() might encounter an offline rq -- look into that.

As a result this kills cancel_dl_timer() which included a rq->lock
break.

Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Cc: Wanpeng Li <wanpeng.li@linux.intel.com>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.574192138@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 152 +++++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 66 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 69d9f50..6318f43 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -234,7 +234,7 @@ static inline void queue_pull_task(struct rq *rq)
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
 	struct rq *later_rq = NULL;
 	bool fallback = false;
@@ -268,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 		double_lock_balance(rq, later_rq);
 	}
 
+	/*
+	 * By now the task is replenished and enqueued; migrate it.
+	 */
 	deactivate_task(rq, p, 0);
 	set_task_cpu(p, later_rq->cpu);
-	activate_task(later_rq, p, ENQUEUE_REPLENISH);
+	activate_task(later_rq, p, 0);
 
 	if (!fallback)
 		resched_curr(later_rq);
 
-	double_unlock_balance(rq, later_rq);
+	double_unlock_balance(later_rq, rq);
+
+	return later_rq;
 }
 
 #else
@@ -515,22 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
  * actually started or not (i.e., the replenishment instant is in
  * the future or in the past).
  */
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
 {
-	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-	struct rq *rq = rq_of_dl_rq(dl_rq);
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->dl_timer;
+	struct rq *rq = task_rq(p);
 	ktime_t now, act;
 	s64 delta;
 
-	if (boosted)
-		return 0;
+	lockdep_assert_held(&rq->lock);
+
 	/*
 	 * We want the timer to fire at the deadline, but considering
 	 * that it is actually coming from rq->clock and not from
 	 * hrtimer's time base reading.
 	 */
 	act = ns_to_ktime(dl_se->deadline);
-	now = hrtimer_cb_get_time(&dl_se->dl_timer);
+	now = hrtimer_cb_get_time(timer);
 	delta = ktime_to_ns(now) - rq_clock(rq);
 	act = ktime_add_ns(act, delta);
 
@@ -542,7 +548,19 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+	/*
+	 * !enqueued will guarantee another callback; even if one is already in
+	 * progress. This ensures a balanced {get,put}_task_struct().
+	 *
+	 * The race against __run_timer() clearing the enqueued state is
+	 * harmless because we're holding task_rq()->lock, therefore the timer
+	 * expiring after we've done the check will wait on its task_rq_lock()
+	 * and observe our state.
+	 */
+	if (!hrtimer_is_queued(timer)) {
+		get_task_struct(p);
+		hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+	}
 
 	return 1;
 }
@@ -572,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 */
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * This is possible if switched_from_dl() raced against a running
+	 * callback that took the above !dl_task() path and we've since then
+	 * switched back into SCHED_DEADLINE.
 	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * There's nothing to do except drop our task reference.
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (dl_se->dl_new)
 		goto unlock;
 
-	sched_clock_tick();
-	update_rq_clock(rq);
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted)
+		goto unlock;
 
-#ifdef CONFIG_SMP
 	/*
-	 * If we find that the rq the task was on is no longer
-	 * available, we need to select a new rq.
+	 * Spurious timer due to start_dl_timer() race; or we already received
+	 * a replenishment from rt_mutex_setprio().
 	 */
-	if (unlikely(!rq->online)) {
-		dl_task_offline_migration(rq, p);
+	if (!dl_se->dl_throttled)
 		goto unlock;
-	}
-#endif
+
+	sched_clock_tick();
+	update_rq_clock(rq);
 
 	/*
 	 * If the throttle happened during sched-out; like:
@@ -626,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		check_preempt_curr_dl(rq, p, 0);
 	else
 		resched_curr(rq);
+
 #ifdef CONFIG_SMP
 	/*
-	 * Queueing this task back might have overloaded rq,
-	 * check if we need to kick someone away.
+	 * Perform balancing operations here; after the replenishments.  We
+	 * cannot drop rq->lock before this, otherwise the assertion in
+	 * start_dl_timer() about not missing updates is not true.
+	 *
+	 * If we find that the rq the task was on is no longer available, we
+	 * need to select a new rq.
+	 *
+	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
+	 */
+	if (unlikely(!rq->online))
+		rq = dl_task_offline_migration(rq, p);
+
+	/*
+	 * Queueing this task back might have overloaded rq, check if we need
+	 * to kick someone away.
 	 */
 	if (has_pushable_dl_tasks(rq))
 		push_dl_task(rq);
 #endif
+
 unlock:
 	task_rq_unlock(rq, p, &flags);
 
+	/*
+	 * This can free the task_struct, including this hrtimer, do not touch
+	 * anything related to that after this.
+	 */
+	put_task_struct(p);
+
 	return HRTIMER_NORESTART;
 }
 
@@ -696,7 +740,7 @@ static void update_curr_dl(struct rq *rq)
 	if (dl_runtime_exceeded(rq, dl_se)) {
 		dl_se->dl_throttled = 1;
 		__dequeue_task_dl(rq, curr, 0);
-		if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+		if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
 			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
 
 		if (!is_leftmost(curr, &rq->dl))
@@ -1178,7 +1222,6 @@ static void task_fork_dl(struct task_struct *p)
 
 static void task_dead_dl(struct task_struct *p)
 {
-	struct hrtimer *timer = &p->dl.dl_timer;
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
 	/*
@@ -1188,8 +1231,6 @@ static void task_dead_dl(struct task_struct *p)
 	/* XXX we should retain the bw until 0-lag */
 	dl_b->total_bw -= p->dl.dl_bw;
 	raw_spin_unlock_irq(&dl_b->lock);
-
-	hrtimer_cancel(timer);
 }
 
 static void set_curr_task_dl(struct rq *rq)
@@ -1674,37 +1715,16 @@ void init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 */
+	if (!start_dl_timer(p))
+		__dl_clear_params(p);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched: Move code around
  2015-06-11 12:46 ` [PATCH 14/18] sched: Move code around Peter Zijlstra
@ 2015-06-18 23:02   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:02 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, mingo, linux-kernel, peterz, tglx

Commit-ID:  5cc389bcee088b72c8c34a01d596412cab4f3f78
Gitweb:     http://git.kernel.org/tip/5cc389bcee088b72c8c34a01d596412cab4f3f78
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:50 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:26 +0200

sched: Move code around

In preparation to reworking set_cpus_allowed_ptr() move some code
around. This also removes some superfluous #ifdefs and adds comments
to some #endifs.

   text    data     bss     dec     hex filename
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.pre
12211532        1738144 1081344 15031020         e55aec defconfig-build/vmlinux.post

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.662086684@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 364 +++++++++++++++++++++++++---------------------------
 1 file changed, 178 insertions(+), 186 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef546e3..26637c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1046,6 +1046,180 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+
+	dequeue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_MIGRATING;
+	set_task_cpu(p, new_cpu);
+	raw_spin_unlock(&rq->lock);
+
+	rq = cpu_rq(new_cpu);
+
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_cpu(p) != new_cpu);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	enqueue_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+
+	return rq;
+}
+
+struct migration_arg {
+	struct task_struct *task;
+	int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!cpu_active(dest_cpu)))
+		return ret;
+
+	rq = cpu_rq(src_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/* Already moved. */
+	if (task_cpu(p) != src_cpu)
+		goto done;
+
+	/* Affinity changed (again). */
+	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+		goto fail;
+
+	/*
+	 * If we're not on a rq, the next wake-up will ensure we're
+	 * placed properly.
+	 */
+	if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+done:
+	ret = 1;
+fail:
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+	return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+	struct migration_arg *arg = data;
+
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	/*
+	 * We need to explicitly wake pending tasks before running
+	 * __migrate_task() such that we will not miss enforcing cpus_allowed
+	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+	 */
+	sched_ttwu_pending();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
+	return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, new_mask);
+
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	unsigned long flags;
+	struct rq *rq;
+	unsigned int dest_cpu;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	if (cpumask_equal(&p->cpus_allowed, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_cpus_allowed(p, new_mask);
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (task_running(rq, p) || p->state == TASK_WAKING) {
+		struct migration_arg arg = { p, dest_cpu };
+		/* Need help from migration thread: drop lock and wait. */
+		task_rq_unlock(rq, p, &flags);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+		tlb_migrate_finish(p->mm);
+		return 0;
+	} else if (task_on_rq_queued(p))
+		rq = move_queued_task(p, dest_cpu);
+out:
+	task_rq_unlock(rq, p, &flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -1186,13 +1360,6 @@ out:
 	return ret;
 }
 
-struct migration_arg {
-	struct task_struct *task;
-	int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -1325,9 +1492,7 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
 
-#ifdef CONFIG_SMP
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
  */
@@ -1432,7 +1597,7 @@ static void update_avg(u64 *avg, u64 sample)
 	s64 diff = sample - *avg;
 	*avg += diff >> 3;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -4773,149 +4938,6 @@ out:
 }
 
 #ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
-	struct rq *rq = task_rq(p);
-
-	lockdep_assert_held(&rq->lock);
-
-	dequeue_task(rq, p, 0);
-	p->on_rq = TASK_ON_RQ_MIGRATING;
-	set_task_cpu(p, new_cpu);
-	raw_spin_unlock(&rq->lock);
-
-	rq = cpu_rq(new_cpu);
-
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_cpu(p) != new_cpu);
-	p->on_rq = TASK_ON_RQ_QUEUED;
-	enqueue_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-
-	return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-	if (p->sched_class->set_cpus_allowed)
-		p->sched_class->set_cpus_allowed(p, new_mask);
-
-	cpumask_copy(&p->cpus_allowed, new_mask);
-	p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	unsigned long flags;
-	struct rq *rq;
-	unsigned int dest_cpu;
-	int ret = 0;
-
-	rq = task_rq_lock(p, &flags);
-
-	if (cpumask_equal(&p->cpus_allowed, new_mask))
-		goto out;
-
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	do_set_cpus_allowed(p, new_mask);
-
-	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
-		goto out;
-
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (task_running(rq, p) || p->state == TASK_WAKING) {
-		struct migration_arg arg = { p, dest_cpu };
-		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
-		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-		tlb_migrate_finish(p->mm);
-		return 0;
-	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-out:
-	task_rq_unlock(rq, p, &flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	struct rq *rq;
-	int ret = 0;
-
-	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
-
-	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
-
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
-}
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
@@ -4963,35 +4985,9 @@ void sched_setnuma(struct task_struct *p, int nid)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-	struct migration_arg *arg = data;
-
-	/*
-	 * The original target cpu might have gone down and we might
-	 * be on another cpu but it doesn't matter.
-	 */
-	local_irq_disable();
-	/*
-	 * We need to explicitly wake pending tasks before running
-	 * __migrate_task() such that we will not miss enforcing cpus_allowed
-	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
-	 */
-	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-	local_irq_enable();
-	return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Ensures that the idle task is using init_mm right before its cpu goes
  * offline.
@@ -5094,7 +5090,6 @@ static void migrate_tasks(unsigned int dead_cpu)
 
 	rq->stop = stop;
 }
-
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5273,7 +5268,7 @@ static void register_sched_domain_sysctl(void)
 static void unregister_sched_domain_sysctl(void)
 {
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
 
 static void set_rq_online(struct rq *rq)
 {
@@ -5420,9 +5415,6 @@ static int __init migration_init(void)
 	return 0;
 }
 early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
 
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
@@ -6648,7 +6640,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
-		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sd)
 				return -ENOMEM;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched: Streamline the task migration locking a little
  2015-06-11 12:46 ` [PATCH 15/18] sched: Streamline the task migration locking a little Peter Zijlstra
@ 2015-06-18 23:03   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:03 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, mingo, peterz, linux-kernel, tglx

Commit-ID:  5e16bbc2fb4053755705da5dd3557bbc0e5ccef6
Gitweb:     http://git.kernel.org/tip/5e16bbc2fb4053755705da5dd3557bbc0e5ccef6
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:51 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:27 +0200

sched: Streamline the task migration locking a little

The whole migrate_task{,s}() locking seems a little shaky, there's a
lot of dropping an require happening. Pull the locking up into the
callers as far as possible to streamline the lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.755256708@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 76 ++++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26637c9..1ddc129 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,10 +1065,8 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
 {
-	struct rq *rq = task_rq(p);
-
 	lockdep_assert_held(&rq->lock);
 
 	dequeue_task(rq, p, 0);
@@ -1100,41 +1098,19 @@ struct migration_arg {
  *
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
  */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
 {
-	struct rq *rq;
-	int ret = 0;
-
 	if (unlikely(!cpu_active(dest_cpu)))
-		return ret;
-
-	rq = cpu_rq(src_cpu);
-
-	raw_spin_lock(&p->pi_lock);
-	raw_spin_lock(&rq->lock);
-	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
-		goto done;
+		return rq;
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-		goto fail;
+		return rq;
 
-	/*
-	 * If we're not on a rq, the next wake-up will ensure we're
-	 * placed properly.
-	 */
-	if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
-done:
-	ret = 1;
-fail:
-	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock(&p->pi_lock);
-	return ret;
+	rq = move_queued_task(rq, p, dest_cpu);
+
+	return rq;
 }
 
 /*
@@ -1145,6 +1121,8 @@ fail:
 static int migration_cpu_stop(void *data)
 {
 	struct migration_arg *arg = data;
+	struct task_struct *p = arg->task;
+	struct rq *rq = this_rq();
 
 	/*
 	 * The original target cpu might have gone down and we might
@@ -1157,7 +1135,19 @@ static int migration_cpu_stop(void *data)
 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 	 */
 	sched_ttwu_pending();
-	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+
+	raw_spin_lock(&p->pi_lock);
+	raw_spin_lock(&rq->lock);
+	/*
+	 * If task_rq(p) != rq, it cannot be migrated here, because we're
+	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+	 * we're holding p->pi_lock.
+	 */
+	if (task_rq(p) == rq && task_on_rq_queued(p))
+		rq = __migrate_task(rq, p, arg->dest_cpu);
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock(&p->pi_lock);
+
 	local_irq_enable();
 	return 0;
 }
@@ -1212,7 +1202,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		tlb_migrate_finish(p->mm);
 		return 0;
 	} else if (task_on_rq_queued(p))
-		rq = move_queued_task(p, dest_cpu);
+		rq = move_queued_task(rq, p, dest_cpu);
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -5043,9 +5033,9 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
 {
-	struct rq *rq = cpu_rq(dead_cpu);
+	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	int dest_cpu;
 
@@ -5067,7 +5057,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 	 */
 	update_rq_clock(rq);
 
-	for ( ; ; ) {
+	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
 		 * remaining thread.
@@ -5080,12 +5070,14 @@ static void migrate_tasks(unsigned int dead_cpu)
 		next->sched_class->put_prev_task(rq, next);
 
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_cpu, next);
-		raw_spin_unlock(&rq->lock);
-
-		__migrate_task(next, dead_cpu, dest_cpu);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		raw_spin_lock(&rq->lock);
+		rq = __migrate_task(rq, next, dest_cpu);
+		if (rq != dead_rq) {
+			raw_spin_unlock(&rq->lock);
+			rq = dead_rq;
+			raw_spin_lock(&rq->lock);
+		}
 	}
 
 	rq->stop = stop;
@@ -5337,7 +5329,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 			set_rq_offline(rq);
 		}
-		migrate_tasks(cpu);
+		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] lockdep: Simplify lock_release()
  2015-06-11 12:46 ` [PATCH 16/18] lockdep: Simplify lock_release() Peter Zijlstra
@ 2015-06-18 23:03   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:03 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: mingo, peterz, tglx, linux-kernel, hpa

Commit-ID:  e0f56fd7066f35ae3765d080e036fa676a9d4128
Gitweb:     http://git.kernel.org/tip/e0f56fd7066f35ae3765d080e036fa676a9d4128
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:52 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:27 +0200

lockdep: Simplify lock_release()

lock_release() takes this nested argument that's mostly pointless
these days, remove the implementation but leave the argument a
rudiment for now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.840411606@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/lockdep.c | 119 +++++++----------------------------------------
 1 file changed, 18 insertions(+), 101 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a0831e1..a266d51 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3260,26 +3260,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	return 0;
 }
 
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
-			unsigned long ip)
-{
-	if (unlikely(!debug_locks))
-		return 0;
-	/*
-	 * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
-		return 0;
-
-	if (curr->lockdep_depth <= 0)
-		return print_unlock_imbalance_bug(curr, lock, ip);
-
-	return 1;
-}
-
 static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 {
 	if (hlock->instance == lock)
@@ -3376,31 +3356,35 @@ found_it:
 }
 
 /*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
  */
 static int
-lock_release_non_nested(struct task_struct *curr,
-			struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
+	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
 	unsigned int depth;
 	int i;
 
-	/*
-	 * Check whether the lock exists in the current stack
-	 * of held locks:
-	 */
+	if (unlikely(!debug_locks))
+		return 0;
+
 	depth = curr->lockdep_depth;
 	/*
 	 * So we're all set to release this lock.. wait what lock? We don't
 	 * own any locks, you've been drinking again?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth))
-		return 0;
+	if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+		 return print_unlock_imbalance_bug(curr, lock, ip);
 
+	/*
+	 * Check whether the lock exists in the current stack
+	 * of held locks:
+	 */
 	prev_hlock = NULL;
 	for (i = depth-1; i >= 0; i--) {
 		hlock = curr->held_locks + i;
@@ -3456,78 +3440,10 @@ found_it:
 	 */
 	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
 		return 0;
-	return 1;
-}
-
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
-			       struct lockdep_map *lock, unsigned long ip)
-{
-	struct held_lock *hlock;
-	unsigned int depth;
-
-	/*
-	 * Pop off the top of the lock stack:
-	 */
-	depth = curr->lockdep_depth - 1;
-	hlock = curr->held_locks + depth;
-
-	/*
-	 * Is the unlock non-nested:
-	 */
-	if (hlock->instance != lock || hlock->references)
-		return lock_release_non_nested(curr, lock, ip);
-	curr->lockdep_depth--;
-
-	/*
-	 * No more locks, but somehow we've got hash left over, who left it?
-	 */
-	if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
-		return 0;
-
-	curr->curr_chain_key = hlock->prev_chain_key;
-
-	lock_release_holdtime(hlock);
 
-#ifdef CONFIG_DEBUG_LOCKDEP
-	hlock->prev_chain_key = 0;
-	hlock->class_idx = 0;
-	hlock->acquire_ip = 0;
-	hlock->irq_context = 0;
-#endif
 	return 1;
 }
 
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
-{
-	struct task_struct *curr = current;
-
-	if (!check_unlock(curr, lock, ip))
-		return;
-
-	if (nested) {
-		if (!lock_release_nested(curr, lock, ip))
-			return;
-	} else {
-		if (!lock_release_non_nested(curr, lock, ip))
-			return;
-	}
-
-	check_chain_key(curr);
-}
-
 static int __lock_is_held(struct lockdep_map *lock)
 {
 	struct task_struct *curr = current;
@@ -3639,7 +3555,8 @@ void lock_release(struct lockdep_map *lock, int nested,
 	check_flags(flags);
 	current->lockdep_recursion = 1;
 	trace_lock_release(lock, ip);
-	__lock_release(lock, nested, ip);
+	if (__lock_release(lock, nested, ip))
+		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] lockdep: Implement lock pinning
  2015-06-11 12:46 ` [PATCH 17/18] lockdep: Implement lock pinning Peter Zijlstra
@ 2015-06-18 23:03   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:03 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, peterz, linux-kernel, mingo, tglx

Commit-ID:  a24fc60d63da2b0b31bf7c876d12a51ed4b778bd
Gitweb:     http://git.kernel.org/tip/a24fc60d63da2b0b31bf7c876d12a51ed4b778bd
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:53 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:27 +0200

lockdep: Implement lock pinning

Add a lockdep annotation that WARNs if you 'accidentially' unlock a
lock.

This is especially helpful for code with callbacks, where the upper
layer assumes a lock remains taken but a lower layer thinks it maybe
can drop and reacquire the lock.

By unwittingly breaking up the lock, races can be introduced.

Lock pinning is a lockdep annotation that helps with this, when you
lockdep_pin_lock() a held lock, any unlock without a
lockdep_unpin_lock() will produce a WARN. Think of this as a relative
of lockdep_assert_held(), except you don't only assert its held now,
but ensure it stays held until you release your assertion.

RFC: a possible alternative API would be something like:

  int cookie = lockdep_pin_lock(&foo);
  ...
  lockdep_unpin_lock(&foo, cookie);

Where we pick a random number for the pin_count; this makes it
impossible to sneak a lock break in without also passing the right
cookie along.

I've not done this because it ends up generating code for !LOCKDEP,
esp. if you need to pass the cookie around for some reason.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.906731065@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/lockdep.h  | 10 ++++++
 kernel/locking/lockdep.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 066ba41..c5b6b58 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -255,6 +255,7 @@ struct held_lock {
 	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
 	unsigned int references:12;					/* 32 bits */
+	unsigned int pin_count;
 };
 
 /*
@@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
+extern void lock_pin_lock(struct lockdep_map *lock);
+extern void lock_unpin_lock(struct lockdep_map *lock);
+
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
 #define lockdep_depth(tsk)	(debug_locks ? (tsk)->lockdep_depth : 0)
@@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t mask);
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
+#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
+#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -420,6 +427,9 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
+#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
+#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a266d51..18f9f43 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
+	hlock->pin_count = 0;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3403,6 +3404,8 @@ found_it:
 	if (hlock->instance == lock)
 		lock_release_holdtime(hlock);
 
+	WARN(hlock->pin_count, "releasing a pinned lock\n");
+
 	if (hlock->references) {
 		hlock->references--;
 		if (hlock->references) {
@@ -3459,6 +3462,49 @@ static int __lock_is_held(struct lockdep_map *lock)
 	return 0;
 }
 
+static void __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			hlock->pin_count++;
+			return;
+		}
+	}
+
+	WARN(1, "pinning an unheld lock\n");
+}
+
+static void __lock_unpin_lock(struct lockdep_map *lock)
+{
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+				return;
+
+			hlock->pin_count--;
+			return;
+		}
+	}
+
+	WARN(1, "unpinning an unheld lock\n");
+}
+
 /*
  * Check whether we follow the irq-flags state precisely:
  */
@@ -3582,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
+void lock_pin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_pin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_unpin_lock(lock);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
 	current->lockdep_reclaim_gfp = gfp_mask;

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* [tip:sched/hrtimers] sched,lockdep: Employ lock pinning
  2015-06-11 12:46 ` [PATCH 18/18] sched,lockdep: Employ " Peter Zijlstra
@ 2015-06-18 23:04   ` tip-bot for Peter Zijlstra
  0 siblings, 0 replies; 58+ messages in thread
From: tip-bot for Peter Zijlstra @ 2015-06-18 23:04 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: hpa, mingo, peterz, linux-kernel, tglx

Commit-ID:  cbce1a686700595de65ee363b9b3283ae85d8fc5
Gitweb:     http://git.kernel.org/tip/cbce1a686700595de65ee363b9b3283ae85d8fc5
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Thu, 11 Jun 2015 14:46:54 +0200
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 19 Jun 2015 00:25:27 +0200

sched,lockdep: Employ lock pinning

Employ the new lockdep lock pinning annotation to ensure no
'accidental' lock-breaks happen with rq->lock.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: oleg@redhat.com
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124744.003233193@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c     | 42 +++++++++++++++++++++++++++++++++++++++---
 kernel/sched/deadline.c |  8 ++++++++
 kernel/sched/fair.c     | 11 ++++++++---
 kernel/sched/rt.c       |  8 ++++++++
 kernel/sched/sched.h    | 10 ++++++++--
 5 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ddc129..c74191a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1201,8 +1201,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
-	} else if (task_on_rq_queued(p))
+	} else if (task_on_rq_queued(p)) {
+		/*
+		 * OK, since we're going to drop the lock immediately
+		 * afterwards anyway.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		rq = move_queued_task(rq, p, dest_cpu);
+		lockdep_pin_lock(&rq->lock);
+	}
 out:
 	task_rq_unlock(rq, p, &flags);
 
@@ -1562,6 +1569,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
+	lockdep_assert_held(&p->pi_lock);
+
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
@@ -1652,9 +1661,12 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
-		 * XXX can drop rq->lock; most likely ok.
+		 * Our task @p is fully woken up and running; so its safe to
+		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
+		lockdep_unpin_lock(&rq->lock);
 		p->sched_class->task_woken(rq, p);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (rq->idle_stamp) {
@@ -1674,6 +1686,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
+	lockdep_assert_held(&rq->lock);
+
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
@@ -1718,6 +1732,7 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
+	lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1725,6 +1740,7 @@ void sched_ttwu_pending(void)
 		ttwu_do_activate(rq, p, 0);
 	}
 
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1821,7 +1837,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
 	raw_spin_lock(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 	ttwu_do_activate(rq, p, 0);
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1916,9 +1934,17 @@ static void try_to_wake_up_local(struct task_struct *p)
 	lockdep_assert_held(&rq->lock);
 
 	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
+		lockdep_pin_lock(&rq->lock);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2530,6 +2556,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
+	lockdep_unpin_lock(&rq->lock);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	context_tracking_task_switch(prev, next);
@@ -2953,6 +2980,7 @@ static void __sched __schedule(void)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock_irq(&rq->lock);
+	lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -2995,8 +3023,10 @@ static void __sched __schedule(void)
 
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
 		cpu = cpu_of(rq);
-	} else
+	} else {
+		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
 	balance_callback(rq);
 }
@@ -5065,6 +5095,11 @@ static void migrate_tasks(struct rq *dead_rq)
 		if (rq->nr_running == 1)
 			break;
 
+		/*
+		 * Ensure rq->lock covers the entire task selection
+		 * until the migration.
+		 */
+		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
@@ -5072,6 +5107,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
+		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6318f43..e814641 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1151,7 +1151,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	dl_rq = &rq->dl;
 
 	if (need_pull_dl_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_dl_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7210ae8..509ef63 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,7 +5392,15 @@ simple:
 	return p;
 
 idle:
+	/*
+	 * This is OK, because current is on_cpu, which avoids it being picked
+	 * for load-balance and preemption/IRQs are still disabled avoiding
+	 * further scheduler activity on it and we're being very careful to
+	 * re-start the picking loop.
+	 */
+	lockdep_unpin_lock(&rq->lock);
 	new_tasks = idle_balance(rq);
+	lockdep_pin_lock(&rq->lock);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
@@ -7426,9 +7434,6 @@ static int idle_balance(struct rq *this_rq)
 		goto out;
 	}
 
-	/*
-	 * Drop the rq->lock, but keep IRQ/preempt disabled.
-	 */
 	raw_spin_unlock(&this_rq->lock);
 
 	update_blocked_averages(this_cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 460f858..0d193a24 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1478,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct rt_rq *rt_rq = &rq->rt;
 
 	if (need_pull_rt_task(rq, prev)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we're
+		 * being very careful to re-start the picking loop.
+		 */
+		lockdep_unpin_lock(&rq->lock);
 		pull_rt_task(rq);
+		lockdep_pin_lock(&rq->lock);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62949ab..ef02d11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1439,8 +1439,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 
 		while (unlikely(task_on_rq_migrating(p)))
@@ -1477,8 +1479,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
 			return rq;
+		}
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 
@@ -1490,6 +1494,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1498,6 +1503,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
+	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }

^ permalink raw reply related	[flat|nested] 58+ messages in thread

* Re: [PATCH 00/18] sched: balance callbacks v4
  2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
                   ` (17 preceding siblings ...)
  2015-06-11 12:46 ` [PATCH 18/18] sched,lockdep: Employ " Peter Zijlstra
@ 2015-12-29  5:41 ` Byungchul Park
  18 siblings, 0 replies; 58+ messages in thread
From: Byungchul Park @ 2015-12-29  5:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, tglx, juri.lelli,
	pang.xunlei, oleg, wanpeng.li, linux-kernel

On Thu, Jun 11, 2015 at 02:46:36PM +0200, Peter Zijlstra wrote:
> Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.

Hello.

We also faced this bug in our embedded product using stable tree. Eventually,
we found that these patch set exactly dealt with the problem. Now, I am just
curious about if you have any plans to merge this to stable trees to make
the stable trees stabler.

Hm?

> 
> The exact scenario is __sched_setscheduler() changing a (runnable) task from
> FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
> drop rq->lock. This gap allows regular cfs load-balancing to step in and
> migrate our task.
> 
> However, check_class_changed() will happily continue with switched_to_fair()
> which assumes our task is still on the old rq and makes the kernel go boom.
> 
> Instead of trying to patch this up and make things complicated; simply disallow
> these methods to drop rq->lock and extend the current post_schedule stuff into
> a balancing callback list, and use that.
> 
> This survives Mike's testcase.
> 
> Changes since -v3:
>  - reworked the hrtimer stuff, again. -- Kirill, Oleg
>  - small changes to the new lockdep stuff
> 
> Changes since -v2:
>  - reworked the hrtimer patch. -- Kirill, tglx
>  - added lock pinning
> 
> Changes since -v1:
>  - make SMP=n build,
>  - cured switched_from_dl()'s cancel_dl_timer().
> 
> no real tests on the new parts other than booting / building kernels.
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 58+ messages in thread

end of thread, other threads:[~2015-12-29  5:41 UTC | newest]

Thread overview: 58+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-11 12:46 [PATCH 00/18] sched: balance callbacks v4 Peter Zijlstra
2015-06-11 12:46 ` [PATCH 01/18] sched: Replace post_schedule with a balance callback list Peter Zijlstra
2015-06-11 15:32   ` Kirill Tkhai
2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 02/18] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
2015-06-18 23:00   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 03/18] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 04/18] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 05/18] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
2015-06-18 23:01   ` [tip:sched/hrtimers] sched, rt: Convert switched_{from, to}_rt() " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 06/18] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 07/18] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] sched, dl: Convert switched_{from, to}_dl() " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 08/18] hrtimer: Remove HRTIMER_STATE_MIGRATE Peter Zijlstra
2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Oleg Nesterov
2015-06-11 12:46 ` [PATCH 09/18] hrtimer: Fix hrtimer_is_queued() hole Peter Zijlstra
2015-06-18 22:18   ` [tip:timers/core] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 10/18] seqcount: Rename write_seqcount_barrier() Peter Zijlstra
2015-06-18 22:19   ` [tip:timers/core] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 11/18] seqcount: Introduce raw_write_seqcount_barrier() Peter Zijlstra
2015-06-11 15:33   ` Paul E. McKenney
2015-06-11 21:45     ` Paul E. McKenney
2015-06-12  7:08       ` Peter Zijlstra
2015-06-12 18:59       ` Oleg Nesterov
2015-06-17 12:29       ` Peter Zijlstra
2015-06-17 14:57         ` Paul E. McKenney
2015-06-17 15:11           ` Peter Zijlstra
2015-06-17 15:42             ` Paul E. McKenney
2015-06-17 16:58               ` Peter Zijlstra
2015-06-17 15:49           ` Peter Zijlstra
2015-06-17 16:37             ` Paul E. McKenney
2015-06-17 17:11               ` Peter Zijlstra
2015-06-17 18:02                 ` Paul E. McKenney
2015-06-18  9:15                   ` Peter Zijlstra
2015-06-18  9:40                     ` Ingo Molnar
2015-06-18 10:40                       ` Peter Zijlstra
2015-06-18 16:54                         ` Paul E. McKenney
2015-06-18 17:10                           ` Steven Rostedt
2015-06-18 17:51                             ` Paul E. McKenney
2015-06-18 22:19         ` [tip:timers/core] seqcount: Introduce raw_write_seqcount_barrier( ) tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 12/18] hrtimer: Allow hrtimer::function() to free the timer Peter Zijlstra
2015-06-18 22:19   ` [tip:timers/core] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 13/18] sched,dl: Fix sched class hopping CBS hole Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 14/18] sched: Move code around Peter Zijlstra
2015-06-18 23:02   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 15/18] sched: Streamline the task migration locking a little Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 16/18] lockdep: Simplify lock_release() Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 17/18] lockdep: Implement lock pinning Peter Zijlstra
2015-06-18 23:03   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-06-11 12:46 ` [PATCH 18/18] sched,lockdep: Employ " Peter Zijlstra
2015-06-18 23:04   ` [tip:sched/hrtimers] " tip-bot for Peter Zijlstra
2015-12-29  5:41 ` [PATCH 00/18] sched: balance callbacks v4 Byungchul Park

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.