[rt] Update to 4.14-rt1 and reenable (Closes: #882192)

This commit is contained in:
Ben Hutchings 2017-11-20 14:16:49 +00:00
parent 94964d2c86
commit 1dffc3c5d6
477 changed files with 16457 additions and 19018 deletions

6
debian/changelog vendored
View File

@ -1,3 +1,9 @@
linux (4.14-1~exp2) UNRELEASED; urgency=medium
* [rt] Update to 4.14-rt1 and reenable (Closes: #882192)
-- Ben Hutchings <ben@decadent.org.uk> Mon, 20 Nov 2017 14:16:28 +0000
linux (4.14-1~exp1) experimental; urgency=medium
* New upstream release: https://kernelnewbies.org/Linux_4.14

View File

@ -110,7 +110,7 @@ debug-info: true
signed-modules: false
[featureset-rt_base]
enabled: false
enabled: true
[description]
part-long-up: This kernel is not suitable for SMP (multi-processor,

View File

@ -1,51 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 10 Apr 2017 18:03:36 +0200
Subject: [PATCH 1/4] futex: Avoid freeing an active timer
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Alexander reported a hrtimer debug_object splat:
ODEBUG: free active (active state 0) object type: hrtimer hint: hrtimer_wakeup (kernel/time/hrtimer.c:1423)
debug_object_free (lib/debugobjects.c:603)
destroy_hrtimer_on_stack (kernel/time/hrtimer.c:427)
futex_lock_pi (kernel/futex.c:2740)
do_futex (kernel/futex.c:3399)
SyS_futex (kernel/futex.c:3447 kernel/futex.c:3415)
do_syscall_64 (arch/x86/entry/common.c:284)
entry_SYSCALL64_slow_path (arch/x86/entry/entry_64.S:249)
Which was caused by commit:
cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
... losing the hrtimer_cancel() in the shuffle. Where previously the
hrtimer_cancel() was done by rt_mutex_slowlock() we now need to do it
manually.
Reported-by: Alexander Levin <alexander.levin@verizon.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704101802370.2906@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/futex.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2736,8 +2736,10 @@ static int futex_lock_pi(u32 __user *uad
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:

View File

@ -1,118 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:48 +0100
Subject: [PATCH] futex: Cleanup variable names for futex_top_waiter()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 499f5aca2cdd5e958b27e2655e7e7f82524f46b1
futex_top_waiter() returns the top-waiter on the pi_mutex. Assinging
this to a variable 'match' totally obscures the code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.554710645@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1122,14 +1122,14 @@ static int attach_to_pi_owner(u32 uval,
static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
- struct futex_q *match = futex_top_waiter(hb, key);
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
/*
* If there is a waiter on that futex, validate it and
* attach to the pi_state when the validation succeeds.
*/
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ if (top_waiter)
+ return attach_to_pi_state(uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1176,7 +1176,7 @@ static int futex_lock_pi_atomic(u32 __us
struct task_struct *task, int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
/*
@@ -1202,9 +1202,9 @@ static int futex_lock_pi_atomic(u32 __us
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
*/
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1294,11 +1294,11 @@ static void mark_wake_futex(struct wake_
q->lock_ptr = NULL;
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
struct futex_hash_bucket *hb)
{
struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
u32 uninitialized_var(curval), newval;
DEFINE_WAKE_Q(wake_q);
bool deboost;
@@ -1319,11 +1319,11 @@ static int wake_futex_pi(u32 __user *uad
/*
* It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
+ * top_waiter owner to the kernel) timed out and is no longer
* waiting on the lock.
*/
if (!new_owner)
- new_owner = this->task;
+ new_owner = top_waiter->task;
/*
* We pass it to the next owner. The WAITERS bit is always
@@ -2633,7 +2633,7 @@ static int futex_unlock_pi(u32 __user *u
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
retry:
@@ -2657,9 +2657,9 @@ static int futex_unlock_pi(u32 __user *u
* all and we at least want to know if user space fiddled
* with the futex value instead of blindly unlocking.
*/
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ ret = wake_futex_pi(uaddr, uval, top_waiter, hb);
/*
* In case of success wake_futex_pi dropped the hash
* bucket lock.

View File

@ -1,53 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:27 +0200
Subject: [PATCH 01/13] ia64/topology: Remove cpus_allowed manipulation
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The CPU hotplug callback fiddles with the cpus_allowed pointer to pin the
calling thread on the plugged CPU. That's already guaranteed by the hotplug
core code.
Remove it.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-ia64@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.174518069@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/ia64/kernel/topology.c | 6 ------
1 file changed, 6 deletions(-)
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cp
unsigned long i, j;
struct cache_info *this_object;
int retval = 0;
- cpumask_t oldmask;
if (all_cpu_cache_info[cpu].kobj.parent)
return 0;
- oldmask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
- if (unlikely(retval))
- return retval;
retval = cpu_cache_sysfs_init(cpu);
- set_cpus_allowed_ptr(current, &oldmask);
if (unlikely(retval < 0))
return retval;

View File

@ -1,74 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:32 +0200
Subject: [PATCH 01/17] init: Pin init task to the boot CPU, initially
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Some of the boot code in init_kernel_freeable() which runs before SMP
bringup assumes (rightfully) that it runs on the boot CPU and therefore can
use smp_processor_id() in preemptible context.
That works so far because the smp_processor_id() check starts to be
effective after smp bringup. That's just wrong. Starting with SMP bringup
and the ability to move threads around, smp_processor_id() in preemptible
context is broken.
Aside of that it does not make sense to allow init to run on all CPUs
before sched_smp_init() has been run.
Pin the init to the boot CPU so the existing code can continue to use
smp_processor_id() without triggering the checks when the enabling of those
checks starts earlier.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184734.943149935@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
init/main.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
--- a/init/main.c
+++ b/init/main.c
@@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kth
static noinline void __ref rest_init(void)
{
+ struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
@@ -397,7 +398,17 @@ static noinline void __ref rest_init(voi
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
- kernel_thread(kernel_init, NULL, CLONE_FS);
+ pid = kernel_thread(kernel_init, NULL, CLONE_FS);
+ /*
+ * Pin init on the boot CPU. Task migration is not properly working
+ * until sched_init_smp() has been run. It will set the allowed
+ * CPUs for init to the non isolated CPUs.
+ */
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
+ rcu_read_unlock();
+
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
@@ -1011,10 +1022,6 @@ static noinline void __init kernel_init_
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
- /*
- * init can run on any cpu.
- */
- set_cpus_allowed_ptr(current, cpu_all_mask);
cad_pid = task_pid(current);

View File

@ -1,178 +0,0 @@
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:07 +0100
Subject: [PATCH 1/9] rtmutex: Deboost before waking up the top waiter
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
We should deboost before waking the high-priority task, such that we
don't run two tasks with the same "state" (priority, deadline,
sched_class, etc).
In order to make sure the boosting task doesn't start running between
unlock and deboost (due to 'spurious' wakeup), we move the deboost
under the wait_lock, that way its serialized against the wait loop in
__rt_mutex_slowlock().
Doing the deboost early can however lead to priority-inversion if
current would get preempted after the deboost but before waking our
high-prio task, hence we disable preemption before doing deboost, and
enabling it after the wake up is over.
This gets us the right semantic order, but most importantly however;
this change ensures pointer stability for the next patch, where we
have rt_mutex_setprio() cache a pointer to the top-most waiter task.
If we, as before this change, do the wakeup first and then deboost,
this pointer might point into thin air.
[peterz: Changelog + patch munging]
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.110065320@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/futex.c | 5 ---
kernel/locking/rtmutex.c | 59 +++++++++++++++++++++-------------------
kernel/locking/rtmutex_common.h | 2 -
3 files changed, 34 insertions(+), 32 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1460,10 +1460,7 @@ static int wake_futex_pi(u32 __user *uad
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- if (deboost) {
- wake_up_q(&wake_q);
- rt_mutex_adjust_prio(current);
- }
+ rt_mutex_postunlock(&wake_q, deboost);
return ret;
}
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -373,24 +373,6 @@ static void __rt_mutex_adjust_prio(struc
}
/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-}
-
-/*
* Deadlock detection is conditional:
*
* If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
@@ -1051,6 +1033,7 @@ static void mark_wakeup_next_waiter(stru
* lock->wait_lock.
*/
rt_mutex_dequeue_pi(current, waiter);
+ __rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1393,6 +1376,16 @@ static bool __sched rt_mutex_slowunlock(
*/
mark_wakeup_next_waiter(wake_q, lock);
+ /*
+ * We should deboost before waking the top waiter task such that
+ * we don't run two tasks with the 'same' priority. This however
+ * can lead to prio-inversion if we would get preempted after
+ * the deboost but before waking our high-prio task, hence the
+ * preempt_disable before unlock. Pairs with preempt_enable() in
+ * rt_mutex_postunlock();
+ */
+ preempt_disable();
+
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
@@ -1442,6 +1435,18 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
return slowfn(lock);
}
+/*
+ * Undo pi boosting (if necessary) and wake top waiter.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
+{
+ wake_up_q(wake_q);
+
+ /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+ if (deboost)
+ preempt_enable();
+}
+
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -1455,11 +1460,7 @@ rt_mutex_fastunlock(struct rt_mutex *loc
deboost = slowfn(lock, &wake_q);
- wake_up_q(&wake_q);
-
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
+ rt_mutex_postunlock(&wake_q, deboost);
}
/**
@@ -1572,6 +1573,13 @@ bool __sched __rt_mutex_futex_unlock(str
}
mark_wakeup_next_waiter(wake_q, lock);
+ /*
+ * We've already deboosted, retain preempt_disabled when dropping
+ * the wait_lock to avoid inversion until the wakeup. Matched
+ * by rt_mutex_postunlock();
+ */
+ preempt_disable();
+
return true; /* deboost and wakeups */
}
@@ -1584,10 +1592,7 @@ void __sched rt_mutex_futex_unlock(struc
deboost = __rt_mutex_futex_unlock(lock, &wake_q);
raw_spin_unlock_irq(&lock->wait_lock);
- if (deboost) {
- wake_up_q(&wake_q);
- rt_mutex_adjust_prio(current);
- }
+ rt_mutex_postunlock(&wake_q, deboost);
}
/**
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -122,7 +122,7 @@ extern void rt_mutex_futex_unlock(struct
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"

View File

@ -1,62 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 May 2017 08:52:02 +0200
Subject: [PATCH] sched/clock: Fix early boot preempt assumption in
__set_sched_clock_stable()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The more strict early boot preemption warnings found that
__set_sched_clock_stable() was incorrectly assuming we'd still be
running on a single CPU:
BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
caller is debug_smp_processor_id+0x1c/0x1e
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2-00108-g1c3c5ea #1
Call Trace:
dump_stack+0x110/0x192
check_preemption_disabled+0x10c/0x128
? set_debug_rodata+0x25/0x25
debug_smp_processor_id+0x1c/0x1e
sched_clock_init_late+0x27/0x87
[...]
Fix it by disabling IRQs.
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: lkp@01.org
Cc: tipbuild@zytor.com
Link: http://lkml.kernel.org/r/20170524065202.v25vyu7pvba5mhpd@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/sched/clock.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -126,12 +126,19 @@ int sched_clock_stable(void)
static void __set_sched_clock_stable(void)
{
- struct sched_clock_data *scd = this_scd();
+ struct sched_clock_data *scd;
/*
+ * Since we're still unstable and the tick is already running, we have
+ * to disable IRQs in order to get a consistent scd->tick* reading.
+ */
+ local_irq_disable();
+ scd = this_scd();
+ /*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+ local_irq_enable();
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, __gtod_offset,

View File

@ -0,0 +1,308 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 13 Nov 2017 20:23:44 +0100
Subject: [PATCH 01/36] timers: Use static keys for migrate_enable and
nohz_active
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The migration_enabled and nohz_active will be later moved into the bitfield. In
the bitfield a change to one bit causes RMW operation and without holding a
lock it might happen that a concurrent change on a second CPU might cause the
loss of the an update.
To avoid that and since both fields are changed to static_branch.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 4 -
kernel/time/hrtimer.c | 17 ++------
kernel/time/tick-internal.h | 21 +++++++---
kernel/time/tick-sched.c | 2
kernel/time/timer.c | 91 ++++++++++++++++++++++----------------------
5 files changed, 69 insertions(+), 66 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -153,8 +153,6 @@ enum hrtimer_base_type {
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
- * @migration_enabled: The migration of hrtimers to other cpus is enabled
- * @nohz_active: The nohz functionality is enabled
* @expires_next: absolute time of the next event which was scheduled
* via clock_set_next_event()
* @next_timer: Pointer to the first expiring timer
@@ -178,8 +176,6 @@ struct hrtimer_cpu_base {
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
- bool migration_enabled;
- bool nohz_active;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int in_hrtirq : 1,
hres_active : 1,
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -178,23 +178,16 @@ hrtimer_check_target(struct hrtimer *tim
#endif
}
-#ifdef CONFIG_NO_HZ_COMMON
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
-{
- if (pinned || !base->migration_enabled)
- return base;
- return &per_cpu(hrtimer_bases, get_nohz_timer_target());
-}
-#else
static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_unlikely(&timers_migration_enabled) && !pinned)
+ return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+#endif
return base;
}
-#endif
/*
* We switch the timer base to a power-optimized selected CPU target,
@@ -971,7 +964,7 @@ void hrtimer_start_range_ns(struct hrtim
* Kick to reschedule the next tick to handle the new timer
* on dynticks target.
*/
- if (new_base->cpu_base->nohz_active)
+ if (is_timers_nohz_active())
wake_up_nohz_cpu(new_base->cpu_base->cpu);
} else {
hrtimer_reprogram(timer, new_base);
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -150,14 +150,25 @@ static inline void tick_nohz_init(void)
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
+extern void timers_update_nohz(void);
+extern struct static_key_false timers_nohz_active;
+
+static inline bool is_timers_nohz_active(void)
+{
+ return static_branch_unlikely(&timers_nohz_active);
+}
+
+#ifdef CONFIG_SMP
+extern struct static_key_false timers_migration_enabled;
+#endif
#else
+static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
-#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(bool update_nohz);
-#else
-static inline void timers_update_migration(bool update_nohz) { }
+static inline bool is_timers_nohz_active(void)
+{
+ return false;
+}
#endif
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1103,7 +1103,7 @@ static inline void tick_nohz_activate(st
ts->nohz_mode = mode;
/* One update is enough */
if (!test_and_set_bit(0, &tick_nohz_active))
- timers_update_migration(true);
+ timers_update_nohz();
}
/**
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -200,8 +200,6 @@ struct timer_base {
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
- bool migration_enabled;
- bool nohz_active;
bool is_idle;
bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
@@ -210,45 +208,59 @@ struct timer_base {
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
+
+DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
+static DEFINE_MUTEX(timer_keys_mutex);
+
+static void timer_update_keys(struct work_struct *work);
+static DECLARE_WORK(timer_update_work, timer_update_keys);
+
+#ifdef CONFIG_SMP
unsigned int sysctl_timer_migration = 1;
-void timers_update_migration(bool update_nohz)
+DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
+
+static void timers_update_migration(void)
{
bool on = sysctl_timer_migration && tick_nohz_active;
- unsigned int cpu;
- /* Avoid the loop, if nothing to update */
- if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)
- return;
+ if (on)
+ static_branch_enable(&timers_migration_enabled);
+ else
+ static_branch_disable(&timers_migration_enabled);
+}
+#else
+static inline void timers_update_migration(void) { }
+#endif /* !CONFIG_SMP */
- for_each_possible_cpu(cpu) {
- per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on;
- per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;
- per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
- if (!update_nohz)
- continue;
- per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true;
- per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;
- per_cpu(hrtimer_bases.nohz_active, cpu) = true;
- }
+static void timer_update_keys(struct work_struct *work)
+{
+ mutex_lock(&timer_keys_mutex);
+ timers_update_migration();
+ static_branch_enable(&timers_nohz_active);
+ mutex_unlock(&timer_keys_mutex);
+}
+
+void timers_update_nohz(void)
+{
+ schedule_work(&timer_update_work);
}
int timer_migration_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- static DEFINE_MUTEX(mutex);
int ret;
- mutex_lock(&mutex);
+ mutex_lock(&timer_keys_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
- timers_update_migration(false);
- mutex_unlock(&mutex);
+ timers_update_migration();
+ mutex_unlock(&timer_keys_mutex);
return ret;
}
-#endif
+#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,
bool force_up)
@@ -534,7 +546,7 @@ static void
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!is_timers_nohz_active())
return;
/*
@@ -817,7 +829,7 @@ static inline struct timer_base *get_tim
* If the timer is deferrable and nohz is active then we need to use
* the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ if (is_timers_nohz_active() &&
(tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
@@ -831,7 +843,7 @@ static inline struct timer_base *get_tim
* If the timer is deferrable and nohz is active then we need to use
* the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
+ if (is_timers_nohz_active() &&
(tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
@@ -842,21 +854,20 @@ static inline struct timer_base *get_tim
return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}
-#ifdef CONFIG_NO_HZ_COMMON
static inline struct timer_base *
get_target_base(struct timer_base *base, unsigned tflags)
{
-#ifdef CONFIG_SMP
- if ((tflags & TIMER_PINNED) || !base->migration_enabled)
- return get_timer_this_cpu_base(tflags);
- return get_timer_cpu_base(tflags, get_nohz_timer_target());
-#else
- return get_timer_this_cpu_base(tflags);
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+ if (static_branch_unlikely(&timers_migration_enabled) &&
+ !(tflags & TIMER_PINNED))
+ return get_timer_cpu_base(tflags, get_nohz_timer_target());
#endif
+ return get_timer_this_cpu_base(tflags);
}
static inline void forward_timer_base(struct timer_base *base)
{
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long jnow;
/*
@@ -880,16 +891,8 @@ static inline void forward_timer_base(st
base->clk = jnow;
else
base->clk = base->next_expiry;
-}
-#else
-static inline struct timer_base *
-get_target_base(struct timer_base *base, unsigned tflags)
-{
- return get_timer_this_cpu_base(tflags);
-}
-
-static inline void forward_timer_base(struct timer_base *base) { }
#endif
+}
/*
@@ -1644,7 +1647,7 @@ static __latent_entropy void run_timer_s
base->must_forward_clk = false;
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (is_timers_nohz_active())
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1658,7 +1661,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!is_timers_nohz_active())
return;
/* CPU is awake, so check the deferrable base. */
base++;

View File

@ -0,0 +1,128 @@
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 22 Sep 2017 14:58:15 -0500
Subject: [PATCH 01/42] tracing: Steve's unofficial trace_recursive_lock()
patch
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
On Tue, 5 Sep 2017 16:57:52 -0500
Tom Zanussi <tom.zanussi@linux.intel.com> wrote:
> Synthetic event generation requires the reservation of a second event
> while the reservation of a previous event is still in progress. The
> trace_recursive_lock() check in ring_buffer_lock_reserve() prevents
> this however.
>
> This sets up a special reserve pathway for this particular case,
> leaving existing pathways untouched, other than an additional check in
> ring_buffer_lock_reserve() and trace_event_buffer_reserve(). These
> checks could be gotten rid of as well, with copies of those functions,
> but for now try to avoid that unless necessary.
>
> Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
I've been planing on changing that lock, which may help you here
without having to mess around with parameters. That is to simply add a
counter. Would this patch help you. You can add a patch to increment
the count to 5 with an explanation of handling synthetic events, but
even getting to 4 is extremely unlikely.
I'll make this into an official patch if this works for you, and then
you can include it in your series.
-- Steve
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/ring_buffer.c | 66 ++++++++++++---------------------------------
1 file changed, 18 insertions(+), 48 deletions(-)
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2538,61 +2538,29 @@ rb_wakeups(struct ring_buffer *buffer, s
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
- *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- * 101 - 1 = 100
- * 101 & 100 = 100 (clearing bit zero)
- *
- * 1010 - 1 = 1001
- * 1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * Normal context.
+ * SoftIRQ context
+ * IRQ context
+ * NMI context
+ *
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned int val = cpu_buffer->current_context;
- int bit;
-
- if (in_interrupt()) {
- if (in_nmi())
- bit = RB_CTX_NMI;
- else if (in_irq())
- bit = RB_CTX_IRQ;
- else
- bit = RB_CTX_SOFTIRQ;
- } else
- bit = RB_CTX_NORMAL;
-
- if (unlikely(val & (1 << bit)))
+ if (cpu_buffer->current_context >= 4)
return 1;
- val |= (1 << bit);
- cpu_buffer->current_context = val;
+ cpu_buffer->current_context++;
+ /* Interrupts must see this update */
+ barrier();
return 0;
}
@@ -2600,7 +2568,9 @@ trace_recursive_lock(struct ring_buffer_
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+ /* Don't let the dec leak out */
+ barrier();
+ cpu_buffer->current_context--;
}
/**

View File

@ -1,37 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:33 +0200
Subject: [PATCH 02/17] arm: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in ipi_cpu_stop() to handle the extra states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/20170516184735.020718977@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/arm/kernel/smp.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -555,8 +555,7 @@ static DEFINE_RAW_SPINLOCK(stop_lock);
*/
static void ipi_cpu_stop(unsigned int cpu)
{
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
raw_spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();

View File

@ -1,55 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 7 Apr 2017 09:04:07 +0200
Subject: [PATCH 2/4] futex: Fix small (and harmless looking) inconsistencies
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
During (post-commit) review Darren spotted a few minor things. One
(harmless AFAICT) type inconsistency and a comment that wasn't as
clear as hoped.
Reported-by: Darren Hart (VMWare) <dvhart@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Darren Hart (VMware) <dvhart@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/futex.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1025,7 +1025,8 @@ static int attach_to_pi_state(u32 __user
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
- int ret, uval2;
+ u32 uval2;
+ int ret;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -1441,6 +1442,11 @@ static int wake_futex_pi(u32 __user *uad
if (ret)
goto out_unlock;
+ /*
+ * This is a point of no return; once we modify the uval there is no
+ * going back and subsequent operations must not fail.
+ */
+
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
@@ -1452,9 +1458,6 @@ static int wake_futex_pi(u32 __user *uad
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- /*
- * We've updated the uservalue, this unlock cannot fail.
- */
postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
out_unlock:

View File

@ -1,39 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:49 +0100
Subject: [PATCH] futex: Use smp_store_release() in mark_wake_futex()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f
Since the futex_q can dissapear the instruction after assigning NULL,
this really should be a RELEASE barrier. That stops loads from hitting
dead memory too.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1290,8 +1290,7 @@ static void mark_wake_futex(struct wake_
* memory barrier is required here to prevent the following
* store to lock_ptr from getting ahead of the plist_del.
*/
- smp_wmb();
- q->lock_ptr = NULL;
+ smp_store_release(&q->lock_ptr, NULL);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,

View File

@ -0,0 +1,37 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:39 +0200
Subject: [PATCH 02/36] hrtimer: Correct blantanly wrong comment
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The protection of a hrtimer which runs its callback against migration to a
different CPU has nothing to do with hard interrupt context.
The protection against migration of a hrtimer running the expiry callback
is the pointer in the cpu_base which holds a pointer to the currently
running timer. This pointer is evaluated in the code which potentially
switches the timer base and makes sure it's kept on the CPU on which the
callback is running.
Reported-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1197,9 +1197,9 @@ static void __run_hrtimer(struct hrtimer
timer->is_rel = false;
/*
- * Because we run timers from hardirq context, there is no chance
- * they get migrated to another cpu, therefore its safe to unlock
- * the timer base.
+ * The timer is marked as running in the cpu base, so it is
+ * protected against migration to a different CPU even if the lock
+ * is dropped.
*/
raw_spin_unlock(&cpu_base->lock);
trace_hrtimer_expire_entry(timer, now);

View File

@ -1,167 +0,0 @@
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:08 +0100
Subject: [PATCH 2/9] sched/rtmutex/deadline: Fix a PI crash for deadline tasks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
A crash happened while I was playing with deadline PI rtmutex.
BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
PGD 232a75067 PUD 230947067 PMD 0
Oops: 0000 [#1] SMP
CPU: 1 PID: 10994 Comm: a.out Not tainted
Call Trace:
[<ffffffff810b658c>] enqueue_task+0x2c/0x80
[<ffffffff810ba763>] activate_task+0x23/0x30
[<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
[<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
[<ffffffff8164e783>] __schedule+0xd3/0x900
[<ffffffff8164efd9>] schedule+0x29/0x70
[<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
[<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
[<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
[<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
[<ffffffff810ed8b0>] do_futex+0x190/0x5b0
[<ffffffff810edd50>] SyS_futex+0x80/0x180
This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task(), will access them with rq lock held but
not holding pi_lock.
In order to tackle it, we introduce new "pi_top_task" pointer
cached in task_struct, and add new rt_mutex_update_top_task()
to update its value, it can be called by rt_mutex_setprio()
which held both owner's pi_lock and rq lock. Thus "pi_top_task"
can be safely accessed by enqueue_task_dl() under rq lock.
Originally-From: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.157682758@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/init_task.h | 1 +
include/linux/sched.h | 2 ++
include/linux/sched/rt.h | 1 +
kernel/fork.c | 1 +
kernel/locking/rtmutex.c | 29 +++++++++++++++++++++--------
kernel/sched/core.c | 2 ++
6 files changed, 28 insertions(+), 8 deletions(-)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -181,6 +181,7 @@ extern struct cred init_cred;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
+ .pi_top_task = NULL, \
.pi_waiters_leftmost = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -779,6 +779,8 @@ struct task_struct {
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
+ /* Updated under owner's pi_lock and rq lock */
+ struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -21,6 +21,7 @@ static inline int rt_task(struct task_st
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
+extern void rt_mutex_update_top_task(struct task_struct *p);
extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct ta
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
+ p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -323,6 +323,19 @@ rt_mutex_dequeue_pi(struct task_struct *
}
/*
+ * Must hold both p->pi_lock and task_rq(p)->lock.
+ */
+void rt_mutex_update_top_task(struct task_struct *p)
+{
+ if (!task_has_pi_waiters(p)) {
+ p->pi_top_task = NULL;
+ return;
+ }
+
+ p->pi_top_task = task_top_pi_waiter(p)->task;
+}
+
+/*
* Calculate task priority from the waiter tree priority
*
* Return task->normal_prio when the waiter tree is empty or when
@@ -337,12 +350,12 @@ int rt_mutex_getprio(struct task_struct
task->normal_prio);
}
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
+ return task->pi_top_task;
}
/*
@@ -351,12 +364,12 @@ struct task_struct *rt_mutex_get_top_tas
*/
int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
{
- if (!task_has_pi_waiters(task))
+ struct task_struct *top_task = rt_mutex_get_top_task(task);
+
+ if (!top_task)
return newprio;
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
+ return min(top_task->prio, newprio);
}
/*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3712,6 +3712,8 @@ void rt_mutex_setprio(struct task_struct
goto out_unlock;
}
+ rt_mutex_update_top_task(p);
+
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;

View File

@ -0,0 +1,189 @@
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Sep 2017 14:58:16 -0500
Subject: [PATCH 02/42] tracing: Reverse the order of trace_types_lock and
event_mutex
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
In order to make future changes where we need to call
tracing_set_clock() from within an event command, the order of
trace_types_lock and event_mutex must be reversed, as the event command
will hold event_mutex and the trace_types_lock is taken from within
tracing_set_clock().
Requested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace.c | 5 +++++
kernel/trace/trace_events.c | 31 +++++++++++++++----------------
2 files changed, 20 insertions(+), 16 deletions(-)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *na
struct trace_array *tr;
int ret;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *na
list_add(&tr->list, &ftrace_trace_arrays);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *na
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *na
int ret;
int i;
+ mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *na
out_unlock:
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *
return -ENODEV;
/* Make sure the system still exists */
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
list_for_each_entry(dir, &tr->systems, list) {
if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *
}
}
exit_loop:
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
if (!system)
return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struc
int trace_add_event_call(struct trace_event_call *call)
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
ret = __register_event(call, NULL);
if (ret >= 0)
__add_event_to_tracers(call);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace
{
int ret;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
down_write(&trace_event_sem);
ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return ret;
}
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct no
{
struct module *mod = data;
- mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct no
trace_module_remove_events(mod);
break;
}
- mutex_unlock(&event_mutex);
mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
return 0;
}
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentr
* creates the event hierachry in the @parent/events directory.
*
* Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
*/
int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ goto out;
down_write(&trace_event_sem);
__trace_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
+ out:
return ret;
}
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *pa
return ret;
}
+/* Must be called with event_mutex held */
int event_trace_del_tracer(struct trace_array *tr)
{
- mutex_lock(&event_mutex);
+ lockdep_assert_held(&event_mutex);
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_
tr->event_dir = NULL;
- mutex_unlock(&event_mutex);
-
return 0;
}

View File

@ -1,84 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:28 +0200
Subject: [PATCH 02/13] workqueue: Provide work_on_cpu_safe()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
work_on_cpu() is not protected against CPU hotplug. For code which requires
to be either executed on an online CPU or to fail if the CPU is not
available the callsite would have to protect against CPU hotplug.
Provide a function which does get/put_online_cpus() around the call to
work_on_cpu() and fails the call with -ENODEV if the target CPU is not
online.
Preparatory patch to convert several racy task affinity manipulations.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.262610721@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/workqueue.h | 5 +++++
kernel/workqueue.c | 23 +++++++++++++++++++++++
2 files changed, 28 insertions(+)
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu,
{
return fn(arg);
}
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+ return fn(arg);
+}
#else
long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4735,6 +4735,29 @@ long work_on_cpu(int cpu, long (*fn)(voi
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+ long ret = -ENODEV;
+
+ get_online_cpus();
+ if (cpu_online(cpu))
+ ret = work_on_cpu(cpu, fn, arg);
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER

View File

@ -1,38 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:34 +0200
Subject: [PATCH 03/17] arm64: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in smp_send_stop() to handle the extra states.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20170516184735.112589728@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/arm64/kernel/smp.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -915,8 +915,7 @@ void smp_send_stop(void)
cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING)
+ if (system_state <= SYSTEM_RUNNING)
pr_crit("SMP: stopping secondary CPUs\n");
smp_cross_call(&mask, IPI_CPU_STOP);
}

View File

@ -1,36 +0,0 @@
From: "Darren Hart (VMware)" <dvhart@infradead.org>
Date: Fri, 14 Apr 2017 15:31:38 -0700
Subject: [PATCH 3/4] futex: Clarify mark_wake_futex memory barrier usage
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Clarify the scenario described in mark_wake_futex requiring the
smp_store_release(). Update the comment to explicitly refer to the
plist_del now under __unqueue_futex() (previously plist_del was in the
same function as the comment).
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170414223138.GA4222@fury
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/futex.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1380,10 +1380,11 @@ static void mark_wake_futex(struct wake_
wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
- * The waiting task can free the futex_q as soon as
- * q->lock_ptr = NULL is written, without taking any locks. A
- * memory barrier is required here to prevent the following
- * store to lock_ptr from getting ahead of the plist_del.
+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+ * is written, without taking any locks. This is possible in the event
+ * of a spurious wakeup, for example. A memory barrier is required here
+ * to prevent the following store to lock_ptr from getting ahead of the
+ * plist_del in __unqueue_futex().
*/
smp_store_release(&q->lock_ptr, NULL);
}

View File

@ -1,185 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:50 +0100
Subject: [PATCH] futex: Remove rt_mutex_deadlock_account_*()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit fffa954fb528963c2fb7b0c0084eb77e2be7ab52
These are unused and clutter up the code.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.652692478@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/locking/rtmutex-debug.c | 9 -------
kernel/locking/rtmutex-debug.h | 3 --
kernel/locking/rtmutex.c | 47 +++++++++++++++--------------------------
kernel/locking/rtmutex.h | 2 -
4 files changed, 18 insertions(+), 43 deletions(-)
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -938,8 +938,6 @@ static int try_to_take_rt_mutex(struct r
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -1342,8 +1340,6 @@ static bool __sched rt_mutex_slowunlock(
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
@@ -1409,11 +1405,10 @@ rt_mutex_fastlock(struct rt_mutex *lock,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
@@ -1425,21 +1420,19 @@ rt_mutex_timed_fastlock(struct rt_mutex
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
+
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
@@ -1449,19 +1442,18 @@ rt_mutex_fastunlock(struct rt_mutex *loc
struct wake_q_head *wqh))
{
DEFINE_WAKE_Q(wake_q);
+ bool deboost;
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- } else {
- bool deboost = slowfn(lock, &wake_q);
+ deboost = slowfn(lock, &wake_q);
- wake_up_q(&wake_q);
+ wake_up_q(&wake_q);
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+ rt_mutex_adjust_prio(current);
}
/**
@@ -1572,10 +1564,9 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
return false;
- }
+
return rt_mutex_slowunlock(lock, wqh);
}
@@ -1637,7 +1628,6 @@ void rt_mutex_init_proxy_locked(struct r
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -1657,7 +1647,6 @@ void rt_mutex_proxy_unlock(struct rt_mut
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
/**
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)

View File

@ -0,0 +1,43 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:40 +0200
Subject: [PATCH 03/36] hrtimer: Fix kerneldoc for struct hrtimer_cpu_base
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The sequence '/**' marks the start of a struct description. Add the
missing second asterisk. While at it adapt the ordering of the struct
members to the struct definition and document the purpose of
expires_next more precisely.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -144,7 +144,7 @@ enum hrtimer_base_type {
HRTIMER_MAX_CLOCK_BASES,
};
-/*
+/**
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
@@ -153,12 +153,12 @@ enum hrtimer_base_type {
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
- * @expires_next: absolute time of the next event which was scheduled
- * via clock_set_next_event()
- * @next_timer: Pointer to the first expiring timer
* @in_hrtirq: hrtimer_interrupt() is currently executing
* @hres_active: State of high resolution mode
* @hang_detected: The last hrtimer interrupt detected a hang
+ * @expires_next: absolute time of the next event, is required for remote
+ * hrtimer enqueue
+ * @next_timer: Pointer to the first expiring timer
* @nr_events: Total number of hrtimer interrupt events
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs

View File

@ -1,129 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:29 +0200
Subject: [PATCH 03/13] ia64/salinfo: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Some of the file operations in /proc/sal require to run code on the
requested cpu. This is achieved by temporarily setting the affinity of the
calling user space thread to the requested CPU and reset it to the original
affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
Replace it by using work_on_cpu_safe() which guarantees to run the code on
the requested CPU or to fail in case the CPU is offline.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-ia64@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.341863457@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/ia64/kernel/salinfo.c | 31 ++++++++++++-------------------
1 file changed, 12 insertions(+), 19 deletions(-)
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms {
const u8 *efi_guid;
u8 **oemdata;
u64 *oemdata_size;
- int ret;
};
-static void
+static long
salinfo_platform_oemdata_cpu(void *context)
{
struct salinfo_platform_oemdata_parms *parms = context;
- parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
+
+ return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, parms->oemdata_size);
}
static void
@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode,
return 0;
}
-static void
-call_on_cpu(int cpu, void (*fn)(void *), void *arg)
-{
- cpumask_t save_cpus_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- (*fn)(arg);
- set_cpus_allowed_ptr(current, &save_cpus_allowed);
-}
-
-static void
+static long
salinfo_log_read_cpu(void *context)
{
struct salinfo_data *data = context;
@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context)
/* Clear corrected errors as they are read from SAL */
if (rh->severity == sal_log_severity_corrected)
ia64_sal_clear_state_info(data->type);
+ return 0;
}
static void
@@ -430,7 +422,7 @@ salinfo_log_new_read(int cpu, struct sal
spin_unlock_irqrestore(&data_saved_lock, flags);
if (!data->saved_num)
- call_on_cpu(cpu, salinfo_log_read_cpu, data);
+ work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
if (!data->log_size) {
data->state = STATE_NO_DATA;
cpumask_clear_cpu(cpu, &data->cpu_event);
@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char
return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
}
-static void
+static long
salinfo_log_clear_cpu(void *context)
{
struct salinfo_data *data = context;
+
ia64_sal_clear_state_info(data->type);
+ return 0;
}
static int
@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *d
rh = (sal_log_record_header_t *)(data->log_buffer);
/* Corrected errors have already been cleared from SAL */
if (rh->severity != sal_log_severity_corrected)
- call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+ work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
/* clearing a record may make a new record visible */
salinfo_log_new_read(cpu, data);
if (data->state == STATE_LOG_RECORD) {
@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, con
.oemdata = &data->oemdata,
.oemdata_size = &data->oemdata_size
};
- call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
- if (parms.ret)
- count = parms.ret;
+ count = work_on_cpu_safe(cpu, salinfo_platform_oemdata_cpu,
+ &parms);
} else
data->oemdata_size = 0;
} else

View File

@ -1,52 +0,0 @@
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:09 +0100
Subject: [PATCH 3/9] sched/deadline/rtmutex: Dont miss the
dl_runtime/dl_period update
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Currently dl tasks will actually return at the very beginning
of rt_mutex_adjust_prio_chain() in !detect_deadlock cases:
if (waiter->prio == task->prio) {
if (!detect_deadlock)
goto out_unlock_pi; // out here
else
requeue = false;
}
As the deadline value of blocked deadline tasks(waiters) without
changing their sched_class(thus prio doesn't change) never changes,
this seems reasonable, but it actually misses the chance of updating
rt_mutex_waiter's "dl_runtime(period)_copy" if a waiter updates its
deadline parameters(dl_runtime, dl_period) or boosted waiter changes
to !deadline class.
Thus, force deadline task not out by adding the !dl_prio() condition.
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/1460633827-345-7-git-send-email-xlpang@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.206577901@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/locking/rtmutex.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -605,7 +605,7 @@ static int rt_mutex_adjust_prio_chain(st
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio) {
+ if (waiter->prio == task->prio && !dl_task(task)) {
if (!detect_deadlock)
goto out_unlock_pi;
else

View File

@ -0,0 +1,38 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:17 -0500
Subject: [PATCH 03/42] tracing: Exclude 'generic fields' from histograms
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that
are found by trace_find_event_field() but are only meant for
filtering. Specifically, they unlike normal fields, they have a size
of 0 and thus wreak havoc when used as a histogram key.
Exclude these (return -EINVAL) when used as histogram keys.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -450,7 +450,7 @@ static int create_val_field(struct hist_
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -548,7 +548,7 @@ static int create_key_field(struct hist_
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}

View File

@ -1,48 +0,0 @@
From: "Darren Hart (VMware)" <dvhart@infradead.org>
Date: Fri, 14 Apr 2017 15:46:08 -0700
Subject: [PATCH 4/4] MAINTAINERS: Add FUTEX SUBSYSTEM
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Add a MAINTAINERS block for the FUTEX SUBSYSTEM which includes the core
kernel code, include headers, testing code, and Documentation. Excludes
arch files, and higher level test code.
I added tglx and mingo as M as they have made the tip commits and peterz
and myself as R.
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Link: http://lkml.kernel.org/r/20170414224608.GA5180@fury
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
MAINTAINERS | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5420,6 +5420,23 @@ F: fs/fuse/
F: include/uapi/linux/fuse.h
F: Documentation/filesystems/fuse.txt
+FUTEX SUBSYSTEM
+M: Thomas Gleixner <tglx@linutronix.de>
+M: Ingo Molnar <mingo@redhat.com>
+R: Peter Zijlstra <peterz@infradead.org>
+R: Darren Hart <dvhart@infradead.org>
+L: linux-kernel@vger.kernel.org
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
+S: Maintained
+F: kernel/futex.c
+F: kernel/futex_compat.c
+F: include/asm-generic/futex.h
+F: include/linux/futex.h
+F: include/uapi/linux/futex.h
+F: tools/testing/selftests/futex/
+F: tools/perf/bench/futex*
+F: Documentation/*futex*
+
FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
M: Rik Faith <faith@cs.unc.edu>
L: linux-scsi@vger.kernel.org

View File

@ -1,221 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:51 +0100
Subject: [PATCH] futex,rt_mutex: Provide futex specific rt_mutex API
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 5293c2efda37775346885c7e924d4ef7018ea60b
Part of what makes futex_unlock_pi() intricate is that
rt_mutex_futex_unlock() -> rt_mutex_slowunlock() can drop
rt_mutex::wait_lock.
This means it cannot rely on the atomicy of wait_lock, which would be
preferred in order to not rely on hb->lock so much.
The reason rt_mutex_slowunlock() needs to drop wait_lock is because it can
race with the rt_mutex fastpath, however futexes have their own fast path.
Since futexes already have a bunch of separate rt_mutex accessors, complete
that set and implement a rt_mutex variant without fastpath for them.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.702962446@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 30 ++++++++++-----------
kernel/locking/rtmutex.c | 55 +++++++++++++++++++++++++++++-----------
kernel/locking/rtmutex_common.h | 9 +++++-
3 files changed, 62 insertions(+), 32 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -916,7 +916,7 @@ void exit_pi_state_list(struct task_stru
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
spin_unlock(&hb->lock);
@@ -1364,20 +1364,18 @@ static int wake_futex_pi(u32 __user *uad
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-
/*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
+ * We've updated the uservalue, this unlock cannot fail.
*/
+ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
+
+ if (deboost) {
+ wake_up_q(&wake_q);
rt_mutex_adjust_prio(current);
+ }
return 0;
}
@@ -2253,7 +2251,7 @@ static int fixup_owner(u32 __user *uaddr
* task acquired the rt_mutex after we removed ourself from the
* rt_mutex waiters list.
*/
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+ if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) {
locked = 1;
goto out;
}
@@ -2568,7 +2566,7 @@ static int futex_lock_pi(u32 __user *uad
if (!trylock) {
ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
} else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
}
@@ -2591,7 +2589,7 @@ static int futex_lock_pi(u32 __user *uad
* it and return the fault to userspace.
*/
if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
@@ -2898,7 +2896,7 @@ static int futex_wait_requeue_pi(u32 __u
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2938,7 +2936,7 @@ static int futex_wait_requeue_pi(u32 __u
* userspace.
*/
if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
+ rt_mutex_futex_unlock(pi_mutex);
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1488,15 +1488,23 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interrup
/*
* Futex variant with full deadlock detection.
+ * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *timeout)
{
might_sleep();
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
+ return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
+ timeout, RT_MUTEX_FULL_CHAINWALK);
+}
+
+/*
+ * Futex variant, must not use fastpath.
+ */
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return rt_mutex_slowtrylock(lock);
}
/**
@@ -1555,19 +1563,38 @@ void __sched rt_mutex_unlock(struct rt_m
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
+{
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ mark_wakeup_next_waiter(wake_q, lock);
+ return true; /* deboost and wakeups */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
- return false;
+ DEFINE_WAKE_Q(wake_q);
+ bool deboost;
- return rt_mutex_slowunlock(lock, wqh);
+ raw_spin_lock_irq(&lock->wait_lock);
+ deboost = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (deboost) {
+ wake_up_q(&wake_q);
+ rt_mutex_adjust_prio(current);
+ }
}
/**
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -109,9 +109,14 @@ extern int rt_mutex_start_proxy_lock(str
extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter);
+
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES

View File

@ -0,0 +1,81 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:41 +0200
Subject: [PATCH 04/36] hrtimer: Cleanup clock argument in
schedule_hrtimeout_range_clock()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
schedule_hrtimeout_range_clock() uses an integer for the clock id
instead of the predefined type "clockid_t". The ID of the clock is
indicated in hrtimer code as clock_id. Therefore change the name of
the variable as well to make it consistent.
While at it, clean up the description for the function parameters clock_id
and mode. The clock modes and the clock ids are not restricted as the
comment suggests. Fix the mode description as well for the callers of
schedule_hrtimeout_range_clock().
No functional change.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 2 +-
kernel/time/hrtimer.c | 12 ++++++------
2 files changed, 7 insertions(+), 7 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -462,7 +462,7 @@ extern int schedule_hrtimeout_range(ktim
extern int schedule_hrtimeout_range_clock(ktime_t *expires,
u64 delta,
const enum hrtimer_mode mode,
- int clock);
+ clockid_t clock_id);
extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
/* Soft interrupt function to run the hrtimer queues: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1664,12 +1664,12 @@ void __init hrtimers_init(void)
* schedule_hrtimeout_range_clock - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
- * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
+ * @mode: timer mode
+ * @clock_id: timer clock to be used
*/
int __sched
schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
- const enum hrtimer_mode mode, int clock)
+ const enum hrtimer_mode mode, clockid_t clock_id)
{
struct hrtimer_sleeper t;
@@ -1690,7 +1690,7 @@ schedule_hrtimeout_range_clock(ktime_t *
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock, mode);
+ hrtimer_init_on_stack(&t.timer, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
hrtimer_init_sleeper(&t, current);
@@ -1712,7 +1712,7 @@ schedule_hrtimeout_range_clock(ktime_t *
* schedule_hrtimeout_range - sleep until timeout
* @expires: timeout value (ktime_t)
* @delta: slack in expires timeout (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless
@@ -1751,7 +1751,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_ran
/**
* schedule_hrtimeout - sleep until timeout
* @expires: timeout value (ktime_t)
- * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
+ * @mode: timer mode
*
* Make the current task sleep until the given expiry time has
* elapsed. The routine will return immediately unless

View File

@ -1,76 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 6 Apr 2017 14:56:18 +0200
Subject: [PATCH 04/13] ia64/sn/hwperf: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
sn_hwperf_op_cpu() which is invoked from an ioctl requires to run code on
the requested cpu. This is achieved by temporarily setting the affinity of
the calling user space thread to the requested CPU and reset it to the
original affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
Replace it by using work_on_cpu_safe() which guarantees to run the code on
the requested CPU or to fail in case the CPU is offline.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-ia64@vger.kernel.org
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704122251450.2548@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/ia64/sn/kernel/sn2/sn_hwperf.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *inf
op_info->ret = r;
}
+static long sn_hwperf_call_sal_work(void *info)
+{
+ sn_hwperf_call_sal(info);
+ return 0;
+}
+
static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
{
u32 cpu;
u32 use_ipi;
int r = 0;
- cpumask_t save_allowed;
cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32;
use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK;
@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hw
/* use an interprocessor interrupt to call SAL */
smp_call_function_single(cpu, sn_hwperf_call_sal,
op_info, 1);
- }
- else {
- /* migrate the task before calling SAL */
- save_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- sn_hwperf_call_sal(op_info);
- set_cpus_allowed_ptr(current, &save_allowed);
+ } else {
+ /* Call on the target CPU */
+ work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info);
}
}
r = op_info->ret;

View File

@ -1,145 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:10 +0100
Subject: [PATCH 4/9] rtmutex: Clean up
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Previous patches changed the meaning of the return value of
rt_mutex_slowunlock(); update comments and code to reflect this.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.255058238@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/futex.c | 7 ++++---
kernel/locking/rtmutex.c | 28 +++++++++++++---------------
kernel/locking/rtmutex_common.h | 2 +-
3 files changed, 18 insertions(+), 19 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1394,7 +1394,7 @@ static int wake_futex_pi(u32 __user *uad
{
u32 uninitialized_var(curval), newval;
struct task_struct *new_owner;
- bool deboost = false;
+ bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
int ret = 0;
@@ -1455,12 +1455,13 @@ static int wake_futex_pi(u32 __user *uad
/*
* We've updated the uservalue, this unlock cannot fail.
*/
- deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- rt_mutex_postunlock(&wake_q, deboost);
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
return ret;
}
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1330,7 +1330,8 @@ static inline int rt_mutex_slowtrylock(s
/*
* Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1401,8 +1402,7 @@ static bool __sched rt_mutex_slowunlock(
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* check PI boosting */
- return true;
+ return true; /* call rt_mutex_postunlock() */
}
/*
@@ -1449,15 +1449,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
}
/*
- * Undo pi boosting (if necessary) and wake top waiter.
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
*/
-void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost)
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
{
wake_up_q(wake_q);
/* Pairs with preempt_disable() in rt_mutex_slowunlock() */
- if (deboost)
- preempt_enable();
+ preempt_enable();
}
static inline void
@@ -1466,14 +1465,12 @@ rt_mutex_fastunlock(struct rt_mutex *loc
struct wake_q_head *wqh))
{
DEFINE_WAKE_Q(wake_q);
- bool deboost;
if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
return;
- deboost = slowfn(lock, &wake_q);
-
- rt_mutex_postunlock(&wake_q, deboost);
+ if (slowfn(lock, &wake_q))
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1593,19 +1590,20 @@ bool __sched __rt_mutex_futex_unlock(str
*/
preempt_disable();
- return true; /* deboost and wakeups */
+ return true; /* call postunlock() */
}
void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
{
DEFINE_WAKE_Q(wake_q);
- bool deboost;
+ bool postunlock;
raw_spin_lock_irq(&lock->wait_lock);
- deboost = __rt_mutex_futex_unlock(lock, &wake_q);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
raw_spin_unlock_irq(&lock->wait_lock);
- rt_mutex_postunlock(&wake_q, deboost);
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
}
/**
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -122,7 +122,7 @@ extern void rt_mutex_futex_unlock(struct
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh);
-extern void rt_mutex_postunlock(struct wake_q_head *wake_q, bool deboost);
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"

View File

@ -0,0 +1,27 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:18 -0500
Subject: [PATCH 04/42] tracing: Remove lookups from tracing_map hitcount
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Lookups inflate the hitcount, making it essentially useless. Only
inserts and updates should really affect the hitcount anyway, so
explicitly filter lookups out.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/tracing_map.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ static inline struct tracing_map_elt *
if (test_key && test_key == key_hash && entry->val &&
keys_match(key, entry->val->key, map->key_size)) {
- atomic64_inc(&map->hits);
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
return entry->val;
}

View File

@ -1,34 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:35 +0200
Subject: [PATCH 04/17] x86/smp: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in announce_cpu() to handle the extra states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184735.191715856@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/x86/kernel/smpboot.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int ap
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");

View File

@ -1,371 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:52 +0100
Subject: [PATCH] futex: Change locking rules
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 734009e96d1983ad739e5b656e03430b3660c913
Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
creates another set of problems, especially priority inversions on RT where
hb->lock becomes a rt_mutex itself.
The rt_mutex::wait_lock is the most obvious protection for keeping the
futex user space value and the kernel internal pi_state in sync.
Rework and document the locking so rt_mutex::wait_lock is held accross all
operations which modify the user space value and the pi state.
This allows to invoke rt_mutex_unlock() (including deboost) without holding
hb->lock as a next step.
Nothing yet relies on the new locking rules.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 165 +++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 132 insertions(+), 33 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_stru
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_stru
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1026,34 @@ static int attach_to_pi_state(u32 uval,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
+ goto out_einval;
+
+out_attach:
atomic_inc(&pi_state->refcount);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/*
@@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, str
* attach to the pi_state when the validation succeeds.
*/
if (top_waiter)
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __u
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __us
*/
top_waiter = futex_top_waiter(hb, key);
if (top_waiter)
- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uad
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uad
else
ret = -EINVAL;
}
+
if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1823,7 +1907,7 @@ static int futex_requeue(u32 __user *uad
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __us
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __us
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
@@ -2166,47 +2252,60 @@ static int fixup_pi_state_owner(u32 __us
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}
pi_state->owner = newowner;
- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
- return 0;
+ if (pi_state->owner != oldowner) {
+ ret = 0;
+ goto out_unlock;
+ }
if (ret)
- return ret;
+ goto out_unlock;
goto retry;
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);

View File

@ -0,0 +1,61 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:42 +0200
Subject: [PATCH 05/36] hrtimer: Fix hrtimer function description
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The hrtimer_start[_range_ns]() starts a timer reliable on this CPU only
when HRTIMER_MODE_PINNED is set. Furthermore the HRTIMER_MODE_PINNED mode
is not considered, when a hrtimer is initialized.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 6 +++---
kernel/time/hrtimer.c | 9 +++++----
2 files changed, 8 insertions(+), 7 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -361,11 +361,11 @@ extern void hrtimer_start_range_ns(struc
u64 range_ns, const enum hrtimer_mode mode);
/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
+ * hrtimer_start - (re)start an hrtimer
* @timer: the timer to be added
* @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED)
*/
static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
const enum hrtimer_mode mode)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -926,12 +926,12 @@ static inline ktime_t hrtimer_update_low
}
/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * hrtimer_start_range_ns - (re)start an hrtimer
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
- * relative (HRTIMER_MODE_REL)
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED)
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode)
@@ -1109,7 +1109,8 @@ static void __hrtimer_init(struct hrtime
* hrtimer_init - initialize a timer to the given clock
* @timer: the timer to be initialized
* @clock_id: the clock to be used
- * @mode: timer mode abs/rel
+ * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL); pinned is not considered here!
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)

View File

@ -1,36 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:36 +0200
Subject: [PATCH 05/17] metag: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in stop_this_cpu() to handle the extra states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184735.283420315@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/metag/kernel/smp.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -567,8 +567,7 @@ static void stop_this_cpu(void *data)
{
unsigned int cpu = smp_processor_id();
- if (system_state == SYSTEM_BOOTING ||
- system_state == SYSTEM_RUNNING) {
+ if (system_state <= SYSTEM_RUNNING) {
spin_lock(&stop_lock);
pr_crit("CPU%u: stopping\n", cpu);
dump_stack();

View File

@ -1,89 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:31 +0200
Subject: [PATCH 05/13] powerpc/smp: Replace open coded task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Init task invokes smp_ops->setup_cpu() from smp_cpus_done(). Init task can
run on any online CPU at this point, but the setup_cpu() callback requires
to be invoked on the boot CPU. This is achieved by temporarily setting the
affinity of the calling user space thread to the requested CPU and reset it
to the original affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
That's actually not a problem in this context as neither CPU hotplug nor
affinity settings can happen, but the access to task_struct::cpus_allowed
is about to restricted.
Replace it with a call to work_on_cpu_safe() which achieves the same result.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.518053336@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/powerpc/kernel/smp.c | 26 +++++++++++---------------
1 file changed, 11 insertions(+), 15 deletions(-)
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -787,24 +787,21 @@ static struct sched_domain_topology_leve
{ NULL, },
};
-void __init smp_cpus_done(unsigned int max_cpus)
+static __init long smp_setup_cpu_workfn(void *data __always_unused)
{
- cpumask_var_t old_mask;
+ smp_ops->setup_cpu(boot_cpuid);
+ return 0;
+}
- /* We want the setup_cpu() here to be called from CPU 0, but our
- * init thread may have been "borrowed" by another CPU in the meantime
- * se we pin us down to CPU 0 for a short while
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+ /*
+ * We want the setup_cpu() here to be called on the boot CPU, but
+ * init might run on any CPU, so make sure it's invoked on the boot
+ * CPU.
*/
- alloc_cpumask_var(&old_mask, GFP_NOWAIT);
- cpumask_copy(old_mask, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
-
if (smp_ops && smp_ops->setup_cpu)
- smp_ops->setup_cpu(boot_cpuid);
-
- set_cpus_allowed_ptr(current, old_mask);
-
- free_cpumask_var(old_mask);
+ work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done();
@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int m
dump_numa_cpu_topology();
set_sched_topology(powerpc_topology);
-
}
#ifdef CONFIG_HOTPLUG_CPU

View File

@ -1,392 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:11 +0100
Subject: [PATCH 5/9] sched/rtmutex: Refactor rt_mutex_setprio()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.
So rework the code to pass a pi_task instead.
Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.
As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.303827095@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/sched/rt.h | 24 +++-------
kernel/locking/rtmutex.c | 112 ++++++++++++-----------------------------------
kernel/sched/core.c | 66 ++++++++++++++++++++++-----
3 files changed, 91 insertions(+), 111 deletions(-)
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,28 +18,20 @@ static inline int rt_task(struct task_st
}
#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+ return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,67 +322,16 @@ rt_mutex_dequeue_pi(struct task_struct *
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
-/*
- * Must hold both p->pi_lock and task_rq(p)->lock.
- */
-void rt_mutex_update_top_task(struct task_struct *p)
-{
- if (!task_has_pi_waiters(p)) {
- p->pi_top_task = NULL;
- return;
- }
-
- p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-/*
- * Must hold either p->pi_lock or task_rq(p)->lock.
- */
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
- return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+static void rt_mutex_adjust_prio(struct task_struct *p)
{
- struct task_struct *top_task = rt_mutex_get_top_task(task);
+ struct task_struct *pi_task = NULL;
- if (!top_task)
- return newprio;
+ lockdep_assert_held(&p->pi_lock);
- return min(top_task->prio, newprio);
-}
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
- int prio = rt_mutex_getprio(task);
-
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
+ rt_mutex_setprio(p, pi_task);
}
/*
@@ -742,7 +691,7 @@ static int rt_mutex_adjust_prio_chain(st
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -758,7 +707,7 @@ static int rt_mutex_adjust_prio_chain(st
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -966,7 +915,7 @@ static int task_blocks_on_rt_mutex(struc
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
@@ -988,7 +937,7 @@ static int task_blocks_on_rt_mutex(struc
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1040,13 +989,14 @@ static void mark_wakeup_next_waiter(stru
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- __rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1058,9 +1008,19 @@ static void mark_wakeup_next_waiter(stru
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock(&current->pi_lock);
-
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(&current->pi_lock);
}
/*
@@ -1095,7 +1055,7 @@ static void remove_waiter(struct rt_mute
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1134,8 +1094,7 @@ void rt_mutex_adjust_pi(struct task_stru
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
+ if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -1389,17 +1348,6 @@ static bool __sched rt_mutex_slowunlock(
* Queue the next waiter for wakeup once we release the wait_lock.
*/
mark_wakeup_next_waiter(wake_q, lock);
-
- /*
- * We should deboost before waking the top waiter task such that
- * we don't run two tasks with the 'same' priority. This however
- * can lead to prio-inversion if we would get preempted after
- * the deboost but before waking our high-prio task, hence the
- * preempt_disable before unlock. Pairs with preempt_enable() in
- * rt_mutex_postunlock();
- */
- preempt_disable();
-
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true; /* call rt_mutex_postunlock() */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
#ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3697,41 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
* Idle task boosting is a nono in general. There is one
@@ -3712,9 +3751,7 @@ void rt_mutex_setprio(struct task_struct
goto out_unlock;
}
- rt_mutex_update_top_task(p);
-
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, prio); /* broken */
oldprio = p->prio;
if (oldprio == prio)
@@ -3738,7 +3775,6 @@ void rt_mutex_setprio(struct task_struct
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3776,6 +3812,11 @@ void rt_mutex_setprio(struct task_struct
balance_callback(rq);
preempt_enable();
}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -4022,10 +4063,9 @@ static void __setscheduler(struct rq *rq
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
+ p->prio = normal_prio(p);
if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = rt_effective_prio(p, p->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4312,7 +4352,7 @@ static int __sched_setscheduler(struct t
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:08 -0500
Subject: [PATCH 07/32] tracing: Increase tracing map KEYS_MAX size
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:58:19 -0500
Subject: [PATCH 05/42] tracing: Increase tracing map KEYS_MAX size
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The current default for the number of subkeys in a compound key is 2,
which is too restrictive. Increase it to a more realistic value of 3.
@ -14,7 +14,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -5,7 +5,7 @@
@@ -6,7 +6,7 @@
#define TRACING_MAP_BITS_MAX 17
#define TRACING_MAP_BITS_MIN 7

View File

@ -1,76 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:53 +0100
Subject: [PATCH] futex: Cleanup refcounting
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit bf92cf3a5100f5a0d5f9834787b130159397cb22
Add a put_pit_state() as counterpart for get_pi_state() so the refcounting
becomes consistent.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.801778516@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_
return pi_state;
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -1103,7 +1108,7 @@ static int attach_to_pi_state(u32 __user
goto out_einval;
out_attach:
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
@@ -1990,7 +1995,7 @@ static int futex_requeue(u32 __user *uad
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,

View File

@ -0,0 +1,42 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:43 +0200
Subject: [PATCH 06/36] hrtimer: Ensure POSIX compliance (relative
CLOCK_REALTIME hrtimers)
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
POSIX specification defines, that relative CLOCK_REALTIME timers are not
affected by clock modifications. Those timers have to use CLOCK_MONOTONIC
to ensure POSIX compliance.
The introduction of the additional mode HRTIMER_MODE_PINNED broke this
requirement for pinned timers. There is no user space visible impact
because user space timers are not using the pinned mode, but for
consistency reasons this needs to be fixed.
Check whether the mode has the HRTIMER_MODE_REL bit set instead of
comparing with HRTIMER_MODE_ABS.
Fixes: 597d0275736d ("timers: Framework for identifying pinned timers")
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1097,7 +1097,12 @@ static void __hrtimer_init(struct hrtime
cpu_base = raw_cpu_ptr(&hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ /*
+ * Posix magic: Relative CLOCK_REALTIME timers are not affected by
+ * clock modifications, so they needs to become CLOCK_MONOTONIC to
+ * ensure Posix compliance.
+ */
+ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
base = hrtimer_clockid_to_base(clock_id);

View File

@ -1,39 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:37 +0200
Subject: [PATCH 06/17] powerpc: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in smp_generic_cpu_bootable() to handle the
extra states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20170516184735.359536998@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
arch/powerpc/kernel/smp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -98,7 +98,7 @@ int smp_generic_cpu_bootable(unsigned in
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot

View File

@ -1,107 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:12 +0100
Subject: [PATCH 6/9] sched,tracing: Update trace_sched_pi_setprio()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Pass the PI donor task, instead of a numerical priority.
Numerical priorities are not sufficient to describe state ever since
SCHED_DEADLINE.
Annotate all sched tracepoints that are currently broken; fixing them
will bork userspace. *hate*.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.353599881@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/trace/events/sched.h | 16 +++++++++-------
kernel/sched/core.c | 2 +-
2 files changed, 10 insertions(+), 8 deletions(-)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_templat
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
__entry->success = 1; /* rudiment, kill when possible */
__entry->target_cpu = task_cpu(p);
),
@@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
+ /* XXX SCHED_DEADLINE */
),
TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
@@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
__entry->orig_cpu = task_cpu(p);
__entry->dest_cpu = dest_cpu;
),
@@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_templa
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
),
TP_printk("comm=%s pid=%d prio=%d",
@@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
TP_fast_assign(
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
__entry->pid = pid_nr(pid);
- __entry->prio = current->prio;
+ __entry->prio = current->prio; /* XXX SCHED_DEADLINE */
),
TP_printk("comm=%s pid=%d prio=%d",
@@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_s
*/
TRACE_EVENT(sched_pi_setprio,
- TP_PROTO(struct task_struct *tsk, int newprio),
+ TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
- TP_ARGS(tsk, newprio),
+ TP_ARGS(tsk, pi_task),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
@@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->oldprio = tsk->prio;
- __entry->newprio = newprio;
+ __entry->newprio = pi_task ? pi_task->prio : tsk->prio;
+ /* XXX SCHED_DEADLINE bits missing */
),
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3751,7 +3751,7 @@ void rt_mutex_setprio(struct task_struct
goto out_unlock;
}
- trace_sched_pi_setprio(p, prio); /* broken */
+ trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)

View File

@ -1,118 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Apr 2017 10:17:07 +0200
Subject: [PATCH 06/13] sparc/sysfs: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The mmustat_enable sysfs file accessor functions must run code on the
target CPU. This is achieved by temporarily setting the affinity of the
calling user space thread to the requested CPU and reset it to the original
affinity afterwards.
That's racy vs. concurrent affinity settings for that thread resulting in
code executing on the wrong CPU and overwriting the new affinity setting.
Replace it by using work_on_cpu() which guarantees to run the code on the
requested CPU.
Protection against CPU hotplug is not required as the open sysfs file
already prevents the removal from the CPU offline callback. Using the
hotplug protected version would actually be wrong because it would deadlock
against a CPU hotplug operation of the CPU associated to the sysfs file in
progress.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: fenghua.yu@intel.com
Cc: tony.luck@intel.com
Cc: herbert@gondor.apana.org.au
Cc: rjw@rjwysocki.net
Cc: peterz@infradead.org
Cc: benh@kernel.crashing.org
Cc: bigeasy@linutronix.de
Cc: jiangshanlai@gmail.com
Cc: sparclinux@vger.kernel.org
Cc: viresh.kumar@linaro.org
Cc: mpe@ellerman.id.au
Cc: tj@kernel.org
Cc: lenb@kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131001270.2408@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/sparc/kernel/sysfs.c | 39 +++++++++++----------------------------
1 file changed, 11 insertions(+), 28 deletions(-)
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_g
.name = "mmu_stats",
};
-/* XXX convert to rusty's on_one_cpu */
-static unsigned long run_on_cpu(unsigned long cpu,
- unsigned long (*func)(unsigned long),
- unsigned long arg)
-{
- cpumask_t old_affinity;
- unsigned long ret;
-
- cpumask_copy(&old_affinity, &current->cpus_allowed);
- /* should return -EINVAL to userspace */
- if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
- return 0;
-
- ret = func(arg);
-
- set_cpus_allowed_ptr(current, &old_affinity);
-
- return ret;
-}
-
-static unsigned long read_mmustat_enable(unsigned long junk)
+static long read_mmustat_enable(void *data __maybe_unused)
{
unsigned long ra = 0;
@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable
return ra != 0;
}
-static unsigned long write_mmustat_enable(unsigned long val)
+static long write_mmustat_enable(void *data)
{
- unsigned long ra, orig_ra;
+ unsigned long ra, orig_ra, *val = data;
- if (val)
+ if (*val)
ra = __pa(&per_cpu(mmu_stats, smp_processor_id()));
else
ra = 0UL;
@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enabl
static ssize_t show_mmustat_enable(struct device *s,
struct device_attribute *attr, char *buf)
{
- unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
+ long val = work_on_cpu(s->id, read_mmustat_enable, NULL);
+
return sprintf(buf, "%lx\n", val);
}
@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(stru
struct device_attribute *attr, const char *buf,
size_t count)
{
- unsigned long val, err;
- int ret = sscanf(buf, "%lu", &val);
+ unsigned long val;
+ long err;
+ int ret;
+ ret = sscanf(buf, "%lu", &val);
if (ret != 1)
return -EINVAL;
- err = run_on_cpu(s->id, write_mmustat_enable, val);
+ err = work_on_cpu(s->id, write_mmustat_enable, &val);
if (err)
return -EIO;

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:10 -0500
Subject: [PATCH 09/32] tracing: Make traceprobe parsing code reusable
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:58:20 -0500
Subject: [PATCH 06/42] tracing: Make traceprobe parsing code reusable
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
traceprobe_probes_write() and traceprobe_command() actually contain
nothing that ties them to kprobes - the code is generically useful for
@ -25,7 +25,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7907,6 +7907,92 @@ void ftrace_dump(enum ftrace_dump_mode o
@@ -8281,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode o
}
EXPORT_SYMBOL_GPL(ftrace_dump);
@ -120,7 +120,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
int ring_buf_size;
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1650,6 +1650,13 @@ void trace_printk_start_comm(void);
@@ -1755,6 +1755,13 @@ void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
@ -136,7 +136,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
* to do the manipulation, as well as saves the print formats
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -878,8 +878,8 @@ static int probes_open(struct inode *ino
@@ -907,8 +907,8 @@ static int probes_open(struct inode *ino
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
@ -147,7 +147,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
static const struct file_operations kprobe_events_ops = {
@@ -1404,9 +1404,9 @@ static __init int kprobe_trace_self_test
@@ -1433,9 +1433,9 @@ static __init int kprobe_trace_self_test
pr_info("Testing kprobe tracing: ");
@ -160,7 +160,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function entry.\n");
warn++;
@@ -1426,8 +1426,8 @@ static __init int kprobe_trace_self_test
@@ -1455,8 +1455,8 @@ static __init int kprobe_trace_self_test
}
}
@ -171,7 +171,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (WARN_ON_ONCE(ret)) {
pr_warn("error on probing function return.\n");
warn++;
@@ -1497,13 +1497,13 @@ static __init int kprobe_trace_self_test
@@ -1526,13 +1526,13 @@ static __init int kprobe_trace_self_test
disable_trace_kprobe(tk, file);
}

View File

@ -1,38 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:38 +0200
Subject: [PATCH 07/17] ACPI: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Make the decision whether a pci root is hotplugged depend on SYSTEM_RUNNING
instead of !SYSTEM_BOOTING. It makes no sense to cover states greater than
SYSTEM_RUNNING as there are not hotplug events on reboot and poweroff.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Link: http://lkml.kernel.org/r/20170516184735.446455652@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
drivers/acpi/pci_root.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -523,7 +523,7 @@ static int acpi_pci_root_add(struct acpi
struct acpi_pci_root *root;
acpi_handle handle = device->handle;
int no_aspm = 0;
- bool hotadd = system_state != SYSTEM_BOOTING;
+ bool hotadd = system_state == SYSTEM_RUNNING;
root = kzalloc(sizeof(struct acpi_pci_root), GFP_KERNEL);
if (!root)

View File

@ -1,45 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:33 +0200
Subject: [PATCH 07/13] ACPI/processor: Fix error handling in
__acpi_processor_start()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
When acpi_install_notify_handler() fails the cooling device stays
registered and the sysfs files created via acpi_pss_perf_init() are
leaked and the function returns success.
Undo acpi_pss_perf_init() and return a proper error code.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: linux-acpi@vger.kernel.org
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.695499645@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/acpi/processor_driver.c | 3 +++
1 file changed, 3 insertions(+)
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct
if (ACPI_SUCCESS(status))
return 0;
+ result = -ENODEV;
+ acpi_pss_perf_exit(pr, device);
+
err_power_exit:
acpi_processor_power_exit(pr);
return result;

View File

@ -1,140 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:54 +0100
Subject: [PATCH] futex: Rework inconsistent rt_mutex/futex_q state
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 73d786bd043ebc855f349c81ea805f6b11cbf2aa
There is a weird state in the futex_unlock_pi() path when it interleaves
with a concurrent futex_lock_pi() at the point where it drops hb->lock.
In this case, it can happen that the rt_mutex wait_list and the futex_q
disagree on pending waiters, in particular rt_mutex will find no pending
waiters where futex_q thinks there are. In this case the rt_mutex unlock
code cannot assign an owner.
The futex side fixup code has to cleanup the inconsistencies with quite a
bunch of interesting corner cases.
Simplify all this by changing wake_futex_pi() to return -EAGAIN when this
situation occurs. This then gives the futex_lock_pi() code the opportunity
to continue and the retried futex_unlock_pi() will now observe a coherent
state.
The only problem is that this breaks RT timeliness guarantees. That
is, consider the following scenario:
T1 and T2 are both pinned to CPU0. prio(T2) > prio(T1)
CPU0
T1
lock_pi()
queue_me() <- Waiter is visible
preemption
T2
unlock_pi()
loops with -EAGAIN forever
Which is undesirable for PI primitives. Future patches will rectify
this.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.850383690@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 50 ++++++++++++++------------------------------------
1 file changed, 14 insertions(+), 36 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1404,12 +1404,19 @@ static int wake_futex_pi(u32 __user *uad
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
- * It is possible that the next waiter (the one that brought
- * top_waiter owner to the kernel) timed out and is no longer
- * waiting on the lock.
+ * When we interleave with futex_lock_pi() where it does
+ * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
+ * but the rt_mutex's wait_list can be empty (either still, or again,
+ * depending on which side we land).
+ *
+ * When this happens, give up our locks and try again, giving the
+ * futex_lock_pi() instance time to complete, either by waiting on the
+ * rtmutex or removing itself from the futex queue.
*/
- if (!new_owner)
- new_owner = top_waiter->task;
+ if (!new_owner) {
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return -EAGAIN;
+ }
/*
* We pass it to the next owner. The WAITERS bit is always
@@ -2332,7 +2339,6 @@ static long futex_wait_restart(struct re
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
int ret = 0;
if (locked) {
@@ -2346,43 +2352,15 @@ static int fixup_owner(u32 __user *uaddr
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
"pi-state %p\n", ret,
q->pi_state->pi_mutex.owner,
q->pi_state->owner);
+ }
out:
return ret ? ret : locked;

View File

@ -0,0 +1,46 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:44 +0200
Subject: [PATCH 07/36] hrtimer: Cleanup hrtimer_mode enum
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
It's not obvious that the HRTIMER_MODE variants are bit combinations
because all modes are hard coded constants.
Change it so the bit meanings are clear and use the symbols for creating
modes which combine bits.
While at it get rid of the ugly tail comments.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -28,13 +28,19 @@ struct hrtimer_cpu_base;
/*
* Mode arguments of xxx_hrtimer functions:
+ *
+ * HRTIMER_MODE_ABS - Time value is absolute
+ * HRTIMER_MODE_REL - Time value is relative to now
+ * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered
+ * when starting the timer)
*/
enum hrtimer_mode {
- HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */
- HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */
- HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */
- HRTIMER_MODE_ABS_PINNED = 0x02,
- HRTIMER_MODE_REL_PINNED = 0x03,
+ HRTIMER_MODE_ABS = 0x00,
+ HRTIMER_MODE_REL = 0x01,
+ HRTIMER_MODE_PINNED = 0x02,
+
+ HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
+ HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
};
/*

View File

@ -1,120 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:13 +0100
Subject: [PATCH 7/9] rtmutex: Fix PI chain order integrity
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
rt_mutex_waiter::prio is a copy of task_struct::prio which is updated
during the PI chain walk, such that the PI chain order isn't messed up
by (asynchronous) task state updates.
Currently rt_mutex_waiter_less() uses task state for deadline tasks;
this is broken, since the task state can, as said above, change
asynchronously, causing the RB tree order to change without actual
tree update -> FAIL.
Fix this by also copying the deadline into the rt_mutex_waiter state
and updating it along with its prio field.
Ideally we would also force PI chain updates whenever DL tasks update
their deadline parameter, but for first approximation this is less
broken than it was.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.403992539@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/locking/rtmutex.c | 29 +++++++++++++++++++++++++++--
kernel/locking/rtmutex_common.h | 1 +
2 files changed, 28 insertions(+), 2 deletions(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -238,8 +238,7 @@ rt_mutex_waiter_less(struct rt_mutex_wai
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return dl_time_before(left->task->dl.deadline,
- right->task->dl.deadline);
+ return dl_time_before(left->deadline, right->deadline);
return 0;
}
@@ -650,7 +649,26 @@ static int rt_mutex_adjust_prio_chain(st
/* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ *
+ * Even though pi_waiters also uses these fields, and that tree is only
+ * updated in [11], we can do this here, since we hold [L], which
+ * serializes all pi_waiters access and rb_erase() does not care about
+ * the values of the node being removed.
+ */
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
@@ -777,6 +795,8 @@ static int rt_mutex_adjust_prio_chain(st
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -902,6 +922,8 @@ static int task_blocks_on_rt_mutex(struc
struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Early deadlock detection. We really don't want the task to
* enqueue on itself just to untangle the mess later. It's not
@@ -919,6 +941,7 @@ static int task_blocks_on_rt_mutex(struc
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -1036,6 +1059,8 @@ static void remove_waiter(struct rt_mute
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
struct rt_mutex *deadlock_lock;
#endif
int prio;
+ u64 deadline;
};
/*

View File

@ -0,0 +1,44 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:21 -0500
Subject: [PATCH 07/42] tracing: Clean up hist_field_flags enum
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
As we add more flags, specifying explicit integers for the flag values
becomes more unwieldy and error-prone - switch them over to left-shift
values.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -110,16 +110,16 @@ DEFINE_HIST_FIELD_FN(u8);
#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
enum hist_field_flags {
- HIST_FIELD_FL_HITCOUNT = 1,
- HIST_FIELD_FL_KEY = 2,
- HIST_FIELD_FL_STRING = 4,
- HIST_FIELD_FL_HEX = 8,
- HIST_FIELD_FL_SYM = 16,
- HIST_FIELD_FL_SYM_OFFSET = 32,
- HIST_FIELD_FL_EXECNAME = 64,
- HIST_FIELD_FL_SYSCALL = 128,
- HIST_FIELD_FL_STACKTRACE = 256,
- HIST_FIELD_FL_LOG2 = 512,
+ HIST_FIELD_FL_HITCOUNT = 1 << 0,
+ HIST_FIELD_FL_KEY = 1 << 1,
+ HIST_FIELD_FL_STRING = 1 << 2,
+ HIST_FIELD_FL_HEX = 1 << 3,
+ HIST_FIELD_FL_SYM = 1 << 4,
+ HIST_FIELD_FL_SYM_OFFSET = 1 << 5,
+ HIST_FIELD_FL_EXECNAME = 1 << 6,
+ HIST_FIELD_FL_SYSCALL = 1 << 7,
+ HIST_FIELD_FL_STACKTRACE = 1 << 8,
+ HIST_FIELD_FL_LOG2 = 1 << 9,
};
struct hist_trigger_attrs {

View File

@ -1,193 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:34 +0200
Subject: [PATCH 08/13] ACPI/processor: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
acpi_processor_get_throttling() requires to invoke the getter function on
the target CPU. This is achieved by temporarily setting the affinity of the
calling user space thread to the requested CPU and reset it to the original
affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
acpi_processor_get_throttling() is invoked in two ways:
1) The CPU online callback, which is already running on the target CPU and
obviously protected against hotplug and not affected by affinity
settings.
2) The ACPI driver probe function, which is not protected against hotplug
during modprobe.
Switch it over to work_on_cpu() and protect the probe function against CPU
hotplug.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: linux-acpi@vger.kernel.org
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.785920903@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/acpi/processor_driver.c | 7 +++-
drivers/acpi/processor_throttling.c | 62 ++++++++++++++++++++----------------
2 files changed, 42 insertions(+), 27 deletions(-)
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -262,11 +262,16 @@ static int __acpi_processor_start(struct
static int acpi_processor_start(struct device *dev)
{
struct acpi_device *device = ACPI_COMPANION(dev);
+ int ret;
if (!device)
return -ENODEV;
- return __acpi_processor_start(device);
+ /* Protect against concurrent CPU hotplug operations */
+ get_online_cpus();
+ ret = __acpi_processor_start(device);
+ put_online_cpus();
+ return ret;
}
static int acpi_processor_stop(struct device *dev)
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg {
#define THROTTLING_POSTCHANGE (2)
static int acpi_processor_get_throttling(struct acpi_processor *pr);
-int acpi_processor_set_throttling(struct acpi_processor *pr,
- int state, bool force);
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+ int state, bool force, bool direct);
static int acpi_processor_update_tsd_coord(void)
{
@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling
ACPI_DEBUG_PRINT((ACPI_DB_INFO,
"Invalid throttling state, reset\n"));
state = 0;
- ret = acpi_processor_set_throttling(pr, state, true);
+ ret = __acpi_processor_set_throttling(pr, state, true,
+ true);
if (ret)
return ret;
}
@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling
return 0;
}
-static int acpi_processor_get_throttling(struct acpi_processor *pr)
+static long __acpi_processor_get_throttling(void *data)
{
- cpumask_var_t saved_mask;
- int ret;
+ struct acpi_processor *pr = data;
+
+ return pr->throttling.acpi_processor_get_throttling(pr);
+}
+static int acpi_processor_get_throttling(struct acpi_processor *pr)
+{
if (!pr)
return -EINVAL;
if (!pr->flags.throttling)
return -ENODEV;
- if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL))
- return -ENOMEM;
-
/*
- * Migrate task to the cpu pointed by pr.
+ * This is either called from the CPU hotplug callback of
+ * processor_driver or via the ACPI probe function. In the latter
+ * case the CPU is not guaranteed to be online. Both call sites are
+ * protected against CPU hotplug.
*/
- cpumask_copy(saved_mask, &current->cpus_allowed);
- /* FIXME: use work_on_cpu() */
- if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
- /* Can't migrate to the target pr->id CPU. Exit */
- free_cpumask_var(saved_mask);
+ if (!cpu_online(pr->id))
return -ENODEV;
- }
- ret = pr->throttling.acpi_processor_get_throttling(pr);
- /* restore the previous state */
- set_cpus_allowed_ptr(current, saved_mask);
- free_cpumask_var(saved_mask);
- return ret;
+ return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
}
static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn
arg->target_state, arg->force);
}
-int acpi_processor_set_throttling(struct acpi_processor *pr,
- int state, bool force)
+static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
+{
+ if (direct)
+ return fn(arg);
+ return work_on_cpu(cpu, fn, arg);
+}
+
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+ int state, bool force, bool direct)
{
int ret = 0;
unsigned int i;
@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct
arg.pr = pr;
arg.target_state = state;
arg.force = force;
- ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg);
+ ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg,
+ direct);
} else {
/*
* When the T-state coordination is SW_ALL or HW_ALL,
@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct
arg.pr = match_pr;
arg.target_state = state;
arg.force = force;
- ret = work_on_cpu(pr->id, acpi_processor_throttling_fn,
- &arg);
+ ret = call_on_cpu(pr->id, acpi_processor_throttling_fn,
+ &arg, direct);
}
}
/*
@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct
return ret;
}
+int acpi_processor_set_throttling(struct acpi_processor *pr, int state,
+ bool force)
+{
+ return __acpi_processor_set_throttling(pr, state, force, false);
+}
+
int acpi_processor_get_throttling_info(struct acpi_processor *pr)
{
int result = 0;

View File

@ -1,358 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:55 +0100
Subject: [PATCH] futex: Pull rt_mutex_futex_unlock() out from under hb->lock
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 16ffa12d742534d4ff73e8b3a4e81c1de39196f0
There's a number of 'interesting' problems, all caused by holding
hb->lock while doing the rt_mutex_unlock() equivalient.
Notably:
- a PI inversion on hb->lock; and,
- a SCHED_DEADLINE crash because of pointer instability.
The previous changes:
- changed the locking rules to cover {uval,pi_state} with wait_lock.
- allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
turn allows to rely on wait_lock atomicity completely.
- simplified the waiter conundrum.
It's now sufficient to hold rtmutex::wait_lock and a reference on the
pi_state to protect the state consistency, so hb->lock can be dropped
before calling rt_mutex_futex_unlock().
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 154 +++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 100 insertions(+), 54 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -921,10 +921,12 @@ void exit_pi_state_list(struct task_stru
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_futex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -1037,6 +1039,11 @@ static int attach_to_pi_state(u32 __user
* has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
* which in turn means that futex_lock_pi() still has a reference on
* our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
*/
WARN_ON(!atomic_read(&pi_state->refcount));
@@ -1380,48 +1387,40 @@ static void mark_wake_futex(struct wake_
smp_store_release(&q->lock_ptr, NULL);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = top_waiter->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool deboost = false;
DEFINE_WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
-
- /*
- * When we interleave with futex_lock_pi() where it does
- * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
- * but the rt_mutex's wait_list can be empty (either still, or again,
- * depending on which side we land).
- *
- * When this happens, give up our locks and try again, giving the
- * futex_lock_pi() instance time to complete, either by waiting on the
- * rtmutex or removing itself from the futex queue.
- */
if (!new_owner) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return -EAGAIN;
+ /*
+ * Since we held neither hb->lock nor wait_lock when coming
+ * into this function, we could have raced with futex_lock_pi()
+ * such that we might observe @this futex_q waiter, but the
+ * rt_mutex's wait_list can be empty (either still, or again,
+ * depending on which side we land).
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
}
/*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1444,10 +1443,8 @@ static int wake_futex_pi(u32 __user *uad
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+ if (ret)
+ goto out_unlock;
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
@@ -1465,15 +1462,15 @@ static int wake_futex_pi(u32 __user *uad
*/
deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
if (deboost) {
wake_up_q(&wake_q);
rt_mutex_adjust_prio(current);
}
- return 0;
+ return ret;
}
/*
@@ -2232,7 +2229,8 @@ static int fixup_pi_state_owner(u32 __us
/*
* We are here either because we stole the rtmutex from the
* previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
+ * waiter but have failed to get the rtmutex the first time.
+ *
* We have to replace the newowner TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
@@ -2249,7 +2247,7 @@ static int fixup_pi_state_owner(u32 __us
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2345,6 +2343,10 @@ static int fixup_owner(u32 __user *uaddr
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * We can safely read pi_state->owner without holding wait_lock
+ * because we now own the rt_mutex, only the owner will attempt
+ * to change it.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2584,6 +2586,7 @@ static int futex_lock_pi(u32 __user *uad
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2670,12 +2673,19 @@ static int futex_lock_pi(u32 __user *uad
* If fixup_owner() faulted and was unable to handle the fault, unlock
* it and return the fault to userspace.
*/
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
goto out_put_key;
out_unlock_put_key:
@@ -2738,10 +2748,36 @@ static int futex_unlock_pi(u32 __user *u
*/
top_waiter = futex_top_waiter(hb, &key);
if (top_waiter) {
- ret = wake_futex_pi(uaddr, uval, top_waiter, hb);
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ /*
+ * Grab a reference on the pi_state and drop hb->lock.
+ *
+ * The reference ensures pi_state lives, dropping the hb->lock
+ * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
+ * close the races against futex_lock_pi(), but in case of
+ * _any_ fail we'll abort and retry the whole deal.
+ */
+ get_pi_state(pi_state);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2756,7 +2792,6 @@ static int futex_unlock_pi(u32 __user *u
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2764,7 +2799,7 @@ static int futex_unlock_pi(u32 __user *u
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2774,8 +2809,10 @@ static int futex_unlock_pi(u32 __user *u
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2789,7 +2826,6 @@ static int futex_unlock_pi(u32 __user *u
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2893,6 +2929,7 @@ static int futex_wait_requeue_pi(u32 __u
u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
@@ -2977,8 +3014,10 @@ static int futex_wait_requeue_pi(u32 __u
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -3017,13 +3056,20 @@ static int futex_wait_requeue_pi(u32 __u
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_futex_unlock(pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling

View File

@ -1,42 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:39 +0200
Subject: [PATCH 08/17] mm: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
get_nid_for_pfn() checks for system_state == BOOTING to decide whether to
use early_pfn_to_nid() when CONFIG_DEFERRED_STRUCT_PAGE_INIT=y.
That check is dubious, because the switch to state RUNNING happes way after
page_alloc_init_late() has been invoked.
Change the check to less than RUNNING state so it covers the new
intermediate states as well.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184735.528279534@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
drivers/base/node.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -377,7 +377,7 @@ static int __ref get_nid_for_pfn(unsigne
if (!pfn_valid_within(pfn))
return -1;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- if (system_state == SYSTEM_BOOTING)
+ if (system_state < SYSTEM_RUNNING)
return early_pfn_to_nid(pfn);
#endif
page = pfn_to_page(pfn);

View File

@ -1,100 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:14 +0100
Subject: [PATCH 8/9] rtmutex: Fix more prio comparisons
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
There was a pure ->prio comparison left in try_to_wake_rt_mutex(),
convert it to use rt_mutex_waiter_less(), noting that greater-or-equal
is not-less (both in kernel priority view).
This necessitated the introduction of cmp_task() which creates a
pointer to an unnamed stack variable of struct rt_mutex_waiter type to
compare against tasks.
With this, we can now also create and employ rt_mutex_waiter_equal().
Reviewed-and-tested-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.455584638@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/locking/rtmutex.c | 32 +++++++++++++++++++++++++++++---
1 file changed, 29 insertions(+), 3 deletions(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(
}
#endif
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -243,6 +249,25 @@ rt_mutex_waiter_less(struct rt_mutex_wai
return 0;
}
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio != right->prio)
+ return 0;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
+
+ return 1;
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -553,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(st
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio && !dl_task(task)) {
+ if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -856,7 +881,8 @@ static int try_to_take_rt_mutex(struct r
* the top waiter priority (kernel view),
* @task lost.
*/
- if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ if (!rt_mutex_waiter_less(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -1119,7 +1145,7 @@ void rt_mutex_adjust_pi(struct task_stru
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:02 -0500
Subject: [PATCH 01/32] tracing: Add hist_field_name() accessor
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:58:22 -0500
Subject: [PATCH 08/42] tracing: Add hist_field_name() accessor
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
In preparation for hist_fields that won't be strictly based on
trace_event_fields, add a new hist_field_name() accessor to allow that
@ -10,8 +10,8 @@ flexibility and update associated users.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 68 ++++++++++++++++++++++++++-------------
1 file changed, 46 insertions(+), 22 deletions(-)
kernel/trace/trace_events_hist.c | 67 ++++++++++++++++++++++++++-------------
1 file changed, 45 insertions(+), 22 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@ -57,7 +57,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
sort_key = &hist_data->sort_keys[i];
@@ -703,8 +721,11 @@ static int create_sort_keys(struct hist_
@@ -703,8 +721,10 @@ static int create_sort_keys(struct hist_
}
for (j = 1; j < hist_data->n_fields; j++) {
@ -65,13 +65,12 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
- if (field && (strcmp(field_name, field->name) == 0)) {
+ hist_field = hist_data->fields[j];
+ test_name = hist_field_name(hist_field, 0);
+ if (test_name == NULL)
+ continue;
+
+ if (strcmp(field_name, test_name) == 0) {
sort_key->field_idx = j;
descending = is_descending(field_str);
if (descending < 0) {
@@ -952,6 +973,7 @@ hist_trigger_entry_print(struct seq_file
@@ -952,6 +972,7 @@ hist_trigger_entry_print(struct seq_file
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
bool multiline = false;
@ -79,7 +78,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
unsigned int i;
u64 uval;
@@ -963,26 +985,27 @@ hist_trigger_entry_print(struct seq_file
@@ -963,26 +984,27 @@ hist_trigger_entry_print(struct seq_file
if (i > hist_data->n_vals)
seq_puts(m, ", ");
@ -115,7 +114,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
const char *syscall_name;
@@ -991,8 +1014,8 @@ hist_trigger_entry_print(struct seq_file
@@ -991,8 +1013,8 @@ hist_trigger_entry_print(struct seq_file
if (!syscall_name)
syscall_name = "unknown_syscall";
@ -126,7 +125,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
seq_puts(m, "stacktrace:\n");
hist_trigger_stacktrace_print(m,
@@ -1000,15 +1023,14 @@ hist_trigger_entry_print(struct seq_file
@@ -1000,15 +1022,14 @@ hist_trigger_entry_print(struct seq_file
HIST_STACKTRACE_DEPTH);
multiline = true;
} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
@ -145,7 +144,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
}
@@ -1021,13 +1043,13 @@ hist_trigger_entry_print(struct seq_file
@@ -1021,13 +1042,13 @@ hist_trigger_entry_print(struct seq_file
tracing_map_read_sum(elt, HITCOUNT_IDX));
for (i = 1; i < hist_data->n_vals; i++) {
@ -163,7 +162,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
tracing_map_read_sum(elt, i));
}
}
@@ -1142,7 +1164,9 @@ static const char *get_hist_field_flags(
@@ -1142,7 +1163,9 @@ static const char *get_hist_field_flags(
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{

View File

@ -0,0 +1,56 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:45 +0200
Subject: [PATCH 08/36] tracing: hrtimer: Take all clock bases and modes into
account
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
So far only CLOCK_MONOTONIC and CLOCK_REALTIME were taken into account as
well as HRTIMER_MODE_ABS/REL in hrtimer_init tracepoint. The query for
detecting timer mode ABS or REL is not valid, since the introduction of
HRTIMER_MODE_PINNED.
HRTIMER_MODE_PINNED is not evaluated in hrtimer_init() call. But for the
sake of completeness print all given modes.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/trace/events/timer.h | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -136,6 +136,20 @@ DEFINE_EVENT(timer_class, timer_cancel,
TP_ARGS(timer)
);
+#define decode_clockid(type) \
+ __print_symbolic(type, \
+ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \
+ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \
+ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \
+ { CLOCK_TAI, "CLOCK_TAI" })
+
+#define decode_hrtimer_mode(mode) \
+ __print_symbolic(mode, \
+ { HRTIMER_MODE_ABS, "ABS" }, \
+ { HRTIMER_MODE_REL, "REL" }, \
+ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \
+ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" })
+
/**
* hrtimer_init - called when the hrtimer is initialized
* @hrtimer: pointer to struct hrtimer
@@ -162,10 +176,8 @@ TRACE_EVENT(hrtimer_init,
),
TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
- __entry->clockid == CLOCK_REALTIME ?
- "CLOCK_REALTIME" : "CLOCK_MONOTONIC",
- __entry->mode == HRTIMER_MODE_ABS ?
- "HRTIMER_MODE_ABS" : "HRTIMER_MODE_REL")
+ decode_clockid(__entry->clockid),
+ decode_hrtimer_mode(__entry->mode))
);
/**

View File

@ -1,209 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:55:03 +0200
Subject: [PATCH 09/13] cpufreq/ia64: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The get() and target() callbacks must run on the affected cpu. This is
achieved by temporarily setting the affinity of the calling thread to the
requested CPU and reset it to the original affinity afterwards.
That's racy vs. concurrent affinity settings for that thread resulting in
code executing on the wrong CPU and overwriting the new affinity setting.
Replace it by work_on_cpu(). All call pathes which invoke the callbacks are
already protected against CPU hotplug.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704122231100.2548@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/cpufreq/ia64-acpi-cpufreq.c | 92 +++++++++++++++---------------------
1 file changed, 39 insertions(+), 53 deletions(-)
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ b/drivers/cpufreq/ia64-acpi-cpufreq.c
@@ -34,6 +34,11 @@ struct cpufreq_acpi_io {
unsigned int resume;
};
+struct cpufreq_acpi_req {
+ unsigned int cpu;
+ unsigned int state;
+};
+
static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS];
static struct cpufreq_driver acpi_cpufreq_driver;
@@ -83,8 +88,7 @@ processor_get_pstate (
static unsigned
extract_clock (
struct cpufreq_acpi_io *data,
- unsigned value,
- unsigned int cpu)
+ unsigned value)
{
unsigned long i;
@@ -98,60 +102,43 @@ extract_clock (
}
-static unsigned int
+static long
processor_get_freq (
- struct cpufreq_acpi_io *data,
- unsigned int cpu)
+ void *arg)
{
- int ret = 0;
- u32 value = 0;
- cpumask_t saved_mask;
- unsigned long clock_freq;
+ struct cpufreq_acpi_req *req = arg;
+ unsigned int cpu = req->cpu;
+ struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+ u32 value;
+ int ret;
pr_debug("processor_get_freq\n");
-
- saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
if (smp_processor_id() != cpu)
- goto migrate_end;
+ return -EAGAIN;
/* processor_get_pstate gets the instantaneous frequency */
ret = processor_get_pstate(&value);
-
if (ret) {
- set_cpus_allowed_ptr(current, &saved_mask);
pr_warn("get performance failed with error %d\n", ret);
- ret = 0;
- goto migrate_end;
+ return ret;
}
- clock_freq = extract_clock(data, value, cpu);
- ret = (clock_freq*1000);
-
-migrate_end:
- set_cpus_allowed_ptr(current, &saved_mask);
- return ret;
+ return 1000 * extract_clock(data, value);
}
-static int
+static long
processor_set_freq (
- struct cpufreq_acpi_io *data,
- struct cpufreq_policy *policy,
- int state)
+ void *arg)
{
- int ret = 0;
- u32 value = 0;
- cpumask_t saved_mask;
- int retval;
+ struct cpufreq_acpi_req *req = arg;
+ unsigned int cpu = req->cpu;
+ struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+ int ret, state = req->state;
+ u32 value;
pr_debug("processor_set_freq\n");
-
- saved_mask = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
- if (smp_processor_id() != policy->cpu) {
- retval = -EAGAIN;
- goto migrate_end;
- }
+ if (smp_processor_id() != cpu)
+ return -EAGAIN;
if (state == data->acpi_data.state) {
if (unlikely(data->resume)) {
@@ -159,8 +146,7 @@ processor_set_freq (
data->resume = 0;
} else {
pr_debug("Already at target state (P%d)\n", state);
- retval = 0;
- goto migrate_end;
+ return 0;
}
}
@@ -171,7 +157,6 @@ processor_set_freq (
* First we write the target state's 'control' value to the
* control_register.
*/
-
value = (u32) data->acpi_data.states[state].control;
pr_debug("Transitioning to state: 0x%08x\n", value);
@@ -179,17 +164,11 @@ processor_set_freq (
ret = processor_set_pstate(value);
if (ret) {
pr_warn("Transition failed with error %d\n", ret);
- retval = -ENODEV;
- goto migrate_end;
+ return -ENODEV;
}
data->acpi_data.state = state;
-
- retval = 0;
-
-migrate_end:
- set_cpus_allowed_ptr(current, &saved_mask);
- return (retval);
+ return 0;
}
@@ -197,11 +176,13 @@ static unsigned int
acpi_cpufreq_get (
unsigned int cpu)
{
- struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+ struct cpufreq_acpi_req req;
+ long ret;
- pr_debug("acpi_cpufreq_get\n");
+ req.cpu = cpu;
+ ret = work_on_cpu(cpu, processor_get_freq, &req);
- return processor_get_freq(data, cpu);
+ return ret > 0 ? (unsigned int) ret : 0;
}
@@ -210,7 +191,12 @@ acpi_cpufreq_target (
struct cpufreq_policy *policy,
unsigned int index)
{
- return processor_set_freq(acpi_io_data[policy->cpu], policy, index);
+ struct cpufreq_acpi_req req;
+
+ req.cpu = policy->cpu;
+ req.state = index;
+
+ return work_on_cpu(req.cpu, processor_set_freq, &req);
}
static int

View File

@ -1,38 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:40 +0200
Subject: [PATCH 09/17] cpufreq/pasemi: Adjust system_state check
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in pas_cpufreq_cpu_exit() to handle the extra
states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20170516184735.620023128@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
drivers/cpufreq/pasemi-cpufreq.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -226,7 +226,7 @@ static int pas_cpufreq_cpu_exit(struct c
* We don't support CPU hotplug. Don't unmap after the system
* has already made it to a running state.
*/
- if (system_state != SYSTEM_BOOTING)
+ if (system_state >= SYSTEM_RUNNING)
return 0;
if (sdcasr_mapbase)

View File

@ -1,80 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:56 +0100
Subject: [PATCH] futex,rt_mutex: Introduce rt_mutex_init_waiter()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 50809358dd7199aa7ce232f6877dd09ec30ef374
Since there's already two copies of this code, introduce a helper now
before adding a third one.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 5 +----
kernel/locking/rtmutex.c | 12 +++++++++---
kernel/locking/rtmutex_common.h | 1 +
3 files changed, 11 insertions(+), 7 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2956,10 +2956,7 @@ static int futex_wait_requeue_pi(u32 __u
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1153,6 +1153,14 @@ void rt_mutex_adjust_pi(struct task_stru
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
@@ -1235,9 +1243,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
unsigned long flags;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -103,6 +103,7 @@ extern void rt_mutex_init_proxy_locked(s
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);

View File

@ -1,41 +0,0 @@
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 5 Apr 2017 10:08:27 +0200
Subject: [PATCH 9/9] rtmutex: Plug preempt count leak in
rt_mutex_futex_unlock()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
mark_wakeup_next_waiter() already disables preemption, doing so again
leaves us with an unpaired preempt_disable().
Fixes: 2a1c60299406 ("rtmutex: Deboost before waking up the top waiter")
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1491379707.6538.2.camel@gmx.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/locking/rtmutex.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1581,13 +1581,13 @@ bool __sched __rt_mutex_futex_unlock(str
return false; /* done */
}
- mark_wakeup_next_waiter(wake_q, lock);
/*
- * We've already deboosted, retain preempt_disabled when dropping
- * the wait_lock to avoid inversion until the wakeup. Matched
- * by rt_mutex_postunlock();
+ * We've already deboosted, mark_wakeup_next_waiter() will
+ * retain preempt_disabled when we drop the wait_lock, to
+ * avoid inversion prior to the wakeup. preempt_disable()
+ * therein pairs with rt_mutex_postunlock().
*/
- preempt_disable();
+ mark_wakeup_next_waiter(wake_q, lock);
return true; /* call postunlock() */
}

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:03 -0500
Subject: [PATCH 02/32] tracing: Reimplement log2
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:58:23 -0500
Subject: [PATCH 09/42] tracing: Reimplement log2
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
log2 as currently implemented applies only to u64 trace_event_field
derived fields, and assumes that anything it's applied to is a u64
@ -79,7 +79,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ return;
+
+ for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+ destroy_hist_field(hist_field->operands[i], ++level);
+ destroy_hist_field(hist_field->operands[i], level + 1);
+
kfree(hist_field);
}

View File

@ -0,0 +1,114 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:46 +0200
Subject: [PATCH 09/36] tracing: hrtimer: Print hrtimer mode in hrtimer_start
tracepoint
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The hrtimer_start tracepoint lacks the mode information. The mode is
important because consecutive starts can switch from ABS to REL or from
PINNED to non PINNED.
Add the mode information.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/trace/events/timer.h | 13 ++++++++-----
kernel/time/hrtimer.c | 16 +++++++++-------
2 files changed, 17 insertions(+), 12 deletions(-)
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -186,15 +186,16 @@ TRACE_EVENT(hrtimer_init,
*/
TRACE_EVENT(hrtimer_start,
- TP_PROTO(struct hrtimer *hrtimer),
+ TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),
- TP_ARGS(hrtimer),
+ TP_ARGS(hrtimer, mode),
TP_STRUCT__entry(
__field( void *, hrtimer )
__field( void *, function )
__field( s64, expires )
__field( s64, softexpires )
+ __field( enum hrtimer_mode, mode )
),
TP_fast_assign(
@@ -202,12 +203,14 @@ TRACE_EVENT(hrtimer_start,
__entry->function = hrtimer->function;
__entry->expires = hrtimer_get_expires(hrtimer);
__entry->softexpires = hrtimer_get_softexpires(hrtimer);
+ __entry->mode = mode;
),
- TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
- __entry->hrtimer, __entry->function,
+ TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu "
+ "mode=%s", __entry->hrtimer, __entry->function,
(unsigned long long) __entry->expires,
- (unsigned long long) __entry->softexpires)
+ (unsigned long long) __entry->softexpires,
+ decode_hrtimer_mode(__entry->mode))
);
/**
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -435,10 +435,11 @@ debug_init(struct hrtimer *timer, clocki
trace_hrtimer_init(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer)
+static inline void debug_activate(struct hrtimer *timer,
+ enum hrtimer_mode mode)
{
debug_hrtimer_activate(timer);
- trace_hrtimer_start(timer);
+ trace_hrtimer_start(timer, mode);
}
static inline void debug_deactivate(struct hrtimer *timer)
@@ -830,9 +831,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* Returns 1 when the new timer is the leftmost timer in the tree.
*/
static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
+ struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
- debug_activate(timer);
+ debug_activate(timer, mode);
base->cpu_base->active_bases |= 1 << base->index;
@@ -955,7 +957,7 @@ void hrtimer_start_range_ns(struct hrtim
/* Switch the timer base, if necessary: */
new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
- leftmost = enqueue_hrtimer(timer, new_base);
+ leftmost = enqueue_hrtimer(timer, new_base, mode);
if (!leftmost)
goto unlock;
@@ -1224,7 +1226,7 @@ static void __run_hrtimer(struct hrtimer
*/
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base);
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
/*
* Separate the ->running assignment from the ->state assignment.
@@ -1623,7 +1625,7 @@ static void migrate_hrtimer_list(struct
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
}
}

View File

@ -1,120 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:36 +0200
Subject: [PATCH 10/13] cpufreq/sh: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The target() callback must run on the affected cpu. This is achieved by
temporarily setting the affinity of the calling thread to the requested CPU
and reset it to the original affinity afterwards.
That's racy vs. concurrent affinity settings for that thread resulting in
code executing on the wrong CPU.
Replace it by work_on_cpu(). All call pathes which invoke the callbacks are
already protected against CPU hotplug.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.958216363@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/cpufreq/sh-cpufreq.c | 45 +++++++++++++++++++++++++------------------
1 file changed, 27 insertions(+), 18 deletions(-)
--- a/drivers/cpufreq/sh-cpufreq.c
+++ b/drivers/cpufreq/sh-cpufreq.c
@@ -30,54 +30,63 @@
static DEFINE_PER_CPU(struct clk, sh_cpuclk);
+struct cpufreq_target {
+ struct cpufreq_policy *policy;
+ unsigned int freq;
+};
+
static unsigned int sh_cpufreq_get(unsigned int cpu)
{
return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000;
}
-/*
- * Here we notify other drivers of the proposed change and the final change.
- */
-static int sh_cpufreq_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
+static long __sh_cpufreq_target(void *arg)
{
- unsigned int cpu = policy->cpu;
+ struct cpufreq_target *target = arg;
+ struct cpufreq_policy *policy = target->policy;
+ int cpu = policy->cpu;
struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu);
- cpumask_t cpus_allowed;
struct cpufreq_freqs freqs;
struct device *dev;
long freq;
- cpus_allowed = current->cpus_allowed;
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
- BUG_ON(smp_processor_id() != cpu);
+ if (smp_processor_id() != cpu)
+ return -ENODEV;
dev = get_cpu_device(cpu);
/* Convert target_freq from kHz to Hz */
- freq = clk_round_rate(cpuclk, target_freq * 1000);
+ freq = clk_round_rate(cpuclk, target->freq * 1000);
if (freq < (policy->min * 1000) || freq > (policy->max * 1000))
return -EINVAL;
- dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000);
+ dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000);
freqs.old = sh_cpufreq_get(cpu);
freqs.new = (freq + 500) / 1000;
freqs.flags = 0;
- cpufreq_freq_transition_begin(policy, &freqs);
- set_cpus_allowed_ptr(current, &cpus_allowed);
+ cpufreq_freq_transition_begin(target->policy, &freqs);
clk_set_rate(cpuclk, freq);
- cpufreq_freq_transition_end(policy, &freqs, 0);
+ cpufreq_freq_transition_end(target->policy, &freqs, 0);
dev_dbg(dev, "set frequency %lu Hz\n", freq);
-
return 0;
}
+/*
+ * Here we notify other drivers of the proposed change and the final change.
+ */
+static int sh_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ struct cpufreq_target data = { .policy = policy, .freq = target_freq };
+
+ return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data);
+}
+
static int sh_cpufreq_verify(struct cpufreq_policy *policy)
{
struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu);

View File

@ -1,159 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:57 +0100
Subject: [PATCH] futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 38d589f2fd08f1296aea3ce62bebd185125c6d81
With the ultimate goal of keeping rt_mutex wait_list and futex_q waiters
consistent it's necessary to split 'rt_mutex_futex_lock()' into finer
parts, such that only the actual blocking can be done without hb->lock
held.
Split split_mutex_finish_proxy_lock() into two parts, one that does the
blocking and one that does remove_waiter() when the lock acquire failed.
When the rtmutex was acquired successfully the waiter can be removed in the
acquisiton path safely, since there is no concurrency on the lock owner.
This means that, except for futex_lock_pi(), all wait_list modifications
are done with both hb->lock and wait_lock held.
[bigeasy@linutronix.de: fix for futex_requeue_pi_signal_restart]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104152.001659630@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 7 +++--
kernel/locking/rtmutex.c | 52 ++++++++++++++++++++++++++++++++++------
kernel/locking/rtmutex_common.h | 8 +++---
3 files changed, 55 insertions(+), 12 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3032,10 +3032,13 @@ static int futex_wait_requeue_pi(u32 __u
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1753,21 +1753,23 @@ struct task_struct *rt_mutex_next_owner(
}
/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
*
* Returns:
* 0 - success
* <0 - error, one of -EINTR, -ETIMEDOUT
*
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
*/
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
{
@@ -1780,9 +1782,6 @@ int rt_mutex_finish_proxy_lock(struct rt
/* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
@@ -1793,3 +1792,42 @@ int rt_mutex_finish_proxy_lock(struct rt
return ret;
}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+}
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -107,9 +107,11 @@ extern void rt_mutex_init_waiter(struct
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);

View File

@ -0,0 +1,80 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:47 +0200
Subject: [PATCH 10/36] hrtimer: Switch for loop to _ffs() evaluation
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Looping over all clock bases to find active bits is suboptimal if not all
bases are active.
Avoid this by converting it to a __ffs() evaluation. The functionallity is
outsourced into an own function and is called via a macro as suggested by
Peter Zijlstra.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 31 +++++++++++++++++++++----------
1 file changed, 21 insertions(+), 10 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -457,20 +457,34 @@ static inline void hrtimer_update_next_t
#endif
}
+static struct hrtimer_clock_base *
+__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+{
+ struct hrtimer_clock_base *base = NULL;
+
+ if (*active) {
+ unsigned int idx = __ffs(*active);
+ *active &= ~(1U << idx);
+ base = &cpu_base->clock_base[idx];
+ }
+
+ return base;
+}
+
+#define for_each_active_base(base, cpu_base, active) \
+ while ((base = __next_base((cpu_base), &(active))))
+
static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
+ struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases;
ktime_t expires, expires_next = KTIME_MAX;
hrtimer_update_next_timer(cpu_base, NULL);
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
- if (!(active & 0x01))
- continue;
-
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
@@ -1243,16 +1257,13 @@ static void __run_hrtimer(struct hrtimer
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
{
- struct hrtimer_clock_base *base = cpu_base->clock_base;
+ struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases;
- for (; active; base++, active >>= 1) {
+ for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *node;
ktime_t basenow;
- if (!(active & 0x01))
- continue;
-
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {

View File

@ -1,47 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:41 +0200
Subject: [PATCH 10/17] iommu/vt-d: Adjust system_state checks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state checks in dmar_parse_one_atsr() and
dmar_iommu_notify_scope_dev() to handle the extra states.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Joerg Roedel <joro@8bytes.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: iommu@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/20170516184735.712365947@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
drivers/iommu/intel-iommu.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4310,7 +4310,7 @@ int dmar_parse_one_atsr(struct acpi_dmar
struct acpi_dmar_atsr *atsr;
struct dmar_atsr_unit *atsru;
- if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
+ if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
return 0;
atsr = container_of(hdr, struct acpi_dmar_atsr, header);
@@ -4560,7 +4560,7 @@ int dmar_iommu_notify_scope_dev(struct d
struct acpi_dmar_atsr *atsr;
struct acpi_dmar_reserved_memory *rmrr;
- if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
+ if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
return 0;
list_for_each_entry(rmrru, &dmar_rmrr_units, list) {

View File

@ -1,106 +0,0 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:11 -0500
Subject: [PATCH 10/32] tracing: Add NO_DISCARD event file flag
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Whenever an event_command has a post-trigger that needs access to the
event record, the event record can't be discarded, or the post-trigger
will eventually see bogus data.
In order to allow the discard check to treat this case separately, add
an EVENT_FILE_FL_NO_DISCARD flag to the event file flags, along with
code in the discard check that checks the flag and avoids the discard
when the flag is set.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/trace_events.h | 3 +++
kernel/trace/trace.h | 13 ++++++++++---
kernel/trace/trace_events_trigger.c | 16 +++++++++++++---
3 files changed, 26 insertions(+), 6 deletions(-)
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -306,6 +306,7 @@ enum {
EVENT_FILE_FL_TRIGGER_MODE_BIT,
EVENT_FILE_FL_TRIGGER_COND_BIT,
EVENT_FILE_FL_PID_FILTER_BIT,
+ EVENT_FILE_FL_NO_DISCARD_BIT,
};
/*
@@ -320,6 +321,7 @@ enum {
* TRIGGER_MODE - When set, invoke the triggers associated with the event
* TRIGGER_COND - When set, one or more triggers has an associated filter
* PID_FILTER - When set, the event is filtered based on pid
+ * NO_DISCARD - When set, do not discard events, something needs them later
*/
enum {
EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -331,6 +333,7 @@ enum {
EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT),
+ EVENT_FILE_FL_NO_DISCARD = (1 << EVENT_FILE_FL_NO_DISCARD_BIT),
};
struct trace_event_file {
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1191,9 +1191,16 @@ static inline bool
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
*tt = event_triggers_call(file, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
- (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, entry))) {
+ if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+ !filter_match_preds(file->filter, entry)) {
+ __trace_event_discard_commit(buffer, event);
+ return true;
+ }
+
+ if (test_bit(EVENT_FILE_FL_NO_DISCARD_BIT, &file->flags))
+ return false;
+
+ if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) {
__trace_event_discard_commit(buffer, event);
return true;
}
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -505,20 +505,30 @@ clear_event_triggers(struct trace_array
void update_cond_flag(struct trace_event_file *file)
{
struct event_trigger_data *data;
- bool set_cond = false;
+ bool set_cond = false, set_no_discard = false;
list_for_each_entry_rcu(data, &file->triggers, list) {
if (data->filter || event_command_post_trigger(data->cmd_ops) ||
- event_command_needs_rec(data->cmd_ops)) {
+ event_command_needs_rec(data->cmd_ops))
set_cond = true;
+
+ if (event_command_post_trigger(data->cmd_ops) &&
+ event_command_needs_rec(data->cmd_ops))
+ set_no_discard = true;
+
+ if (set_cond && set_no_discard)
break;
- }
}
if (set_cond)
set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags);
+
+ if (set_no_discard)
+ set_bit(EVENT_FILE_FL_NO_DISCARD_BIT, &file->flags);
+ else
+ clear_bit(EVENT_FILE_FL_NO_DISCARD_BIT, &file->flags);
}
/**

View File

@ -0,0 +1,114 @@
From: Vedang Patel <vedang.patel@intel.com>
Date: Fri, 22 Sep 2017 14:59:41 -0500
Subject: [PATCH 10/42] tracing: Add support to detect and avoid duplicates
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
A duplicate in the tracing_map hash table is when 2 different entries
have the same key and, as a result, the key_hash. This is possible due
to a race condition in the algorithm. This race condition is inherent to
the algorithm and not a bug. This was fine because, until now, we were
only interested in the sum of all the values related to a particular
key (the duplicates are dealt with in tracing_map_sort_entries()). But,
with the inclusion of variables[1], we are interested in individual
values. So, it will not be clear what value to choose when
there are duplicates. So, the duplicates need to be removed.
The duplicates can occur in the code in the following scenarios:
- A thread is in the process of adding a new element. It has
successfully executed cmpxchg() and inserted the key. But, it is still
not done acquiring the trace_map_elt struct, populating it and storing
the pointer to the struct in the value field of tracing_map hash table.
If another thread comes in at this time and wants to add an element with
the same key, it will not see the current element and add a new one.
- There are multiple threads trying to execute cmpxchg at the same time,
one of the threads will succeed and the others will fail. The ones which
fail will go ahead increment 'idx' and add a new element there creating
a duplicate.
This patch detects and avoids the first condition by asking the thread
which detects the duplicate to loop one more time. There is also a
possibility of infinite loop if the thread which is trying to insert
goes to sleep indefinitely and the one which is trying to insert a new
element detects a duplicate. Which is why, the thread loops for
map_size iterations before returning NULL.
The second scenario is avoided by preventing the threads which failed
cmpxchg() from incrementing idx. This way, they will loop
around and check if the thread which succeeded in executing cmpxchg()
had the same key.
[1] http://lkml.kernel.org/r/cover.1498510759.git.tom.zanussi@linux.intel.com
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/tracing_map.c | 41 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 5 deletions(-)
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -414,7 +414,9 @@ static inline struct tracing_map_elt *
__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
{
u32 idx, key_hash, test_key;
+ int dup_try = 0;
struct tracing_map_entry *entry;
+ struct tracing_map_elt *val;
key_hash = jhash(key, map->key_size, 0);
if (key_hash == 0)
@@ -426,11 +428,33 @@ static inline struct tracing_map_elt *
entry = TRACING_MAP_ENTRY(map->map, idx);
test_key = entry->key;
- if (test_key && test_key == key_hash && entry->val &&
- keys_match(key, entry->val->key, map->key_size)) {
- if (!lookup_only)
- atomic64_inc(&map->hits);
- return entry->val;
+ if (test_key && test_key == key_hash) {
+ val = READ_ONCE(entry->val);
+ if (val &&
+ keys_match(key, val->key, map->key_size)) {
+ if (!lookup_only)
+ atomic64_inc(&map->hits);
+ return val;
+ } else if (unlikely(!val)) {
+ /*
+ * The key is present. But, val (pointer to elt
+ * struct) is still NULL. which means some other
+ * thread is in the process of inserting an
+ * element.
+ *
+ * On top of that, it's key_hash is same as the
+ * one being inserted right now. So, it's
+ * possible that the element has the same
+ * key as well.
+ */
+
+ dup_try++;
+ if (dup_try > map->map_size) {
+ atomic64_inc(&map->drops);
+ break;
+ }
+ continue;
+ }
}
if (!test_key) {
@@ -452,6 +476,13 @@ static inline struct tracing_map_elt *
atomic64_inc(&map->hits);
return entry->val;
+ } else {
+ /*
+ * cmpxchg() failed. Loop around once
+ * more to check what key was inserted.
+ */
+ dup_try++;
+ continue;
}
}

View File

@ -1,124 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:37 +0200
Subject: [PATCH 11/13] cpufreq/sparc-us3: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The access to the safari config register in the CPU frequency functions
must be executed on the target CPU. This is achieved by temporarily setting
the affinity of the calling user space thread to the requested CPU and
reset it to the original affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
Replace it by a straight forward smp function call.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201043.047558840@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/cpufreq/sparc-us3-cpufreq.c | 46 ++++++++++++------------------------
1 file changed, 16 insertions(+), 30 deletions(-)
--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_
#define SAFARI_CFG_DIV_32 0x0000000080000000UL
#define SAFARI_CFG_DIV_MASK 0x00000000C0000000UL
-static unsigned long read_safari_cfg(void)
+static void read_safari_cfg(void *arg)
{
- unsigned long ret;
+ unsigned long ret, *val = arg;
__asm__ __volatile__("ldxa [%%g0] %1, %0"
: "=&r" (ret)
: "i" (ASI_SAFARI_CONFIG));
- return ret;
+ *val = ret;
}
-static void write_safari_cfg(unsigned long val)
+static void update_safari_cfg(void *arg)
{
+ unsigned long reg, *new_bits = arg;
+
+ read_safari_cfg(&reg);
+ reg &= ~SAFARI_CFG_DIV_MASK;
+ reg |= *new_bits;
+
__asm__ __volatile__("stxa %0, [%%g0] %1\n\t"
"membar #Sync"
: /* no outputs */
- : "r" (val), "i" (ASI_SAFARI_CONFIG)
+ : "r" (reg), "i" (ASI_SAFARI_CONFIG)
: "memory");
}
@@ -78,29 +84,17 @@ static unsigned long get_current_freq(un
static unsigned int us3_freq_get(unsigned int cpu)
{
- cpumask_t cpus_allowed;
unsigned long reg;
- unsigned int ret;
-
- cpumask_copy(&cpus_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
- reg = read_safari_cfg();
- ret = get_current_freq(cpu, reg);
- set_cpus_allowed_ptr(current, &cpus_allowed);
-
- return ret;
+ if (smp_call_function_single(cpu, read_safari_cfg, &reg, 1))
+ return 0;
+ return get_current_freq(cpu, reg);
}
static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
{
unsigned int cpu = policy->cpu;
- unsigned long new_bits, new_freq, reg;
- cpumask_t cpus_allowed;
-
- cpumask_copy(&cpus_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ unsigned long new_bits, new_freq;
new_freq = sparc64_get_clock_tick(cpu) / 1000;
switch (index) {
@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufre
BUG();
}
- reg = read_safari_cfg();
-
- reg &= ~SAFARI_CFG_DIV_MASK;
- reg |= new_bits;
- write_safari_cfg(reg);
-
- set_cpus_allowed_ptr(current, &cpus_allowed);
-
- return 0;
+ return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1);
}
static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)

View File

@ -1,267 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:58 +0100
Subject: [PATCH] futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit cfafcd117da0216520568c195cb2f6cd1980c4bb
By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list
modifications are done under both hb->lock and wait_lock.
This closes the obvious interleave pattern between futex_lock_pi() and
futex_unlock_pi(), but not entirely so. See below:
Before:
futex_lock_pi() futex_unlock_pi()
unlock hb->lock
lock hb->lock
unlock hb->lock
lock rt_mutex->wait_lock
unlock rt_mutex_wait_lock
-EAGAIN
lock rt_mutex->wait_lock
list_add
unlock rt_mutex->wait_lock
schedule()
lock rt_mutex->wait_lock
list_del
unlock rt_mutex->wait_lock
<idem>
-EAGAIN
lock hb->lock
After:
futex_lock_pi() futex_unlock_pi()
lock hb->lock
lock rt_mutex->wait_lock
list_add
unlock rt_mutex->wait_lock
unlock hb->lock
schedule()
lock hb->lock
unlock hb->lock
lock hb->lock
lock rt_mutex->wait_lock
list_del
unlock rt_mutex->wait_lock
lock rt_mutex->wait_lock
unlock rt_mutex_wait_lock
-EAGAIN
unlock hb->lock
It does however solve the earlier starvation/live-lock scenario which got
introduced with the -EAGAIN since unlike the before scenario; where the
-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the
after scenario it happens while futex_unlock_pi() actually holds a lock,
and then it is serialized on that lock.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 77 ++++++++++++++++++++++++++++------------
kernel/locking/rtmutex.c | 26 +++----------
kernel/locking/rtmutex_common.h | 1
3 files changed, 62 insertions(+), 42 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2099,20 +2099,7 @@ queue_unlock(struct futex_hash_bucket *h
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2129,6 +2116,24 @@ static inline void queue_me(struct futex
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2587,6 +2592,7 @@ static int futex_lock_pi(u32 __user *uad
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2639,25 +2645,52 @@ static int futex_lock_pi(u32 __user *uad
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
+ if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ /*
+ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+ * such that the hb and rt_mutex wait lists match.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ goto no_block;
+ }
+
+ spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1493,19 +1493,6 @@ int __sched rt_mutex_lock_interruptible(
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
- */
-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
-{
- might_sleep();
-
- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
- timeout, RT_MUTEX_FULL_CHAINWALK);
-}
-
-/*
* Futex variant, must not use fastpath.
*/
int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
@@ -1782,12 +1769,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m
/* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- /*
- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
- * have to fix that up.
- */
- fixup_rt_mutex_waiters(lock);
-
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1827,6 +1808,13 @@ bool rt_mutex_cleanup_proxy_lock(struct
fixup_rt_mutex_waiters(lock);
cleanup = true;
}
+
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
raw_spin_unlock_irq(&lock->wait_lock);
return cleanup;
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -113,7 +113,6 @@ extern int rt_mutex_wait_proxy_lock(stru
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);

View File

@ -0,0 +1,192 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:48 +0200
Subject: [PATCH 11/36] hrtimer: Store running timer in hrtimer_clock_base
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The pointer to the currently running timer is stored in hrtimer_cpu_base
before the base lock is dropped and the callback is invoked.
This results in two levels of indirections and the upcoming support for
softirq based hrtimer requires splitting the "running" storage into soft
and hard irq context expiry.
Storing both in the cpu base would require conditionals in all code paths
accessing that information.
It's possible to have a per clock base sequence count and running pointer
without changing the semantics of the related mechanisms because the timer
base pointer cannot be changed while a timer is running the callback.
Unfortunately this makes cpu_clock base larger than 32 bytes on 32bit
kernels. Instead of having huge gaps due to alignment, remove the alignment
and let the compiler pack cpu base for 32bit. The resulting cache access
patterns are fortunately not really different from the current
behaviour. On 64bit kernels the 64byte alignment stays and the behaviour is
unchanged. This was determined by analyzing the resulting layout and
looking at the number of cache lines involved for the frequently used
clocks.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 20 +++++++++-----------
kernel/time/hrtimer.c | 28 +++++++++++++---------------
2 files changed, 22 insertions(+), 26 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -118,9 +118,9 @@ struct hrtimer_sleeper {
};
#ifdef CONFIG_64BIT
-# define HRTIMER_CLOCK_BASE_ALIGN 64
+# define __hrtimer_clock_base_align ____cacheline_aligned
#else
-# define HRTIMER_CLOCK_BASE_ALIGN 32
+# define __hrtimer_clock_base_align
#endif
/**
@@ -129,18 +129,22 @@ struct hrtimer_sleeper {
* @index: clock type index for per_cpu support when moving a
* timer to a base on another cpu.
* @clockid: clock id for per_cpu support
+ * @seq: seqcount around __run_hrtimer
+ * @running: pointer to the currently running hrtimer
* @active: red black tree root node for the active timers
* @get_time: function to retrieve the current time of the clock
* @offset: offset of this clock to the monotonic base
*/
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
- int index;
+ unsigned int index;
clockid_t clockid;
+ seqcount_t seq;
+ struct hrtimer *running;
struct timerqueue_head active;
ktime_t (*get_time)(void);
ktime_t offset;
-} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
+} __hrtimer_clock_base_align;
enum hrtimer_base_type {
HRTIMER_BASE_MONOTONIC,
@@ -154,8 +158,6 @@ enum hrtimer_base_type {
* struct hrtimer_cpu_base - the per cpu clock bases
* @lock: lock protecting the base and associated clock bases
* and timers
- * @seq: seqcount around __run_hrtimer
- * @running: pointer to the currently running hrtimer
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
@@ -177,8 +179,6 @@ enum hrtimer_base_type {
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
- seqcount_t seq;
- struct hrtimer *running;
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
@@ -198,8 +198,6 @@ struct hrtimer_cpu_base {
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
- BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN);
-
timer->node.expires = time;
timer->_softexpires = time;
}
@@ -424,7 +422,7 @@ static inline int hrtimer_is_queued(stru
*/
static inline int hrtimer_callback_running(struct hrtimer *timer)
{
- return timer->base->cpu_base->running == timer;
+ return timer->base->running == timer;
}
/* Forward a hrtimer so it expires after now: */
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -70,7 +70,6 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .seq = SEQCNT_ZERO(hrtimer_bases.seq),
.clock_base =
{
{
@@ -118,7 +117,6 @@ static const int hrtimer_clock_to_base_t
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .seq = SEQCNT_ZERO(migration_cpu_base),
.clock_base = { { .cpu_base = &migration_cpu_base, }, },
};
@@ -1150,19 +1148,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
*/
bool hrtimer_active(const struct hrtimer *timer)
{
- struct hrtimer_cpu_base *cpu_base;
+ struct hrtimer_clock_base *base;
unsigned int seq;
do {
- cpu_base = READ_ONCE(timer->base->cpu_base);
- seq = raw_read_seqcount_begin(&cpu_base->seq);
+ base = READ_ONCE(timer->base);
+ seq = raw_read_seqcount_begin(&base->seq);
if (timer->state != HRTIMER_STATE_INACTIVE ||
- cpu_base->running == timer)
+ base->running == timer)
return true;
- } while (read_seqcount_retry(&cpu_base->seq, seq) ||
- cpu_base != READ_ONCE(timer->base->cpu_base));
+ } while (read_seqcount_retry(&base->seq, seq) ||
+ base != READ_ONCE(timer->base));
return false;
}
@@ -1196,16 +1194,16 @@ static void __run_hrtimer(struct hrtimer
lockdep_assert_held(&cpu_base->lock);
debug_deactivate(timer);
- cpu_base->running = timer;
+ base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
@@ -1246,13 +1244,13 @@ static void __run_hrtimer(struct hrtimer
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
- * hrtimer_active() cannot observe cpu_base->running == NULL &&
+ * hrtimer_active() cannot observe base->running.timer == NULL &&
* timer->state == INACTIVE.
*/
- raw_write_seqcount_barrier(&cpu_base->seq);
+ raw_write_seqcount_barrier(&base->seq);
- WARN_ON_ONCE(cpu_base->running != timer);
- cpu_base->running = NULL;
+ WARN_ON_ONCE(base->running != timer);
+ base->running = NULL;
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)

View File

@ -1,29 +0,0 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:12 -0500
Subject: [PATCH 11/32] tracing: Add post-trigger flag to hist trigger command
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Add EVENT_CMD_FL_POST_TRIGGER to the hist trigger cmd - it doesn't
affect the hist trigger results, and allows further events such as
synthetic events to be generated from a hist trigger.
Without this change, generating an event from a hist trigger will
cause the generated event to fail a ring buffer trace_recursive_lock()
check and return without actually logging the event.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1676,7 +1676,7 @@ static int event_hist_trigger_func(struc
static struct event_command trigger_hist_cmd = {
.name = "hist",
.trigger_type = ETT_EVENT_HIST,
- .flags = EVENT_CMD_FL_NEEDS_REC,
+ .flags = EVENT_CMD_FL_NEEDS_REC | EVENT_CMD_FL_POST_TRIGGER,
.func = event_hist_trigger_func,
.reg = hist_register_trigger,
.unreg = hist_unregister_trigger,

View File

@ -0,0 +1,188 @@
From: Vedang Patel <vedang.patel@intel.com>
Date: Fri, 22 Sep 2017 14:59:42 -0500
Subject: [PATCH 11/42] tracing: Remove code which merges duplicates
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
We now have the logic to detect and remove duplicates in the
tracing_map hash table. The code which merges duplicates in the
histogram is redundant now. So, modify this code just to detect
duplicates. The duplication detection code is still kept to ensure
that any rare race condition which might cause duplicates does not go
unnoticed.
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 11 -----
kernel/trace/tracing_map.c | 83 ++-------------------------------------
kernel/trace/tracing_map.h | 7 ---
3 files changed, 6 insertions(+), 95 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -340,16 +340,6 @@ static int hist_trigger_elt_comm_alloc(s
return 0;
}
-static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
- struct tracing_map_elt *from)
-{
- char *comm_from = from->private_data;
- char *comm_to = to->private_data;
-
- if (comm_from)
- memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
-}
-
static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
{
char *comm = elt->private_data;
@@ -360,7 +350,6 @@ static void hist_trigger_elt_comm_init(s
static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
.elt_alloc = hist_trigger_elt_comm_alloc,
- .elt_copy = hist_trigger_elt_comm_copy,
.elt_free = hist_trigger_elt_comm_free,
.elt_init = hist_trigger_elt_comm_init,
};
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -847,67 +847,15 @@ create_sort_entry(void *key, struct trac
return sort_entry;
}
-static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
-{
- struct tracing_map_elt *dup_elt;
- unsigned int i;
-
- dup_elt = tracing_map_elt_alloc(elt->map);
- if (IS_ERR(dup_elt))
- return NULL;
-
- if (elt->map->ops && elt->map->ops->elt_copy)
- elt->map->ops->elt_copy(dup_elt, elt);
-
- dup_elt->private_data = elt->private_data;
- memcpy(dup_elt->key, elt->key, elt->map->key_size);
-
- for (i = 0; i < elt->map->n_fields; i++) {
- atomic64_set(&dup_elt->fields[i].sum,
- atomic64_read(&elt->fields[i].sum));
- dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
- }
-
- return dup_elt;
-}
-
-static int merge_dup(struct tracing_map_sort_entry **sort_entries,
- unsigned int target, unsigned int dup)
-{
- struct tracing_map_elt *target_elt, *elt;
- bool first_dup = (target - dup) == 1;
- int i;
-
- if (first_dup) {
- elt = sort_entries[target]->elt;
- target_elt = copy_elt(elt);
- if (!target_elt)
- return -ENOMEM;
- sort_entries[target]->elt = target_elt;
- sort_entries[target]->elt_copied = true;
- } else
- target_elt = sort_entries[target]->elt;
-
- elt = sort_entries[dup]->elt;
-
- for (i = 0; i < elt->map->n_fields; i++)
- atomic64_add(atomic64_read(&elt->fields[i].sum),
- &target_elt->fields[i].sum);
-
- sort_entries[dup]->dup = true;
-
- return 0;
-}
-
-static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+static void detect_dups(struct tracing_map_sort_entry **sort_entries,
int n_entries, unsigned int key_size)
{
unsigned int dups = 0, total_dups = 0;
- int err, i, j;
+ int i;
void *key;
if (n_entries < 2)
- return total_dups;
+ return;
sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
(int (*)(const void *, const void *))cmp_entries_dup, NULL);
@@ -916,30 +864,14 @@ static int merge_dups(struct tracing_map
for (i = 1; i < n_entries; i++) {
if (!memcmp(sort_entries[i]->key, key, key_size)) {
dups++; total_dups++;
- err = merge_dup(sort_entries, i - dups, i);
- if (err)
- return err;
continue;
}
key = sort_entries[i]->key;
dups = 0;
}
- if (!total_dups)
- return total_dups;
-
- for (i = 0, j = 0; i < n_entries; i++) {
- if (!sort_entries[i]->dup) {
- sort_entries[j] = sort_entries[i];
- if (j++ != i)
- sort_entries[i] = NULL;
- } else {
- destroy_sort_entry(sort_entries[i]);
- sort_entries[i] = NULL;
- }
- }
-
- return total_dups;
+ WARN_ONCE(total_dups > 0,
+ "Duplicates detected: %d\n", total_dups);
}
static bool is_key(struct tracing_map *map, unsigned int field_idx)
@@ -1065,10 +997,7 @@ int tracing_map_sort_entries(struct trac
return 1;
}
- ret = merge_dups(entries, n_entries, map->key_size);
- if (ret < 0)
- goto free;
- n_entries -= ret;
+ detect_dups(entries, n_entries, map->key_size);
if (is_key(map, sort_keys[0].field_idx))
cmp_entries_fn = cmp_entries_key;
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -215,11 +215,6 @@ struct tracing_map {
* Element allocation occurs before tracing begins, when the
* tracing_map_init() call is made by client code.
*
- * @elt_copy: At certain points in the lifetime of an element, it may
- * need to be copied. The copy should include a copy of the
- * client-allocated data, which can be copied into the 'to'
- * element from the 'from' element.
- *
* @elt_free: When a tracing_map_elt is freed, this function is called
* and allows client-allocated per-element data to be freed.
*
@@ -233,8 +228,6 @@ struct tracing_map {
*/
struct tracing_map_ops {
int (*elt_alloc)(struct tracing_map_elt *elt);
- void (*elt_copy)(struct tracing_map_elt *to,
- struct tracing_map_elt *from);
void (*elt_free)(struct tracing_map_elt *elt);
void (*elt_clear)(struct tracing_map_elt *elt);
void (*elt_init)(struct tracing_map_elt *elt);

View File

@ -1,61 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:43 +0200
Subject: [PATCH 12/17] async: Adjust system_state checks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in async_run_entry_fn() and
async_synchronize_cookie_domain() to handle the extra states.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184735.865155020@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/async.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct wo
ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(asy
{
ktime_t uninitialized_var(starttime), delta, endtime;
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state == SYSTEM_BOOTING) {
+ if (initcall_debug && system_state < SYSTEM_RUNNING) {
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);

View File

@ -1,129 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Apr 2017 10:22:43 +0200
Subject: [PATCH 12/13] cpufreq/sparc-us2e: Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The access to the HBIRD_ESTAR_MODE register in the cpu frequency control
functions must happen on the target CPU. This is achieved by temporarily
setting the affinity of the calling user space thread to the requested CPU
and reset it to the original affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
Replace it by a straight forward smp function call.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: linux-pm@vger.kernel.org
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131020280.2408@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/cpufreq/sparc-us2e-cpufreq.c | 45 ++++++++++++++++-------------------
1 file changed, 21 insertions(+), 24 deletions(-)
--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -118,10 +118,6 @@ static void us2e_transition(unsigned lon
unsigned long clock_tick,
unsigned long old_divisor, unsigned long divisor)
{
- unsigned long flags;
-
- local_irq_save(flags);
-
estar &= ~ESTAR_MODE_DIV_MASK;
/* This is based upon the state transition diagram in the IIe manual. */
@@ -152,8 +148,6 @@ static void us2e_transition(unsigned lon
} else {
BUG();
}
-
- local_irq_restore(flags);
}
static unsigned long index_to_estar_mode(unsigned int index)
@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(un
return ret;
}
+static void __us2e_freq_get(void *arg)
+{
+ unsigned long *estar = arg;
+
+ *estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
+}
+
static unsigned int us2e_freq_get(unsigned int cpu)
{
- cpumask_t cpus_allowed;
unsigned long clock_tick, estar;
- cpumask_copy(&cpus_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
clock_tick = sparc64_get_clock_tick(cpu) / 1000;
- estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
-
- set_cpus_allowed_ptr(current, &cpus_allowed);
+ if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1))
+ return 0;
return clock_tick / estar_to_divisor(estar);
}
-static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+static void __us2e_freq_target(void *arg)
{
- unsigned int cpu = policy->cpu;
+ unsigned int cpu = smp_processor_id();
+ unsigned int *index = arg;
unsigned long new_bits, new_freq;
unsigned long clock_tick, divisor, old_divisor, estar;
- cpumask_t cpus_allowed;
-
- cpumask_copy(&cpus_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
- new_bits = index_to_estar_mode(index);
- divisor = index_to_divisor(index);
+ new_bits = index_to_estar_mode(*index);
+ divisor = index_to_divisor(*index);
new_freq /= divisor;
estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
old_divisor = estar_to_divisor(estar);
- if (old_divisor != divisor)
+ if (old_divisor != divisor) {
us2e_transition(estar, new_bits, clock_tick * 1000,
old_divisor, divisor);
+ }
+}
- set_cpus_allowed_ptr(current, &cpus_allowed);
+static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+{
+ unsigned int cpu = policy->cpu;
- return 0;
+ return smp_call_function_single(cpu, __us2e_freq_target, &index, 1);
}
static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)

View File

@ -1,81 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:35:59 +0100
Subject: [PATCH] futex: Futex_unlock_pi() determinism
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit bebe5b514345f09be2c15e414d076b02ecb9cce8
The problem with returning -EAGAIN when the waiter state mismatches is that
it becomes very hard to proof a bounded execution time on the
operation. And seeing that this is a RT operation, this is somewhat
important.
While in practise; given the previous patch; it will be very unlikely to
ever really take more than one or two rounds, proving so becomes rather
hard.
However, now that modifying wait_list is done while holding both hb->lock
and wait_lock, the scenario can be avoided entirely by acquiring wait_lock
while still holding hb-lock. Doing a hand-over, without leaving a hole.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104152.112378812@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1398,15 +1398,10 @@ static int wake_futex_pi(u32 __user *uad
DEFINE_WAKE_Q(wake_q);
int ret = 0;
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
- if (!new_owner) {
+ if (WARN_ON_ONCE(!new_owner)) {
/*
- * Since we held neither hb->lock nor wait_lock when coming
- * into this function, we could have raced with futex_lock_pi()
- * such that we might observe @this futex_q waiter, but the
- * rt_mutex's wait_list can be empty (either still, or again,
- * depending on which side we land).
+ * As per the comment in futex_unlock_pi() this should not happen.
*
* When this happens, give up our locks and try again, giving
* the futex_lock_pi() instance time to complete, either by
@@ -2794,15 +2789,18 @@ static int futex_unlock_pi(u32 __user *u
if (pi_state->owner != current)
goto out_unlock;
+ get_pi_state(pi_state);
/*
- * Grab a reference on the pi_state and drop hb->lock.
+ * Since modifying the wait_list is done while holding both
+ * hb->lock and wait_lock, holding either is sufficient to
+ * observe it.
*
- * The reference ensures pi_state lives, dropping the hb->lock
- * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
- * close the races against futex_lock_pi(), but in case of
- * _any_ fail we'll abort and retry the whole deal.
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
*/
- get_pi_state(pi_state);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
ret = wake_futex_pi(uaddr, uval, pi_state);

View File

@ -0,0 +1,34 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:50 +0200
Subject: [PATCH 12/36] hrtimer: Make room in struct hrtimer_cpu_base
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The upcoming softirq based hrtimers support requires an additional field in
the hrtimer_cpu_base struct, which would grow the struct size beyond a
cache line.
The struct members nr_retries and nr_hangs of hrtimer_cpu_base are solely
used for diagnostic output and have no requirement to be unsigned int.
Make them unsigned short to create room for the new struct member. No
functional change.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -189,8 +189,8 @@ struct hrtimer_cpu_base {
ktime_t expires_next;
struct hrtimer *next_timer;
unsigned int nr_events;
- unsigned int nr_retries;
- unsigned int nr_hangs;
+ unsigned short nr_retries;
+ unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];

View File

@ -1,8 +1,8 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:04 -0500
Subject: [PATCH 03/32] ring-buffer: Add interface for setting absolute time
Date: Fri, 22 Sep 2017 14:59:43 -0500
Subject: [PATCH 12/42] ring-buffer: Add interface for setting absolute time
stamps
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Define a new function, tracing_set_time_stamp_abs(), which can be used
to enable or disable the use of absolute timestamps rather than time
@ -15,17 +15,18 @@ Only the interface is added here; a subsequent patch will add the
underlying implementation.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Baohong Liu <baohong.liu@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/ring_buffer.h | 2 ++
kernel/trace/ring_buffer.c | 11 +++++++++++
kernel/trace/trace.c | 25 ++++++++++++++++++++++++-
kernel/trace/trace.h | 2 ++
4 files changed, 39 insertions(+), 1 deletion(-)
kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++-
kernel/trace/trace.h | 3 +++
4 files changed, 55 insertions(+), 1 deletion(-)
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -180,6 +180,8 @@ void ring_buffer_normalize_time_stamp(st
@@ -181,6 +181,8 @@ void ring_buffer_normalize_time_stamp(st
int cpu, u64 *ts);
void ring_buffer_set_clock(struct ring_buffer *buffer,
u64 (*clock)(void));
@ -36,7 +37,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -484,6 +484,7 @@ struct ring_buffer {
@@ -485,6 +485,7 @@ struct ring_buffer {
u64 (*clock)(void);
struct rb_irq_work irq_work;
@ -44,7 +45,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
};
struct ring_buffer_iter {
@@ -1378,6 +1379,16 @@ void ring_buffer_set_clock(struct ring_b
@@ -1379,6 +1380,16 @@ void ring_buffer_set_clock(struct ring_b
buffer->clock = clock;
}
@ -63,7 +64,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2082,7 +2082,7 @@ trace_event_buffer_lock_reserve(struct r
@@ -2269,7 +2269,7 @@ trace_event_buffer_lock_reserve(struct r
*current_rb = trace_file->tr->trace_buffer.buffer;
@ -72,14 +73,30 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
(EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
(entry = this_cpu_read(trace_buffered_event))) {
/* Try to use the per cpu buffer first */
@@ -5959,6 +5959,29 @@ static int tracing_clock_open(struct ino
@@ -6297,6 +6297,44 @@ static int tracing_clock_open(struct ino
return ret;
}
+
+int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
+{
+ int ret = 0;
+
+ mutex_lock(&trace_types_lock);
+
+ if (abs && tr->time_stamp_abs_ref++)
+ goto out;
+
+ if (!abs) {
+ if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (--tr->time_stamp_abs_ref)
+ goto out;
+ }
+
+ ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
+
+ /*
@ -89,22 +106,29 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
+ ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+
+ out:
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+ return ret;
+}
+
struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -278,6 +278,8 @@ extern struct mutex trace_types_lock;
@@ -273,6 +273,7 @@ struct trace_array {
/* function tracing enabled */
int function_enabled;
#endif
+ int time_stamp_abs_ref;
};
enum {
@@ -286,6 +287,8 @@ extern struct mutex trace_types_lock;
extern int trace_array_get(struct trace_array *tr);
extern void trace_array_put(struct trace_array *tr);

View File

@ -1,95 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Apr 2017 10:20:23 +0200
Subject: [PATCH 13/13] crypto: N2 - Replace racy task affinity logic
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
spu_queue_register() needs to invoke setup functions on a particular
CPU. This is achieved by temporarily setting the affinity of the
calling user space thread to the requested CPU and reset it to the original
affinity afterwards.
That's racy vs. CPU hotplug and concurrent affinity settings for that
thread resulting in code executing on the wrong CPU and overwriting the
new affinity setting.
Replace it by using work_on_cpu_safe() which guarantees to run the code on
the requested CPU or to fail in case the CPU is offline.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linux-crypto@vger.kernel.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1704131019420.2408@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
drivers/crypto/n2_core.c | 31 ++++++++++++++++---------------
1 file changed, 16 insertions(+), 15 deletions(-)
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -65,6 +65,11 @@ struct spu_queue {
struct list_head list;
};
+struct spu_qreg {
+ struct spu_queue *queue;
+ unsigned long type;
+};
+
static struct spu_queue **cpu_to_cwq;
static struct spu_queue **cpu_to_mau;
@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void)
kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
}
-static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+static long spu_queue_register_workfn(void *arg)
{
- cpumask_var_t old_allowed;
+ struct spu_qreg *qr = arg;
+ struct spu_queue *p = qr->queue;
+ unsigned long q_type = qr->type;
unsigned long hv_ret;
- if (cpumask_empty(&p->sharing))
- return -EINVAL;
-
- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
- return -ENOMEM;
-
- cpumask_copy(old_allowed, &current->cpus_allowed);
-
- set_cpus_allowed_ptr(current, &p->sharing);
-
hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q),
CWQ_NUM_ENTRIES, &p->qhandle);
if (!hv_ret)
sun4v_ncs_sethead_marker(p->qhandle, 0);
- set_cpus_allowed_ptr(current, old_allowed);
+ return hv_ret ? -EINVAL : 0;
+}
- free_cpumask_var(old_allowed);
+static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+{
+ int cpu = cpumask_any_and(&p->sharing, cpu_online_mask);
+ struct spu_qreg qr = { .queue = p, .type = q_type };
- return (hv_ret ? -EINVAL : 0);
+ return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr);
}
static int spu_queue_setup(struct spu_queue *p)

View File

@ -1,36 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:44 +0200
Subject: [PATCH 13/17] extable: Adjust system_state checks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in core_kernel_text() to handle the extra
states, i.e. to cover init text up to the point where the system switches
to state RUNNING.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184735.949992741@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/extable.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
- if (system_state == SYSTEM_BOOTING &&
+ if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;

View File

@ -1,204 +0,0 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 22 Mar 2017 11:36:00 +0100
Subject: [PATCH] futex: Drop hb->lock before enqueueing on the rtmutex
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Upstream commit 56222b212e8edb1cf51f5dd73ff645809b082b40
When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI
chain code will (falsely) report a deadlock and BUG.
The problem is that it hold hb->lock (now an rt_mutex) while doing
task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when
interleaved just right with futex_unlock_pi() leads it to believe to see an
AB-BA deadlock.
Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI)
does FUTEX_UNLOCK_PI)
lock hb->lock
lock rt_mutex (as per start_proxy)
lock hb->lock
Which is a trivial AB-BA.
It is not an actual deadlock, because it won't be holding hb->lock by the
time it actually blocks on the rt_mutex, but the chainwalk code doesn't
know that and it would be a nightmare to handle this gracefully.
To avoid this problem, do the same as in futex_unlock_pi() and drop
hb->lock after acquiring wait_lock. This still fully serializes against
futex_unlock_pi(), since adding to the wait_list does the very same lock
dance, and removing it holds both locks.
Aside of solving the RT problem this makes the lock and unlock mechanism
symetric and reduces the hb->lock held time.
Reported-and-tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/futex.c | 30 +++++++++++++++++-------
kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------
kernel/locking/rtmutex_common.h | 3 ++
3 files changed, 52 insertions(+), 30 deletions(-)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2654,20 +2654,33 @@ static int futex_lock_pi(u32 __user *uad
goto no_block;
}
+ rt_mutex_init_waiter(&rt_waiter);
+
/*
- * We must add ourselves to the rt_mutex waitlist while holding hb->lock
- * such that the hb and rt_mutex wait lists match.
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
+ * serializes against futex_unlock_pi() as that does the exact same
+ * lock handoff sequence.
*/
- rt_mutex_init_waiter(&rt_waiter);
- ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
if (ret) {
if (ret == 1)
ret = 0;
+ spin_lock(q.lock_ptr);
goto no_block;
}
- spin_unlock(q.lock_ptr);
if (unlikely(to))
hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
@@ -2680,6 +2693,9 @@ static int futex_lock_pi(u32 __user *uad
* first acquire the hb->lock before removing the lock from the
* rt_mutex waitqueue, such that we can keep the hb and rt_mutex
* wait lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
*/
if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
ret = 0;
@@ -2791,10 +2807,6 @@ static int futex_unlock_pi(u32 __user *u
get_pi_state(pi_state);
/*
- * Since modifying the wait_list is done while holding both
- * hb->lock and wait_lock, holding either is sufficient to
- * observe it.
- *
* By taking wait_lock while still holding hb->lock, we ensure
* there is no point where we hold neither; and therefore
* wake_futex_pi() must observe a state consistent with what we
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1669,31 +1669,14 @@ void rt_mutex_proxy_unlock(struct rt_mut
rt_mutex_set_owner(lock, NULL);
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1712,12 +1695,36 @@ int rt_mutex_start_proxy_lock(struct rt_
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
}
+
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
/**
* rt_mutex_next_owner - return the next owner of the lock
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);

View File

@ -0,0 +1,150 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:51 +0200
Subject: [PATCH 13/36] hrtimer: Reduce conditional code (hres_active)
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The hrtimer_cpu_base struct has the CONFIG_HIGH_RES_TIMERS conditional
struct member hres_active. All related functions to this member are
conditional as well.
There is no functional change, when the hres_active member is
unconditional with all related functions and is set to zero during
initialization.
The conditional code sections can be avoided by adding IS_ENABLED(HIGHRES)
conditionals into common functions, which ensures dead code elimination.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 20 ++++++++------------
kernel/time/hrtimer.c | 31 +++++++++++++++----------------
2 files changed, 23 insertions(+), 28 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -161,8 +161,8 @@ enum hrtimer_base_type {
* @cpu: cpu number
* @active_bases: Bitfield to mark bases with active timers
* @clock_was_set_seq: Sequence counter of clock was set events
- * @in_hrtirq: hrtimer_interrupt() is currently executing
* @hres_active: State of high resolution mode
+ * @in_hrtirq: hrtimer_interrupt() is currently executing
* @hang_detected: The last hrtimer interrupt detected a hang
* @expires_next: absolute time of the next event, is required for remote
* hrtimer enqueue
@@ -182,9 +182,9 @@ struct hrtimer_cpu_base {
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
+ unsigned int hres_active : 1;
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int in_hrtirq : 1,
- hres_active : 1,
hang_detected : 1;
ktime_t expires_next;
struct hrtimer *next_timer;
@@ -266,16 +266,17 @@ static inline ktime_t hrtimer_cb_get_tim
return timer->base->get_time();
}
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
+ timer->base->cpu_base->hres_active : 0;
+}
+
#ifdef CONFIG_HIGH_RES_TIMERS
struct clock_event_device;
extern void hrtimer_interrupt(struct clock_event_device *dev);
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
- return timer->base->cpu_base->hres_active;
-}
-
/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
@@ -298,11 +299,6 @@ extern unsigned int hrtimer_resolution;
#define hrtimer_resolution (unsigned int)LOW_RES_NSEC
-static inline int hrtimer_is_hres_active(struct hrtimer *timer)
-{
- return 0;
-}
-
static inline void clock_was_set_delayed(void) { }
#endif
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -512,6 +512,20 @@ static inline ktime_t hrtimer_update_bas
offs_real, offs_boot, offs_tai);
}
+/*
+ * Is the high resolution mode active ?
+ */
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+ return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
+ cpu_base->hres_active : 0;
+}
+
+static inline int hrtimer_hres_active(void)
+{
+ return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
+}
+
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -541,19 +555,6 @@ static inline int hrtimer_is_hres_enable
}
/*
- * Is the high resolution mode active ?
- */
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
-{
- return cpu_base->hres_active;
-}
-
-static inline int hrtimer_hres_active(void)
-{
- return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
-}
-
-/*
* Reprogram the event source with checking both queues for the
* next event
* Called with interrupts disabled and base->lock held
@@ -661,7 +662,6 @@ static void hrtimer_reprogram(struct hrt
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next = KTIME_MAX;
- base->hres_active = 0;
}
/*
@@ -720,8 +720,6 @@ void clock_was_set_delayed(void)
#else
-static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
-static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
static inline void
@@ -1602,6 +1600,7 @@ int hrtimers_prepare_cpu(unsigned int cp
}
cpu_base->cpu = cpu;
+ cpu_base->hres_active = 0;
hrtimer_init_hres(cpu_base);
return 0;
}

View File

@ -1,8 +1,8 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:05 -0500
Subject: [PATCH 04/32] ring-buffer: Redefine the unimplemented
Date: Fri, 22 Sep 2017 14:59:44 -0500
Subject: [PATCH 13/42] ring-buffer: Redefine the unimplemented
RINGBUF_TIME_TIME_STAMP
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
RINGBUF_TYPE_TIME_STAMP is defined but not used, and from what I can
gather was reserved for something like an absolute timestamp feature
@ -24,13 +24,13 @@ previous interface patch.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/ring_buffer.h | 12 ++--
kernel/trace/ring_buffer.c | 107 +++++++++++++++++++++++++++++++-------------
2 files changed, 83 insertions(+), 36 deletions(-)
include/linux/ring_buffer.h | 12 ++---
kernel/trace/ring_buffer.c | 105 +++++++++++++++++++++++++++++++-------------
2 files changed, 83 insertions(+), 34 deletions(-)
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -36,10 +36,12 @@ struct ring_buffer_event {
@@ -37,10 +37,12 @@ struct ring_buffer_event {
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*
@ -47,7 +47,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*
* <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
* Data record
@@ -56,12 +58,12 @@ enum ring_buffer_type {
@@ -57,12 +59,12 @@ enum ring_buffer_type {
RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
RINGBUF_TYPE_PADDING,
RINGBUF_TYPE_TIME_EXTEND,
@ -72,7 +72,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
trace_seq_printf(s, "\tdata max type_len == %d\n",
RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
@@ -147,6 +149,9 @@ enum {
@@ -141,12 +143,15 @@ int ring_buffer_print_entry_header(struc
enum {
RB_LEN_TIME_EXTEND = 8,
- RB_LEN_TIME_STAMP = 16,
+ RB_LEN_TIME_STAMP = 8,
};
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
@ -82,19 +89,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -187,10 +192,8 @@ rb_event_length(struct ring_buffer_event
return event->array[0] + RB_EVNT_HDR_SIZE;
case RINGBUF_TYPE_TIME_EXTEND:
- return RB_LEN_TIME_EXTEND;
-
case RINGBUF_TYPE_TIME_STAMP:
- return RB_LEN_TIME_STAMP;
+ return RB_LEN_TIME_EXTEND;
case RINGBUF_TYPE_DATA:
return rb_event_data_length(event);
@@ -210,7 +213,7 @@ rb_event_ts_length(struct ring_buffer_ev
@@ -210,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_ev
{
unsigned len = 0;
@ -103,7 +98,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
@@ -232,7 +235,7 @@ unsigned ring_buffer_event_length(struct
@@ -232,7 +237,7 @@ unsigned ring_buffer_event_length(struct
{
unsigned length;
@ -112,7 +107,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
event = skip_time_extend(event);
length = rb_event_length(event);
@@ -249,7 +252,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_leng
@@ -249,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_leng
static __always_inline void *
rb_event_data(struct ring_buffer_event *event)
{
@ -121,7 +116,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
@@ -276,6 +279,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data
@@ -276,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
@ -149,7 +144,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* Flag when events were overwritten */
#define RB_MISSED_EVENTS (1 << 31)
/* Missed count stored at end */
@@ -2219,13 +2243,16 @@ rb_move_tail(struct ring_buffer_per_cpu
@@ -2220,13 +2246,16 @@ rb_move_tail(struct ring_buffer_per_cpu
}
/* Slow path, do not inline */
@ -171,7 +166,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2268,7 +2295,9 @@ rb_update_event(struct ring_buffer_per_c
@@ -2269,7 +2298,9 @@ rb_update_event(struct ring_buffer_per_c
* add it to the start of the resevered space.
*/
if (unlikely(info->add_timestamp)) {
@ -182,7 +177,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
@@ -2456,7 +2485,7 @@ static __always_inline void rb_end_commi
@@ -2457,7 +2488,7 @@ static __always_inline void rb_end_commi
static inline void rb_event_discard(struct ring_buffer_event *event)
{
@ -191,7 +186,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
@@ -2487,6 +2516,10 @@ rb_update_write_stamp(struct ring_buffer
@@ -2488,6 +2519,10 @@ rb_update_write_stamp(struct ring_buffer
{
u64 delta;
@ -202,7 +197,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
* The event first in the commit queue updates the
* time stamp.
@@ -2500,9 +2533,7 @@ rb_update_write_stamp(struct ring_buffer
@@ -2501,9 +2536,7 @@ rb_update_write_stamp(struct ring_buffer
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
@ -213,7 +208,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
cpu_buffer->write_stamp += delta;
} else
cpu_buffer->write_stamp += event->time_delta;
@@ -2686,7 +2717,7 @@ static struct ring_buffer_event *
@@ -2657,7 +2690,7 @@ static struct ring_buffer_event *
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
@ -222,7 +217,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
info->delta = 0;
/* See if we shot pass the end of this buffer page */
@@ -2764,8 +2795,11 @@ rb_reserve_next_event(struct ring_buffer
@@ -2735,8 +2768,11 @@ rb_reserve_next_event(struct ring_buffer
/* make sure this diff is calculated here */
barrier();
@ -236,7 +231,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
info.delta = diff;
if (unlikely(test_time_stamp(info.delta)))
rb_handle_timestamp(cpu_buffer, &info);
@@ -3447,14 +3481,12 @@ rb_update_read_stamp(struct ring_buffer_
@@ -3418,14 +3454,12 @@ rb_update_read_stamp(struct ring_buffer_
return;
case RINGBUF_TYPE_TIME_EXTEND:
@ -253,7 +248,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
return;
case RINGBUF_TYPE_DATA:
@@ -3478,14 +3510,12 @@ rb_update_iter_read_stamp(struct ring_bu
@@ -3449,14 +3483,12 @@ rb_update_iter_read_stamp(struct ring_bu
return;
case RINGBUF_TYPE_TIME_EXTEND:
@ -270,7 +265,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
return;
case RINGBUF_TYPE_DATA:
@@ -3709,6 +3739,8 @@ rb_buffer_peek(struct ring_buffer_per_cp
@@ -3680,6 +3712,8 @@ rb_buffer_peek(struct ring_buffer_per_cp
struct buffer_page *reader;
int nr_loops = 0;
@ -279,7 +274,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
again:
/*
* We repeat when a time extend is encountered.
@@ -3745,12 +3777,17 @@ rb_buffer_peek(struct ring_buffer_per_cp
@@ -3716,12 +3750,17 @@ rb_buffer_peek(struct ring_buffer_per_cp
goto again;
case RINGBUF_TYPE_TIME_STAMP:
@ -299,7 +294,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
*ts = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
@@ -3775,6 +3812,9 @@ rb_iter_peek(struct ring_buffer_iter *it
@@ -3746,6 +3785,9 @@ rb_iter_peek(struct ring_buffer_iter *it
struct ring_buffer_event *event;
int nr_loops = 0;
@ -309,7 +304,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
@@ -3827,12 +3867,17 @@ rb_iter_peek(struct ring_buffer_iter *it
@@ -3798,12 +3840,17 @@ rb_iter_peek(struct ring_buffer_iter *it
goto again;
case RINGBUF_TYPE_TIME_STAMP:

View File

@ -0,0 +1,36 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:52 +0200
Subject: [PATCH 14/36] hrtimer: Use accesor functions instead of direct access
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
__hrtimer_hres_active() is now available unconditionally. Replace the
direct access to hrtimer_cpu_base.hres_active.
No functional change.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -564,7 +564,7 @@ hrtimer_force_reprogram(struct hrtimer_c
{
ktime_t expires_next;
- if (!cpu_base->hres_active)
+ if (!__hrtimer_hres_active(cpu_base))
return;
expires_next = __hrtimer_get_next_event(cpu_base);
@@ -673,7 +673,7 @@ static void retrigger_next_event(void *a
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
- if (!base->hres_active)
+ if (!__hrtimer_hres_active(base))
return;
raw_spin_lock(&base->lock);

View File

@ -1,35 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:45 +0200
Subject: [PATCH 14/17] printk: Adjust system_state checks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in boot_delay_msec() to handle the extra
states.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184736.027534895@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/printk/printk.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level)
unsigned long long k;
unsigned long timeout;
- if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+ if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
|| suppress_message_printing(level)) {
return;
}

View File

@ -1,8 +1,8 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:06 -0500
Subject: [PATCH 05/32] tracing: Give event triggers access to
Date: Fri, 22 Sep 2017 14:59:45 -0500
Subject: [PATCH 14/42] tracing: Give event triggers access to
ring_buffer_event
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
The ring_buffer event can provide a timestamp that may be useful to
various triggers - pass it into the handlers for that purpose.
@ -18,7 +18,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -400,11 +400,13 @@ enum event_trigger_type {
@@ -402,11 +402,13 @@ enum event_trigger_type {
extern int filter_match_preds(struct event_filter *filter, void *rec);
@ -37,7 +37,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
@@ -424,7 +426,7 @@ trace_trigger_soft_disabled(struct trace
@@ -426,7 +428,7 @@ trace_trigger_soft_disabled(struct trace
if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
@ -48,7 +48,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (eflags & EVENT_FILE_FL_PID_FILTER)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1189,7 +1189,7 @@ static inline bool
@@ -1296,7 +1296,7 @@ static inline bool
unsigned long eflags = file->flags;
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
@ -57,7 +57,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
(unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1226,7 +1226,7 @@ event_trigger_unlock_commit(struct trace
@@ -1333,7 +1333,7 @@ event_trigger_unlock_commit(struct trace
trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
if (tt)
@ -66,7 +66,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
/**
@@ -1259,7 +1259,7 @@ event_trigger_unlock_commit_regs(struct
@@ -1366,7 +1366,7 @@ event_trigger_unlock_commit_regs(struct
irq_flags, pc, regs);
if (tt)
@ -75,7 +75,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -1482,7 +1482,8 @@ extern int register_trigger_hist_enable_
@@ -1591,7 +1591,8 @@ extern int register_trigger_hist_enable_
*/
struct event_trigger_ops {
void (*func)(struct event_trigger_data *data,
@ -87,7 +87,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
void (*free)(struct event_trigger_ops *ops,
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -921,7 +921,8 @@ static inline void add_to_key(char *comp
@@ -909,7 +909,8 @@ static inline void add_to_key(char *comp
memcpy(compound_key + key_field->offset, key, size);
}
@ -97,7 +97,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
@@ -1672,7 +1673,8 @@ static struct event_command trigger_hist
@@ -1660,7 +1661,8 @@ static struct event_command trigger_hist
}
static void
@ -107,7 +107,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct enable_trigger_data *enable_data = data->private_data;
struct event_trigger_data *test;
@@ -1688,7 +1690,8 @@ hist_enable_trigger(struct event_trigger
@@ -1676,7 +1678,8 @@ hist_enable_trigger(struct event_trigger
}
static void
@ -117,7 +117,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
if (!data->count)
return;
@@ -1696,7 +1699,7 @@ hist_enable_count_trigger(struct event_t
@@ -1684,7 +1687,7 @@ hist_enable_count_trigger(struct event_t
if (data->count != -1)
(data->count)--;

View File

@ -0,0 +1,129 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:53 +0200
Subject: [PATCH 15/36] hrtimer: Make the remote enqueue check unconditional
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
hrtimer_cpu_base.expires_next is used to cache the next event armed in the
timer hardware. The value is used to check whether an hrtimer can be
enqueued remotely. If the new hrtimer is expiring before expires_next, then
remote enqueue is not possible as the remote hrtimer hardware cannot be
accessed for reprogramming to an earlier expiry time.
The remote enqueue check is currently conditional on
CONFIG_HIGH_RES_TIMERS=y and hrtimer_cpu_base.hres_active. There is no
compelling reason to make this conditional.
Move hrtimer_cpu_base.expires_next out of the CONFIG_HIGH_RES_TIMERS=y
guarded area and remove the conditionals in hrtimer_check_target().
The check is currently a NOOP for the CONFIG_HIGH_RES_TIMERS=n and the
!hrtimer_cpu_base.hres_active case because in these cases nothing updates
hrtimer_cpu_base.expires_next yet. This will be changed with later patches
which further reduce the #ifdef zoo in this code.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 6 +++---
kernel/time/hrtimer.c | 26 ++++++--------------------
2 files changed, 9 insertions(+), 23 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -164,13 +164,13 @@ enum hrtimer_base_type {
* @hres_active: State of high resolution mode
* @in_hrtirq: hrtimer_interrupt() is currently executing
* @hang_detected: The last hrtimer interrupt detected a hang
- * @expires_next: absolute time of the next event, is required for remote
- * hrtimer enqueue
* @next_timer: Pointer to the first expiring timer
* @nr_events: Total number of hrtimer interrupt events
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
+ * @expires_next: absolute time of the next event, is required for remote
+ * hrtimer enqueue
* @clock_base: array of clock bases for this cpu
*
* Note: next_timer is just an optimization for __remove_hrtimer().
@@ -186,13 +186,13 @@ struct hrtimer_cpu_base {
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int in_hrtirq : 1,
hang_detected : 1;
- ktime_t expires_next;
struct hrtimer *next_timer;
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
+ ktime_t expires_next;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -154,26 +154,21 @@ struct hrtimer_clock_base *lock_hrtimer_
}
/*
- * With HIGHRES=y we do not migrate the timer when it is expiring
- * before the next event on the target cpu because we cannot reprogram
- * the target cpu hardware and we would cause it to fire late.
+ * We do not migrate the timer when it is expiring before the next
+ * event on the target cpu. When high resolution is enabled, we cannot
+ * reprogram the target cpu hardware and we would cause it to fire
+ * late. To keep it simple, we handle the high resolution enabled and
+ * disabled case similar.
*
* Called with cpu_base->lock of target cpu held.
*/
static int
hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
{
-#ifdef CONFIG_HIGH_RES_TIMERS
ktime_t expires;
- if (!new_base->cpu_base->hres_active)
- return 0;
-
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
return expires <= new_base->cpu_base->expires_next;
-#else
- return 0;
-#endif
}
static inline
@@ -657,14 +652,6 @@ static void hrtimer_reprogram(struct hrt
}
/*
- * Initialize the high resolution related parts of cpu_base
- */
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
-{
- base->expires_next = KTIME_MAX;
-}
-
-/*
* Retrigger next event is called after clock was set
*
* Called with interrupts disabled via on_each_cpu()
@@ -729,7 +716,6 @@ static inline int hrtimer_reprogram(stru
{
return 0;
}
-static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
@@ -1601,7 +1587,7 @@ int hrtimers_prepare_cpu(unsigned int cp
cpu_base->cpu = cpu;
cpu_base->hres_active = 0;
- hrtimer_init_hres(cpu_base);
+ cpu_base->expires_next = KTIME_MAX;
return 0;
}

View File

@ -1,39 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:46 +0200
Subject: [PATCH 15/17] mm/vmscan: Adjust system_state checks
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.
Adjust the system_state check in kswapd_run() to handle the extra states.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184736.119158930@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3654,7 +3654,7 @@ int kswapd_run(int nid)
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
- BUG_ON(system_state == SYSTEM_BOOTING);
+ BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;

View File

@ -1,8 +1,8 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:07 -0500
Subject: [PATCH 06/32] tracing: Add ring buffer event param to hist field
Date: Fri, 22 Sep 2017 14:59:46 -0500
Subject: [PATCH 15/42] tracing: Add ring buffer event param to hist field
functions
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Some events such as timestamps require access to a ring_buffer_event
struct; add a param so that hist field functions can access that.
@ -91,7 +91,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{ \
type *addr = (type *)(event + hist_field->field->offset); \
\
@@ -883,8 +892,8 @@ create_hist_data(unsigned int map_bits,
@@ -871,8 +880,8 @@ create_hist_data(unsigned int map_bits,
}
static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
@ -102,7 +102,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct hist_field *hist_field;
unsigned int i;
@@ -892,7 +901,7 @@ static void hist_trigger_elt_update(stru
@@ -880,7 +889,7 @@ static void hist_trigger_elt_update(stru
for_each_hist_val_field(i, hist_data) {
hist_field = hist_data->fields[i];
@ -111,7 +111,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
tracing_map_update_sum(elt, i, hist_val);
}
}
@@ -922,7 +931,7 @@ static inline void add_to_key(char *comp
@@ -910,7 +919,7 @@ static inline void add_to_key(char *comp
}
static void event_hist_trigger(struct event_trigger_data *data, void *rec,
@ -120,7 +120,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
@@ -951,7 +960,7 @@ static void event_hist_trigger(struct ev
@@ -939,7 +948,7 @@ static void event_hist_trigger(struct ev
key = entries;
} else {
@ -129,7 +129,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (key_field->flags & HIST_FIELD_FL_STRING) {
key = (void *)(unsigned long)field_contents;
use_compound_key = true;
@@ -968,7 +977,7 @@ static void event_hist_trigger(struct ev
@@ -956,7 +965,7 @@ static void event_hist_trigger(struct ev
elt = tracing_map_insert(hist_data->map, key);
if (elt)

View File

@ -0,0 +1,99 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:54 +0200
Subject: [PATCH 16/36] hrtimer: Make hrtimer_cpu_base.next_timer handling
unconditional
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
hrtimer_cpu_base.next_timer stores the pointer to the next expiring timer
in a cpu base.
This pointer cannot be dereferenced and is solely used to check whether a
hrtimer which is removed is the hrtimer which is the first to expire in the
CPU base. If this is the case, then the timer hardware needs to be
reprogrammed to avoid an extra interrupt for nothing.
Again, this is conditional functionality, but there is no compelling reason
to make this conditional. As a preparation, hrtimer_cpu_base.next_timer
needs to be available unconditonal. Aside of that the upcoming support for
softirq based hrtimers requires access to this pointer unconditionally.
Make the update of hrtimer_cpu_base.next_timer unconditional and remove the
ifdef cruft. The impact on CONFIG_HIGH_RES_TIMERS=n && CONFIG_NOHZ=n is
marginal as it's just a store on an already dirtied cacheline.
No functional change.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 4 ++--
kernel/time/hrtimer.c | 12 ++----------
2 files changed, 4 insertions(+), 12 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -164,13 +164,13 @@ enum hrtimer_base_type {
* @hres_active: State of high resolution mode
* @in_hrtirq: hrtimer_interrupt() is currently executing
* @hang_detected: The last hrtimer interrupt detected a hang
- * @next_timer: Pointer to the first expiring timer
* @nr_events: Total number of hrtimer interrupt events
* @nr_retries: Total number of hrtimer interrupt retries
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @expires_next: absolute time of the next event, is required for remote
* hrtimer enqueue
+ * @next_timer: Pointer to the first expiring timer
* @clock_base: array of clock bases for this cpu
*
* Note: next_timer is just an optimization for __remove_hrtimer().
@@ -186,13 +186,13 @@ struct hrtimer_cpu_base {
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int in_hrtirq : 1,
hang_detected : 1;
- struct hrtimer *next_timer;
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time;
#endif
ktime_t expires_next;
+ struct hrtimer *next_timer;
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -442,14 +442,6 @@ static inline void debug_deactivate(stru
}
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
-static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *timer)
-{
-#ifdef CONFIG_HIGH_RES_TIMERS
- cpu_base->next_timer = timer;
-#endif
-}
-
static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
@@ -473,7 +465,7 @@ static ktime_t __hrtimer_get_next_event(
unsigned int active = cpu_base->active_bases;
ktime_t expires, expires_next = KTIME_MAX;
- hrtimer_update_next_timer(cpu_base, NULL);
+ cpu_base->next_timer = NULL;
for_each_active_base(base, cpu_base, active) {
struct timerqueue_node *next;
struct hrtimer *timer;
@@ -483,7 +475,7 @@ static ktime_t __hrtimer_get_next_event(
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
- hrtimer_update_next_timer(cpu_base, timer);
+ cpu_base->next_timer = timer;
}
}
/*

View File

@ -1,60 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:47 +0200
Subject: [PATCH 16/17] init: Introduce SYSTEM_SCHEDULING state
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
might_sleep() debugging and smp_processor_id() debugging should be active
right after the scheduler starts working. The init task can invoke
smp_processor_id() from preemptible context as it is pinned on the boot cpu
until sched_smp_init() removes the pinning and lets it schedule on all non
isolated cpus.
Add a new state which allows to enable those checks earlier and add it to
the xen do_poweroff() function.
No functional change.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184736.196214622@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
drivers/xen/manage.c | 1 +
include/linux/kernel.h | 6 +++++-
2 files changed, 6 insertions(+), 1 deletion(-)
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -190,6 +190,7 @@ static void do_poweroff(void)
{
switch (system_state) {
case SYSTEM_BOOTING:
+ case SYSTEM_SCHEDULING:
orderly_poweroff(true);
break;
case SYSTEM_RUNNING:
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -488,9 +488,13 @@ extern int root_mountflags;
extern bool early_boot_irqs_disabled;
-/* Values used for system_state */
+/*
+ * Values used for system_state. Ordering of the states must not be changed
+ * as code checks for <, <=, >, >= STATE.
+ */
extern enum system_states {
SYSTEM_BOOTING,
+ SYSTEM_SCHEDULING,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,

View File

@ -1,20 +1,21 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:09 -0500
Subject: [PATCH 08/32] tracing: Break out hist trigger assignment parsing
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:59:47 -0500
Subject: [PATCH 16/42] tracing: Break out hist trigger assignment parsing
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
This will make it easier to add variables, and makes the parsing code
cleaner regardless.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Rajvi Jingar <rajvi.jingar@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 56 ++++++++++++++++++++++++---------------
1 file changed, 35 insertions(+), 21 deletions(-)
kernel/trace/trace_events_hist.c | 72 +++++++++++++++++++++++++++------------
1 file changed, 51 insertions(+), 21 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -251,6 +251,35 @@ static void destroy_hist_trigger_attrs(s
@@ -251,6 +251,51 @@ static void destroy_hist_trigger_attrs(s
kfree(attrs);
}
@ -23,17 +24,33 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ int ret = 0;
+
+ if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+ (strncmp(str, "keys=", strlen("keys=")) == 0))
+ (strncmp(str, "keys=", strlen("keys=")) == 0)) {
+ attrs->keys_str = kstrdup(str, GFP_KERNEL);
+ else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+ if (!attrs->keys_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+ (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+ (strncmp(str, "values=", strlen("values=")) == 0))
+ (strncmp(str, "values=", strlen("values=")) == 0)) {
+ attrs->vals_str = kstrdup(str, GFP_KERNEL);
+ else if (strncmp(str, "sort=", strlen("sort=")) == 0)
+ if (!attrs->vals_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
+ attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+ else if (strncmp(str, "name=", strlen("name=")) == 0)
+ if (!attrs->sort_key_str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "name=", strlen("name=")) == 0) {
+ attrs->name = kstrdup(str, GFP_KERNEL);
+ else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ if (!attrs->name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (strncmp(str, "size=", strlen("size=")) == 0) {
+ int map_bits = parse_map_size(str);
+
+ if (map_bits < 0) {
@ -50,7 +67,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
{
struct hist_trigger_attrs *attrs;
@@ -263,33 +292,18 @@ static struct hist_trigger_attrs *parse_
@@ -263,33 +308,18 @@ static struct hist_trigger_attrs *parse_
while (trigger_str) {
char *str = strsep(&trigger_str, ":");

View File

@ -0,0 +1,187 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:55 +0200
Subject: [PATCH 17/36] hrtimer: Make hrtimer_reprogramm() unconditional
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
hrtimer_reprogram() needs to be available unconditionally for softirq based
hrtimers. Move the function and all required struct members out of the
CONFIG_HIGH_RES_TIMERS #ifdef.
There is no functional change because hrtimer_reprogram() is only invoked
when hrtimer_cpu_base.hres_active is true. Making it unconditional
increases the text size for the CONFIG_HIGH_RES_TIMERS=n case, but avoids
replication of that code for the upcoming softirq based hrtimers support.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/hrtimer.h | 6 +-
kernel/time/hrtimer.c | 129 +++++++++++++++++++++++-------------------------
2 files changed, 65 insertions(+), 70 deletions(-)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -182,10 +182,10 @@ struct hrtimer_cpu_base {
unsigned int cpu;
unsigned int active_bases;
unsigned int clock_was_set_seq;
- unsigned int hres_active : 1;
-#ifdef CONFIG_HIGH_RES_TIMERS
- unsigned int in_hrtirq : 1,
+ unsigned int hres_active : 1,
+ in_hrtirq : 1,
hang_detected : 1;
+#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -582,68 +582,6 @@ hrtimer_force_reprogram(struct hrtimer_c
}
/*
- * When a timer is enqueued and expires earlier than the already enqueued
- * timers, we have to check, whether it expires earlier than the timer for
- * which the clock event device was armed.
- *
- * Called with interrupts disabled and base->cpu_base.lock held
- */
-static void hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
-
- /*
- * If the timer is not on the current cpu, we cannot reprogram
- * the other cpus clock event device.
- */
- if (base->cpu_base != cpu_base)
- return;
-
- /*
- * If the hrtimer interrupt is running, then it will
- * reevaluate the clock bases and reprogram the clock event
- * device. The callbacks are always executed in hard interrupt
- * context so we don't need an extra check for a running
- * callback.
- */
- if (cpu_base->in_hrtirq)
- return;
-
- /*
- * CLOCK_REALTIME timer might be requested with an absolute
- * expiry time which is less than base->offset. Set it to 0.
- */
- if (expires < 0)
- expires = 0;
-
- if (expires >= cpu_base->expires_next)
- return;
-
- /* Update the pointer to the next expiring timer */
- cpu_base->next_timer = timer;
-
- /*
- * If a hang was detected in the last timer interrupt then we
- * do not schedule a timer which is earlier than the expiry
- * which we enforced in the hang detection. We want the system
- * to make progress.
- */
- if (cpu_base->hang_detected)
- return;
-
- /*
- * Program the timer hardware. We enforce the expiry for
- * events which are already in the past.
- */
- cpu_base->expires_next = expires;
- tick_program_event(expires, 1);
-}
-
-/*
* Retrigger next event is called after clock was set
*
* Called with interrupts disabled via on_each_cpu()
@@ -703,16 +641,73 @@ static inline int hrtimer_is_hres_enable
static inline void hrtimer_switch_to_hres(void) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
-static inline int hrtimer_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base)
-{
- return 0;
-}
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
/*
+ * When a timer is enqueued and expires earlier than the already enqueued
+ * timers, we have to check, whether it expires earlier than the timer for
+ * which the clock event device was armed.
+ *
+ * Called with interrupts disabled and base->cpu_base.lock held
+ */
+static void hrtimer_reprogram(struct hrtimer *timer,
+ struct hrtimer_clock_base *base)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+
+ WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+
+ /*
+ * If the timer is not on the current cpu, we cannot reprogram
+ * the other cpus clock event device.
+ */
+ if (base->cpu_base != cpu_base)
+ return;
+
+ /*
+ * If the hrtimer interrupt is running, then it will
+ * reevaluate the clock bases and reprogram the clock event
+ * device. The callbacks are always executed in hard interrupt
+ * context so we don't need an extra check for a running
+ * callback.
+ */
+ if (cpu_base->in_hrtirq)
+ return;
+
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Set it to 0.
+ */
+ if (expires < 0)
+ expires = 0;
+
+ if (expires >= cpu_base->expires_next)
+ return;
+
+ /* Update the pointer to the next expiring timer */
+ cpu_base->next_timer = timer;
+
+ /*
+ * If a hang was detected in the last timer interrupt then we
+ * do not schedule a timer which is earlier than the expiry
+ * which we enforced in the hang detection. We want the system
+ * to make progress.
+ */
+ if (cpu_base->hang_detected)
+ return;
+
+ /*
+ * Program the timer hardware. We enforce the expiry for
+ * events which are already in the past.
+ */
+ cpu_base->expires_next = expires;
+ tick_program_event(expires, 1);
+}
+
+/*
* Clock realtime was set
*
* Change the offset of the realtime clock vs. the monotonic

View File

@ -1,74 +0,0 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:48 +0200
Subject: [PATCH 17/17] sched/core: Enable might_sleep() and smp_processor_id()
checks early
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
might_sleep() and smp_processor_id() checks are enabled after the boot
process is done. That hides bugs in the SMP bringup and driver
initialization code.
Enable it right when the scheduler starts working, i.e. when init task and
kthreadd have been created and right before the idle task enables
preemption.
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184736.272225698@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
init/main.c | 10 ++++++++++
kernel/sched/core.c | 4 +++-
lib/smp_processor_id.c | 2 +-
3 files changed, 14 insertions(+), 2 deletions(-)
--- a/init/main.c
+++ b/init/main.c
@@ -414,6 +414,16 @@ static noinline void __ref rest_init(voi
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
+
+ /*
+ * Enable might_sleep() and smp_processor_id() checks.
+ * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+ * kernel_thread() would trigger might_sleep() splats. With
+ * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+ * already, but it's stuck on the kthreadd_done completion.
+ */
+ system_state = SYSTEM_SCHEDULING;
+
complete(&kthreadd_done);
/*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6223,8 +6223,10 @@ void ___might_sleep(const char *file, in
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
!is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
return;
+
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
prev_jiffy = jiffies;
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemp
/*
* It is valid to assume CPU-locality during early bootup:
*/
- if (system_state != SYSTEM_RUNNING)
+ if (system_state < SYSTEM_SCHEDULING)
goto out;
/*

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:13 -0500
Subject: [PATCH 12/32] tracing: Add hist trigger timestamp support
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:59:48 -0500
Subject: [PATCH 17/42] tracing: Add hist trigger timestamp support
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Add support for a timestamp event field. This is actually a 'pseudo-'
event field in that it behaves like it's part of the event record, but
@ -22,10 +22,11 @@ event rather than an offset. This mode will be enabled if and only if
a histogram makes use of the "$common_timestamp" field.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Baohong Liu <baohong.liu@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/trace_events_hist.c | 90 ++++++++++++++++++++++++++++-----------
1 file changed, 66 insertions(+), 24 deletions(-)
kernel/trace/trace_events_hist.c | 90 +++++++++++++++++++++++++++++----------
1 file changed, 67 insertions(+), 23 deletions(-)
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@ -43,10 +44,10 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
static u64 hist_field_##type(struct hist_field *hist_field, \
void *event, \
@@ -135,6 +141,7 @@ enum hist_field_flags {
HIST_FIELD_FL_SYSCALL = 128,
HIST_FIELD_FL_STACKTRACE = 256,
HIST_FIELD_FL_LOG2 = 512,
+ HIST_FIELD_FL_TIMESTAMP = 1024,
HIST_FIELD_FL_SYSCALL = 1 << 7,
HIST_FIELD_FL_STACKTRACE = 1 << 8,
HIST_FIELD_FL_LOG2 = 1 << 9,
+ HIST_FIELD_FL_TIMESTAMP = 1 << 10,
};
struct hist_trigger_attrs {
@ -67,7 +68,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (field_name == NULL)
field_name = "";
@@ -435,6 +445,12 @@ static struct hist_field *create_hist_fi
@@ -440,6 +450,12 @@ static struct hist_field *create_hist_fi
goto out;
}
@ -80,12 +81,12 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (WARN_ON_ONCE(!field))
goto out;
@@ -512,10 +528,15 @@ static int create_val_field(struct hist_
@@ -517,10 +533,15 @@ static int create_val_field(struct hist_
}
}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
- if (!field || !field->size) {
- ret = -EINVAL;
- goto out;
+ if (strcmp(field_name, "$common_timestamp") == 0) {
@ -93,19 +94,19 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ hist_data->enable_timestamps = true;
+ } else {
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field) {
+ if (!field || !field->size) {
+ ret = -EINVAL;
+ goto out;
+ }
}
hist_data->fields[val_idx] = create_hist_field(field, flags);
@@ -610,16 +631,22 @@ static int create_key_field(struct hist_
@@ -615,16 +636,22 @@ static int create_key_field(struct hist_
}
}
- field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
- if (!field || !field->size) {
- ret = -EINVAL;
- goto out;
- }
@ -115,7 +116,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ key_size = sizeof(u64);
+ } else {
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field) {
+ if (!field || !field->size) {
+ ret = -EINVAL;
+ goto out;
+ }
@ -132,16 +133,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
hist_data->fields[key_idx] = create_hist_field(field, flags);
@@ -756,7 +783,7 @@ static int create_sort_keys(struct hist_
break;
}
- if (strcmp(field_name, "hitcount") == 0) {
+ if ((strcmp(field_name, "hitcount") == 0)) {
descending = is_descending(field_str);
if (descending < 0) {
ret = descending;
@@ -816,6 +843,9 @@ static int create_tracing_map_fields(str
@@ -820,6 +847,9 @@ static int create_tracing_map_fields(str
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
@ -151,7 +143,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
else if (is_string_field(field))
cmp_fn = tracing_map_cmp_string;
else
@@ -1213,7 +1243,11 @@ static void hist_field_print(struct seq_
@@ -1217,7 +1247,11 @@ static void hist_field_print(struct seq_
{
const char *field_name = hist_field_name(hist_field, 0);
@ -164,7 +156,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (hist_field->flags) {
const char *flags_str = get_hist_field_flags(hist_field);
@@ -1264,27 +1298,25 @@ static int event_hist_trigger_print(stru
@@ -1268,27 +1302,25 @@ static int event_hist_trigger_print(stru
for (i = 0; i < hist_data->n_sort_keys; i++) {
struct tracing_map_sort_key *sort_key;
@ -199,7 +191,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
if (data->filter_str)
@@ -1452,6 +1484,10 @@ static bool hist_trigger_match(struct ev
@@ -1456,6 +1488,10 @@ static bool hist_trigger_match(struct ev
return false;
if (key_field->offset != key_field_test->offset)
return false;
@ -210,7 +202,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
}
for (i = 0; i < hist_data->n_sort_keys; i++) {
@@ -1534,6 +1570,9 @@ static int hist_register_trigger(char *g
@@ -1538,6 +1574,9 @@ static int hist_register_trigger(char *g
update_cond_flag(file);
@ -220,13 +212,15 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (trace_event_trigger_enable_disable(file, 1) < 0) {
list_del_rcu(&data->list);
update_cond_flag(file);
@@ -1568,6 +1607,9 @@ static void hist_unregister_trigger(char
@@ -1572,6 +1611,11 @@ static void hist_unregister_trigger(char
if (unregistered && test->ops->free)
test->ops->free(test->ops, test);
+
+ if (hist_data->enable_timestamps)
+ tracing_set_time_stamp_abs(file->tr, false);
+ if (hist_data->enable_timestamps) {
+ if (unregistered)
+ tracing_set_time_stamp_abs(file->tr, false);
+ }
}
static void hist_unreg_all(struct trace_event_file *file)

View File

@ -0,0 +1,105 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:56 +0200
Subject: [PATCH 18/36] hrtimer: Reduce conditional code and make
hrtimer_force_reprogramm() unconditional
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
hrtimer_force_reprogram() needs to be available unconditionally for softirq
based hrtimers. Move the function and all required struct members out of
the CONFIG_HIGH_RES_TIMERS #ifdef.
There is no functional change because hrtimer_force_reprogram() is
only invoked when hrtimer_cpu_base.hres_active is true and
CONFIG_HIGH_RES_TIMERS=y. Making it unconditional increases the text
size for the CONFIG_HIGH_RES_TIMERS=n case slightly, but avoids
replication of that code for the upcoming softirq based hrtimers
support. Most of the code gets eliminated in the
CONFIG_HIGH_RES_TIMERS=n case by the compiler.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 58 ++++++++++++++++++++++++--------------------------
1 file changed, 28 insertions(+), 30 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -513,34 +513,6 @@ static inline int hrtimer_hres_active(vo
return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
}
-/* High resolution timer related functions */
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer enabled ?
- */
-static bool hrtimer_hres_enabled __read_mostly = true;
-unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
-EXPORT_SYMBOL_GPL(hrtimer_resolution);
-
-/*
- * Enable / Disable high resolution mode
- */
-static int __init setup_hrtimer_hres(char *str)
-{
- return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
-}
-
-__setup("highres=", setup_hrtimer_hres);
-
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
-{
- return hrtimer_hres_enabled;
-}
-
/*
* Reprogram the event source with checking both queues for the
* next event
@@ -581,6 +553,34 @@ hrtimer_force_reprogram(struct hrtimer_c
tick_program_event(cpu_base->expires_next, 1);
}
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer enabled ?
+ */
+static bool hrtimer_hres_enabled __read_mostly = true;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
+
+/*
+ * Enable / Disable high resolution mode
+ */
+static int __init setup_hrtimer_hres(char *str)
+{
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
+}
+
+__setup("highres=", setup_hrtimer_hres);
+
+/*
+ * hrtimer_high_res_enabled - query, if the highres mode is enabled
+ */
+static inline int hrtimer_is_hres_enabled(void)
+{
+ return hrtimer_hres_enabled;
+}
+
/*
* Retrigger next event is called after clock was set
*
@@ -639,8 +639,6 @@ void clock_was_set_delayed(void)
static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
-static inline void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */

View File

@ -1,8 +1,8 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:14 -0500
Subject: [PATCH 13/32] tracing: Add per-element variable support to
Date: Fri, 22 Sep 2017 14:59:49 -0500
Subject: [PATCH 18/42] tracing: Add per-element variable support to
tracing_map
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
In order to allow information to be passed between trace events, add
support for per-element variables to tracing_map. This provides a
@ -21,9 +21,9 @@ important for event-matching uses.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/trace/tracing_map.c | 113 +++++++++++++++++++++++++++++++++++++++++++++
kernel/trace/tracing_map.c | 108 +++++++++++++++++++++++++++++++++++++++++++++
kernel/trace/tracing_map.h | 11 ++++
2 files changed, 124 insertions(+)
2 files changed, 119 insertions(+)
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@ -130,7 +130,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
* tracing_map_add_key_field - Add a field describing a tracing_map key
* @map: The tracing_map
* @offset: The offset within the key
@@ -277,6 +366,11 @@ static void tracing_map_elt_clear(struct
@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct
if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
atomic64_set(&elt->fields[i].sum, 0);
@ -142,7 +142,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (elt->map->ops && elt->map->ops->elt_clear)
elt->map->ops->elt_clear(elt);
}
@@ -303,6 +397,8 @@ static void tracing_map_elt_free(struct
@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct
if (elt->map->ops && elt->map->ops->elt_free)
elt->map->ops->elt_free(elt);
kfree(elt->fields);
@ -151,10 +151,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
kfree(elt->key);
kfree(elt);
}
@@ -330,6 +426,18 @@ static struct tracing_map_elt *tracing_m
@@ -332,6 +428,18 @@ static struct tracing_map_elt *tracing_m
err = -ENOMEM;
goto free;
}
+
+ elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
+ if (!elt->vars) {
+ err = -ENOMEM;
@ -166,25 +167,12 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ err = -ENOMEM;
+ goto free;
+ }
+
tracing_map_elt_init_fields(elt);
if (map->ops && map->ops->elt_alloc) {
@@ -833,6 +941,11 @@ static struct tracing_map_elt *copy_elt(
dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
}
+ for (i = 0; i < elt->map->n_vars; i++) {
+ atomic64_set(&dup_elt->vars[i], atomic64_read(&elt->vars[i]));
+ dup_elt->var_set[i] = elt->var_set[i];
+ }
+
return dup_elt;
}
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -9,6 +9,7 @@
@@ -10,6 +10,7 @@
#define TRACING_MAP_VALS_MAX 3
#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
TRACING_MAP_VALS_MAX)
@ -192,7 +180,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
#define TRACING_MAP_SORT_KEYS_MAX 2
typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
@@ -136,6 +137,8 @@ struct tracing_map_field {
@@ -137,6 +138,8 @@ struct tracing_map_field {
struct tracing_map_elt {
struct tracing_map *map;
struct tracing_map_field *fields;
@ -201,7 +189,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
void *key;
void *private_data;
};
@@ -191,6 +194,7 @@ struct tracing_map {
@@ -192,6 +195,7 @@ struct tracing_map {
int key_idx[TRACING_MAP_KEYS_MAX];
unsigned int n_keys;
struct tracing_map_sort_key sort_key;
@ -209,7 +197,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
atomic64_t hits;
atomic64_t drops;
};
@@ -247,6 +251,7 @@ tracing_map_create(unsigned int map_bits
@@ -241,6 +245,7 @@ tracing_map_create(unsigned int map_bits
extern int tracing_map_init(struct tracing_map *map);
extern int tracing_map_add_sum_field(struct tracing_map *map);
@ -217,7 +205,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
extern int tracing_map_add_key_field(struct tracing_map *map,
unsigned int offset,
tracing_map_cmp_fn_t cmp_fn);
@@ -266,7 +271,13 @@ extern int tracing_map_cmp_none(void *va
@@ -260,7 +265,13 @@ extern int tracing_map_cmp_none(void *va
extern void tracing_map_update_sum(struct tracing_map_elt *elt,
unsigned int i, u64 n);

View File

@ -0,0 +1,89 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:57 +0200
Subject: [PATCH 19/36] hrtimer: Unify handling of hrtimer remove
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
When the first hrtimer on the current CPU is removed,
hrtimer_force_reprogram() is invoked but only when
CONFIG_HIGH_RES_TIMERS=y and hrtimer_cpu_base.hres_active is set.
hrtimer_force_reprogram() updates hrtimer_cpu_base.expires_next and
reprograms the clock event device. When CONFIG_HIGH_RES_TIMERS=y and
hrtimer_cpu_base.hres_active is set, a pointless hrtimer interrupt can be
prevented.
hrtimer_check_target() makes the 'can remote enqueue' decision. As soon as
hrtimer_check_target() is unconditionally available and
hrtimer_cpu_base.expires_next is updated by hrtimer_reprogram(),
hrtimer_force_reprogram() needs to be available unconditionally as well to
prevent the following scenario with CONFIG_HIGH_RES_TIMERS=n:
- the first hrtimer on this CPU is removed and hrtimer_force_reprogram() is
not executed
- CPU goes idle (next timer is calculated and hrtimers are taken into
account)
- a hrtimer is enqueued remote on the idle CPU: hrtimer_check_target()
compares expiry value and hrtimer_cpu_base.expires_next. The expiry value
is after expires_next, so the hrtimer is enqueued. This timer will fire
late, if it expires before the effective first hrtimer on this CPU and
the comparison was with an outdated expires_next value.
To prevent this scenario, make hrtimer_force_reprogram() unconditional
except the effective reprogramming part, which gets eliminated by the
compiler in the CONFIG_HIGH_RES_TIMERS=n case.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -523,9 +523,6 @@ hrtimer_force_reprogram(struct hrtimer_c
{
ktime_t expires_next;
- if (!__hrtimer_hres_active(cpu_base))
- return;
-
expires_next = __hrtimer_get_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
@@ -534,6 +531,9 @@ hrtimer_force_reprogram(struct hrtimer_c
cpu_base->expires_next = expires_next;
/*
+ * If hres is not active, hardware does not have to be
+ * reprogrammed yet.
+ *
* If a hang was detected in the last timer interrupt then we
* leave the hang delay active in the hardware. We want the
* system to make progress. That also prevents the following
@@ -547,7 +547,7 @@ hrtimer_force_reprogram(struct hrtimer_c
* set. So we'd effectivly block all timers until the T2 event
* fires.
*/
- if (cpu_base->hang_detected)
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
tick_program_event(cpu_base->expires_next, 1);
@@ -848,7 +848,6 @@ static void __remove_hrtimer(struct hrti
if (!timerqueue_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
-#ifdef CONFIG_HIGH_RES_TIMERS
/*
* Note: If reprogram is false we do not update
* cpu_base->next_timer. This happens when we remove the first
@@ -859,7 +858,6 @@ static void __remove_hrtimer(struct hrti
*/
if (reprogram && timer == cpu_base->next_timer)
hrtimer_force_reprogram(cpu_base, 1);
-#endif
}
/*

View File

@ -1,7 +1,7 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:15 -0500
Subject: [PATCH 14/32] tracing: Add hist_data member to hist_field
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
Date: Fri, 22 Sep 2017 14:59:50 -0500
Subject: [PATCH 19/42] tracing: Add hist_data member to hist_field
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
Allow hist_data access via hist_field. Some users of hist_fields
require or will require more access to the associated hist_data.
@ -22,7 +22,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
};
static u64 hist_field_none(struct hist_field *field, void *event,
@@ -415,7 +416,8 @@ static void destroy_hist_field(struct hi
@@ -420,7 +421,8 @@ static void destroy_hist_field(struct hi
kfree(hist_field);
}
@ -32,7 +32,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
unsigned long flags)
{
struct hist_field *hist_field;
@@ -427,6 +429,8 @@ static struct hist_field *create_hist_fi
@@ -432,6 +434,8 @@ static struct hist_field *create_hist_fi
if (!hist_field)
return NULL;
@ -41,7 +41,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (flags & HIST_FIELD_FL_HITCOUNT) {
hist_field->fn = hist_field_counter;
goto out;
@@ -440,7 +444,7 @@ static struct hist_field *create_hist_fi
@@ -445,7 +449,7 @@ static struct hist_field *create_hist_fi
if (flags & HIST_FIELD_FL_LOG2) {
unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
hist_field->fn = hist_field_log2;
@ -50,7 +50,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
hist_field->size = hist_field->operands[0]->size;
goto out;
}
@@ -493,7 +497,7 @@ static void destroy_hist_fields(struct h
@@ -498,7 +502,7 @@ static void destroy_hist_fields(struct h
static int create_hitcount_val(struct hist_trigger_data *hist_data)
{
hist_data->fields[HITCOUNT_IDX] =
@ -59,7 +59,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (!hist_data->fields[HITCOUNT_IDX])
return -ENOMEM;
@@ -539,7 +543,7 @@ static int create_val_field(struct hist_
@@ -544,7 +548,7 @@ static int create_val_field(struct hist_
}
}
@ -68,7 +68,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (!hist_data->fields[val_idx]) {
ret = -ENOMEM;
goto out;
@@ -649,7 +653,7 @@ static int create_key_field(struct hist_
@@ -654,7 +658,7 @@ static int create_key_field(struct hist_
}
}

View File

@ -0,0 +1,158 @@
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 22 Oct 2017 23:39:58 +0200
Subject: [PATCH 20/36] hrtimer: Unify handling of remote enqueue
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/older/patches-4.14-rt1.tar.xz
hrtimer_reprogram() is conditionally invoked from hrtimer_start_range_ns()
when hrtimer_cpu_base.hres_active is true.
In the !hres_active case there is a special condition for the nohz_active
case:
If the newly enqueued timer expires before the first expiring timer on a
remote CPU then the remote CPU needs to be notified and woken up from a
NOHZ idle sleep to take the new first expiring timer into account.
Previous changes have already established the prerequisites to make the
remote enqueue behaviour the same whether high resolution mode is active or
not:
If the to be enqueued timer expires before the first expiring timer on a
remote CPU, then it cannot be enqueued there.
This was done for the high resolution mode because there is no way to
access the remote CPU timer hardware. The same is true for NOHZ, but was
handled differently by unconditionally enqueuing the timer and waking up
the remote CPU so it can reprogram its timer. Again there is no compelling
reason for this difference.
hrtimer_check_target(), which makes the 'can remote enqueue' decision is
already unconditional, but not yet functional because nothing updates
hrtimer_cpu_base.expires_next in the !hres_active case.
To unify this the following changes are required:
1) Make the store of the new first expiry time unconditonal in
hrtimer_reprogram() and check __hrtimer_hres_active() before proceeding
to the actual hardware access. This check also lets the compiler
eliminate the rest of the function in case of CONFIG_HIGH_RES_TIMERS=n.
2) Invoke hrtimer_reprogram() unconditionally from
hrtimer_start_range_ns()
3) Remove the remote wakeup special case for the !high_res && nohz_active
case.
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/hrtimer.c | 18 ++++++------------
kernel/time/tick-internal.h | 11 -----------
kernel/time/timer.c | 15 ++++++++++++++-
3 files changed, 20 insertions(+), 24 deletions(-)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -687,21 +687,24 @@ static void hrtimer_reprogram(struct hrt
/* Update the pointer to the next expiring timer */
cpu_base->next_timer = timer;
+ cpu_base->expires_next = expires;
/*
+ * If hres is not active, hardware does not have to be
+ * programmed yet.
+ *
* If a hang was detected in the last timer interrupt then we
* do not schedule a timer which is earlier than the expiry
* which we enforced in the hang detection. We want the system
* to make progress.
*/
- if (cpu_base->hang_detected)
+ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
/*
* Program the timer hardware. We enforce the expiry for
* events which are already in the past.
*/
- cpu_base->expires_next = expires;
tick_program_event(expires, 1);
}
@@ -940,16 +943,7 @@ void hrtimer_start_range_ns(struct hrtim
if (!leftmost)
goto unlock;
- if (!hrtimer_is_hres_active(timer)) {
- /*
- * Kick to reschedule the next tick to handle the new timer
- * on dynticks target.
- */
- if (is_timers_nohz_active())
- wake_up_nohz_cpu(new_base->cpu_base->cpu);
- } else {
- hrtimer_reprogram(timer, new_base);
- }
+ hrtimer_reprogram(timer, new_base);
unlock:
unlock_hrtimer_base(timer, &flags);
}
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -151,12 +151,6 @@ static inline void tick_nohz_init(void)
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
extern void timers_update_nohz(void);
-extern struct static_key_false timers_nohz_active;
-
-static inline bool is_timers_nohz_active(void)
-{
- return static_branch_unlikely(&timers_nohz_active);
-}
#ifdef CONFIG_SMP
extern struct static_key_false timers_migration_enabled;
@@ -164,11 +158,6 @@ extern struct static_key_false timers_mi
#else
static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
-
-static inline bool is_timers_nohz_active(void)
-{
- return false;
-}
#endif
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -210,7 +210,7 @@ static DEFINE_PER_CPU(struct timer_base,
#ifdef CONFIG_NO_HZ_COMMON
-DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
+static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);
static void timer_update_keys(struct work_struct *work);
@@ -260,6 +260,19 @@ int timer_migration_handler(struct ctl_t
mutex_unlock(&timer_keys_mutex);
return ret;
}
+
+static inline bool is_timers_nohz_active(void)
+{
+ return static_branch_unlikely(&timers_nohz_active);
+}
+
+#else
+
+static inline bool is_timers_nohz_active(void)
+{
+ return false;
+}
+
#endif /* NO_HZ_COMMON */
static unsigned long round_jiffies_common(unsigned long j, int cpu,

View File

@ -1,196 +0,0 @@
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 26 Jun 2017 17:49:21 -0500
Subject: [PATCH 20/32] tracing: Add support for dynamic tracepoints
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/4.11/older/patches-4.11.9-rt7.tar.xz
The tracepoint infrastructure assumes statically-defined tracepoints
and uses static_keys for tracepoint enablement. In order to define
tracepoints on the fly, we need to have a dynamic counterpart.
Add a dynamic_tracepoint_probe_register() and a dynamic param onto
tracepoint_probe_unregister() for this purpose.
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/tracepoint.h | 11 +++++++----
kernel/trace/trace_events.c | 4 ++--
kernel/tracepoint.c | 42 ++++++++++++++++++++++++++++++------------
3 files changed, 39 insertions(+), 18 deletions(-)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -37,9 +37,12 @@ extern int
tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
extern int
tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data,
- int prio);
+ int prio, bool dynamic);
+extern int dynamic_tracepoint_probe_register(struct tracepoint *tp,
+ void *probe, void *data);
extern int
-tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);
+tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data,
+ bool dynamic);
extern void
for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
void *priv);
@@ -206,13 +209,13 @@ extern void syscall_unregfunc(void);
int prio) \
{ \
return tracepoint_probe_register_prio(&__tracepoint_##name, \
- (void *)probe, data, prio); \
+ (void *)probe, data, prio, false); \
} \
static inline int \
unregister_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_unregister(&__tracepoint_##name,\
- (void *)probe, data); \
+ (void *)probe, data, false); \
} \
static inline void \
check_trace_callback_type_##name(void (*cb)(data_proto)) \
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -297,7 +297,7 @@ int trace_event_reg(struct trace_event_c
case TRACE_REG_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->probe,
- file);
+ file, false);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -308,7 +308,7 @@ int trace_event_reg(struct trace_event_c
case TRACE_REG_PERF_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
- call);
+ call, false);
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,15 @@ static void *func_remove(struct tracepoi
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func, int prio)
+ struct tracepoint_func *func, int prio,
+ bool dynamic)
{
struct tracepoint_func *old, *tp_funcs;
int ret;
- if (tp->regfunc && !static_key_enabled(&tp->key)) {
+ if (tp->regfunc &&
+ ((dynamic && !(atomic_read(&tp->key.enabled) > 0)) ||
+ !static_key_enabled(&tp->key))) {
ret = tp->regfunc();
if (ret < 0)
return ret;
@@ -219,7 +222,9 @@ static int tracepoint_add_func(struct tr
* is used.
*/
rcu_assign_pointer(tp->funcs, tp_funcs);
- if (!static_key_enabled(&tp->key))
+ if (dynamic && !(atomic_read(&tp->key.enabled) > 0))
+ atomic_inc(&tp->key.enabled);
+ else if (!dynamic && !static_key_enabled(&tp->key))
static_key_slow_inc(&tp->key);
release_probes(old);
return 0;
@@ -232,7 +237,7 @@ static int tracepoint_add_func(struct tr
* by preempt_disable around the call site.
*/
static int tracepoint_remove_func(struct tracepoint *tp,
- struct tracepoint_func *func)
+ struct tracepoint_func *func, bool dynamic)
{
struct tracepoint_func *old, *tp_funcs;
@@ -246,10 +251,14 @@ static int tracepoint_remove_func(struct
if (!tp_funcs) {
/* Removed last function */
- if (tp->unregfunc && static_key_enabled(&tp->key))
+ if (tp->unregfunc &&
+ ((dynamic && (atomic_read(&tp->key.enabled) > 0)) ||
+ static_key_enabled(&tp->key)))
tp->unregfunc();
- if (static_key_enabled(&tp->key))
+ if (dynamic && (atomic_read(&tp->key.enabled) > 0))
+ atomic_dec(&tp->key.enabled);
+ else if (!dynamic && static_key_enabled(&tp->key))
static_key_slow_dec(&tp->key);
}
rcu_assign_pointer(tp->funcs, tp_funcs);
@@ -258,7 +267,7 @@ static int tracepoint_remove_func(struct
}
/**
- * tracepoint_probe_register - Connect a probe to a tracepoint
+ * tracepoint_probe_register_prio - Connect a probe to a tracepoint
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
@@ -271,7 +280,7 @@ static int tracepoint_remove_func(struct
* within module exit functions.
*/
int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
- void *data, int prio)
+ void *data, int prio, bool dynamic)
{
struct tracepoint_func tp_func;
int ret;
@@ -280,7 +289,7 @@ int tracepoint_probe_register_prio(struc
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
- ret = tracepoint_add_func(tp, &tp_func, prio);
+ ret = tracepoint_add_func(tp, &tp_func, prio, dynamic);
mutex_unlock(&tracepoints_mutex);
return ret;
}
@@ -301,10 +310,18 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_regis
*/
int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
{
- return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
+ return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO, false);
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
+int dynamic_tracepoint_probe_register(struct tracepoint *tp, void *probe,
+ void *data)
+{
+ return tracepoint_probe_register_prio(tp, probe, data,
+ TRACEPOINT_DEFAULT_PRIO, true);
+}
+EXPORT_SYMBOL_GPL(dynamic_tracepoint_probe_register);
+
/**
* tracepoint_probe_unregister - Disconnect a probe from a tracepoint
* @tp: tracepoint
@@ -313,7 +330,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_regis
*
* Returns 0 if ok, error value on error.
*/
-int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data,
+ bool dynamic)
{
struct tracepoint_func tp_func;
int ret;
@@ -321,7 +339,7 @@ int tracepoint_probe_unregister(struct t
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
- ret = tracepoint_remove_func(tp, &tp_func);
+ ret = tracepoint_remove_func(tp, &tp_func, dynamic);
mutex_unlock(&tracepoints_mutex);
return ret;
}

Some files were not shown because too many files have changed in this diff Show More