mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-24 09:13:20 -05:00
perf/core: Fix perf_cgroup_switch()
There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp) in perf_cgroup_switch(). CPU1 CPU2 perf_cgroup_sched_out(prev, next) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) perf_cgroup_switch(prev, PERF_CGROUP_SWOUT) cgroup_migrate_execute() task->cgroups = ? perf_cgroup_attach() task_function_call(task, __perf_cgroup_move) perf_cgroup_sched_in(prev, next) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) perf_cgroup_switch(next, PERF_CGROUP_SWIN) __perf_cgroup_move() perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN) The commita8d757ef07
("perf events: Fix slow and broken cgroup context switch code") want to skip perf_cgroup_switch() when the perf_cgroup of "prev" and "next" are the same. But task->cgroups can change in concurrent with context_switch() in cgroup_migrate_execute(). If cgrp1 == cgrp2 in sched_out(), cpuctx won't do sched_out. Then task->cgroups changed cause cgrp1 != cgrp2 in sched_in(), cpuctx will do sched_in. So trigger WARN_ON_ONCE(cpuctx->cgrp). Even though __perf_cgroup_move() will be synchronized as the context switch disables the interrupt, context_switch() still can see the task->cgroups is changing in the middle, since task->cgroups changed before sending IPI. So we have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(), unified into perf_cgroup_switch(), to fix the incosistency between perf_cgroup_sched_out() and perf_cgroup_sched_in(). But we can't just compare prev->cgroups with next->cgroups to decide whether to skip cpuctx sched_out/in since the prev->cgroups is changing too. For example: CPU1 CPU2 cgroup_migrate_execute() prev->cgroups = ? perf_cgroup_attach() task_function_call(task, __perf_cgroup_move) perf_cgroup_switch(task) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) cpuctx sched_out/in ... task_function_call() will return -ESRCH In the above example, prev->cgroups changing cause (cgrp1 == cgrp2) to be true, so skip cpuctx sched_out/in. And later task_function_call() would return -ESRCH since the prev task isn't running on cpu anymore. So we would leave perf_events of the old prev->cgroups still sched on the CPU, which is wrong. The solution is that we should use cpuctx->cgrp to compare with the next task's perf_cgroup. Since cpuctx->cgrp can only be changed on local CPU, and we have irq disabled, we can read cpuctx->cgrp to compare without holding ctx lock. Fixes:a8d757ef07
("perf events: Fix slow and broken cgroup context switch code") Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20220329154523.86438-4-zhouchengming@bytedance.com
This commit is contained in:
parent
6875186aea
commit
96492a6c55
1 changed files with 25 additions and 107 deletions
|
@ -824,17 +824,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
|
|||
|
||||
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
|
||||
|
||||
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
|
||||
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
|
||||
|
||||
/*
|
||||
* reschedule events based on the cgroup constraint of task.
|
||||
*
|
||||
* mode SWOUT : schedule out everything
|
||||
* mode SWIN : schedule in based on cgroup for next
|
||||
*/
|
||||
static void perf_cgroup_switch(struct task_struct *task, int mode)
|
||||
static void perf_cgroup_switch(struct task_struct *task)
|
||||
{
|
||||
struct perf_cgroup *cgrp;
|
||||
struct perf_cpu_context *cpuctx, *tmp;
|
||||
struct list_head *list;
|
||||
unsigned long flags;
|
||||
|
@ -845,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
|
|||
*/
|
||||
local_irq_save(flags);
|
||||
|
||||
cgrp = perf_cgroup_from_task(task, NULL);
|
||||
|
||||
list = this_cpu_ptr(&cgrp_cpuctx_list);
|
||||
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
|
||||
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
|
||||
if (READ_ONCE(cpuctx->cgrp) == cgrp)
|
||||
continue;
|
||||
|
||||
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
|
||||
perf_pmu_disable(cpuctx->ctx.pmu);
|
||||
|
||||
if (mode & PERF_CGROUP_SWOUT) {
|
||||
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
|
||||
/*
|
||||
* must not be done before ctxswout due
|
||||
* to event_filter_match() in event_sched_out()
|
||||
* to update_cgrp_time_from_cpuctx() in
|
||||
* ctx_sched_out()
|
||||
*/
|
||||
cpuctx->cgrp = NULL;
|
||||
}
|
||||
|
||||
if (mode & PERF_CGROUP_SWIN) {
|
||||
WARN_ON_ONCE(cpuctx->cgrp);
|
||||
cpuctx->cgrp = cgrp;
|
||||
/*
|
||||
* set cgrp before ctxsw in to allow
|
||||
* perf_cgroup_set_timestamp() in ctx_sched_in()
|
||||
* to not have to pass task around
|
||||
* we pass the cpuctx->ctx to perf_cgroup_from_task()
|
||||
* because cgorup events are only per-cpu
|
||||
*/
|
||||
cpuctx->cgrp = perf_cgroup_from_task(task,
|
||||
&cpuctx->ctx);
|
||||
cpu_ctx_sched_in(cpuctx, EVENT_ALL);
|
||||
}
|
||||
|
||||
perf_pmu_enable(cpuctx->ctx.pmu);
|
||||
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
|
||||
}
|
||||
|
@ -881,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
|
|||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static inline void perf_cgroup_sched_out(struct task_struct *task,
|
||||
struct task_struct *next)
|
||||
{
|
||||
struct perf_cgroup *cgrp1;
|
||||
struct perf_cgroup *cgrp2 = NULL;
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* we come here when we know perf_cgroup_events > 0
|
||||
* we do not need to pass the ctx here because we know
|
||||
* we are holding the rcu lock
|
||||
*/
|
||||
cgrp1 = perf_cgroup_from_task(task, NULL);
|
||||
cgrp2 = perf_cgroup_from_task(next, NULL);
|
||||
|
||||
/*
|
||||
* only schedule out current cgroup events if we know
|
||||
* that we are switching to a different cgroup. Otherwise,
|
||||
* do no touch the cgroup events.
|
||||
*/
|
||||
if (cgrp1 != cgrp2)
|
||||
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static inline void perf_cgroup_sched_in(struct task_struct *prev,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct perf_cgroup *cgrp1;
|
||||
struct perf_cgroup *cgrp2 = NULL;
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* we come here when we know perf_cgroup_events > 0
|
||||
* we do not need to pass the ctx here because we know
|
||||
* we are holding the rcu lock
|
||||
*/
|
||||
cgrp1 = perf_cgroup_from_task(task, NULL);
|
||||
cgrp2 = perf_cgroup_from_task(prev, NULL);
|
||||
|
||||
/*
|
||||
* only need to schedule in cgroup events if we are changing
|
||||
* cgroup during ctxsw. Cgroup events were not scheduled
|
||||
* out of ctxsw out if that was not the case.
|
||||
*/
|
||||
if (cgrp1 != cgrp2)
|
||||
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int perf_cgroup_ensure_storage(struct perf_event *event,
|
||||
struct cgroup_subsys_state *css)
|
||||
{
|
||||
|
@ -1096,16 +1035,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
|
|||
{
|
||||
}
|
||||
|
||||
static inline void perf_cgroup_sched_out(struct task_struct *task,
|
||||
struct task_struct *next)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void perf_cgroup_sched_in(struct task_struct *prev,
|
||||
struct task_struct *task)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
|
||||
struct perf_event_attr *attr,
|
||||
struct perf_event *group_leader)
|
||||
|
@ -1118,11 +1047,6 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
|
|||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
|
||||
{
|
||||
}
|
||||
|
||||
static inline u64 perf_cgroup_event_time(struct perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
|
@ -1142,6 +1066,10 @@ static inline void
|
|||
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
|
||||
{
|
||||
}
|
||||
|
||||
static void perf_cgroup_switch(struct task_struct *task)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -3661,7 +3589,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
|
|||
* cgroup event are system-wide mode only
|
||||
*/
|
||||
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
|
||||
perf_cgroup_sched_out(task, next);
|
||||
perf_cgroup_switch(next);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -3975,16 +3903,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
|
|||
struct perf_event_context *ctx;
|
||||
int ctxn;
|
||||
|
||||
/*
|
||||
* If cgroup events exist on this CPU, then we need to check if we have
|
||||
* to switch in PMU state; cgroup event are system-wide mode only.
|
||||
*
|
||||
* Since cgroup events are CPU events, we must schedule these in before
|
||||
* we schedule in the task events.
|
||||
*/
|
||||
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
|
||||
perf_cgroup_sched_in(prev, task);
|
||||
|
||||
for_each_task_context_nr(ctxn) {
|
||||
ctx = task->perf_event_ctxp[ctxn];
|
||||
if (likely(!ctx))
|
||||
|
@ -13556,7 +13474,7 @@ static int __perf_cgroup_move(void *info)
|
|||
{
|
||||
struct task_struct *task = info;
|
||||
rcu_read_lock();
|
||||
perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
|
||||
perf_cgroup_switch(task);
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue