perf/core: Optimize perf_pmu_sched_task()

This change “perf/core: Optimize perf_pmu_sched_task()” in Linux kernel is authored by Peter Zijlstra <peterz [at] infradead.org> on Wed Jul 6 09:18:30 2016 +0200.

perf/core: Optimize perf_pmu_sched_task()

For perf record -b, which requires the pmu::sched_task callback the
current code is rather expensive:

     7.68%  sched-pipe  [kernel.vmlinux]    [k] perf_pmu_sched_task
     5.95%  sched-pipe  [kernel.vmlinux]    [k] __switch_to
     5.20%  sched-pipe  [kernel.vmlinux]    [k] __intel_pmu_disable_all
     3.95%  sched-pipe  perf                [.] worker_thread

The problem is that it will iterate all registered PMUs, most of which
will not have anything to do. Avoid this by keeping an explicit list
of PMUs that have requested the callback.

The perf_sched_cb_{inc,dec}() functions already takes the required pmu
argument, and now that these functions are no longer called from NMI
context we can use them to manage a list.

With this patch applied the function doesn't show up in the top 4
anymore (it dropped to 18th place).

     6.67%  sched-pipe  [kernel.vmlinux]    [k] __switch_to
     6.18%  sched-pipe  [kernel.vmlinux]    [k] __intel_pmu_disable_all
     3.92%  sched-pipe  [kernel.vmlinux]    [k] switch_mm_irqs_off
     3.71%  sched-pipe  perf                [.] worker_thread

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

This Linux change may have been applied to various maintained Linux releases and you can find Linux releases including commit e48c178.

There are 46 lines of Linux source code added/deleted in this change. Code changes to Linux kernel are as follows.

 include/linux/perf_event.h |  3 +++
 kernel/events/core.c       | 43 ++++++++++++++++++++++++-------------------
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2b6b43c..529c41f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -774,6 +774,9 @@ struct perf_cpu_context {
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
 #endif
+
+	struct list_head		sched_cb_entry;
+	int				sched_cb_usage;
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57aff71..803481c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2805,13 +2805,26 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 	}
 }
 
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
 void perf_sched_cb_dec(struct pmu *pmu)
 {
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
 	this_cpu_dec(perf_sched_cb_usages);
+
+	if (!--cpuctx->sched_cb_usage)
+		list_del(&cpuctx->sched_cb_entry);
 }
 
+
 void perf_sched_cb_inc(struct pmu *pmu)
 {
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+	if (!cpuctx->sched_cb_usage++)
+		list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
 	this_cpu_inc(perf_sched_cb_usages);
 }
 
@@ -2829,34 +2842,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
 {
 	struct perf_cpu_context *cpuctx;
 	struct pmu *pmu;
-	unsigned long flags;
 
 	if (prev == next)
 		return;
 
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		if (pmu->sched_task) {
-			cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+	list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+		pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
 
-			perf_pmu_disable(pmu);
+		if (WARN_ON_ONCE(!pmu->sched_task))
+			continue;
 
-			pmu->sched_task(cpuctx->task_ctx, sched_in);
+		perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+		perf_pmu_disable(pmu);
 
-			perf_pmu_enable(pmu);
+		pmu->sched_task(cpuctx->task_ctx, sched_in);
 
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		}
+		perf_pmu_enable(pmu);
+		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 	}
-
-	rcu_read_unlock();
-
-	local_irq_restore(flags);
 }
 
 static void perf_event_switch(struct task_struct *task,
@@ -10393,6 +10396,8 @@ static void __init perf_event_init_all_cpus(void)
 
 		INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
 		raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+		INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
 	}
 }
 

The commit for this change in Linux stable tree is e48c178 (patch).

Leave a Reply

Your email address will not be published. Required fields are marked *