aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Wilson <[email protected]>2016-01-08 09:55:33 +0000
committerThomas Gleixner <[email protected]>2016-01-08 19:27:39 +0100
commit1f1a89ac05f6e88aa341e86e57435fdbb1177c0c (patch)
treef2ccbc5afe94040c6ba411038e91c63d1ea90101
parent2039e6acaf94d83ec6b6d9f3d0bce7ea1f099918 (diff)
x86/mm: Micro-optimise clflush_cache_range()
Whilst inspecting the asm for clflush_cache_range() and some perf profiles that required extensive flushing of single cachelines (from part of the intel-gpu-tools GPU benchmarks), we noticed that gcc was reloading boot_cpu_data.x86_clflush_size on every iteration of the loop. We can manually hoist that read which perf regarded as taking ~25% of the function time for a single cacheline flush. Signed-off-by: Chris Wilson <[email protected]> Reviewed-by: Ross Zwisler <[email protected]> Acked-by: "H. Peter Anvin" <[email protected]> Cc: Toshi Kani <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Luis R. Rodriguez <[email protected]> Cc: Stephen Rothwell <[email protected]> Cc: Sai Praneeth <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Thomas Gleixner <[email protected]>
-rw-r--r--arch/x86/mm/pageattr.c10
1 files changed, 6 insertions, 4 deletions
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3137a4feed1..6000ad7f560c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -129,14 +129,16 @@ within(unsigned long addr, unsigned long start, unsigned long end)
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
- unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
+ void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
void *vend = vaddr + size;
- void *p;
+
+ if (p >= vend)
+ return;
mb();
- for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
- p < vend; p += boot_cpu_data.x86_clflush_size)
+ for (; p < vend; p += clflush_size)
clflushopt(p);
mb();