add copy user patches from Marvell

svn path=/dists/trunk/linux-2.6/; revision=13721
This commit is contained in:
Martin Michlmayr 2009-06-03 20:58:24 +00:00
parent 52a7c27516
commit 063ca30d03
4 changed files with 214 additions and 0 deletions

3
debian/changelog vendored
View File

@ -45,6 +45,9 @@ linux-2.6 (2.6.30~rc8-1~experimental.1) UNRELEASED; urgency=low
these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD.
* [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and
BROADCOM_PHY.
* Add patches from git.marvell.com:
- alternative copy_to_user: more precise fallback threshold
- lower overhead with alternative copy_to_user for small copies
[ Aurelien Jarno ]
* [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed

View File

@ -0,0 +1,121 @@
From: Nicolas Pitre <nico@cam.org>
Date: Sat, 30 May 2009 01:55:50 +0000 (-0400)
Subject: [ARM] alternative copy_to_user: more precise fallback threshold
X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55
[ARM] alternative copy_to_user: more precise fallback threshold
Previous size thresholds were guessed from various user space benchmarks
using a kernel with and without the alternative uaccess option. This
is however not as precise as a kernel based test to measure the real
speed of each method.
This adds a simple test bench to show the time needed for each method.
With this, the optimal size treshold for the alternative implementation
can be determined with more confidence. It appears that the optimal
threshold for both copy_to_user and clear_user is around 64 bytes. This
is not a surprise knowing that the memcpy and memset implementations
need at least 64 bytes to achieve maximum throughput.
One might suggest that such test be used to determine the optimal
threshold at run time instead, but results are near enough to 64 on
tested targets concerned by this alternative copy_to_user implementation,
so adding some overhead associated with a variable threshold is probably
not worth it for now.
Signed-off-by: Nicolas Pitre <nico@marvell.com>
---
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 92838e7..6b967ff 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
* With frame pointer disabled, tail call optimization kicks in
* as well making this test almost invisible.
*/
- if (n < 1024)
+ if (n < 64)
return __copy_to_user_std(to, from, n);
return __copy_to_user_memcpy(to, from, n);
}
@@ -151,7 +151,78 @@ out:
unsigned long __clear_user(void __user *addr, unsigned long n)
{
/* See rational for this in __copy_to_user() above. */
- if (n < 256)
+ if (n < 64)
return __clear_user_std(addr, n);
return __clear_user_memset(addr, n);
}
+
+#if 0
+
+/*
+ * This code is disabled by default, but kept around in case the chosen
+ * thresholds need to be revalidated. Some overhead (small but still)
+ * would be implied by a runtime determined variable threshold, and
+ * so far the measurement on concerned targets didn't show a worthwhile
+ * variation.
+ *
+ * Note that a fairly precise sched_clock() implementation is needed
+ * for results to make some sense.
+ */
+
+#include <linux/vmalloc.h>
+
+static int __init test_size_treshold(void)
+{
+ struct page *src_page, *dst_page;
+ void *user_ptr, *kernel_ptr;
+ unsigned long long t0, t1, t2;
+ int size, ret;
+
+ ret = -ENOMEM;
+ src_page = alloc_page(GFP_KERNEL);
+ if (!src_page)
+ goto no_src;
+ dst_page = alloc_page(GFP_KERNEL);
+ if (!dst_page)
+ goto no_dst;
+ kernel_ptr = page_address(src_page);
+ user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
+ if (!user_ptr)
+ goto no_vmap;
+
+ /* warm up the src page dcache */
+ ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
+
+ for (size = PAGE_SIZE; size >= 4; size /= 2) {
+ t0 = sched_clock();
+ ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
+ t1 = sched_clock();
+ ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
+ t2 = sched_clock();
+ printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+ }
+
+ for (size = PAGE_SIZE; size >= 4; size /= 2) {
+ t0 = sched_clock();
+ ret |= __clear_user_memset(user_ptr, size);
+ t1 = sched_clock();
+ ret |= __clear_user_std(user_ptr, size);
+ t2 = sched_clock();
+ printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+ }
+
+ if (ret)
+ ret = -EFAULT;
+
+ vunmap(user_ptr);
+no_vmap:
+ put_page(dst_page);
+no_dst:
+ put_page(src_page);
+no_src:
+ return ret;
+}
+
+subsys_initcall(test_size_treshold);
+
+#endif

View File

@ -0,0 +1,88 @@
From: Nicolas Pitre <nico@cam.org>
Date: Fri, 22 May 2009 02:17:17 +0000 (-0400)
Subject: [ARM] lower overhead with alternative copy_to_user for small copies
X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047
[ARM] lower overhead with alternative copy_to_user for small copies
Because the alternate copy_to_user implementation has a higher setup cost
than the standard implementation, the size of the memory area to copy
is tested and the standard implementation invoked instead when that size
is too small. Still, that test is made after the processor has preserved
a bunch of registers on the stack which have to be reloaded right away
needlessly in that case, causing a measurable performance regression
compared to plain usage of the standard implementation only.
To make the size test overhead negligible, let's factorize it out of
the alternate copy_to_user function where it is clear to the compiler
that no stack frame is needed. Thanks to CONFIG_ARM_UNWIND allowing
for frame pointers to be disabled and tail call optimization to kick in,
the overhead in the small copy case becomes only 3 assembly instructions.
A similar trick is applied to clear_user as well.
Signed-off-by: Nicolas Pitre <nico@marvell.com>
---
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index bf987b4..92838e7 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
return 1;
}
-unsigned long
-__copy_to_user(void __user *to, const void *from, unsigned long n)
+static unsigned long noinline
+__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
{
int atomic;
- if (n < 1024)
- return __copy_to_user_std(to, from, n);
-
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
memcpy((void *)to, from, n);
return 0;
@@ -99,11 +96,24 @@ out:
return n;
}
-unsigned long __clear_user(void __user *addr, unsigned long n)
+unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ /*
+ * This test is stubbed out of the main function above to keep
+ * the overhead for small copies low by avoiding a large
+ * register dump on the stack just to reload them right away.
+ * With frame pointer disabled, tail call optimization kicks in
+ * as well making this test almost invisible.
+ */
+ if (n < 1024)
+ return __copy_to_user_std(to, from, n);
+ return __copy_to_user_memcpy(to, from, n);
+}
+
+static unsigned long noinline
+__clear_user_memset(void __user *addr, unsigned long n)
{
- if (n < 256)
- return __clear_user_std(addr, n);
-
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
memset((void *)addr, 0, n);
return 0;
@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n)
out:
return n;
}
+
+unsigned long __clear_user(void __user *addr, unsigned long n)
+{
+ /* See rational for this in __copy_to_user() above. */
+ if (n < 256)
+ return __clear_user_std(addr, n);
+ return __clear_user_memset(addr, n);
+}

View File

@ -27,6 +27,8 @@
#+ features/sparc/video-sunxvr500-intergraph.patch
+ features/arm/allow-alternative-copy-user.patch
+ features/arm/alternative-copy-user.patch
+ features/arm/lower_overhead_with_alternative.patch
+ features/arm/copy_to_user-better_threshold.patch
+ bugfix/all/mvsdio-platform.patch
+ bugfix/all/mvsdio-ignore-high-speed.patch
+ bugfix/all/mvsdio-config-failure.patch