add copy user patches from Marvell

svn path=/dists/trunk/linux-2.6/; revision=13721
2009-06-03 20:58:24 +00:00 · 2009-06-03 20:58:24 +00:00 · 063ca30d03
parent 52a7c27516
commit 063ca30d03
4 changed files with 214 additions and 0 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -45,6 +45,9 @@ linux-2.6 (2.6.30~rc8-1~experimental.1) UNRELEASED; urgency=low
    these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD.
  * [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and
    BROADCOM_PHY.
+  * Add patches from git.marvell.com:
+    - alternative copy_to_user: more precise fallback threshold
+    - lower overhead with alternative copy_to_user for small copies

  [ Aurelien Jarno ]
  * [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed
--- a/debian/patches/features/arm/copy_to_user-better_threshold.patch
+++ b/debian/patches/features/arm/copy_to_user-better_threshold.patch
@ -0,0 +1,121 @@
+From: Nicolas Pitre <nico@cam.org>
+Date: Sat, 30 May 2009 01:55:50 +0000 (-0400)
+Subject: [ARM] alternative copy_to_user: more precise fallback threshold
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55
+
+[ARM] alternative copy_to_user: more precise fallback threshold
+
+Previous size thresholds were guessed from various user space benchmarks
+using a kernel with and without the alternative uaccess option.  This
+is however not as precise as a kernel based test to measure the real
+speed of each method.
+
+This adds a simple test bench to show the time needed for each method.
+With this, the optimal size treshold for the alternative implementation
+can be determined with more confidence.  It appears that the optimal
+threshold for both copy_to_user and clear_user is around 64 bytes. This
+is not a surprise knowing that the memcpy and memset implementations
+need at least 64 bytes to achieve maximum throughput.
+
+One might suggest that such test be used to determine the optimal
+threshold at run time instead, but results are near enough to 64 on
+tested targets concerned by this alternative copy_to_user implementation,
+so adding some overhead associated with a variable threshold is probably
+not worth it for now.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index 92838e7..6b967ff 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
+ 	 * With frame pointer disabled, tail call optimization kicks in
+ 	 * as well making this test almost invisible.
+ 	 */
+-	if (n < 1024)
+	if (n < 64)
+ 		return __copy_to_user_std(to, from, n);
+ 	return __copy_to_user_memcpy(to, from, n);
+ }
+@@ -151,7 +151,78 @@ out:
+ unsigned long __clear_user(void __user *addr, unsigned long n)
+ {
+ 	/* See rational for this in __copy_to_user() above. */
+-	if (n < 256)
+	if (n < 64)
+ 		return __clear_user_std(addr, n);
+ 	return __clear_user_memset(addr, n);
+ }
+
+#if 0
+
+/*
+ * This code is disabled by default, but kept around in case the chosen
+ * thresholds need to be revalidated.  Some overhead (small but still)
+ * would be implied by a runtime determined variable threshold, and
+ * so far the measurement on concerned targets didn't show a worthwhile
+ * variation.
+ *
+ * Note that a fairly precise sched_clock() implementation is needed
+ * for results to make some sense.
+ */
+
+#include <linux/vmalloc.h>
+
+static int __init test_size_treshold(void)
+{
+	struct page *src_page, *dst_page;
+	void *user_ptr, *kernel_ptr;
+	unsigned long long t0, t1, t2;
+	int size, ret;
+
+	ret = -ENOMEM;
+	src_page = alloc_page(GFP_KERNEL);
+	if (!src_page)
+		goto no_src;
+	dst_page = alloc_page(GFP_KERNEL);
+	if (!dst_page)
+		goto no_dst;
+	kernel_ptr = page_address(src_page);
+	user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
+	if (!user_ptr)
+		goto no_vmap;
+
+	/* warm up the src page dcache */
+	ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
+		t1 = sched_clock();
+		ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
+		t2 = sched_clock();
+		printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __clear_user_memset(user_ptr, size);
+		t1 = sched_clock();
+		ret |= __clear_user_std(user_ptr, size);
+		t2 = sched_clock();
+		printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	if (ret)
+		ret = -EFAULT;
+
+	vunmap(user_ptr);
+no_vmap:
+	put_page(dst_page);
+no_dst:
+	put_page(src_page);
+no_src:
+	return ret;
+}
+
+subsys_initcall(test_size_treshold);
+
+#endif
--- a/debian/patches/features/arm/lower_overhead_with_alternative.patch
+++ b/debian/patches/features/arm/lower_overhead_with_alternative.patch
@ -0,0 +1,88 @@
+From: Nicolas Pitre <nico@cam.org>
+Date: Fri, 22 May 2009 02:17:17 +0000 (-0400)
+Subject: [ARM] lower overhead with alternative copy_to_user for small copies
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047
+
+[ARM] lower overhead with alternative copy_to_user for small copies
+
+Because the alternate copy_to_user implementation has a higher setup cost
+than the standard implementation, the size of the memory area to copy
+is tested and the standard implementation invoked instead when that size
+is too small.  Still, that test is made after the processor has preserved
+a bunch of registers on the stack which have to be reloaded right away
+needlessly in that case, causing a measurable performance regression
+compared to plain usage of the standard implementation only.
+
+To make the size test overhead negligible, let's factorize it out of
+the alternate copy_to_user function where it is clear to the compiler
+that no stack frame is needed.  Thanks to CONFIG_ARM_UNWIND allowing
+for frame pointers to be disabled and tail call optimization to kick in,
+the overhead in the small copy case becomes only 3 assembly instructions.
+
+A similar trick is applied to clear_user as well.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index bf987b4..92838e7 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ 	return 1;
+ }
+ 
+-unsigned long
+-__copy_to_user(void __user *to, const void *from, unsigned long n)
+static unsigned long noinline
+__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+ 	int atomic;
+ 
+-	if (n < 1024)
+-		return __copy_to_user_std(to, from, n);
+-
+ 	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ 		memcpy((void *)to, from, n);
+ 		return 0;
+@@ -99,11 +96,24 @@ out:
+ 	return n;
+ }
+ 
+-unsigned long __clear_user(void __user *addr, unsigned long n)
+unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	/*
+	 * This test is stubbed out of the main function above to keep
+	 * the overhead for small copies low by avoiding a large
+	 * register dump on the stack just to reload them right away.
+	 * With frame pointer disabled, tail call optimization kicks in
+	 * as well making this test almost invisible.
+	 */
+	if (n < 1024)
+		return __copy_to_user_std(to, from, n);
+	return __copy_to_user_memcpy(to, from, n);
+}
+	
+static unsigned long noinline
+__clear_user_memset(void __user *addr, unsigned long n)
+ {
+-	if (n < 256)
+-		return __clear_user_std(addr, n);
+-
+ 	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ 		memset((void *)addr, 0, n);
+ 		return 0;
+@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n)
+ out:
+ 	return n;
+ }
+
+unsigned long __clear_user(void __user *addr, unsigned long n)
+{
+	/* See rational for this in __copy_to_user() above. */
+	if (n < 256)
+		return __clear_user_std(addr, n);
+	return __clear_user_memset(addr, n);
+}
--- a/debian/patches/series/base
+++ b/debian/patches/series/base
@ -27,6 +27,8 @@
 #+ features/sparc/video-sunxvr500-intergraph.patch
 + features/arm/allow-alternative-copy-user.patch
 + features/arm/alternative-copy-user.patch
+ features/arm/lower_overhead_with_alternative.patch
+ features/arm/copy_to_user-better_threshold.patch
 + bugfix/all/mvsdio-platform.patch
 + bugfix/all/mvsdio-ignore-high-speed.patch
 + bugfix/all/mvsdio-config-failure.patch