From 063ca30d03ffaae96995cfa479b91c54f6e2c7b4 Mon Sep 17 00:00:00 2001
From: Martin Michlmayr <tbm@cyrius.com>
Date: Wed, 3 Jun 2009 20:58:24 +0000
Subject: [PATCH] add copy user patches from Marvell

svn path=/dists/trunk/linux-2.6/; revision=13721
---
 debian/changelog                              |   3 +
 .../arm/copy_to_user-better_threshold.patch   | 121 ++++++++++++++++++
 .../arm/lower_overhead_with_alternative.patch |  88 +++++++++++++
 debian/patches/series/base                    |   2 +
 4 files changed, 214 insertions(+)
 create mode 100644 debian/patches/features/arm/copy_to_user-better_threshold.patch
 create mode 100644 debian/patches/features/arm/lower_overhead_with_alternative.patch

diff --git a/debian/changelog b/debian/changelog
index 89f196a8a..d2c764889 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -45,6 +45,9 @@ linux-2.6 (2.6.30~rc8-1~experimental.1) UNRELEASED; urgency=low
     these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD.
   * [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and
     BROADCOM_PHY.
+  * Add patches from git.marvell.com:
+    - alternative copy_to_user: more precise fallback threshold
+    - lower overhead with alternative copy_to_user for small copies
 
   [ Aurelien Jarno ]
   * [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed
diff --git a/debian/patches/features/arm/copy_to_user-better_threshold.patch b/debian/patches/features/arm/copy_to_user-better_threshold.patch
new file mode 100644
index 000000000..3bae79241
--- /dev/null
+++ b/debian/patches/features/arm/copy_to_user-better_threshold.patch
@@ -0,0 +1,121 @@
+From: Nicolas Pitre <nico@cam.org>
+Date: Sat, 30 May 2009 01:55:50 +0000 (-0400)
+Subject: [ARM] alternative copy_to_user: more precise fallback threshold
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55
+
+[ARM] alternative copy_to_user: more precise fallback threshold
+
+Previous size thresholds were guessed from various user space benchmarks
+using a kernel with and without the alternative uaccess option.  This
+is however not as precise as a kernel based test to measure the real
+speed of each method.
+
+This adds a simple test bench to show the time needed for each method.
+With this, the optimal size treshold for the alternative implementation
+can be determined with more confidence.  It appears that the optimal
+threshold for both copy_to_user and clear_user is around 64 bytes. This
+is not a surprise knowing that the memcpy and memset implementations
+need at least 64 bytes to achieve maximum throughput.
+
+One might suggest that such test be used to determine the optimal
+threshold at run time instead, but results are near enough to 64 on
+tested targets concerned by this alternative copy_to_user implementation,
+so adding some overhead associated with a variable threshold is probably
+not worth it for now.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index 92838e7..6b967ff 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
+ 	 * With frame pointer disabled, tail call optimization kicks in
+ 	 * as well making this test almost invisible.
+ 	 */
+-	if (n < 1024)
++	if (n < 64)
+ 		return __copy_to_user_std(to, from, n);
+ 	return __copy_to_user_memcpy(to, from, n);
+ }
+@@ -151,7 +151,78 @@ out:
+ unsigned long __clear_user(void __user *addr, unsigned long n)
+ {
+ 	/* See rational for this in __copy_to_user() above. */
+-	if (n < 256)
++	if (n < 64)
+ 		return __clear_user_std(addr, n);
+ 	return __clear_user_memset(addr, n);
+ }
++
++#if 0
++
++/*
++ * This code is disabled by default, but kept around in case the chosen
++ * thresholds need to be revalidated.  Some overhead (small but still)
++ * would be implied by a runtime determined variable threshold, and
++ * so far the measurement on concerned targets didn't show a worthwhile
++ * variation.
++ *
++ * Note that a fairly precise sched_clock() implementation is needed
++ * for results to make some sense.
++ */
++
++#include <linux/vmalloc.h>
++
++static int __init test_size_treshold(void)
++{
++	struct page *src_page, *dst_page;
++	void *user_ptr, *kernel_ptr;
++	unsigned long long t0, t1, t2;
++	int size, ret;
++
++	ret = -ENOMEM;
++	src_page = alloc_page(GFP_KERNEL);
++	if (!src_page)
++		goto no_src;
++	dst_page = alloc_page(GFP_KERNEL);
++	if (!dst_page)
++		goto no_dst;
++	kernel_ptr = page_address(src_page);
++	user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
++	if (!user_ptr)
++		goto no_vmap;
++
++	/* warm up the src page dcache */
++	ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
++
++	for (size = PAGE_SIZE; size >= 4; size /= 2) {
++		t0 = sched_clock();
++		ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
++		t1 = sched_clock();
++		ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
++		t2 = sched_clock();
++		printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
++	}
++
++	for (size = PAGE_SIZE; size >= 4; size /= 2) {
++		t0 = sched_clock();
++		ret |= __clear_user_memset(user_ptr, size);
++		t1 = sched_clock();
++		ret |= __clear_user_std(user_ptr, size);
++		t2 = sched_clock();
++		printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
++	}
++
++	if (ret)
++		ret = -EFAULT;
++
++	vunmap(user_ptr);
++no_vmap:
++	put_page(dst_page);
++no_dst:
++	put_page(src_page);
++no_src:
++	return ret;
++}
++
++subsys_initcall(test_size_treshold);
++
++#endif
diff --git a/debian/patches/features/arm/lower_overhead_with_alternative.patch b/debian/patches/features/arm/lower_overhead_with_alternative.patch
new file mode 100644
index 000000000..64deebb3e
--- /dev/null
+++ b/debian/patches/features/arm/lower_overhead_with_alternative.patch
@@ -0,0 +1,88 @@
+From: Nicolas Pitre <nico@cam.org>
+Date: Fri, 22 May 2009 02:17:17 +0000 (-0400)
+Subject: [ARM] lower overhead with alternative copy_to_user for small copies
+X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047
+
+[ARM] lower overhead with alternative copy_to_user for small copies
+
+Because the alternate copy_to_user implementation has a higher setup cost
+than the standard implementation, the size of the memory area to copy
+is tested and the standard implementation invoked instead when that size
+is too small.  Still, that test is made after the processor has preserved
+a bunch of registers on the stack which have to be reloaded right away
+needlessly in that case, causing a measurable performance regression
+compared to plain usage of the standard implementation only.
+
+To make the size test overhead negligible, let's factorize it out of
+the alternate copy_to_user function where it is clear to the compiler
+that no stack frame is needed.  Thanks to CONFIG_ARM_UNWIND allowing
+for frame pointers to be disabled and tail call optimization to kick in,
+the overhead in the small copy case becomes only 3 assembly instructions.
+
+A similar trick is applied to clear_user as well.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+
+diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
+index bf987b4..92838e7 100644
+--- a/arch/arm/lib/uaccess_with_memcpy.c
++++ b/arch/arm/lib/uaccess_with_memcpy.c
+@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+ 	return 1;
+ }
+ 
+-unsigned long
+-__copy_to_user(void __user *to, const void *from, unsigned long n)
++static unsigned long noinline
++__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+ {
+ 	int atomic;
+ 
+-	if (n < 1024)
+-		return __copy_to_user_std(to, from, n);
+-
+ 	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ 		memcpy((void *)to, from, n);
+ 		return 0;
+@@ -99,11 +96,24 @@ out:
+ 	return n;
+ }
+ 
+-unsigned long __clear_user(void __user *addr, unsigned long n)
++unsigned long
++__copy_to_user(void __user *to, const void *from, unsigned long n)
++{
++	/*
++	 * This test is stubbed out of the main function above to keep
++	 * the overhead for small copies low by avoiding a large
++	 * register dump on the stack just to reload them right away.
++	 * With frame pointer disabled, tail call optimization kicks in
++	 * as well making this test almost invisible.
++	 */
++	if (n < 1024)
++		return __copy_to_user_std(to, from, n);
++	return __copy_to_user_memcpy(to, from, n);
++}
++	
++static unsigned long noinline
++__clear_user_memset(void __user *addr, unsigned long n)
+ {
+-	if (n < 256)
+-		return __clear_user_std(addr, n);
+-
+ 	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+ 		memset((void *)addr, 0, n);
+ 		return 0;
+@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n)
+ out:
+ 	return n;
+ }
++
++unsigned long __clear_user(void __user *addr, unsigned long n)
++{
++	/* See rational for this in __copy_to_user() above. */
++	if (n < 256)
++		return __clear_user_std(addr, n);
++	return __clear_user_memset(addr, n);
++}
diff --git a/debian/patches/series/base b/debian/patches/series/base
index 7e82736ab..4e44051a2 100644
--- a/debian/patches/series/base
+++ b/debian/patches/series/base
@@ -27,6 +27,8 @@
 #+ features/sparc/video-sunxvr500-intergraph.patch
 + features/arm/allow-alternative-copy-user.patch
 + features/arm/alternative-copy-user.patch
++ features/arm/lower_overhead_with_alternative.patch
++ features/arm/copy_to_user-better_threshold.patch
 + bugfix/all/mvsdio-platform.patch
 + bugfix/all/mvsdio-ignore-high-speed.patch
 + bugfix/all/mvsdio-config-failure.patch