From 063ca30d03ffaae96995cfa479b91c54f6e2c7b4 Mon Sep 17 00:00:00 2001 From: Martin Michlmayr Date: Wed, 3 Jun 2009 20:58:24 +0000 Subject: [PATCH] add copy user patches from Marvell svn path=/dists/trunk/linux-2.6/; revision=13721 --- debian/changelog | 3 + .../arm/copy_to_user-better_threshold.patch | 121 ++++++++++++++++++ .../arm/lower_overhead_with_alternative.patch | 88 +++++++++++++ debian/patches/series/base | 2 + 4 files changed, 214 insertions(+) create mode 100644 debian/patches/features/arm/copy_to_user-better_threshold.patch create mode 100644 debian/patches/features/arm/lower_overhead_with_alternative.patch diff --git a/debian/changelog b/debian/changelog index 89f196a8a..d2c764889 100644 --- a/debian/changelog +++ b/debian/changelog @@ -45,6 +45,9 @@ linux-2.6 (2.6.30~rc8-1~experimental.1) UNRELEASED; urgency=low these days, so disable IDE and build in ATA, SCSI and BLK_DEV_SD. * [mips/sb1-bcm91250a, mips/sb1a-bcm91480b] Compile in SB1250_MAC and BROADCOM_PHY. + * Add patches from git.marvell.com: + - alternative copy_to_user: more precise fallback threshold + - lower overhead with alternative copy_to_user for small copies [ Aurelien Jarno ] * [mips(el)/sb1-bcm91250a] Set CONFIG_SCSI_AIC7XXX=y, it is needed diff --git a/debian/patches/features/arm/copy_to_user-better_threshold.patch b/debian/patches/features/arm/copy_to_user-better_threshold.patch new file mode 100644 index 000000000..3bae79241 --- /dev/null +++ b/debian/patches/features/arm/copy_to_user-better_threshold.patch @@ -0,0 +1,121 @@ +From: Nicolas Pitre +Date: Sat, 30 May 2009 01:55:50 +0000 (-0400) +Subject: [ARM] alternative copy_to_user: more precise fallback threshold +X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=c626e3f5ca1d95ad2204d3128c26e7678714eb55 + +[ARM] alternative copy_to_user: more precise fallback threshold + +Previous size thresholds were guessed from various user space benchmarks +using a kernel with and without the alternative uaccess option. This +is however not as precise as a kernel based test to measure the real +speed of each method. + +This adds a simple test bench to show the time needed for each method. +With this, the optimal size treshold for the alternative implementation +can be determined with more confidence. It appears that the optimal +threshold for both copy_to_user and clear_user is around 64 bytes. This +is not a surprise knowing that the memcpy and memset implementations +need at least 64 bytes to achieve maximum throughput. + +One might suggest that such test be used to determine the optimal +threshold at run time instead, but results are near enough to 64 on +tested targets concerned by this alternative copy_to_user implementation, +so adding some overhead associated with a variable threshold is probably +not worth it for now. + +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c +index 92838e7..6b967ff 100644 +--- a/arch/arm/lib/uaccess_with_memcpy.c ++++ b/arch/arm/lib/uaccess_with_memcpy.c +@@ -106,7 +106,7 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) + * With frame pointer disabled, tail call optimization kicks in + * as well making this test almost invisible. + */ +- if (n < 1024) ++ if (n < 64) + return __copy_to_user_std(to, from, n); + return __copy_to_user_memcpy(to, from, n); + } +@@ -151,7 +151,78 @@ out: + unsigned long __clear_user(void __user *addr, unsigned long n) + { + /* See rational for this in __copy_to_user() above. */ +- if (n < 256) ++ if (n < 64) + return __clear_user_std(addr, n); + return __clear_user_memset(addr, n); + } ++ ++#if 0 ++ ++/* ++ * This code is disabled by default, but kept around in case the chosen ++ * thresholds need to be revalidated. Some overhead (small but still) ++ * would be implied by a runtime determined variable threshold, and ++ * so far the measurement on concerned targets didn't show a worthwhile ++ * variation. ++ * ++ * Note that a fairly precise sched_clock() implementation is needed ++ * for results to make some sense. ++ */ ++ ++#include ++ ++static int __init test_size_treshold(void) ++{ ++ struct page *src_page, *dst_page; ++ void *user_ptr, *kernel_ptr; ++ unsigned long long t0, t1, t2; ++ int size, ret; ++ ++ ret = -ENOMEM; ++ src_page = alloc_page(GFP_KERNEL); ++ if (!src_page) ++ goto no_src; ++ dst_page = alloc_page(GFP_KERNEL); ++ if (!dst_page) ++ goto no_dst; ++ kernel_ptr = page_address(src_page); ++ user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010)); ++ if (!user_ptr) ++ goto no_vmap; ++ ++ /* warm up the src page dcache */ ++ ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE); ++ ++ for (size = PAGE_SIZE; size >= 4; size /= 2) { ++ t0 = sched_clock(); ++ ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size); ++ t1 = sched_clock(); ++ ret |= __copy_to_user_std(user_ptr, kernel_ptr, size); ++ t2 = sched_clock(); ++ printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1); ++ } ++ ++ for (size = PAGE_SIZE; size >= 4; size /= 2) { ++ t0 = sched_clock(); ++ ret |= __clear_user_memset(user_ptr, size); ++ t1 = sched_clock(); ++ ret |= __clear_user_std(user_ptr, size); ++ t2 = sched_clock(); ++ printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1); ++ } ++ ++ if (ret) ++ ret = -EFAULT; ++ ++ vunmap(user_ptr); ++no_vmap: ++ put_page(dst_page); ++no_dst: ++ put_page(src_page); ++no_src: ++ return ret; ++} ++ ++subsys_initcall(test_size_treshold); ++ ++#endif diff --git a/debian/patches/features/arm/lower_overhead_with_alternative.patch b/debian/patches/features/arm/lower_overhead_with_alternative.patch new file mode 100644 index 000000000..64deebb3e --- /dev/null +++ b/debian/patches/features/arm/lower_overhead_with_alternative.patch @@ -0,0 +1,88 @@ +From: Nicolas Pitre +Date: Fri, 22 May 2009 02:17:17 +0000 (-0400) +Subject: [ARM] lower overhead with alternative copy_to_user for small copies +X-Git-Url: http://git.marvell.com/?p=orion.git;a=commitdiff_plain;h=cb9dc92c0a1b76165c8c334402e27191084b2047 + +[ARM] lower overhead with alternative copy_to_user for small copies + +Because the alternate copy_to_user implementation has a higher setup cost +than the standard implementation, the size of the memory area to copy +is tested and the standard implementation invoked instead when that size +is too small. Still, that test is made after the processor has preserved +a bunch of registers on the stack which have to be reloaded right away +needlessly in that case, causing a measurable performance regression +compared to plain usage of the standard implementation only. + +To make the size test overhead negligible, let's factorize it out of +the alternate copy_to_user function where it is clear to the compiler +that no stack frame is needed. Thanks to CONFIG_ARM_UNWIND allowing +for frame pointers to be disabled and tail call optimization to kick in, +the overhead in the small copy case becomes only 3 assembly instructions. + +A similar trick is applied to clear_user as well. + +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c +index bf987b4..92838e7 100644 +--- a/arch/arm/lib/uaccess_with_memcpy.c ++++ b/arch/arm/lib/uaccess_with_memcpy.c +@@ -49,14 +49,11 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) + return 1; + } + +-unsigned long +-__copy_to_user(void __user *to, const void *from, unsigned long n) ++static unsigned long noinline ++__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) + { + int atomic; + +- if (n < 1024) +- return __copy_to_user_std(to, from, n); +- + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy((void *)to, from, n); + return 0; +@@ -99,11 +96,24 @@ out: + return n; + } + +-unsigned long __clear_user(void __user *addr, unsigned long n) ++unsigned long ++__copy_to_user(void __user *to, const void *from, unsigned long n) ++{ ++ /* ++ * This test is stubbed out of the main function above to keep ++ * the overhead for small copies low by avoiding a large ++ * register dump on the stack just to reload them right away. ++ * With frame pointer disabled, tail call optimization kicks in ++ * as well making this test almost invisible. ++ */ ++ if (n < 1024) ++ return __copy_to_user_std(to, from, n); ++ return __copy_to_user_memcpy(to, from, n); ++} ++ ++static unsigned long noinline ++__clear_user_memset(void __user *addr, unsigned long n) + { +- if (n < 256) +- return __clear_user_std(addr, n); +- + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memset((void *)addr, 0, n); + return 0; +@@ -137,3 +147,11 @@ unsigned long __clear_user(void __user *addr, unsigned long n) + out: + return n; + } ++ ++unsigned long __clear_user(void __user *addr, unsigned long n) ++{ ++ /* See rational for this in __copy_to_user() above. */ ++ if (n < 256) ++ return __clear_user_std(addr, n); ++ return __clear_user_memset(addr, n); ++} diff --git a/debian/patches/series/base b/debian/patches/series/base index 7e82736ab..4e44051a2 100644 --- a/debian/patches/series/base +++ b/debian/patches/series/base @@ -27,6 +27,8 @@ #+ features/sparc/video-sunxvr500-intergraph.patch + features/arm/allow-alternative-copy-user.patch + features/arm/alternative-copy-user.patch ++ features/arm/lower_overhead_with_alternative.patch ++ features/arm/copy_to_user-better_threshold.patch + bugfix/all/mvsdio-platform.patch + bugfix/all/mvsdio-ignore-high-speed.patch + bugfix/all/mvsdio-config-failure.patch