some cache align speedups

svn path=/dists/trunk/linux-2.6/; revision=11633
This commit is contained in:
Martin Michlmayr 2008-06-13 10:18:56 +00:00
parent 8687a57c93
commit e4ab0fb6f8
5 changed files with 305 additions and 0 deletions

2
debian/changelog vendored
View File

@ -46,6 +46,8 @@ linux-2.6 (2.6.26~rc6-1~experimental.1) UNRELEASED; urgency=low
* [arm/orion5x] Add some patches from Marvell's Orion tree:
- Feroceon: speed up flushing of the entire cache
- support for 5281 D0 stepping
- cache align destination pointer when copying memory for some processors
- cache align memset and memzero
* [arm/orion5x] Enable NETCONSOLE.
* [arm/orion5x] Disable more SCSI drivers.
* [arm/ixp4xx] Disable ATA and more SCSI and network drivers.

View File

@ -0,0 +1,124 @@
From: Nicolas Pitre <nico@cam.org>
The implementation for memory copy functions on ARM had a (disabled)
provision for aligning the source pointer before loading registers with
data. Turns out that aligning the _destination_ pointer is much more
useful, as the read side is already sufficiently helped with the use of
preload.
So this changes the definition of the CALGN() macro to target the
destination pointer instead, and turns it on for Feroceon processors
where the gain is very notable.
Signed-off-by: Nicolas Pitre <nico@marvell.com>
---
arch/arm/lib/copy_template.S | 12 ++----------
arch/arm/lib/memmove.S | 12 ++----------
include/asm-arm/assembler.h | 15 +++++++++++++++
3 files changed, 19 insertions(+), 20 deletions(-)
Index: linux-2.6.26-rc5/arch/arm/lib/copy_template.S
===================================================================
--- linux-2.6.26-rc5.orig/arch/arm/lib/copy_template.S
+++ linux-2.6.26-rc5/arch/arm/lib/copy_template.S
@@ -13,14 +13,6 @@
*/
/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do. That might be different in the future.
- */
-//#define CALGN(code...) code
-#define CALGN(code...)
-
-/*
* Theory of operation
* -------------------
*
@@ -82,7 +74,7 @@
stmfd sp!, {r5 - r8}
blt 5f
- CALGN( ands ip, r1, #31 )
+ CALGN( ands ip, r0, #31 )
CALGN( rsb r3, ip, #32 )
CALGN( sbcnes r4, r3, r2 ) @ C is always set here
CALGN( bcs 2f )
@@ -168,7 +160,7 @@
subs r2, r2, #28
blt 14f
- CALGN( ands ip, r1, #31 )
+ CALGN( ands ip, r0, #31 )
CALGN( rsb ip, ip, #32 )
CALGN( sbcnes r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
Index: linux-2.6.26-rc5/arch/arm/lib/memmove.S
===================================================================
--- linux-2.6.26-rc5.orig/arch/arm/lib/memmove.S
+++ linux-2.6.26-rc5/arch/arm/lib/memmove.S
@@ -13,14 +13,6 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * This can be used to enable code to cacheline align the source pointer.
- * Experiments on tested architectures (StrongARM and XScale) didn't show
- * this a worthwhile thing to do. That might be different in the future.
- */
-//#define CALGN(code...) code
-#define CALGN(code...)
-
.text
/*
@@ -55,7 +47,7 @@ ENTRY(memmove)
stmfd sp!, {r5 - r8}
blt 5f
- CALGN( ands ip, r1, #31 )
+ CALGN( ands ip, r0, #31 )
CALGN( sbcnes r4, ip, r2 ) @ C is always set here
CALGN( bcs 2f )
CALGN( adr r4, 6f )
@@ -139,7 +131,7 @@ ENTRY(memmove)
subs r2, r2, #28
blt 14f
- CALGN( ands ip, r1, #31 )
+ CALGN( ands ip, r0, #31 )
CALGN( sbcnes r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
CALGN( bcc 15f )
Index: linux-2.6.26-rc5/include/asm-arm/assembler.h
===================================================================
--- linux-2.6.26-rc5.orig/include/asm-arm/assembler.h
+++ linux-2.6.26-rc5/include/asm-arm/assembler.h
@@ -56,6 +56,21 @@
#endif
/*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory. Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+/*
* Enable and disable interrupts
*/
#if __LINUX_ARM_ARCH__ >= 6
-------------------------------------------------------------------
List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
FAQ: http://www.arm.linux.org.uk/mailinglists/faq.php
Etiquette: http://www.arm.linux.org.uk/mailinglists/etiquette.php

View File

@ -0,0 +1,142 @@
From: Nicolas Pitre <nico@cam.org>
This is a natural extension following the previous patch.
Non Feroceon based targets are unchanged.
Signed-off-by: Nicolas Pitre <nico@marvell.com>
---
arch/arm/lib/memset.S | 46 ++++++++++++++++++++++++++++++++++++++++++++++
arch/arm/lib/memzero.S | 44 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 90 insertions(+), 0 deletions(-)
Index: linux-2.6.26-rc5/arch/arm/lib/memset.S
===================================================================
--- linux-2.6.26-rc5.orig/arch/arm/lib/memset.S
+++ linux-2.6.26-rc5/arch/arm/lib/memset.S
@@ -39,6 +39,9 @@ ENTRY(memset)
mov r3, r1
cmp r2, #16
blt 4f
+
+#if ! CALGN(1)+0
+
/*
* We need an extra register for this loop - save the return address and
* use the LR
@@ -64,6 +67,49 @@ ENTRY(memset)
stmneia r0!, {r1, r3, ip, lr}
ldr lr, [sp], #4
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+ stmfd sp!, {r4-r7, lr}
+ mov r4, r1
+ mov r5, r1
+ mov r6, r1
+ mov r7, r1
+ mov ip, r1
+ mov lr, r1
+
+ cmp r2, #96
+ tstgt r0, #31
+ ble 3f
+
+ and ip, r0, #31
+ rsb ip, ip, #32
+ sub r2, r2, ip
+ movs ip, ip, lsl #(32 - 4)
+ stmcsia r0!, {r4, r5, r6, r7}
+ stmmiia r0!, {r4, r5}
+ tst ip, #(1 << 30)
+ mov ip, r1
+ strne r1, [r0], #4
+
+3: subs r2, r2, #64
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ bgt 3b
+ ldmeqfd sp!, {r4-r7, pc}
+
+ tst r2, #32
+ stmneia r0!, {r1, r3-r7, ip, lr}
+ tst r2, #16
+ stmneia r0!, {r4-r7}
+ ldmfd sp!, {r4-r7, lr}
+
+#endif
+
4: tst r2, #8
stmneia r0!, {r1, r3}
tst r2, #4
Index: linux-2.6.26-rc5/arch/arm/lib/memzero.S
===================================================================
--- linux-2.6.26-rc5.orig/arch/arm/lib/memzero.S
+++ linux-2.6.26-rc5/arch/arm/lib/memzero.S
@@ -39,6 +39,9 @@ ENTRY(__memzero)
*/
cmp r1, #16 @ 1 we can skip this chunk if we
blt 4f @ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
/*
* We need an extra register for this loop - save the return address and
* use the LR
@@ -64,6 +67,47 @@ ENTRY(__memzero)
stmneia r0!, {r2, r3, ip, lr} @ 4
ldr lr, [sp], #4 @ 1
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+ stmfd sp!, {r4-r7, lr}
+ mov r4, r2
+ mov r5, r2
+ mov r6, r2
+ mov r7, r2
+ mov ip, r2
+ mov lr, r2
+
+ cmp r1, #96
+ andgts ip, r0, #31
+ ble 3f
+
+ rsb ip, ip, #32
+ sub r1, r1, ip
+ movs ip, ip, lsl #(32 - 4)
+ stmcsia r0!, {r4, r5, r6, r7}
+ stmmiia r0!, {r4, r5}
+ movs ip, ip, lsl #2
+ strcs r2, [r0], #4
+
+3: subs r1, r1, #64
+ stmgeia r0!, {r2-r7, ip, lr}
+ stmgeia r0!, {r2-r7, ip, lr}
+ bgt 3b
+ ldmeqfd sp!, {r4-r7, pc}
+
+ tst r1, #32
+ stmneia r0!, {r2-r7, ip, lr}
+ tst r1, #16
+ stmneia r0!, {r4-r7}
+ ldmfd sp!, {r4-r7, lr}
+
+#endif
+
4: tst r1, #8 @ 1 8 bytes or more?
stmneia r0!, {r2, r3} @ 2
tst r1, #4 @ 1 4 bytes or more?
-------------------------------------------------------------------
List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
FAQ: http://www.arm.linux.org.uk/mailinglists/faq.php
Etiquette: http://www.arm.linux.org.uk/mailinglists/etiquette.php

View File

@ -0,0 +1,34 @@
From: Nicolas Pitre <nico@cam.org>
This code is currently disabled, which explains why no one was affected.
Signed-off-by: Nicolas Pitre <nico@marvell.com>
---
arch/arm/lib/memmove.S | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
Index: linux-2.6.26-rc5/arch/arm/lib/memmove.S
===================================================================
--- linux-2.6.26-rc5.orig/arch/arm/lib/memmove.S
+++ linux-2.6.26-rc5/arch/arm/lib/memmove.S
@@ -60,6 +60,7 @@ ENTRY(memmove)
CALGN( bcs 2f )
CALGN( adr r4, 6f )
CALGN( subs r2, r2, ip ) @ C is set here
+ CALGN( rsb ip, ip, #32 )
CALGN( add pc, r4, ip )
PLD( pld [r1, #-4] )
@@ -139,7 +140,6 @@ ENTRY(memmove)
blt 14f
CALGN( ands ip, r1, #31 )
- CALGN( rsb ip, ip, #32 )
CALGN( sbcnes r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
CALGN( bcc 15f )
-------------------------------------------------------------------
List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
FAQ: http://www.arm.linux.org.uk/mailinglists/faq.php
Etiquette: http://www.arm.linux.org.uk/mailinglists/etiquette.php

View File

@ -31,6 +31,9 @@
+ bugfix/arm/disable-r6040.patch
+ features/arm/speed_flush_cache.patch
+ features/arm/5281d0.patch
+ features/arm/fix_cache_alignment.patch
+ features/arm/cache_align1.patch
+ features/arm/cache_align2.patch
+ features/arm/led-pca9532-generic.patch
+ features/arm/led-pca9532-fix.patch
+ features/arm/led-pca9532-n2100.patch