some cache align speedups

svn path=/dists/trunk/linux-2.6/; revision=11633
2008-06-13 10:18:56 +00:00 · 2008-06-13 10:18:56 +00:00 · e4ab0fb6f8
parent 8687a57c93
commit e4ab0fb6f8
5 changed files with 305 additions and 0 deletions
--- a/debian/changelog
+++ b/debian/changelog
@ -46,6 +46,8 @@ linux-2.6 (2.6.26~rc6-1~experimental.1) UNRELEASED; urgency=low
  * [arm/orion5x] Add some patches from Marvell's Orion tree:
    - Feroceon: speed up flushing of the entire cache
    - support for 5281 D0 stepping
+    - cache align destination pointer when copying memory for some processors
+    - cache align memset and memzero
  * [arm/orion5x] Enable NETCONSOLE.
  * [arm/orion5x] Disable more SCSI drivers.
  * [arm/ixp4xx] Disable ATA and more SCSI and network drivers.
--- a/debian/patches/features/arm/cache_align1.patch
+++ b/debian/patches/features/arm/cache_align1.patch
@ -0,0 +1,124 @@
+From: Nicolas Pitre <nico@cam.org>
+
+The implementation for memory copy functions on ARM had a (disabled)
+provision for aligning the source pointer before loading registers with
+data.  Turns out that aligning the _destination_ pointer is much more
+useful, as the read side is already sufficiently helped with the use of
+preload.
+
+So this changes the definition of the CALGN() macro to target the
+destination pointer instead, and turns it on for Feroceon processors
+where the gain is very notable.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+ arch/arm/lib/copy_template.S |   12 ++----------
+ arch/arm/lib/memmove.S       |   12 ++----------
+ include/asm-arm/assembler.h  |   15 +++++++++++++++
+ 3 files changed, 19 insertions(+), 20 deletions(-)
+
+Index: linux-2.6.26-rc5/arch/arm/lib/copy_template.S
+===================================================================
+--- linux-2.6.26-rc5.orig/arch/arm/lib/copy_template.S
+++ linux-2.6.26-rc5/arch/arm/lib/copy_template.S
+@@ -13,14 +13,6 @@
+  */
+ 
+ /*
+- * This can be used to enable code to cacheline align the source pointer.
+- * Experiments on tested architectures (StrongARM and XScale) didn't show
+- * this a worthwhile thing to do.  That might be different in the future.
+- */
+-//#define CALGN(code...)	code
+-#define CALGN(code...)
+-
+-/*
+  * Theory of operation
+  * -------------------
+  *
+@@ -82,7 +74,7 @@
+ 		stmfd	sp!, {r5 - r8}
+ 		blt	5f
+ 
+-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
+ 	CALGN(	rsb	r3, ip, #32		)
+ 	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+ 	CALGN(	bcs	2f			)
+@@ -168,7 +160,7 @@
+ 		subs	r2, r2, #28
+ 		blt	14f
+ 
+-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
+ 	CALGN(	rsb	ip, ip, #32		)
+ 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+ 	CALGN(	subcc	r2, r2, ip		)
+Index: linux-2.6.26-rc5/arch/arm/lib/memmove.S
+===================================================================
+--- linux-2.6.26-rc5.orig/arch/arm/lib/memmove.S
+++ linux-2.6.26-rc5/arch/arm/lib/memmove.S
+@@ -13,14 +13,6 @@
+ #include <linux/linkage.h>
+ #include <asm/assembler.h>
+ 
+-/*
+- * This can be used to enable code to cacheline align the source pointer.
+- * Experiments on tested architectures (StrongARM and XScale) didn't show
+- * this a worthwhile thing to do.  That might be different in the future.
+- */
+-//#define CALGN(code...)        code
+-#define CALGN(code...)
+-
+ 		.text
+ 
+ /*
+@@ -55,7 +47,7 @@ ENTRY(memmove)
+ 		stmfd	sp!, {r5 - r8}
+ 		blt	5f
+ 
+-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
+ 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+ 	CALGN(	bcs	2f			)
+ 	CALGN(	adr	r4, 6f			)
+@@ -139,7 +131,7 @@ ENTRY(memmove)
+ 		subs	r2, r2, #28
+ 		blt	14f
+ 
+-	CALGN(	ands	ip, r1, #31		)
+	CALGN(	ands	ip, r0, #31		)
+ 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+ 	CALGN(	subcc	r2, r2, ip		)
+ 	CALGN(	bcc	15f			)
+Index: linux-2.6.26-rc5/include/asm-arm/assembler.h
+===================================================================
+--- linux-2.6.26-rc5.orig/include/asm-arm/assembler.h
+++ linux-2.6.26-rc5/include/asm-arm/assembler.h
+@@ -56,6 +56,21 @@
+ #endif
+ 
+ /*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory.  Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+/*
+  * Enable and disable interrupts
+  */
+ #if __LINUX_ARM_ARCH__ >= 6
+
+-------------------------------------------------------------------
+List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
+FAQ:        http://www.arm.linux.org.uk/mailinglists/faq.php
+Etiquette:  http://www.arm.linux.org.uk/mailinglists/etiquette.php
--- a/debian/patches/features/arm/cache_align2.patch
+++ b/debian/patches/features/arm/cache_align2.patch
@ -0,0 +1,142 @@
+From: Nicolas Pitre <nico@cam.org>
+
+This is a natural extension following the previous patch.
+Non Feroceon based targets are unchanged.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+ arch/arm/lib/memset.S  |   46 ++++++++++++++++++++++++++++++++++++++++++++++
+ arch/arm/lib/memzero.S |   44 ++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 90 insertions(+), 0 deletions(-)
+
+Index: linux-2.6.26-rc5/arch/arm/lib/memset.S
+===================================================================
+--- linux-2.6.26-rc5.orig/arch/arm/lib/memset.S
+++ linux-2.6.26-rc5/arch/arm/lib/memset.S
+@@ -39,6 +39,9 @@ ENTRY(memset)
+ 	mov	r3, r1
+ 	cmp	r2, #16
+ 	blt	4f
+
+#if ! CALGN(1)+0
+
+ /*
+  * We need an extra register for this loop - save the return address and
+  * use the LR
+@@ -64,6 +67,49 @@ ENTRY(memset)
+ 	stmneia	r0!, {r1, r3, ip, lr}
+ 	ldr	lr, [sp], #4
+ 
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	ip, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	r0, #31
+	ble	3f
+
+	and	ip, r0, #31
+	rsb	ip, ip, #32
+	sub	r2, r2, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	tst	ip, #(1 << 30)
+	mov	ip, r1
+	strne	r1, [r0], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r2, #32
+	stmneia	r0!, {r1, r3-r7, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+ 4:	tst	r2, #8
+ 	stmneia	r0!, {r1, r3}
+ 	tst	r2, #4
+Index: linux-2.6.26-rc5/arch/arm/lib/memzero.S
+===================================================================
+--- linux-2.6.26-rc5.orig/arch/arm/lib/memzero.S
+++ linux-2.6.26-rc5/arch/arm/lib/memzero.S
+@@ -39,6 +39,9 @@ ENTRY(__memzero)
+  */
+ 	cmp	r1, #16			@ 1 we can skip this chunk if we
+ 	blt	4f			@ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
+ /*
+  * We need an extra register for this loop - save the return address and
+  * use the LR
+@@ -64,6 +67,47 @@ ENTRY(__memzero)
+ 	stmneia	r0!, {r2, r3, ip, lr}	@ 4
+ 	ldr	lr, [sp], #4		@ 1
+ 
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r2
+	mov	r5, r2
+	mov	r6, r2
+	mov	r7, r2
+	mov	ip, r2
+	mov	lr, r2
+
+	cmp	r1, #96
+	andgts	ip, r0, #31
+	ble	3f
+
+	rsb	ip, ip, #32
+	sub	r1, r1, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	movs	ip, ip, lsl #2
+	strcs	r2, [r0], #4
+
+3:	subs	r1, r1, #64
+	stmgeia	r0!, {r2-r7, ip, lr}
+	stmgeia	r0!, {r2-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r1, #32
+	stmneia	r0!, {r2-r7, ip, lr}
+	tst	r1, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+ 4:	tst	r1, #8			@ 1 8 bytes or more?
+ 	stmneia	r0!, {r2, r3}		@ 2
+ 	tst	r1, #4			@ 1 4 bytes or more?
+
+-------------------------------------------------------------------
+List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
+FAQ:        http://www.arm.linux.org.uk/mailinglists/faq.php
+Etiquette:  http://www.arm.linux.org.uk/mailinglists/etiquette.php
--- a/debian/patches/features/arm/fix_cache_alignment.patch
+++ b/debian/patches/features/arm/fix_cache_alignment.patch
@ -0,0 +1,34 @@
+From: Nicolas Pitre <nico@cam.org>
+
+This code is currently disabled, which explains why no one was affected.
+
+Signed-off-by: Nicolas Pitre <nico@marvell.com>
+---
+ arch/arm/lib/memmove.S |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+Index: linux-2.6.26-rc5/arch/arm/lib/memmove.S
+===================================================================
+--- linux-2.6.26-rc5.orig/arch/arm/lib/memmove.S
+++ linux-2.6.26-rc5/arch/arm/lib/memmove.S
+@@ -60,6 +60,7 @@ ENTRY(memmove)
+ 	CALGN(	bcs	2f			)
+ 	CALGN(	adr	r4, 6f			)
+ 	CALGN(	subs	r2, r2, ip		)  @ C is set here
+	CALGN(	rsb	ip, ip, #32		)
+ 	CALGN(	add	pc, r4, ip		)
+ 
+ 	PLD(	pld	[r1, #-4]		)
+@@ -139,7 +140,6 @@ ENTRY(memmove)
+ 		blt	14f
+ 
+ 	CALGN(	ands	ip, r1, #31		)
+-	CALGN(	rsb	ip, ip, #32		)
+ 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+ 	CALGN(	subcc	r2, r2, ip		)
+ 	CALGN(	bcc	15f			)
+
+-------------------------------------------------------------------
+List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm-kernel
+FAQ:        http://www.arm.linux.org.uk/mailinglists/faq.php
+Etiquette:  http://www.arm.linux.org.uk/mailinglists/etiquette.php
--- a/debian/patches/series/1~experimental.1
+++ b/debian/patches/series/1~experimental.1
@ -31,6 +31,9 @@
 + bugfix/arm/disable-r6040.patch
 + features/arm/speed_flush_cache.patch
 + features/arm/5281d0.patch
+ features/arm/fix_cache_alignment.patch
+ features/arm/cache_align1.patch
+ features/arm/cache_align2.patch
 + features/arm/led-pca9532-generic.patch
 + features/arm/led-pca9532-fix.patch
 + features/arm/led-pca9532-n2100.patch