diff --git a/debian/changelog b/debian/changelog index 4f8304ac0..ee0776eac 100644 --- a/debian/changelog +++ b/debian/changelog @@ -22,6 +22,11 @@ linux-2.6 (2.6.26~rc1-1~experimental.1) UNRELEASED; urgency=low [ Martin Michlmayr ] * [arm/orion5x] Update the config to reflect upstream renaming this subarch. + * [arm/orion5x] Add some patches from Marvell's Orion tree: + - cache align destination pointer when copying memory for some processors + - cache align memset and memzero + - Feroceon: speed up flushing of the entire cache + - support for 5281 D0 stepping -- maximilian attems Sat, 26 Apr 2008 23:11:17 +0200 diff --git a/debian/patches/features/arm/5281d0.patch b/debian/patches/features/arm/5281d0.patch new file mode 100644 index 000000000..5f64a3791 --- /dev/null +++ b/debian/patches/features/arm/5281d0.patch @@ -0,0 +1,52 @@ +From: Lennert Buytenhek +Date: Mon, 5 May 2008 18:19:55 +0000 (-0400) +Subject: Orion: support for D0 stepping +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fnico%2Forion.git;a=commitdiff_plain;h=0e33c8a37f7c05bf85944cf10ca499b2d3754c1b + +Orion: support for D0 stepping + +Signed-off-by: Lennert Buytenhek +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c +index 4f13fd0..3e40e96 100644 +--- a/arch/arm/mach-orion5x/common.c ++++ b/arch/arm/mach-orion5x/common.c +@@ -338,6 +338,8 @@ static void __init orion5x_id(u32 *dev, u32 *rev, char **dev_name) + *dev_name = "MV88F5281-D2"; + } else if (*rev == MV88F5281_REV_D1) { + *dev_name = "MV88F5281-D1"; ++ } else if (*rev == MV88F5281_REV_D0) { ++ *dev_name = "MV88F5281-D0"; + } else { + *dev_name = "MV88F5281-Rev-Unsupported"; + } +@@ -372,6 +374,15 @@ void __init orion5x_init(void) + orion5x_setup_cpu_mbus_bridge(); + + /* ++ * Don't issue "Wait for Interrupt" instruction if we are ++ * running on D0 5281 silicon. ++ */ ++ if (dev == MV88F5281_DEV_ID && rev == MV88F5281_REV_D0) { ++ printk(KERN_INFO "Orion: Applying 5281 D0 WFI workaround.\n"); ++ disable_hlt(); ++ } ++ ++ /* + * Register devices. + */ + platform_device_register(&orion5x_uart); +diff --git a/include/asm-arm/arch-orion5x/orion5x.h b/include/asm-arm/arch-orion5x/orion5x.h +index 206ddd7..25ec775 100644 +--- a/include/asm-arm/arch-orion5x/orion5x.h ++++ b/include/asm-arm/arch-orion5x/orion5x.h +@@ -71,6 +71,7 @@ + #define MV88F5182_REV_A2 2 + /* Orion-2 (88F5281) */ + #define MV88F5281_DEV_ID 0x5281 ++#define MV88F5281_REV_D0 4 + #define MV88F5281_REV_D1 5 + #define MV88F5281_REV_D2 6 + diff --git a/debian/patches/features/arm/cache-align.patch b/debian/patches/features/arm/cache-align.patch new file mode 100644 index 000000000..74d766819 --- /dev/null +++ b/debian/patches/features/arm/cache-align.patch @@ -0,0 +1,120 @@ +From: Nicolas Pitre +Date: Mon, 31 Mar 2008 16:38:31 +0000 (-0400) +Subject: [ARM] cache align destination pointer when copying memory for some processors +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fnico%2Forion.git;a=commitdiff_plain;h=f25c9c5b9b3eca2f4a41ac72fec6244c0cbd87cc + +[ARM] cache align destination pointer when copying memory for some processors + +The implementation for memory copy functions on ARM had a (disabled) +provision for aligning the source pointer before loading registers with +data. Turns out that aligning the _destination_ pointer is much more +useful, as the read side is already sufficiently helped with the use of +preload. + +So this changes the definition of the CALGN() macro to target the +destination pointer instead, and turns it on for Feroceon processors +where the gain is very notable. + +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S +index cab355c..139cce6 100644 +--- a/arch/arm/lib/copy_template.S ++++ b/arch/arm/lib/copy_template.S +@@ -13,14 +13,6 @@ + */ + + /* +- * This can be used to enable code to cacheline align the source pointer. +- * Experiments on tested architectures (StrongARM and XScale) didn't show +- * this a worthwhile thing to do. That might be different in the future. +- */ +-//#define CALGN(code...) code +-#define CALGN(code...) +- +-/* + * Theory of operation + * ------------------- + * +@@ -82,7 +74,7 @@ + stmfd sp!, {r5 - r8} + blt 5f + +- CALGN( ands ip, r1, #31 ) ++ CALGN( ands ip, r0, #31 ) + CALGN( rsb r3, ip, #32 ) + CALGN( sbcnes r4, r3, r2 ) @ C is always set here + CALGN( bcs 2f ) +@@ -168,7 +160,7 @@ + subs r2, r2, #28 + blt 14f + +- CALGN( ands ip, r1, #31 ) ++ CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) +diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S +index ef7fddc..415e3d1 100644 +--- a/arch/arm/lib/memmove.S ++++ b/arch/arm/lib/memmove.S +@@ -13,14 +13,6 @@ + #include + #include + +-/* +- * This can be used to enable code to cacheline align the source pointer. +- * Experiments on tested architectures (StrongARM and XScale) didn't show +- * this a worthwhile thing to do. That might be different in the future. +- */ +-//#define CALGN(code...) code +-#define CALGN(code...) +- + .text + + /* +@@ -55,7 +47,7 @@ ENTRY(memmove) + stmfd sp!, {r5 - r8} + blt 5f + +- CALGN( ands ip, r1, #31 ) ++ CALGN( ands ip, r0, #31 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( bcs 2f ) + CALGN( adr r4, 6f ) +@@ -138,7 +130,7 @@ ENTRY(memmove) + subs r2, r2, #28 + blt 14f + +- CALGN( ands ip, r1, #31 ) ++ CALGN( ands ip, r0, #31 ) + CALGN( rsb ip, ip, #32 ) + CALGN( sbcnes r4, ip, r2 ) @ C is always set here + CALGN( subcc r2, r2, ip ) +diff --git a/include/asm-arm/assembler.h b/include/asm-arm/assembler.h +index fce8328..911393b 100644 +--- a/include/asm-arm/assembler.h ++++ b/include/asm-arm/assembler.h +@@ -56,6 +56,21 @@ + #endif + + /* ++ * This can be used to enable code to cacheline align the destination ++ * pointer when bulk writing to memory. Experiments on StrongARM and ++ * XScale didn't show this a worthwhile thing to do when the cache is not ++ * set to write-allocate (this would need further testing on XScale when WA ++ * is used). ++ * ++ * On Feroceon there is much to gain however, regardless of cache mode. ++ */ ++#ifdef CONFIG_CPU_FEROCEON ++#define CALGN(code...) code ++#else ++#define CALGN(code...) ++#endif ++ ++/* + * Enable and disable interrupts + */ + #if __LINUX_ARM_ARCH__ >= 6 diff --git a/debian/patches/features/arm/cache-align2.patch b/debian/patches/features/arm/cache-align2.patch new file mode 100644 index 000000000..2250d594c --- /dev/null +++ b/debian/patches/features/arm/cache-align2.patch @@ -0,0 +1,139 @@ +From: Nicolas Pitre +Date: Sat, 12 Apr 2008 01:04:28 +0000 (-0400) +Subject: [ARM] cache align memset and memzero +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fnico%2Forion.git;a=commitdiff_plain;h=74fa6238bc1602038532b548b954020f06b596cc + +[ARM] cache align memset and memzero + +This is a natural extension following the previous patch. +Non Feroceon based targets are unchanged. + +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S +index 95b110b..cf75188 100644 +--- a/arch/arm/lib/memset.S ++++ b/arch/arm/lib/memset.S +@@ -39,6 +39,9 @@ ENTRY(memset) + mov r3, r1 + cmp r2, #16 + blt 4f ++ ++#if CALGN(1)-1 != 0 ++ + /* + * We need an extra register for this loop - save the return address and + * use the LR +@@ -64,6 +67,49 @@ ENTRY(memset) + stmneia r0!, {r1, r3, ip, lr} + ldr lr, [sp], #4 + ++#else ++ ++/* ++ * This version aligns the destination pointer in order to write ++ * whole cache lines at once. ++ */ ++ ++ stmfd sp!, {r4-r7, lr} ++ mov r4, r1 ++ mov r5, r1 ++ mov r6, r1 ++ mov r7, r1 ++ mov ip, r1 ++ mov lr, r1 ++ ++ cmp r2, #96 ++ tstgt r0, #31 ++ ble 3f ++ ++ and ip, r0, #31 ++ rsb ip, ip, #32 ++ sub r2, r2, ip ++ movs ip, ip, lsl #(32 - 4) ++ stmcsia r0!, {r4, r5, r6, r7} ++ stmmiia r0!, {r4, r5} ++ tst ip, #(1 << 30) ++ mov ip, r1 ++ strne r1, [r0], #4 ++ ++3: subs r2, r2, #64 ++ stmgeia r0!, {r1, r3-r7, ip, lr} ++ stmgeia r0!, {r1, r3-r7, ip, lr} ++ bgt 3b ++ ldmeqfd sp!, {r4-r7, pc} ++ ++ tst r2, #32 ++ stmneia r0!, {r1, r3-r7, ip, lr} ++ tst r2, #16 ++ stmneia r0!, {r4-r7} ++ ldmfd sp!, {r4-r7, lr} ++ ++#endif ++ + 4: tst r2, #8 + stmneia r0!, {r1, r3} + tst r2, #4 +diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S +index abf2508..a9bfef5 100644 +--- a/arch/arm/lib/memzero.S ++++ b/arch/arm/lib/memzero.S +@@ -39,6 +39,9 @@ ENTRY(__memzero) + */ + cmp r1, #16 @ 1 we can skip this chunk if we + blt 4f @ 1 have < 16 bytes ++ ++#if CALGN(1)-1 != 0 ++ + /* + * We need an extra register for this loop - save the return address and + * use the LR +@@ -64,6 +67,47 @@ ENTRY(__memzero) + stmneia r0!, {r2, r3, ip, lr} @ 4 + ldr lr, [sp], #4 @ 1 + ++#else ++ ++/* ++ * This version aligns the destination pointer in order to write ++ * whole cache lines at once. ++ */ ++ ++ stmfd sp!, {r4-r7, lr} ++ mov r4, r2 ++ mov r5, r2 ++ mov r6, r2 ++ mov r7, r2 ++ mov ip, r2 ++ mov lr, r2 ++ ++ cmp r1, #96 ++ andgts ip, r0, #31 ++ ble 3f ++ ++ rsb ip, ip, #32 ++ sub r1, r1, ip ++ movs ip, ip, lsl #(32 - 4) ++ stmcsia r0!, {r4, r5, r6, r7} ++ stmmiia r0!, {r4, r5} ++ movs ip, ip, lsl #2 ++ strcs r2, [r0], #4 ++ ++3: subs r1, r1, #64 ++ stmgeia r0!, {r2-r7, ip, lr} ++ stmgeia r0!, {r2-r7, ip, lr} ++ bgt 3b ++ ldmeqfd sp!, {r4-r7, pc} ++ ++ tst r1, #32 ++ stmneia r0!, {r2-r7, ip, lr} ++ tst r1, #16 ++ stmneia r0!, {r4-r7} ++ ldmfd sp!, {r4-r7, lr} ++ ++#endif ++ + 4: tst r1, #8 @ 1 8 bytes or more? + stmneia r0!, {r2, r3} @ 2 + tst r1, #4 @ 1 4 bytes or more? diff --git a/debian/patches/features/arm/speed_flush_cache.patch b/debian/patches/features/arm/speed_flush_cache.patch new file mode 100644 index 000000000..dbd186490 --- /dev/null +++ b/debian/patches/features/arm/speed_flush_cache.patch @@ -0,0 +1,124 @@ +From: Lennert Buytenhek +Date: Thu, 24 Apr 2008 05:31:46 +0000 (-0400) +Subject: [ARM] Feroceon: speed up flushing of the entire cache +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fnico%2Forion.git;a=commitdiff_plain;h=8c38bce5ed3a5f2c8a1cb070ba41a8889cc69257 + +[ARM] Feroceon: speed up flushing of the entire cache + +Flushing the L1 D cache with a test/clean/invalidate loop is very +easy in software, but it is not the quickest way of doing it, as +there is a lot of overhead involved in re-scanning the cache from +the beginning every time we hit a dirty line. + +This patch makes proc-feroceon.S use "clean+invalidate by set/way" +loops according to possible cache configuration of Feroceon CPUs +(either direct-mapped or 4-way set associative). + +[nico: optimized the assembly a bit] + +Signed-off-by: Lennert Buytenhek +Signed-off-by: Nicolas Pitre +--- + +diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S +index a02c171..968d5ad 100644 +--- a/arch/arm/mm/proc-feroceon.S ++++ b/arch/arm/mm/proc-feroceon.S +@@ -44,11 +44,31 @@ + */ + #define CACHE_DLINESIZE 32 + ++ .bss ++ .align 3 ++__cache_params_loc: ++ .space 8 ++ + .text ++__cache_params: ++ .word __cache_params_loc ++ + /* + * cpu_feroceon_proc_init() + */ + ENTRY(cpu_feroceon_proc_init) ++ mrc p15, 0, r0, c0, c0, 1 @ read cache type register ++ ldr r1, __cache_params ++ mov r2, #(16 << 5) ++ tst r0, #(1 << 16) @ get way ++ mov r0, r0, lsr #18 @ get cache size order ++ movne r3, #((4 - 1) << 30) @ 4-way ++ and r0, r0, #0xf ++ moveq r3, #0 @ 1-way ++ mov r2, r2, lsl r0 @ actual cache size ++ movne r2, r2, lsr #2 @ turned into # of sets ++ sub r2, r2, #(1 << 5) ++ stmia r1, {r2, r3} + mov pc, lr + + /* +@@ -117,11 +137,19 @@ ENTRY(feroceon_flush_user_cache_all) + */ + ENTRY(feroceon_flush_kern_cache_all) + mov r2, #VM_EXEC +- mov ip, #0 ++ + __flush_whole_cache: +-1: mrc p15, 0, r15, c7, c14, 3 @ test,clean,invalidate +- bne 1b ++ ldr r1, __cache_params ++ ldmia r1, {r1, r3} ++1: orr ip, r1, r3 ++2: mcr p15, 0, ip, c7, c14, 2 @ clean + invalidate D set/way ++ subs ip, ip, #(1 << 30) @ next way ++ bcs 2b ++ subs r1, r1, #(1 << 5) @ next set ++ bcs 1b ++ + tst r2, #VM_EXEC ++ mov ip, #0 + mcrne p15, 0, ip, c7, c5, 0 @ invalidate I cache + mcrne p15, 0, ip, c7, c10, 4 @ drain WB + mov pc, lr +@@ -138,7 +166,6 @@ __flush_whole_cache: + */ + .align 5 + ENTRY(feroceon_flush_user_cache_range) +- mov ip, #0 + sub r3, r1, r0 @ calculate total size + cmp r3, #CACHE_DLIMIT + bgt __flush_whole_cache +@@ -152,6 +179,7 @@ ENTRY(feroceon_flush_user_cache_range) + cmp r0, r1 + blo 1b + tst r2, #VM_EXEC ++ mov ip, #0 + mcrne p15, 0, ip, c7, c10, 4 @ drain WB + mov pc, lr + +@@ -306,16 +334,19 @@ ENTRY(cpu_feroceon_dcache_clean_area) + .align 5 + ENTRY(cpu_feroceon_switch_mm) + #ifdef CONFIG_MMU +- mov ip, #0 +-@ && 'Clean & Invalidate whole DCache' +-1: mrc p15, 0, r15, c7, c14, 3 @ test,clean,invalidate +- bne 1b +- mcr p15, 0, ip, c7, c5, 0 @ invalidate I cache +- mcr p15, 0, ip, c7, c10, 4 @ drain WB ++ mov r2, lr @ abuse r2 to preserve lr ++ bl __flush_whole_cache ++ @ if r2 contains the VM_EXEC bit then the next 2 ops are done already ++ tst r2, #VM_EXEC ++ mcreq p15, 0, ip, c7, c5, 0 @ invalidate I cache ++ mcreq p15, 0, ip, c7, c10, 4 @ drain WB ++ + mcr p15, 0, r0, c2, c0, 0 @ load page table pointer + mcr p15, 0, ip, c8, c7, 0 @ invalidate I & D TLBs +-#endif ++ mov pc, r2 ++#else + mov pc, lr ++#endif + + /* + * cpu_feroceon_set_pte_ext(ptep, pte, ext) diff --git a/debian/patches/series/1~experimental.1 b/debian/patches/series/1~experimental.1 index 0f0f4ea26..ab7eee489 100644 --- a/debian/patches/series/1~experimental.1 +++ b/debian/patches/series/1~experimental.1 @@ -27,6 +27,10 @@ + bugfix/arm/disable-scsi_acard.patch ##+ bugfix/arm/disable-ath5k.patch + bugfix/arm/disable-r6040.patch ++ features/arm/cache-align.patch ++ features/arm/cache-align2.patch ++ features/arm/speed_flush_cache.patch ++ features/arm/5281d0.patch + features/all/at76.patch + bugfix/fix-hifn_795X-divdi3.patch + bugfix/all/mtd-prevent-physmap-from-causing-request_module-runaway-loop-modprobe-net-pf-1.patch