From 2357044444cfdef49886c8195a5ddf6067dc2fb5 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Fri, 22 Feb 2019 09:54:44 +0800 Subject: [PATCH] [mipsel/mips64el] Backport loongson workarounds MIPS: Loongson: Introduce and use loongson_llsc_mb() --- debian/changelog | 2 + ...n-Introduce-and-use-loongson_llsc_mb.patch | 390 ++++++++++++++++++ debian/patches/series | 1 + 3 files changed, 393 insertions(+) create mode 100644 debian/patches/bugfix/mips/MIPS-Loongson-Introduce-and-use-loongson_llsc_mb.patch diff --git a/debian/changelog b/debian/changelog index 2398015f2..b3e9761a2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -607,6 +607,8 @@ linux (4.19.26-1) UNRELEASED; urgency=medium Enable CPU_HAS_MSA, HIGHMEM, CRYPTO_CRC32_MIPS, and NR_CPUS to 16. Support some boston drivers: IMG_ASCII_LCD, I2C_EG20T, PCH_PHUB, MMC, PCIE_XILINX, RTC_DRV_M41T80, SPI_TOPCLIFF_PCH. + * [mipsel/mips64el] Backport MIPS: Loongson: Introduce and use + loongson_llsc_mb() -- Ben Hutchings Tue, 12 Feb 2019 12:49:10 +0000 diff --git a/debian/patches/bugfix/mips/MIPS-Loongson-Introduce-and-use-loongson_llsc_mb.patch b/debian/patches/bugfix/mips/MIPS-Loongson-Introduce-and-use-loongson_llsc_mb.patch new file mode 100644 index 000000000..2c8e582f8 --- /dev/null +++ b/debian/patches/bugfix/mips/MIPS-Loongson-Introduce-and-use-loongson_llsc_mb.patch @@ -0,0 +1,390 @@ +From e02e07e3127d8aec1f4bcdfb2fc52a2d99b4859e Mon Sep 17 00:00:00 2001 +From: Huacai Chen +Date: Tue, 15 Jan 2019 16:04:54 +0800 +Subject: MIPS: Loongson: Introduce and use loongson_llsc_mb() + +On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and +lld/scd is very weak ordering. We should add sync instructions "before +each ll/lld" and "at the branch-target between ll/sc" to workaround. +Otherwise, this flaw will cause deadlock occasionally (e.g. when doing +heavy load test with LTP). + +Below is the explaination of CPU designer: + +"For Loongson 3 family, when a memory access instruction (load, store, +or prefetch)'s executing occurs between the execution of LL and SC, the +success or failure of SC is not predictable. Although programmer would +not insert memory access instructions between LL and SC, the memory +instructions before LL in program-order, may dynamically executed +between the execution of LL/SC, so a memory fence (SYNC) is needed +before LL/LLD to avoid this situation. + +Since Loongson-3A R2 (3A2000), we have improved our hardware design to +handle this case. But we later deduce a rarely circumstance that some +speculatively executed memory instructions due to branch misprediction +between LL/SC still fall into the above case, so a memory fence (SYNC) +at branch-target (if its target is not between LL/SC) is needed for +Loongson 3A1000, 3B1500, 3A2000 and 3A3000. + +Our processor is continually evolving and we aim to to remove all these +workaround-SYNCs around LL/SC for new-come processor." + +Here is an example: + +Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var, +this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same +time('sc' return 1), and the variable is only *added by 1*, sometimes, +which is wrong and unacceptable(it should be added by 2). + +Why disable fix-loongson3-llsc in compiler? +Because compiler fix will cause problems in kernel's __ex_table section. + +This patch fix all the cases in kernel, but: + ++. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target +of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix +the ll and branch-target coincidently such as atomic_sub_if_positive/ +cmpxchg/xchg, just like this one. + ++. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch +edac.h + ++. local_ops and cmpxchg_local should not be affected by this bug since +only the owner can write. + ++. mips_atomic_set for syscall.c is deprecated and rarely used, just let +it go + +Signed-off-by: Huacai Chen +Signed-off-by: Huang Pei +[paul.burton@mips.com: + - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add + a comment describing why it's there. + - Make loongson_llsc_mb() a no-op when + CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory + barrier. + - Add a comment describing the bug & how loongson_llsc_mb() helps + in asm/barrier.h.] +Signed-off-by: Paul Burton +Cc: Ralf Baechle +Cc: ambrosehua@gmail.com +Cc: Steven J . Hill +Cc: linux-mips@linux-mips.org +Cc: Fuxin Zhang +Cc: Zhangjin Wu +Cc: Li Xuefeng +Cc: Xu Chenghua +--- + arch/mips/Kconfig | 15 +++++++++++++++ + arch/mips/include/asm/atomic.h | 6 ++++++ + arch/mips/include/asm/barrier.h | 36 ++++++++++++++++++++++++++++++++++++ + arch/mips/include/asm/bitops.h | 5 +++++ + arch/mips/include/asm/futex.h | 3 +++ + arch/mips/include/asm/pgtable.h | 2 ++ + arch/mips/loongson64/Platform | 23 +++++++++++++++++++++++ + arch/mips/mm/tlbex.c | 10 ++++++++++ + 8 files changed, 100 insertions(+) + +diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig +index 0d14f51..a84c24d 100644 +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -1403,6 +1403,21 @@ config LOONGSON3_ENHANCEMENT + please say 'N' here. If you want a high-performance kernel to run on + new Loongson 3 machines only, please say 'Y' here. + ++config CPU_LOONGSON3_WORKAROUNDS ++ bool "Old Loongson 3 LLSC Workarounds" ++ default y if SMP ++ depends on CPU_LOONGSON3 ++ help ++ Loongson 3 processors have the llsc issues which require workarounds. ++ Without workarounds the system may hang unexpectedly. ++ ++ Newer Loongson 3 will fix these issues and no workarounds are needed. ++ The workarounds have no significant side effect on them but may ++ decrease the performance of the system so this option should be ++ disabled unless the kernel is intended to be run on old systems. ++ ++ If unsure, please say Y. ++ + config CPU_LOONGSON2E + bool "Loongson 2E" + depends on SYS_HAS_CPU_LOONGSON2E +diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h +index 43fcd35..9409629 100644 +--- a/arch/mips/include/asm/atomic.h ++++ b/arch/mips/include/asm/atomic.h +@@ -58,6 +58,7 @@ static __inline__ void atomic_##op(int i, atomic_t * v) \ + if (kernel_uses_llsc) { \ + int temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +@@ -85,6 +86,7 @@ static __inline__ int atomic_##op##_return_relaxed(int i, atomic_t * v) \ + if (kernel_uses_llsc) { \ + int temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +@@ -118,6 +120,7 @@ static __inline__ int atomic_fetch_##op##_relaxed(int i, atomic_t * v) \ + if (kernel_uses_llsc) { \ + int temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +@@ -256,6 +259,7 @@ static __inline__ void atomic64_##op(long i, atomic64_t * v) \ + if (kernel_uses_llsc) { \ + long temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +@@ -283,6 +287,7 @@ static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \ + if (kernel_uses_llsc) { \ + long temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +@@ -316,6 +321,7 @@ static __inline__ long atomic64_fetch_##op##_relaxed(long i, atomic64_t * v) \ + if (kernel_uses_llsc) { \ + long temp; \ + \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set "MIPS_ISA_LEVEL" \n" \ +diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h +index a5eb1bb..b7f6ac5 100644 +--- a/arch/mips/include/asm/barrier.h ++++ b/arch/mips/include/asm/barrier.h +@@ -222,6 +222,42 @@ + #define __smp_mb__before_atomic() __smp_mb__before_llsc() + #define __smp_mb__after_atomic() smp_llsc_mb() + ++/* ++ * Some Loongson 3 CPUs have a bug wherein execution of a memory access (load, ++ * store or pref) in between an ll & sc can cause the sc instruction to ++ * erroneously succeed, breaking atomicity. Whilst it's unusual to write code ++ * containing such sequences, this bug bites harder than we might otherwise ++ * expect due to reordering & speculation: ++ * ++ * 1) A memory access appearing prior to the ll in program order may actually ++ * be executed after the ll - this is the reordering case. ++ * ++ * In order to avoid this we need to place a memory barrier (ie. a sync ++ * instruction) prior to every ll instruction, in between it & any earlier ++ * memory access instructions. Many of these cases are already covered by ++ * smp_mb__before_llsc() but for the remaining cases, typically ones in ++ * which multiple CPUs may operate on a memory location but ordering is not ++ * usually guaranteed, we use loongson_llsc_mb() below. ++ * ++ * This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later. ++ * ++ * 2) If a conditional branch exists between an ll & sc with a target outside ++ * of the ll-sc loop, for example an exit upon value mismatch in cmpxchg() ++ * or similar, then misprediction of the branch may allow speculative ++ * execution of memory accesses from outside of the ll-sc loop. ++ * ++ * In order to avoid this we need a memory barrier (ie. a sync instruction) ++ * at each affected branch target, for which we also use loongson_llsc_mb() ++ * defined below. ++ * ++ * This case affects all current Loongson 3 CPUs. ++ */ ++#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS /* Loongson-3's LLSC workaround */ ++#define loongson_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory") ++#else ++#define loongson_llsc_mb() do { } while (0) ++#endif ++ + #include + + #endif /* __ASM_BARRIER_H */ +diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h +index c467595..830c93a 100644 +--- a/arch/mips/include/asm/bitops.h ++++ b/arch/mips/include/asm/bitops.h +@@ -69,6 +69,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr) + : "ir" (1UL << bit), GCC_OFF_SMALL_ASM() (*m)); + #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) + } else if (kernel_uses_llsc && __builtin_constant_p(bit)) { ++ loongson_llsc_mb(); + do { + __asm__ __volatile__( + " " __LL "%0, %1 # set_bit \n" +@@ -79,6 +80,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr) + } while (unlikely(!temp)); + #endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */ + } else if (kernel_uses_llsc) { ++ loongson_llsc_mb(); + do { + __asm__ __volatile__( + " .set push \n" +@@ -123,6 +125,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr) + : "ir" (~(1UL << bit))); + #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) + } else if (kernel_uses_llsc && __builtin_constant_p(bit)) { ++ loongson_llsc_mb(); + do { + __asm__ __volatile__( + " " __LL "%0, %1 # clear_bit \n" +@@ -133,6 +136,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr) + } while (unlikely(!temp)); + #endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */ + } else if (kernel_uses_llsc) { ++ loongson_llsc_mb(); + do { + __asm__ __volatile__( + " .set push \n" +@@ -193,6 +197,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr) + unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG); + unsigned long temp; + ++ loongson_llsc_mb(); + do { + __asm__ __volatile__( + " .set push \n" +diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h +index c14d798..b83b039 100644 +--- a/arch/mips/include/asm/futex.h ++++ b/arch/mips/include/asm/futex.h +@@ -50,6 +50,7 @@ + "i" (-EFAULT) \ + : "memory"); \ + } else if (cpu_has_llsc) { \ ++ loongson_llsc_mb(); \ + __asm__ __volatile__( \ + " .set push \n" \ + " .set noat \n" \ +@@ -163,6 +164,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + "i" (-EFAULT) + : "memory"); + } else if (cpu_has_llsc) { ++ loongson_llsc_mb(); + __asm__ __volatile__( + "# futex_atomic_cmpxchg_inatomic \n" + " .set push \n" +@@ -192,6 +194,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + : GCC_OFF_SMALL_ASM() (*uaddr), "Jr" (oldval), "Jr" (newval), + "i" (-EFAULT) + : "memory"); ++ loongson_llsc_mb(); + } else + return -ENOSYS; + +diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h +index 57933fc..910851c 100644 +--- a/arch/mips/include/asm/pgtable.h ++++ b/arch/mips/include/asm/pgtable.h +@@ -228,6 +228,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) + : [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp) + : [global] "r" (page_global)); + } else if (kernel_uses_llsc) { ++ loongson_llsc_mb(); + __asm__ __volatile__ ( + " .set push \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" +@@ -242,6 +243,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) + " .set pop \n" + : [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp) + : [global] "r" (page_global)); ++ loongson_llsc_mb(); + } + #else /* !CONFIG_SMP */ + if (pte_none(*buddy)) +diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform +index 0fce460..c1a4d4d 100644 +--- a/arch/mips/loongson64/Platform ++++ b/arch/mips/loongson64/Platform +@@ -23,6 +23,29 @@ ifdef CONFIG_CPU_LOONGSON2F_WORKAROUNDS + endif + + cflags-$(CONFIG_CPU_LOONGSON3) += -Wa,--trap ++ ++# ++# Some versions of binutils, not currently mainline as of 2019/02/04, support ++# an -mfix-loongson3-llsc flag which emits a sync prior to each ll instruction ++# to work around a CPU bug (see loongson_llsc_mb() in asm/barrier.h for a ++# description). ++# ++# We disable this in order to prevent the assembler meddling with the ++# instruction that labels refer to, ie. if we label an ll instruction: ++# ++# 1: ll v0, 0(a0) ++# ++# ...then with the assembler fix applied the label may actually point at a sync ++# instruction inserted by the assembler, and if we were using the label in an ++# exception table the table would no longer contain the address of the ll ++# instruction. ++# ++# Avoid this by explicitly disabling that assembler behaviour. If upstream ++# binutils does not merge support for the flag then we can revisit & remove ++# this later - for now it ensures vendor toolchains don't cause problems. ++# ++cflags-$(CONFIG_CPU_LOONGSON3) += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,) ++ + # + # binutils from v2.25 on and gcc starting from v4.9.0 treat -march=loongson3a + # as MIPS64 R2; older versions as just R1. This leaves the possibility open +diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c +index 37b1cb2..65b6e85 100644 +--- a/arch/mips/mm/tlbex.c ++++ b/arch/mips/mm/tlbex.c +@@ -932,6 +932,8 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, + * to mimic that here by taking a load/istream page + * fault. + */ ++ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS)) ++ uasm_i_sync(p, 0); + UASM_i_LA(p, ptr, (unsigned long)tlb_do_page_fault_0); + uasm_i_jr(p, ptr); + +@@ -1646,6 +1648,8 @@ static void + iPTE_LW(u32 **p, unsigned int pte, unsigned int ptr) + { + #ifdef CONFIG_SMP ++ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS)) ++ uasm_i_sync(p, 0); + # ifdef CONFIG_PHYS_ADDR_T_64BIT + if (cpu_has_64bits) + uasm_i_lld(p, pte, 0, ptr); +@@ -2259,6 +2263,8 @@ static void build_r4000_tlb_load_handler(void) + #endif + + uasm_l_nopage_tlbl(&l, p); ++ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS)) ++ uasm_i_sync(&p, 0); + build_restore_work_registers(&p); + #ifdef CONFIG_CPU_MICROMIPS + if ((unsigned long)tlb_do_page_fault_0 & 1) { +@@ -2313,6 +2319,8 @@ static void build_r4000_tlb_store_handler(void) + #endif + + uasm_l_nopage_tlbs(&l, p); ++ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS)) ++ uasm_i_sync(&p, 0); + build_restore_work_registers(&p); + #ifdef CONFIG_CPU_MICROMIPS + if ((unsigned long)tlb_do_page_fault_1 & 1) { +@@ -2368,6 +2376,8 @@ static void build_r4000_tlb_modify_handler(void) + #endif + + uasm_l_nopage_tlbm(&l, p); ++ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS)) ++ uasm_i_sync(&p, 0); + build_restore_work_registers(&p); + #ifdef CONFIG_CPU_MICROMIPS + if ((unsigned long)tlb_do_page_fault_1 & 1) { +-- +cgit v1.1 + diff --git a/debian/patches/series b/debian/patches/series index c166088eb..f7355b9f7 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -82,6 +82,7 @@ bugfix/m68k/m68k-build-with-ffreestanding.patch bugfix/x86/x86-kvmclock-set-offset-for-kvm-unstable-clock.patch bugfix/arm/ARM-dts-sun8i-h3-add-sy8106a-to-orange-pi-plus.patch bugfix/arm64/arm64-dts-allwinner-a64-Enable-A64-timer-workaround.patch +bugfix/mips/MIPS-Loongson-Introduce-and-use-loongson_llsc_mb.patch # Arch features features/mips/MIPS-increase-MAX-PHYSMEM-BITS-on-Loongson-3-only.patch