From 2c58080407554e1bac8fd50d23cb02420524caed Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 12 Aug 2013 12:50:22 +0200 Subject: [PATCH] MIPS: partially inline dma ops Several DMA ops are no-op on many platforms, and the indirection through the mips_dma_map_ops function table is causing the compiler to emit unnecessary code. Inlining visibly improves network performance in my tests (on a 24Kc based system), and also slightly reduces code size of a few drivers. Signed-off-by: Felix Fietkau --- arch/mips/Kconfig | 4 + arch/mips/include/asm/dma-mapping.h | 360 +++++++++++++++++++++++++++++++++++- arch/mips/mm/dma-default.c | 163 ++-------------- 3 files changed, 373 insertions(+), 154 deletions(-) --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1431,6 +1431,7 @@ config CPU_CAVIUM_OCTEON select LIBFDT select USE_OF select USB_EHCI_BIG_ENDIAN_MMIO + select SYS_HAS_DMA_OPS help The Cavium Octeon processor is a highly integrated chip containing many ethernet hardware widgets for networking tasks. The processor @@ -1651,6 +1652,9 @@ config SYS_HAS_CPU_XLR config SYS_HAS_CPU_XLP bool +config SYS_HAS_DMA_OPS + bool + # # CPU may reorder R->R, R->W, W->R, W->W # Reordering beyond LL and SC is handled in WEAK_REORDERING_BEYOND_LLSC --- a/arch/mips/include/asm/dma-mapping.h +++ b/arch/mips/include/asm/dma-mapping.h @@ -1,6 +1,12 @@ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H +#include +#include +#include +#include +#include + #include #include #include @@ -12,12 +18,47 @@ extern struct dma_map_ops *mips_dma_map_ops; +void __dma_sync(struct page *page, unsigned long offset, size_t size, + enum dma_data_direction direction); +void *mips_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs); +void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs); + static inline struct dma_map_ops *get_dma_ops(struct device *dev) { +#ifdef CONFIG_SYS_HAS_DMA_OPS if (dev && dev->archdata.dma_ops) return dev->archdata.dma_ops; else return mips_dma_map_ops; +#else + return NULL; +#endif +} + +/* + * Warning on the terminology - Linux calls an uncached area coherent; + * MIPS terminology calls memory areas with hardware maintained coherency + * coherent. + */ + +static inline int cpu_is_noncoherent_r10000(struct device *dev) +{ +#ifndef CONFIG_SYS_HAS_CPU_R10000 + return 0; +#endif + return !plat_device_is_coherent(dev) && + (current_cpu_type() == CPU_R10000 || + current_cpu_type() == CPU_R12000); +} + +static inline struct page *dma_addr_to_page(struct device *dev, + dma_addr_t dma_addr) +{ + return pfn_to_page( + plat_dma_addr_to_phys(dev, dma_addr) >> PAGE_SHIFT); } static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) @@ -30,12 +71,309 @@ static inline bool dma_capable(struct de static inline void dma_mark_clean(void *addr, size_t size) {} -#include +static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + unsigned long offset = (unsigned long)ptr & ~PAGE_MASK; + struct page *page = virt_to_page(ptr); + dma_addr_t addr; + + kmemcheck_mark_initialized(ptr, size); + BUG_ON(!valid_dma_direction(dir)); + if (ops) { + addr = ops->map_page(dev, page, offset, size, dir, attrs); + } else { + if (!plat_device_is_coherent(dev)) + __dma_sync(page, offset, size, dir); + + addr = plat_map_dma_mem_page(dev, page) + offset; + } + debug_dma_map_page(dev, page, offset, size, dir, addr, true); + return addr; +} + +static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) { + ops->unmap_page(dev, addr, size, dir, attrs); + } else { + if (cpu_is_noncoherent_r10000(dev)) + __dma_sync(dma_addr_to_page(dev, addr), + addr & ~PAGE_MASK, size, dir); + + plat_unmap_dma_mem(dev, addr, size, dir); + } + debug_dma_unmap_page(dev, addr, size, dir, true); +} + +static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + int i, ents; + struct scatterlist *s; + + for_each_sg(sg, s, nents, i) + kmemcheck_mark_initialized(sg_virt(s), s->length); + BUG_ON(!valid_dma_direction(dir)); + if (ops) { + ents = ops->map_sg(dev, sg, nents, dir, attrs); + } else { + for_each_sg(sg, s, nents, i) { + struct page *page = sg_page(s); + + if (!plat_device_is_coherent(dev)) + __dma_sync(page, s->offset, s->length, dir); + s->dma_address = + plat_map_dma_mem_page(dev, page) + s->offset; + } + ents = nents; + } + debug_dma_map_sg(dev, sg, nents, ents, dir); + + return ents; +} + +static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + struct scatterlist *s; + int i; + + BUG_ON(!valid_dma_direction(dir)); + debug_dma_unmap_sg(dev, sg, nents, dir); + if (ops) { + ops->unmap_sg(dev, sg, nents, dir, attrs); + return; + } + + for_each_sg(sg, s, nents, i) { + if (!plat_device_is_coherent(dev) && dir != DMA_TO_DEVICE) + __dma_sync(sg_page(s), s->offset, s->length, dir); + plat_unmap_dma_mem(dev, s->dma_address, s->length, dir); + } +} + +static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + dma_addr_t addr; + + kmemcheck_mark_initialized(page_address(page) + offset, size); + BUG_ON(!valid_dma_direction(dir)); + if (ops) { + addr = ops->map_page(dev, page, offset, size, dir, NULL); + } else { + if (!plat_device_is_coherent(dev)) + __dma_sync(page, offset, size, dir); + + addr = plat_map_dma_mem_page(dev, page) + offset; + } + debug_dma_map_page(dev, page, offset, size, dir, addr, false); + + return addr; +} + +static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) { + ops->unmap_page(dev, addr, size, dir, NULL); + } else { + if (cpu_is_noncoherent_r10000(dev)) + __dma_sync(dma_addr_to_page(dev, addr), + addr & ~PAGE_MASK, size, dir); + + plat_unmap_dma_mem(dev, addr, size, dir); + } + debug_dma_unmap_page(dev, addr, size, dir, false); +} + +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, + size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_single_for_cpu(dev, addr, size, dir); + else if (cpu_is_noncoherent_r10000(dev)) + __dma_sync(dma_addr_to_page(dev, addr), + addr & ~PAGE_MASK, size, dir); + debug_dma_sync_single_for_cpu(dev, addr, size, dir); +} + +static inline void dma_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_single_for_device(dev, addr, size, dir); + else if (!plat_device_is_coherent(dev)) + __dma_sync(dma_addr_to_page(dev, addr), + addr & ~PAGE_MASK, size, dir); + debug_dma_sync_single_for_device(dev, addr, size, dir); +} + +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_single_for_cpu(dev, addr + offset, size, dir); + else if (cpu_is_noncoherent_r10000(dev)) + __dma_sync(dma_addr_to_page(dev, addr + offset), + (addr + offset) & ~PAGE_MASK, size, dir); + debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); +} + +static inline void dma_sync_single_range_for_device(struct device *dev, + dma_addr_t addr, + unsigned long offset, + size_t size, + enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_single_for_device(dev, addr + offset, size, dir); + else if (!plat_device_is_coherent(dev)) + __dma_sync(dma_addr_to_page(dev, addr + offset), + (addr + offset) & ~PAGE_MASK, size, dir); + debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + struct scatterlist *s; + int i; + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_sg_for_cpu(dev, sg, nelems, dir); + else if (cpu_is_noncoherent_r10000(dev)) { + for_each_sg(sg, s, nelems, i) + __dma_sync(sg_page(s), s->offset, s->length, dir); + } + debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + struct scatterlist *s; + int i; + + BUG_ON(!valid_dma_direction(dir)); + if (ops) + ops->sync_sg_for_device(dev, sg, nelems, dir); + else if (!plat_device_is_coherent(dev)) { + for_each_sg(sg, s, nelems, i) + __dma_sync(sg_page(s), s->offset, s->length, dir); + } + debug_dma_sync_sg_for_device(dev, sg, nelems, dir); + +} + +#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) +#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, NULL) +#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, NULL) +#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, NULL) + +extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +/** + * dma_mmap_attrs - map a coherent DMA allocation into user space + * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices + * @vma: vm_area_struct describing requested user mapping + * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs + * @handle: device-view address returned from dma_alloc_attrs + * @size: size of memory originally requested in dma_alloc_attrs + * @attrs: attributes of mapping properties requested in dma_alloc_attrs + * + * Map a coherent DMA buffer previously allocated by dma_alloc_attrs + * into user space. The coherent DMA buffer must not be freed by the + * driver until the user space mapping has been released. + */ +static inline int +dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops && ops->mmap) + return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); +} + +#define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, NULL) + +static inline int dma_mmap_writecombine(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size) +{ + DEFINE_DMA_ATTRS(attrs); + dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs); + return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs); +} + +int +dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size); + +static inline int +dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, + dma_addr_t dma_addr, size_t size, struct dma_attrs *attrs) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + BUG_ON(!ops); + if (ops && ops->get_sgtable) + return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, + attrs); + return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); +} + +#define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, NULL) + static inline int dma_supported(struct device *dev, u64 mask) { struct dma_map_ops *ops = get_dma_ops(dev); - return ops->dma_supported(dev, mask); + if (ops) + return ops->dma_supported(dev, mask); + return plat_dma_supported(dev, mask); } static inline int dma_mapping_error(struct device *dev, u64 mask) @@ -43,7 +381,9 @@ static inline int dma_mapping_error(stru struct dma_map_ops *ops = get_dma_ops(dev); debug_dma_mapping_error(dev, mask); - return ops->mapping_error(dev, mask); + if (ops) + return ops->mapping_error(dev, mask); + return 0; } static inline int @@ -69,7 +409,11 @@ static inline void *dma_alloc_attrs(stru void *ret; struct dma_map_ops *ops = get_dma_ops(dev); - ret = ops->alloc(dev, size, dma_handle, gfp, attrs); + if (ops) + ret = ops->alloc(dev, size, dma_handle, gfp, attrs); + else + ret = mips_dma_alloc_coherent(dev, size, dma_handle, gfp, + attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, ret); @@ -84,7 +428,10 @@ static inline void dma_free_attrs(struct { struct dma_map_ops *ops = get_dma_ops(dev); - ops->free(dev, size, vaddr, dma_handle, attrs); + if (ops) + ops->free(dev, size, vaddr, dma_handle, attrs); + else + mips_dma_free_coherent(dev, size, vaddr, dma_handle, attrs); debug_dma_free_coherent(dev, size, vaddr, dma_handle); } --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -24,7 +24,7 @@ #ifdef CONFIG_DMA_MAYBE_COHERENT int coherentio = 0; /* User defined DMA coherency from command line. */ -EXPORT_SYMBOL_GPL(coherentio); +EXPORT_SYMBOL(coherentio); int hw_coherentio = 0; /* Actual hardware supported DMA coherency setting. */ static int __init setcoherentio(char *str) @@ -44,13 +44,6 @@ static int __init setnocoherentio(char * early_param("nocoherentio", setnocoherentio); #endif -static inline struct page *dma_addr_to_page(struct device *dev, - dma_addr_t dma_addr) -{ - return pfn_to_page( - plat_dma_addr_to_phys(dev, dma_addr) >> PAGE_SHIFT); -} - /* * The affected CPUs below in 'cpu_needs_post_dma_flush()' can * speculatively fill random cachelines with stale data at any time, @@ -123,8 +116,9 @@ void *dma_alloc_noncoherent(struct devic } EXPORT_SYMBOL(dma_alloc_noncoherent); -static void *mips_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t * dma_handle, gfp_t gfp, struct dma_attrs *attrs) +void *mips_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) { void *ret; @@ -148,6 +142,7 @@ static void *mips_dma_alloc_coherent(str return ret; } +EXPORT_SYMBOL(mips_dma_alloc_coherent); void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, @@ -158,8 +153,8 @@ void dma_free_noncoherent(struct device } EXPORT_SYMBOL(dma_free_noncoherent); -static void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, struct dma_attrs *attrs) +void mips_dma_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) { unsigned long addr = (unsigned long) vaddr; int order = get_order(size); @@ -174,6 +169,7 @@ static void mips_dma_free_coherent(struc free_pages(addr, get_order(size)); } +EXPORT_SYMBOL(mips_dma_free_coherent); static inline void __dma_sync_virtual(void *addr, size_t size, enum dma_data_direction direction) @@ -202,8 +198,8 @@ static inline void __dma_sync_virtual(vo * If highmem is not configured then the bulk of this loop gets * optimized out. */ -static inline void __dma_sync(struct page *page, - unsigned long offset, size_t size, enum dma_data_direction direction) +void __dma_sync(struct page *page, unsigned long offset, size_t size, + enum dma_data_direction direction) { size_t left = size; @@ -233,108 +229,7 @@ static inline void __dma_sync(struct pag } while (left); } -static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, struct dma_attrs *attrs) -{ - if (cpu_needs_post_dma_flush(dev)) - __dma_sync(dma_addr_to_page(dev, dma_addr), - dma_addr & ~PAGE_MASK, size, direction); - - plat_unmap_dma_mem(dev, dma_addr, size, direction); -} - -static int mips_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, struct dma_attrs *attrs) -{ - int i; - - for (i = 0; i < nents; i++, sg++) { - if (!plat_device_is_coherent(dev)) - __dma_sync(sg_page(sg), sg->offset, sg->length, - direction); - sg->dma_address = plat_map_dma_mem_page(dev, sg_page(sg)) + - sg->offset; - } - - return nents; -} - -static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - if (!plat_device_is_coherent(dev)) - __dma_sync(page, offset, size, direction); - - return plat_map_dma_mem_page(dev, page) + offset; -} - -static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - int i; - - for (i = 0; i < nhwentries; i++, sg++) { - if (!plat_device_is_coherent(dev) && - direction != DMA_TO_DEVICE) - __dma_sync(sg_page(sg), sg->offset, sg->length, - direction); - plat_unmap_dma_mem(dev, sg->dma_address, sg->length, direction); - } -} - -static void mips_dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) -{ - if (cpu_needs_post_dma_flush(dev)) - __dma_sync(dma_addr_to_page(dev, dma_handle), - dma_handle & ~PAGE_MASK, size, direction); -} - -static void mips_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) -{ - if (!plat_device_is_coherent(dev)) - __dma_sync(dma_addr_to_page(dev, dma_handle), - dma_handle & ~PAGE_MASK, size, direction); -} - -static void mips_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) -{ - int i; - - /* Make sure that gcc doesn't leave the empty loop body. */ - for (i = 0; i < nelems; i++, sg++) { - if (cpu_needs_post_dma_flush(dev)) - __dma_sync(sg_page(sg), sg->offset, sg->length, - direction); - } -} - -static void mips_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, enum dma_data_direction direction) -{ - int i; - - /* Make sure that gcc doesn't leave the empty loop body. */ - for (i = 0; i < nelems; i++, sg++) { - if (!plat_device_is_coherent(dev)) - __dma_sync(sg_page(sg), sg->offset, sg->length, - direction); - } -} - -int mips_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -int mips_dma_supported(struct device *dev, u64 mask) -{ - return plat_dma_supported(dev, mask); -} +EXPORT_SYMBOL(__dma_sync); void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) @@ -347,23 +242,10 @@ void dma_cache_sync(struct device *dev, EXPORT_SYMBOL(dma_cache_sync); -static struct dma_map_ops mips_default_dma_map_ops = { - .alloc = mips_dma_alloc_coherent, - .free = mips_dma_free_coherent, - .map_page = mips_dma_map_page, - .unmap_page = mips_dma_unmap_page, - .map_sg = mips_dma_map_sg, - .unmap_sg = mips_dma_unmap_sg, - .sync_single_for_cpu = mips_dma_sync_single_for_cpu, - .sync_single_for_device = mips_dma_sync_single_for_device, - .sync_sg_for_cpu = mips_dma_sync_sg_for_cpu, - .sync_sg_for_device = mips_dma_sync_sg_for_device, - .mapping_error = mips_dma_mapping_error, - .dma_supported = mips_dma_supported -}; - -struct dma_map_ops *mips_dma_map_ops = &mips_default_dma_map_ops; +#ifdef CONFIG_SYS_HAS_DMA_OPS +struct dma_map_ops *mips_dma_map_ops = NULL; EXPORT_SYMBOL(mips_dma_map_ops); +#endif #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)