diff --git a/debian/changelog b/debian/changelog index 167cdc4d5..c166f42ab 100644 --- a/debian/changelog +++ b/debian/changelog @@ -13,6 +13,9 @@ linux-2.6 (2.6.32-10) UNRELEASED; urgency=low * agpgart: Reprobe VGA devices when a new GART device is added (Closes: #570229) + [ Bastian Blank ] + * Add support for Xen dom0 into its featureset. + -- maximilian attems Thu, 25 Feb 2010 13:07:47 +0100 linux-2.6 (2.6.32-9) unstable; urgency=high diff --git a/debian/config/defines b/debian/config/defines index 8b45540cf..710168ade 100644 --- a/debian/config/defines +++ b/debian/config/defines @@ -26,7 +26,7 @@ featuresets: enabled: true [featureset-xen_base] -enabled: false +enabled: true [description] part-long-xen: This kernel also runs on a Xen hypervisor. diff --git a/debian/patches/features/all/xen/pvops-updates.patch b/debian/patches/features/all/xen/pvops-updates.patch new file mode 100644 index 000000000..5c3f255a4 --- /dev/null +++ b/debian/patches/features/all/xen/pvops-updates.patch @@ -0,0 +1,270 @@ +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index 4953f9b..863e1c2 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -397,6 +397,9 @@ static inline unsigned long pages_to_mb(unsigned long npg) + #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + ++#define arch_vm_get_page_prot arch_vm_get_page_prot ++extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags); ++ + #if PAGETABLE_LEVELS > 2 + static inline int pud_none(pud_t pud) + { +diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h +index c57a301..4e46931 100644 +--- a/arch/x86/include/asm/pgtable_64.h ++++ b/arch/x86/include/asm/pgtable_64.h +@@ -160,7 +160,7 @@ extern void cleanup_highmap(void); + #define pgtable_cache_init() do { } while (0) + #define check_pgt_cache() do { } while (0) + +-#define PAGE_AGP PAGE_KERNEL_NOCACHE ++#define PAGE_AGP PAGE_KERNEL_IO_NOCACHE + #define HAVE_PAGE_AGP 1 + + /* fs/proc/kcore.c */ +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 25fc1df..103e324 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -17,6 +17,16 @@ + + gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + ++pgprot_t arch_vm_get_page_prot(unsigned vm_flags) ++{ ++ pgprot_t ret = __pgprot(0); ++ ++ if (vm_flags & VM_IO) ++ ret = __pgprot(_PAGE_IOMAP); ++ ++ return ret; ++} ++ + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) + { + return (pte_t *)__get_free_page(PGALLOC_GFP); +diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c +index 9bca04e..399a017 100644 +--- a/drivers/char/agp/intel-agp.c ++++ b/drivers/char/agp/intel-agp.c +@@ -398,15 +398,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode) + /* Exists to support ARGB cursors */ + static struct page *i8xx_alloc_pages(void) + { ++ void *addr; ++ dma_addr_t _d; + struct page *page; + +- page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2); +- if (page == NULL) ++ addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL); ++ if (addr == NULL) + return NULL; + ++ page = virt_to_page(addr); ++ + if (set_pages_uc(page, 4) < 0) { + set_pages_wb(page, 4); +- __free_pages(page, 2); ++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d); + return NULL; + } + get_page(page); +@@ -416,12 +420,17 @@ static struct page *i8xx_alloc_pages(void) + + static void i8xx_destroy_pages(struct page *page) + { ++ void *addr; ++ + if (page == NULL) + return; + + set_pages_wb(page, 4); + put_page(page); +- __free_pages(page, 2); ++ ++ addr = page_address(page); ++ ++ dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr)); + atomic_dec(&agp_bridge->current_memory_agp); + } + +diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c +index a75ca63..bdc26b9 100644 +--- a/drivers/gpu/drm/drm_drv.c ++++ b/drivers/gpu/drm/drm_drv.c +@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev) + } + if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg && + !drm_core_check_feature(dev, DRIVER_MODESET)) { +- drm_sg_cleanup(dev->sg); ++ drm_sg_cleanup(dev, dev->sg); + dev->sg = NULL; + } + +diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c +index 8bf3770..dde5f66 100644 +--- a/drivers/gpu/drm/drm_gem.c ++++ b/drivers/gpu/drm/drm_gem.c +@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND; + vma->vm_ops = obj->dev->driver->gem_vm_ops; + vma->vm_private_data = map->handle; +- vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); ++ vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); + + /* Take a ref for this mapping of the object, so that the fault + * handler can dereference the mmap offset's pointer to the object. +diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c +index c7823c8..95ffb8a 100644 +--- a/drivers/gpu/drm/drm_scatter.c ++++ b/drivers/gpu/drm/drm_scatter.c +@@ -32,20 +32,73 @@ + */ + + #include ++#include + #include "drmP.h" + + #define DEBUG_SCATTER 0 + +-static inline void *drm_vmalloc_dma(unsigned long size) ++static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size) + { + #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) + return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE); + #else +- return vmalloc_32(size); ++ struct device *dev = &drmdev->pdev->dev; ++ struct page **pages; ++ void *addr; ++ const int npages = PFN_UP(size); ++ int i; ++ ++ pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL); ++ if (!pages) ++ goto fail; ++ ++ for (i = 0; i < npages; i++) { ++ dma_addr_t phys; ++ void *addr; ++ addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL); ++ if (addr == NULL) ++ goto out_free_pages; ++ ++ pages[i] = virt_to_page(addr); ++ } ++ ++ addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL); ++ ++ kfree(pages); ++ ++ return addr; ++ ++out_free_pages: ++ while (i > 0) { ++ void *addr = page_address(pages[--i]); ++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr)); ++ } ++ ++ kfree(pages); ++ ++fail: ++ return NULL; ++#endif ++} ++ ++static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages, ++ struct page **pages) ++{ ++#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) ++ vfree(addr); ++#else ++ struct device *dev = &drmdev->pdev->dev; ++ int i; ++ ++ for (i = 0; i < npages; i++) { ++ void *addr = page_address(pages[i]); ++ dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr)); ++ } ++ vunmap(addr); + #endif + } + +-void drm_sg_cleanup(struct drm_sg_mem * entry) ++void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry) + { + struct page *page; + int i; +@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry) + ClearPageReserved(page); + } + +- vfree(entry->virtual); ++ drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist); + + kfree(entry->busaddr); + kfree(entry->pagelist); +@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request) + } + memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr)); + +- entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT); ++ entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT); + if (!entry->virtual) { + kfree(entry->busaddr); + kfree(entry->pagelist); +@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request) + return 0; + + failed: +- drm_sg_cleanup(entry); ++ drm_sg_cleanup(dev, entry); + return -ENOMEM; + } + EXPORT_SYMBOL(drm_sg_alloc); +@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data, + + DRM_DEBUG("virtual = %p\n", entry->virtual); + +- drm_sg_cleanup(entry); ++ drm_sg_cleanup(dev, entry); + + return 0; + } +diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c +index 1c040d0..3dc8d6b 100644 +--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c ++++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c +@@ -272,6 +272,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma, + + vma->vm_private_data = bo; + vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + return 0; + out_unref: + ttm_bo_unref(&bo); +@@ -287,6 +288,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo) + vma->vm_ops = &ttm_bo_vm_ops; + vma->vm_private_data = ttm_bo_reference(bo); + vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND; ++ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + return 0; + } + EXPORT_SYMBOL(ttm_fbdev_mmap); +diff --git a/include/drm/drmP.h b/include/drm/drmP.h +index 7ad3faa..cf9ddce 100644 +--- a/include/drm/drmP.h ++++ b/include/drm/drmP.h +@@ -1388,7 +1388,7 @@ extern int drm_vma_info(struct seq_file *m, void *data); + #endif + + /* Scatter Gather Support (drm_scatter.h) */ +-extern void drm_sg_cleanup(struct drm_sg_mem * entry); ++extern void drm_sg_cleanup(struct drm_device *dev, struct drm_sg_mem * entry); + extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data, + struct drm_file *file_priv); + extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request); diff --git a/debian/patches/features/all/xen/pvops.patch b/debian/patches/features/all/xen/pvops.patch new file mode 100644 index 000000000..368546237 --- /dev/null +++ b/debian/patches/features/all/xen/pvops.patch @@ -0,0 +1,20510 @@ +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 5bc4eaa..345c399 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -2668,6 +2668,13 @@ and is between 256 and 4096 characters. It is defined in the file + medium is write-protected). + Example: quirks=0419:aaf5:rl,0421:0433:rc + ++ userpte= ++ [X86] Flags controlling user PTE allocations. ++ ++ nohigh = do not allocate PTE pages in ++ HIGHMEM regardless of setting ++ of CONFIG_HIGHPTE. ++ + vdso= [X86,SH] + vdso=2: enable compat VDSO (default with COMPAT_VDSO) + vdso=1: enable VDSO (default) +diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt +index 29a6ff8..81f9b94 100644 +--- a/Documentation/x86/x86_64/boot-options.txt ++++ b/Documentation/x86/x86_64/boot-options.txt +@@ -267,10 +267,14 @@ IOMMU (input/output memory management unit) + + iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU + implementation: +- swiotlb=[,force] ++ swiotlb=[npages=] ++ swiotlb=[force] ++ swiotlb=[overflow=] ++ + Prereserve that many 128K pages for the software IO + bounce buffering. + force Force all IO through the software TLB. ++ Size in bytes of the overflow buffer. + + Settings for the IBM Calgary hardware IOMMU currently found in IBM + pSeries and xSeries machines: +diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h +index 8d3c79c..7d09a09 100644 +--- a/arch/ia64/include/asm/dma-mapping.h ++++ b/arch/ia64/include/asm/dma-mapping.h +@@ -73,7 +73,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h +index dcbaea7..f0acde6 100644 +--- a/arch/ia64/include/asm/swiotlb.h ++++ b/arch/ia64/include/asm/swiotlb.h +@@ -4,8 +4,6 @@ + #include + #include + +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; + extern void pci_swiotlb_init(void); +diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h +index b8370c8..baa74c8 100644 +--- a/arch/ia64/include/asm/xen/events.h ++++ b/arch/ia64/include/asm/xen/events.h +@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) + return !(ia64_psr(regs)->i); + } + +-static inline void handle_irq(int irq, struct pt_regs *regs) +-{ +- __do_IRQ(irq); +-} + #define irq_ctx_init(cpu) do { } while (0) + + #endif /* _ASM_IA64_XEN_EVENTS_H */ +diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c +index 285aae8..53292ab 100644 +--- a/arch/ia64/kernel/pci-swiotlb.c ++++ b/arch/ia64/kernel/pci-swiotlb.c +@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { + void __init swiotlb_dma_init(void) + { + dma_ops = &swiotlb_dma_ops; +- swiotlb_init(); ++ swiotlb_init(1); + } + + void __init pci_swiotlb_init(void) +@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) + swiotlb = 1; + printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); + machvec_init("dig"); +- swiotlb_init(); ++ swiotlb_init(1); + dma_ops = &swiotlb_dma_ops; + #else + panic("Unable to find Intel IOMMU"); +diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h +index e281dae..80a973b 100644 +--- a/arch/powerpc/include/asm/dma-mapping.h ++++ b/arch/powerpc/include/asm/dma-mapping.h +@@ -197,7 +197,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c +index 53bcf3d..b152de3 100644 +--- a/arch/powerpc/kernel/setup_32.c ++++ b/arch/powerpc/kernel/setup_32.c +@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c +index 04f638d..df2c9e9 100644 +--- a/arch/powerpc/kernel/setup_64.c ++++ b/arch/powerpc/kernel/setup_64.c +@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p) + + #ifdef CONFIG_SWIOTLB + if (ppc_swiotlb_enable) +- swiotlb_init(); ++ swiotlb_init(1); + #endif + + paging_init(); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 4fdb669..fd612c0 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1875,6 +1875,10 @@ config PCI_OLPC + def_bool y + depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) + ++config PCI_XEN ++ bool ++ select SWIOTLB ++ + config PCI_DOMAINS + def_bool y + depends on PCI +diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h +index 18aa3f8..4413ba4 100644 +--- a/arch/x86/include/asm/amd_iommu.h ++++ b/arch/x86/include/asm/amd_iommu.h +@@ -23,20 +23,16 @@ + #include + + #ifdef CONFIG_AMD_IOMMU +-extern int amd_iommu_init(void); + extern int amd_iommu_init_dma_ops(void); + extern int amd_iommu_init_passthrough(void); + extern void amd_iommu_detect(void); + extern irqreturn_t amd_iommu_int_handler(int irq, void *data); + extern void amd_iommu_flush_all_domains(void); + extern void amd_iommu_flush_all_devices(void); +-extern void amd_iommu_shutdown(void); + extern void amd_iommu_apply_erratum_63(u16 devid); + extern void amd_iommu_init_api(void); + #else +-static inline int amd_iommu_init(void) { return -ENODEV; } + static inline void amd_iommu_detect(void) { } +-static inline void amd_iommu_shutdown(void) { } + #endif + + #endif /* _ASM_X86_AMD_IOMMU_H */ +diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h +index b03bedb..0918654 100644 +--- a/arch/x86/include/asm/calgary.h ++++ b/arch/x86/include/asm/calgary.h +@@ -62,10 +62,8 @@ struct cal_chipset_ops { + extern int use_calgary; + + #ifdef CONFIG_CALGARY_IOMMU +-extern int calgary_iommu_init(void); + extern void detect_calgary(void); + #else +-static inline int calgary_iommu_init(void) { return 1; } + static inline void detect_calgary(void) { return; } + #endif + +diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h +index 6a25d5d..ac91eed 100644 +--- a/arch/x86/include/asm/dma-mapping.h ++++ b/arch/x86/include/asm/dma-mapping.h +@@ -20,7 +20,8 @@ + # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) + #endif + +-extern dma_addr_t bad_dma_address; ++#define DMA_ERROR_CODE 0 ++ + extern int iommu_merge; + extern struct device x86_dma_fallback_dev; + extern int panic_on_overflow; +@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); + +- return (dma_addr == bad_dma_address); ++ return (dma_addr == DMA_ERROR_CODE); + } + + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) +@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) + if (!dev->dma_mask) + return 0; + +- return addr + size <= *dev->dma_mask; ++ return addr + size - 1 <= *dev->dma_mask; + } + + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h +index 6cfdafa..4ac5b0f 100644 +--- a/arch/x86/include/asm/gart.h ++++ b/arch/x86/include/asm/gart.h +@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed; + extern int gart_iommu_aperture_disabled; + + extern void early_gart_iommu_check(void); +-extern void gart_iommu_init(void); +-extern void gart_iommu_shutdown(void); ++extern int gart_iommu_init(void); + extern void __init gart_parse_options(char *); + extern void gart_iommu_hole_init(void); + +@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void); + static inline void early_gart_iommu_check(void) + { + } +-static inline void gart_iommu_init(void) +-{ +-} +-static inline void gart_iommu_shutdown(void) +-{ +-} + static inline void gart_parse_options(char *options) + { + } +diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h +index 3251e23..fa152cb 100644 +--- a/arch/x86/include/asm/hpet.h ++++ b/arch/x86/include/asm/hpet.h +@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address; + extern int hpet_force_user; + extern u8 hpet_msi_disable; + extern int is_hpet_enabled(void); ++extern int disable_hpet(char *); + extern int hpet_enable(void); + extern void hpet_disable(void); + extern unsigned long hpet_readl(unsigned long a); +@@ -108,6 +109,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler); + #else /* CONFIG_HPET_TIMER */ + + static inline int hpet_enable(void) { return 0; } ++static inline int disable_hpet(char *s) { return 0; } + static inline int is_hpet_enabled(void) { return 0; } + #define hpet_readl(a) 0 + +diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h +index 7373932..49ee1a9 100644 +--- a/arch/x86/include/asm/io.h ++++ b/arch/x86/include/asm/io.h +@@ -7,6 +7,10 @@ + #include + #include + ++#include ++ ++extern int isapnp_disable; ++ + #define build_mmio_read(name, size, type, reg, barrier) \ + static inline type name(const volatile void __iomem *addr) \ + { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +@@ -199,6 +203,17 @@ extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); + extern void early_iounmap(void __iomem *addr, unsigned long size); + ++#ifdef CONFIG_XEN ++struct bio_vec; ++ ++extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, ++ const struct bio_vec *vec2); ++ ++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ ++ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ ++ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) ++#endif /* CONFIG_XEN */ ++ + #define IO_SPACE_LIMIT 0xffff + + #endif /* _ASM_X86_IO_H */ +diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h +index 7c7c16c..2fc09d3 100644 +--- a/arch/x86/include/asm/io_apic.h ++++ b/arch/x86/include/asm/io_apic.h +@@ -171,6 +171,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); + + extern void probe_nr_irqs_gsi(void); ++extern int get_nr_irqs_gsi(void); + + extern int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, +@@ -200,4 +201,6 @@ static inline void probe_nr_irqs_gsi(void) { } + + #endif + ++void xen_io_apic_init(void); ++ + #endif /* _ASM_X86_IO_APIC_H */ +diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h +index fd6d21b..345c99c 100644 +--- a/arch/x86/include/asm/iommu.h ++++ b/arch/x86/include/asm/iommu.h +@@ -1,8 +1,6 @@ + #ifndef _ASM_X86_IOMMU_H + #define _ASM_X86_IOMMU_H + +-extern void pci_iommu_shutdown(void); +-extern void no_iommu_init(void); + extern struct dma_map_ops nommu_dma_ops; + extern int force_iommu, no_iommu; + extern int iommu_detected; +diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h +index ef51b50..e15fca1 100644 +--- a/arch/x86/include/asm/microcode.h ++++ b/arch/x86/include/asm/microcode.h +@@ -55,4 +55,13 @@ static inline struct microcode_ops * __init init_amd_microcode(void) + } + #endif + ++#ifdef CONFIG_MICROCODE_XEN ++extern struct microcode_ops * __init init_xen_microcode(void); ++#else ++static inline struct microcode_ops * __init init_xen_microcode(void) ++{ ++ return NULL; ++} ++#endif ++ + #endif /* _ASM_X86_MICROCODE_H */ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 80a1dee..67eaa91 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -13,6 +13,9 @@ typedef struct { + int size; + struct mutex lock; + void *vdso; ++#ifdef CONFIG_XEN ++ int has_foreign_mappings; ++#endif + } mm_context_t; + + #ifdef CONFIG_SMP +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index efb3899..63a55bc 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -330,11 +330,18 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) + { + PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g); + } ++ + static inline void set_iopl_mask(unsigned mask) + { + PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask); + } + ++static inline void set_io_bitmap(struct thread_struct *thread, ++ unsigned long bytes_updated) ++{ ++ PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated); ++} ++ + /* The paravirtualized I/O functions */ + static inline void slow_down_io(void) + { +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 9357473..3202dcc 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -135,6 +135,8 @@ struct pv_cpu_ops { + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); ++ void (*set_io_bitmap)(struct thread_struct *thread, ++ unsigned long bytes_updated); + + void (*wbinvd)(void); + void (*io_delay)(void); +diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h +index ada8c20..2a34c12 100644 +--- a/arch/x86/include/asm/pci.h ++++ b/arch/x86/include/asm/pci.h +@@ -21,6 +21,7 @@ struct pci_sysdata { + extern int pci_routeirq; + extern int noioapicquirk; + extern int noioapicreroute; ++extern int pci_scan_all_fns; + + /* scan a bus after allocating a pci_sysdata for it */ + extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, +diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h +index b399988..30cbf49 100644 +--- a/arch/x86/include/asm/pci_x86.h ++++ b/arch/x86/include/asm/pci_x86.h +@@ -45,6 +45,7 @@ enum pci_bf_sort_state { + extern unsigned int pcibios_max_latency; + + void pcibios_resource_survey(void); ++void pcibios_set_cache_line_size(void); + + /* pci-pc.c */ + +@@ -106,6 +107,7 @@ extern int pci_direct_probe(void); + extern void pci_direct_init(int type); + extern void pci_pcbios_init(void); + extern int pci_olpc_init(void); ++extern int pci_xen_init(void); + extern void __init dmi_check_pciprobe(void); + extern void __init dmi_check_skip_isa_align(void); + +diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h +index 0e8c2a0..271de94 100644 +--- a/arch/x86/include/asm/pgalloc.h ++++ b/arch/x86/include/asm/pgalloc.h +@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {} + #endif + + /* ++ * Flags to use when allocating a user page table page. ++ */ ++extern gfp_t __userpte_alloc_gfp; ++ ++/* + * Allocate and free page tables. + */ + extern pgd_t *pgd_alloc(struct mm_struct *); +diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h +index af6fd36..4953f9b 100644 +--- a/arch/x86/include/asm/pgtable.h ++++ b/arch/x86/include/asm/pgtable.h +@@ -616,6 +616,9 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) + memcpy(dst, src, count * sizeof(pgd_t)); + } + ++int create_lookup_pte_addr(struct mm_struct *mm, ++ unsigned long address, ++ uint64_t *ptep); + + #include + #endif /* __ASSEMBLY__ */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 13b1885..0aac25a 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -551,6 +551,9 @@ static inline void native_set_iopl_mask(unsigned mask) + #endif + } + ++extern void native_set_io_bitmap(struct thread_struct *thread, ++ unsigned long updated_bytes); ++ + static inline void + native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) + { +@@ -592,6 +595,7 @@ static inline void load_sp0(struct tss_struct *tss, + } + + #define set_iopl_mask native_set_iopl_mask ++#define set_io_bitmap native_set_io_bitmap + #endif /* CONFIG_PARAVIRT */ + + /* +diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h +index b9e4e20..8085277 100644 +--- a/arch/x86/include/asm/swiotlb.h ++++ b/arch/x86/include/asm/swiotlb.h +@@ -3,15 +3,16 @@ + + #include + +-/* SWIOTLB interface */ +- +-extern int swiotlb_force; +- + #ifdef CONFIG_SWIOTLB + extern int swiotlb; +-extern void pci_swiotlb_init(void); ++extern int __init pci_swiotlb_detect(void); ++extern void __init pci_swiotlb_init(void); + #else + #define swiotlb 0 ++static inline int pci_swiotlb_detect(void) ++{ ++ return 0; ++} + static inline void pci_swiotlb_init(void) + { + } +diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h +index 1bb6e39..ef0fa4d 100644 +--- a/arch/x86/include/asm/syscalls.h ++++ b/arch/x86/include/asm/syscalls.h +@@ -33,11 +33,11 @@ long sys_rt_sigreturn(struct pt_regs *); + asmlinkage int sys_set_thread_area(struct user_desc __user *); + asmlinkage int sys_get_thread_area(struct user_desc __user *); + +-/* X86_32 only */ +-#ifdef CONFIG_X86_32 + /* kernel/ioport.c */ +-long sys_iopl(struct pt_regs *); ++asmlinkage long sys_iopl(unsigned int); + ++/* X86_32 only */ ++#ifdef CONFIG_X86_32 + /* kernel/process_32.c */ + int sys_clone(struct pt_regs *); + int sys_execve(struct pt_regs *); +@@ -68,8 +68,6 @@ int sys_vm86(struct pt_regs *); + #else /* CONFIG_X86_32 */ + + /* X86_64 only */ +-/* kernel/ioport.c */ +-asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + + /* kernel/process_64.c */ + asmlinkage long sys_clone(unsigned long, unsigned long, +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 7f3eba0..e4fc8ea 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -89,6 +89,10 @@ static inline void __flush_tlb_one(unsigned long addr) + + #ifndef CONFIG_SMP + ++static inline void __init init_smp_flush(void) ++{ ++} ++ + #define flush_tlb() __flush_tlb() + #define flush_tlb_all() __flush_tlb_all() + #define local_flush_tlb() __flush_tlb() +@@ -129,6 +133,8 @@ static inline void reset_lazy_tlbstate(void) + + #define local_flush_tlb() __flush_tlb() + ++extern void init_smp_flush(void); ++ + extern void flush_tlb_all(void); + extern void flush_tlb_current_task(void); + extern void flush_tlb_mm(struct mm_struct *); +diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h +index 2c756fd..d8e7145 100644 +--- a/arch/x86/include/asm/x86_init.h ++++ b/arch/x86/include/asm/x86_init.h +@@ -91,6 +91,14 @@ struct x86_init_timers { + }; + + /** ++ * struct x86_init_iommu - platform specific iommu setup ++ * @iommu_init: platform specific iommu setup ++ */ ++struct x86_init_iommu { ++ int (*iommu_init)(void); ++}; ++ ++/** + * struct x86_init_ops - functions for platform specific setup + * + */ +@@ -101,6 +109,7 @@ struct x86_init_ops { + struct x86_init_oem oem; + struct x86_init_paging paging; + struct x86_init_timers timers; ++ struct x86_init_iommu iommu; + }; + + /** +@@ -121,6 +130,7 @@ struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long nowtime); ++ void (*iommu_shutdown)(void); + }; + + extern struct x86_init_ops x86_init; +diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h +index 9c371e4..3da450b 100644 +--- a/arch/x86/include/asm/xen/hypercall.h ++++ b/arch/x86/include/asm/xen/hypercall.h +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + /* + * The hypercall asms have to meet several constraints: +@@ -200,6 +201,23 @@ extern struct { char _entry[32]; } hypercall_page[]; + (type)__res; \ + }) + ++static inline long ++privcmd_call(unsigned call, ++ unsigned long a1, unsigned long a2, ++ unsigned long a3, unsigned long a4, ++ unsigned long a5) ++{ ++ __HYPERCALL_DECLS; ++ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); ++ ++ asm volatile("call *%[call]" ++ : __HYPERCALL_5PARAM ++ : [call] "a" (&hypercall_page[call]) ++ : __HYPERCALL_CLOBBER5); ++ ++ return (long)__res; ++} ++ + static inline int + HYPERVISOR_set_trap_table(struct trap_info *table) + { +@@ -282,6 +300,13 @@ HYPERVISOR_set_timer_op(u64 timeout) + } + + static inline int ++HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) ++{ ++ platform_op->interface_version = XENPF_INTERFACE_VERSION; ++ return _hypercall1(int, dom0_op, platform_op); ++} ++ ++static inline int + HYPERVISOR_set_debugreg(int reg, unsigned long value) + { + return _hypercall2(int, set_debugreg, reg, value); +@@ -424,6 +449,14 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) + mcl->args[0] = set; + } + ++#if defined(CONFIG_X86_64) ++#define MULTI_UVMFLAGS_INDEX 2 ++#define MULTI_UVMDOMID_INDEX 3 ++#else ++#define MULTI_UVMFLAGS_INDEX 3 ++#define MULTI_UVMDOMID_INDEX 4 ++#endif ++ + static inline void + MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +@@ -432,12 +465,11 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + mcl->args[0] = va; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; +- mcl->args[2] = flags; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; +- mcl->args[3] = flags; + } ++ mcl->args[MULTI_UVMFLAGS_INDEX] = flags; + } + + static inline void +diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h +index d5b7e90..396ff4c 100644 +--- a/arch/x86/include/asm/xen/hypervisor.h ++++ b/arch/x86/include/asm/xen/hypervisor.h +@@ -37,31 +37,4 @@ + extern struct shared_info *HYPERVISOR_shared_info; + extern struct start_info *xen_start_info; + +-enum xen_domain_type { +- XEN_NATIVE, /* running on bare hardware */ +- XEN_PV_DOMAIN, /* running in a PV domain */ +- XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ +-}; +- +-#ifdef CONFIG_XEN +-extern enum xen_domain_type xen_domain_type; +-#else +-#define xen_domain_type XEN_NATIVE +-#endif +- +-#define xen_domain() (xen_domain_type != XEN_NATIVE) +-#define xen_pv_domain() (xen_domain() && \ +- xen_domain_type == XEN_PV_DOMAIN) +-#define xen_hvm_domain() (xen_domain() && \ +- xen_domain_type == XEN_HVM_DOMAIN) +- +-#ifdef CONFIG_XEN_DOM0 +-#include +- +-#define xen_initial_domain() (xen_pv_domain() && \ +- xen_start_info->flags & SIF_INITDOMAIN) +-#else /* !CONFIG_XEN_DOM0 */ +-#define xen_initial_domain() (0) +-#endif /* CONFIG_XEN_DOM0 */ +- + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ +diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h +index e8506c1..9539998 100644 +--- a/arch/x86/include/asm/xen/interface.h ++++ b/arch/x86/include/asm/xen/interface.h +@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void); + #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) + #endif + +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) ++#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) ++#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) + + /* Maximum number of virtual CPUs in multi-processor guests. */ + #define MAX_VIRT_CPUS 32 +@@ -97,6 +97,8 @@ DEFINE_GUEST_HANDLE(void); + #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) + + #ifndef __ASSEMBLY__ ++#include ++ + struct trap_info { + uint8_t vector; /* exception vector */ + uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ +diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h +index 42a7e00..8413688 100644 +--- a/arch/x86/include/asm/xen/interface_32.h ++++ b/arch/x86/include/asm/xen/interface_32.h +@@ -32,6 +32,11 @@ + /* And the trap vector is... */ + #define TRAP_INSTR "int $0x82" + ++#define __MACH2PHYS_VIRT_START 0xF5800000 ++#define __MACH2PHYS_VIRT_END 0xF6800000 ++ ++#define __MACH2PHYS_SHIFT 2 ++ + /* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. +diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h +index 100d266..839a481 100644 +--- a/arch/x86/include/asm/xen/interface_64.h ++++ b/arch/x86/include/asm/xen/interface_64.h +@@ -39,18 +39,7 @@ + #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 + #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 + #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 +- +-#ifndef HYPERVISOR_VIRT_START +-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +-#endif +- +-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +-#ifndef machine_to_phys_mapping +-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +-#endif ++#define __MACH2PHYS_SHIFT 3 + + /* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) +diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h +new file mode 100644 +index 0000000..75df312 +--- /dev/null ++++ b/arch/x86/include/asm/xen/iommu.h +@@ -0,0 +1,12 @@ ++#ifndef ASM_X86__XEN_IOMMU_H ++ ++#ifdef CONFIG_PCI_XEN ++extern void xen_iommu_init(void); ++#else ++static inline void xen_iommu_init(void) ++{ ++} ++#endif ++ ++#endif ++ +diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h +index 018a0a4..f334014 100644 +--- a/arch/x86/include/asm/xen/page.h ++++ b/arch/x86/include/asm/xen/page.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -35,6 +36,8 @@ typedef struct xpaddr { + #define MAX_DOMAIN_PAGES \ + ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) + ++extern unsigned long *machine_to_phys_mapping; ++extern unsigned int machine_to_phys_order; + + extern unsigned long get_phys_to_machine(unsigned long pfn); + extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); +@@ -62,10 +65,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +-#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) +- return max_mapnr; +-#endif ++ return ~0; + + pfn = 0; + /* +@@ -112,13 +113,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) + */ + static inline unsigned long mfn_to_local_pfn(unsigned long mfn) + { +- extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); +- if ((pfn < max_mapnr) +- && !xen_feature(XENFEAT_auto_translated_physmap) +- && (get_phys_to_machine(pfn) != mfn)) +- return max_mapnr; /* force !pfn_valid() */ +- /* XXX fixme; not true with sparsemem */ ++ if (get_phys_to_machine(pfn) != mfn) ++ return -1; /* force !pfn_valid() */ + return pfn; + } + +@@ -163,6 +160,7 @@ static inline pte_t __pte_ma(pteval_t x) + + #define pgd_val_ma(x) ((x).pgd) + ++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); + + xmaddr_t arbitrary_virt_to_machine(void *address); + unsigned long arbitrary_virt_to_mfn(void *vaddr); +diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h +new file mode 100644 +index 0000000..cb84abe +--- /dev/null ++++ b/arch/x86/include/asm/xen/pci.h +@@ -0,0 +1,37 @@ ++#ifndef _ASM_X86_XEN_PCI_H ++#define _ASM_X86_XEN_PCI_H ++ ++#ifdef CONFIG_XEN_DOM0_PCI ++int xen_register_gsi(u32 gsi, int triggering, int polarity); ++int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type); ++int xen_destroy_irq(int irq); ++#else ++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ return -1; ++} ++ ++static inline int xen_create_msi_irq(struct pci_dev *dev, ++ struct msi_desc *msidesc, ++ int type) ++{ ++ return -1; ++} ++static inline int xen_destroy_irq(int irq) ++{ ++ return -1; ++} ++#endif ++ ++#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI) ++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); ++#else ++static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ return -1; ++} ++#endif ++ ++#endif /* _ASM_X86_XEN_PCI_H */ +diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h +new file mode 100644 +index 0000000..e4fe299 +--- /dev/null ++++ b/arch/x86/include/asm/xen/swiotlb-xen.h +@@ -0,0 +1,14 @@ ++#ifndef _ASM_X86_SWIOTLB_XEN_H ++#define _ASM_X86_SWIOTLB_XEN_H ++ ++#ifdef CONFIG_PCI_XEN ++extern int xen_swiotlb; ++extern int __init pci_xen_swiotlb_detect(void); ++extern void __init pci_xen_swiotlb_init(void); ++#else ++#define xen_swiotlb 0 ++static inline int __init pci_xen_swiotlb_detect(void) { return 0; } ++static inline void __init pci_xen_swiotlb_init(void) { } ++#endif ++ ++#endif /* _ASM_X86_SWIOTLB_XEN_H */ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index d8e5d0c..6e80af9 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -111,6 +111,7 @@ obj-$(CONFIG_X86_MRST) += mrst.o + microcode-y := microcode_core.o + microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o + microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o ++microcode-$(CONFIG_MICROCODE_XEN) += microcode_xen.o + obj-$(CONFIG_MICROCODE) += microcode.o + + obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 67e929b..21fc029 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -42,6 +42,10 @@ + #include + #include + ++#include ++ ++#include ++ + static int __initdata acpi_force = 0; + u32 acpi_rsdt_forced; + int acpi_disabled; +@@ -149,6 +153,10 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) + { + unsigned int ver = 0; + ++ /* We don't want to register lapics when in Xen dom0 */ ++ if (xen_initial_domain()) ++ return; ++ + if (!enabled) { + ++disabled_cpus; + return; +@@ -455,9 +463,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) + */ + int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) + { +- unsigned int irq; ++ int irq; + unsigned int plat_gsi = gsi; + ++ irq = xen_register_gsi(gsi, trigger, polarity); ++ if (irq >= 0) ++ return irq; ++ + #ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. +@@ -733,6 +745,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) + + static void __init acpi_register_lapic_address(unsigned long address) + { ++ /* Xen dom0 doesn't have usable lapics */ ++ if (xen_initial_domain()) ++ return; ++ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); +@@ -853,6 +869,9 @@ int __init acpi_probe_gsi(void) + max_gsi = gsi; + } + ++ if (xen_initial_domain()) ++ max_gsi += 255; /* Plus maximum entries of an ioapic. */ ++ + return max_gsi + 1; + } + +diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c +index 23fc9fe..40497d3 100644 +--- a/arch/x86/kernel/amd_iommu.c ++++ b/arch/x86/kernel/amd_iommu.c +@@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, + } + + if (unlikely(address == -1)) +- address = bad_dma_address; ++ address = DMA_ERROR_CODE; + + WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + +@@ -1545,7 +1545,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, + + pte = dma_ops_get_pte(dom, address); + if (!pte) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + +@@ -1626,7 +1626,7 @@ static dma_addr_t __map_single(struct device *dev, + retry: + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); +- if (unlikely(address == bad_dma_address)) { ++ if (unlikely(address == DMA_ERROR_CODE)) { + /* + * setting next_address here will let the address + * allocator only scan the new allocated range in the +@@ -1647,7 +1647,7 @@ retry: + start = address; + for (i = 0; i < pages; ++i) { + ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); +- if (ret == bad_dma_address) ++ if (ret == DMA_ERROR_CODE) + goto out_unmap; + + paddr += PAGE_SIZE; +@@ -1675,7 +1675,7 @@ out_unmap: + + dma_ops_free_addresses(dma_dom, address, pages); + +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + /* +@@ -1691,7 +1691,7 @@ static void __unmap_single(struct amd_iommu *iommu, + dma_addr_t i, start; + unsigned int pages; + +- if ((dma_addr == bad_dma_address) || ++ if ((dma_addr == DMA_ERROR_CODE) || + (dma_addr + size > dma_dom->aperture_size)) + return; + +@@ -1733,7 +1733,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + INC_STATS_COUNTER(cnt_map_single); + + if (!check_device(dev)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + dma_mask = *dev->dma_mask; + +@@ -1744,12 +1744,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, + return (dma_addr_t)paddr; + + if (!dma_ops_domain(domain)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + + spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); +- if (addr == bad_dma_address) ++ if (addr == DMA_ERROR_CODE) + goto out; + + iommu_completion_wait(iommu); +@@ -1958,7 +1958,7 @@ static void *alloc_coherent(struct device *dev, size_t size, + *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + size, DMA_BIDIRECTIONAL, true, dma_mask); + +- if (*dma_addr == bad_dma_address) { ++ if (*dma_addr == DMA_ERROR_CODE) { + spin_unlock_irqrestore(&domain->lock, flags); + goto out_free; + } +@@ -2120,8 +2120,7 @@ int __init amd_iommu_init_dma_ops(void) + prealloc_protection_domains(); + + iommu_detected = 1; +- force_iommu = 1; +- bad_dma_address = 0; ++ swiotlb = 0; + #ifdef CONFIG_GART_IOMMU + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; +diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c +index 362ab88..2ff5b5d 100644 +--- a/arch/x86/kernel/amd_iommu_init.c ++++ b/arch/x86/kernel/amd_iommu_init.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + /* + * definitions for the ACPI scanning code +@@ -1183,19 +1184,10 @@ static struct sys_device device_amd_iommu = { + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ +-int __init amd_iommu_init(void) ++static int __init amd_iommu_init(void) + { + int i, ret = 0; + +- +- if (no_iommu) { +- printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); +- return 0; +- } +- +- if (!amd_iommu_detected) +- return -ENODEV; +- + /* + * First parse ACPI tables to find the largest Bus/Dev/Func + * we need to handle. Upon this information the shared data +@@ -1310,6 +1302,7 @@ int __init amd_iommu_init(void) + else + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + ++ x86_platform.iommu_shutdown = disable_iommus; + out: + return ret; + +@@ -1336,11 +1329,6 @@ free: + goto out; + } + +-void amd_iommu_shutdown(void) +-{ +- disable_iommus(); +-} +- + /**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA +@@ -1355,16 +1343,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) + + void __init amd_iommu_detect(void) + { +- if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) ++ if (no_iommu || (iommu_detected && !gart_iommu_aperture)) + return; + + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { + iommu_detected = 1; + amd_iommu_detected = 1; +-#ifdef CONFIG_GART_IOMMU +- gart_iommu_aperture_disabled = 1; +- gart_iommu_aperture = 0; +-#endif ++ x86_init.iommu.iommu_init = amd_iommu_init; + } + } + +diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c +index 128111d..e0dfb68 100644 +--- a/arch/x86/kernel/aperture_64.c ++++ b/arch/x86/kernel/aperture_64.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + int gart_iommu_aperture; + int gart_iommu_aperture_disabled __initdata; +@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) + + iommu_detected = 1; + gart_iommu_aperture = 1; ++ x86_init.iommu.iommu_init = gart_iommu_init; + + aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; +@@ -456,7 +458,7 @@ out: + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ +- } else if (swiotlb && !valid_agp) { ++ } else if (!valid_agp) { + /* Do nothing */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || +diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c +index c107e83..db1af79 100644 +--- a/arch/x86/kernel/apic/io_apic.c ++++ b/arch/x86/kernel/apic/io_apic.c +@@ -63,8 +63,11 @@ + #include + #include + ++#include + #include + ++#include ++ + #define __apicdebuginit(type) static type __init + #define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) +@@ -390,14 +393,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) + + static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); + } + + static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); + } +@@ -410,7 +417,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i + */ + static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) + { +- struct io_apic __iomem *io_apic = io_apic_base(apic); ++ struct io_apic __iomem *io_apic; ++ ++ io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); +@@ -3447,6 +3456,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + ++ if (xen_domain()) ++ return xen_setup_msi_irqs(dev, nvec, type); ++ + node = dev_to_node(&dev->dev); + irq_want = nr_irqs_gsi; + sub_handle = 0; +@@ -3496,7 +3508,10 @@ error: + + void arch_teardown_msi_irq(unsigned int irq) + { +- destroy_irq(irq); ++ if (xen_domain()) ++ xen_destroy_irq(irq); ++ else ++ destroy_irq(irq); + } + + #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) +@@ -3812,6 +3827,11 @@ void __init probe_nr_irqs_gsi(void) + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); + } + ++int get_nr_irqs_gsi(void) ++{ ++ return nr_irqs_gsi; ++} ++ + #ifdef CONFIG_SPARSE_IRQ + int __init arch_probe_nr_irqs(void) + { +diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile +index f4361b5..404e458 100644 +--- a/arch/x86/kernel/cpu/mtrr/Makefile ++++ b/arch/x86/kernel/cpu/mtrr/Makefile +@@ -1,3 +1,4 @@ + obj-y := main.o if.o generic.o state.o cleanup.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o ++obj-$(CONFIG_XEN_DOM0) += xen.o + +diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c +index 33af141..378f8dc 100644 +--- a/arch/x86/kernel/cpu/mtrr/amd.c ++++ b/arch/x86/kernel/cpu/mtrr/amd.c +@@ -108,6 +108,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) + return 0; + } + ++static int amd_num_var_ranges(void) ++{ ++ return 2; ++} ++ + static struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, +@@ -115,6 +120,7 @@ static struct mtrr_ops amd_mtrr_ops = { + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = amd_num_var_ranges, + }; + + int __init amd_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c +index de89f14..7c686a0 100644 +--- a/arch/x86/kernel/cpu/mtrr/centaur.c ++++ b/arch/x86/kernel/cpu/mtrr/centaur.c +@@ -110,6 +110,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t + return 0; + } + ++static int centaur_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, +@@ -117,6 +122,7 @@ static struct mtrr_ops centaur_mtrr_ops = { + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = centaur_num_var_ranges, + }; + + int __init centaur_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c +index 228d982..fd6edcc 100644 +--- a/arch/x86/kernel/cpu/mtrr/cyrix.c ++++ b/arch/x86/kernel/cpu/mtrr/cyrix.c +@@ -265,6 +265,11 @@ static void cyrix_set_all(void) + post_set(); + } + ++static int cyrix_num_var_ranges(void) ++{ ++ return 8; ++} ++ + static struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, +@@ -273,6 +278,7 @@ static struct mtrr_ops cyrix_mtrr_ops = { + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, ++ .num_var_ranges = cyrix_num_var_ranges, + }; + + int __init cyrix_init_mtrr(void) +diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c +index 55da0c5..42f30cd 100644 +--- a/arch/x86/kernel/cpu/mtrr/generic.c ++++ b/arch/x86/kernel/cpu/mtrr/generic.c +@@ -749,8 +749,16 @@ int positive_have_wrcomb(void) + return 1; + } + +-/* +- * Generic structure... ++static int generic_num_var_ranges(void) ++{ ++ unsigned long config = 0, dummy; ++ ++ rdmsr(MSR_MTRRcap, config, dummy); ++ ++ return config & 0xff; ++} ++ ++/* generic structure... + */ + struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, +@@ -760,4 +768,5 @@ struct mtrr_ops generic_mtrr_ops = { + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, ++ .num_var_ranges = generic_num_var_ranges, + }; +diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c +index 84e83de..c8cb9ed 100644 +--- a/arch/x86/kernel/cpu/mtrr/main.c ++++ b/arch/x86/kernel/cpu/mtrr/main.c +@@ -110,21 +110,6 @@ static int have_wrcomb(void) + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; + } + +-/* This function returns the number of variable MTRRs */ +-static void __init set_num_var_ranges(void) +-{ +- unsigned long config = 0, dummy; +- +- if (use_intel()) +- rdmsr(MSR_MTRRcap, config, dummy); +- else if (is_cpu(AMD)) +- config = 2; +- else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) +- config = 8; +- +- num_var_ranges = config & 0xff; +-} +- + static void __init init_table(void) + { + int i, max; +@@ -711,8 +696,11 @@ void __init mtrr_bp_init(void) + } + } + ++ /* Let Xen code override the above if it wants */ ++ xen_init_mtrr(); ++ + if (mtrr_if) { +- set_num_var_ranges(); ++ num_var_ranges = mtrr_if->num_var_ranges(); + init_table(); + if (use_intel()) { + get_mtrr_state(); +diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h +index a501dee..98569c3 100644 +--- a/arch/x86/kernel/cpu/mtrr/mtrr.h ++++ b/arch/x86/kernel/cpu/mtrr/mtrr.h +@@ -5,6 +5,8 @@ + #include + #include + ++#include ++ + #define MTRR_CHANGE_MASK_FIXED 0x01 + #define MTRR_CHANGE_MASK_VARIABLE 0x02 + #define MTRR_CHANGE_MASK_DEFTYPE 0x04 +@@ -25,6 +27,8 @@ struct mtrr_ops { + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); ++ ++ int (*num_var_ranges)(void); + }; + + extern int generic_get_free_region(unsigned long base, unsigned long size, +@@ -73,6 +77,13 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned); + int amd_init_mtrr(void); + int cyrix_init_mtrr(void); + int centaur_init_mtrr(void); ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_mtrr(void); ++#else ++static inline void xen_init_mtrr(void) ++{ ++} ++#endif + + extern int changed_by_mtrr_cleanup; + extern int mtrr_cleanup(unsigned address_bits); +diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c +new file mode 100644 +index 0000000..54ced4b +--- /dev/null ++++ b/arch/x86/kernel/cpu/mtrr/xen.c +@@ -0,0 +1,105 @@ ++#include ++#include ++# ++#include "mtrr.h" ++ ++#include ++#include ++#include ++#include ++ ++static void xen_set_mtrr(unsigned int reg, unsigned long base, ++ unsigned long size, mtrr_type type) ++{ ++ struct xen_platform_op op; ++ int error; ++ ++ /* mtrr_ops->set() is called once per CPU, ++ * but Xen's ops apply to all CPUs. ++ */ ++ if (smp_processor_id()) ++ return; ++ ++ if (size == 0) { ++ op.cmd = XENPF_del_memtype; ++ op.u.del_memtype.handle = 0; ++ op.u.del_memtype.reg = reg; ++ } else { ++ op.cmd = XENPF_add_memtype; ++ op.u.add_memtype.mfn = base; ++ op.u.add_memtype.nr_mfns = size; ++ op.u.add_memtype.type = type; ++ } ++ ++ error = HYPERVISOR_dom0_op(&op); ++ BUG_ON(error != 0); ++} ++ ++static void xen_get_mtrr(unsigned int reg, unsigned long *base, ++ unsigned long *size, mtrr_type *type) ++{ ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = reg; ++ if (HYPERVISOR_dom0_op(&op) != 0) { ++ *base = 0; ++ *size = 0; ++ *type = 0; ++ return; ++ } ++ ++ *size = op.u.read_memtype.nr_mfns; ++ *base = op.u.read_memtype.mfn; ++ *type = op.u.read_memtype.type; ++} ++ ++static int __init xen_num_var_ranges(void) ++{ ++ int ranges; ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ ++ for (ranges = 0; ; ranges++) { ++ op.u.read_memtype.reg = ranges; ++ if (HYPERVISOR_dom0_op(&op) != 0) ++ break; ++ } ++ return ranges; ++} ++ ++/* ++ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full ++ * working userland mtrr support. ++ */ ++static struct mtrr_ops xen_mtrr_ops = { ++ .vendor = X86_VENDOR_UNKNOWN, ++ .get_free_region = generic_get_free_region, ++ .set = xen_set_mtrr, ++ .get = xen_get_mtrr, ++ .have_wrcomb = positive_have_wrcomb, ++ .validate_add_page = generic_validate_add_page, ++ .use_intel_if = 0, ++ .num_var_ranges = xen_num_var_ranges, ++}; ++ ++void __init xen_init_mtrr(void) ++{ ++ /* ++ * Check that we're running under Xen, and privileged enough ++ * to play with MTRRs. ++ */ ++ if (!xen_initial_domain()) ++ return; ++ ++ /* ++ * Check that the CPU has an MTRR implementation we can ++ * support. ++ */ ++ if (cpu_has_mtrr || ++ cpu_has_k6_mtrr || ++ cpu_has_cyrix_arr || ++ cpu_has_centaur_mcr) ++ mtrr_if = &xen_mtrr_ops; ++} +diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c +index 5e409dc..a4849c1 100644 +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -27,8 +27,7 @@ + #include + #include + #include +-#include +- ++#include + + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) + #endif + + #ifdef CONFIG_X86_64 +- pci_iommu_shutdown(); ++ x86_platform.iommu_shutdown(); + #endif + + crash_save_cpu(regs, safe_smp_processor_id()); +diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c +index 5877873..1aab4be 100644 +--- a/arch/x86/kernel/hpet.c ++++ b/arch/x86/kernel/hpet.c +@@ -98,7 +98,7 @@ static int __init hpet_setup(char *str) + } + __setup("hpet=", hpet_setup); + +-static int __init disable_hpet(char *str) ++int __init disable_hpet(char *str) + { + boot_hpet_disable = 1; + return 1; +diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c +index 99c4d30..919c1a8 100644 +--- a/arch/x86/kernel/ioport.c ++++ b/arch/x86/kernel/ioport.c +@@ -30,13 +30,29 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base, + } + } + ++void native_set_io_bitmap(struct thread_struct *t, ++ unsigned long bytes_updated) ++{ ++ struct tss_struct *tss; ++ ++ if (!bytes_updated) ++ return; ++ ++ tss = &__get_cpu_var(init_tss); ++ ++ /* Update the TSS: */ ++ if (t->io_bitmap_ptr) ++ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ else ++ memset(tss->io_bitmap, 0xff, bytes_updated); ++} ++ + /* + * this changes the io permissions bitmap in the current task. + */ + asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + { + struct thread_struct *t = ¤t->thread; +- struct tss_struct *tss; + unsigned int i, max_long, bytes, bytes_updated; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) +@@ -61,13 +77,13 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + } + + /* +- * do it in the per-thread copy and in the TSS ... ++ * do it in the per-thread copy + * +- * Disable preemption via get_cpu() - we must not switch away ++ * Disable preemption - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ +- tss = &per_cpu(init_tss, get_cpu()); ++ preempt_disable(); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + +@@ -85,10 +101,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) + + t->io_bitmap_max = bytes; + +- /* Update the TSS: */ +- memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); ++ set_io_bitmap(t, bytes_updated); + +- put_cpu(); ++ preempt_enable(); + + return 0; + } +@@ -119,11 +134,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) + return 0; + } + +-#ifdef CONFIG_X86_32 +-long sys_iopl(struct pt_regs *regs) ++asmlinkage long sys_iopl(unsigned int level) + { +- unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; ++ struct pt_regs *regs = task_pt_regs(current); + int rc; + + rc = do_iopl(level, regs); +@@ -135,9 +149,3 @@ long sys_iopl(struct pt_regs *regs) + out: + return rc; + } +-#else +-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +-{ +- return do_iopl(level, regs); +-} +-#endif +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index ec6ef60..fa5b061 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -109,6 +109,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) + + mutex_init(&mm->context.lock); + mm->context.size = 0; ++#ifdef CONFIG_XEN ++ mm->context.has_foreign_mappings = 0; ++#endif + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); +diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c +index 378e9a8..86ca771 100644 +--- a/arch/x86/kernel/microcode_core.c ++++ b/arch/x86/kernel/microcode_core.c +@@ -81,6 +81,8 @@ + #include + #include + ++#include ++#include + #include + #include + +@@ -503,7 +505,9 @@ static int __init microcode_init(void) + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + +- if (c->x86_vendor == X86_VENDOR_INTEL) ++ if (xen_pv_domain()) ++ microcode_ops = init_xen_microcode(); ++ else if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); +diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c +new file mode 100644 +index 0000000..16c742e +--- /dev/null ++++ b/arch/x86/kernel/microcode_xen.c +@@ -0,0 +1,201 @@ ++/* ++ * Xen microcode update driver ++ * ++ * Xen does most of the work here. We just pass the whole blob into ++ * Xen, and it will apply it to all CPUs as appropriate. Xen will ++ * worry about how different CPU models are actually updated. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++MODULE_DESCRIPTION("Xen microcode update driver"); ++MODULE_LICENSE("GPL"); ++ ++struct xen_microcode { ++ size_t len; ++ char data[0]; ++}; ++ ++static int xen_microcode_update(int cpu) ++{ ++ int err; ++ struct xen_platform_op op; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc = uci->mc; ++ ++ if (uc == NULL || uc->len == 0) { ++ /* ++ * We do all cpus at once, so we don't need to do ++ * other cpus explicitly (besides, these vcpu numbers ++ * have no relationship to underlying physical cpus). ++ */ ++ return 0; ++ } ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, uc->data); ++ op.u.microcode.length = uc->len; ++ ++ err = HYPERVISOR_dom0_op(&op); ++ ++ if (err != 0) ++ printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err); ++ ++ return err; ++} ++ ++static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device) ++{ ++ char name[30]; ++ struct cpuinfo_x86 *c = &cpu_data(cpu); ++ const struct firmware *firmware; ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ enum ucode_state ret; ++ struct xen_microcode *uc; ++ size_t size; ++ int err; ++ ++ switch (c->x86_vendor) { ++ case X86_VENDOR_INTEL: ++ snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x", ++ c->x86, c->x86_model, c->x86_mask); ++ break; ++ ++ case X86_VENDOR_AMD: ++ snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin"); ++ break; ++ ++ default: ++ return UCODE_NFOUND; ++ } ++ ++ err = request_firmware(&firmware, name, device); ++ if (err) { ++ pr_debug("microcode: data file %s load failed\n", name); ++ return UCODE_NFOUND; ++ } ++ ++ /* ++ * Only bother getting real firmware for cpu 0; the others get ++ * dummy placeholders. ++ */ ++ if (cpu == 0) ++ size = firmware->size; ++ else ++ size = 0; ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ ret = UCODE_OK; ++ uc->len = size; ++ memcpy(uc->data, firmware->data, uc->len); ++ ++ uci->mc = uc; ++ ++out: ++ release_firmware(firmware); ++ ++ return ret; ++} ++ ++static enum ucode_state xen_request_microcode_user(int cpu, ++ const void __user *buf, size_t size) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ struct xen_microcode *uc; ++ enum ucode_state ret; ++ size_t unread; ++ ++ if (cpu != 0) { ++ /* No real firmware for non-zero cpus; just store a ++ placeholder */ ++ size = 0; ++ } ++ ++ if (uci->mc != NULL) { ++ vfree(uci->mc); ++ uci->mc = NULL; ++ } ++ ++ ret = UCODE_ERROR; ++ uc = vmalloc(sizeof(*uc) + size); ++ if (uc == NULL) ++ goto out; ++ ++ uc->len = size; ++ ++ ret = UCODE_NFOUND; ++ ++ /* XXX This sporadically returns uncopied bytes, so we return ++ EFAULT. As far as I can see, the usermode code ++ (microcode_ctl) isn't doing anything wrong... */ ++ unread = copy_from_user(uc->data, buf, size); ++ ++ if (unread != 0) { ++ printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n", ++ unread, size, buf, uc->data); ++ goto out; ++ } ++ ++ ret = UCODE_OK; ++ ++out: ++ if (ret == 0) ++ uci->mc = uc; ++ else ++ vfree(uc); ++ ++ return ret; ++} ++ ++static void xen_microcode_fini_cpu(int cpu) ++{ ++ struct ucode_cpu_info *uci = ucode_cpu_info + cpu; ++ ++ vfree(uci->mc); ++ uci->mc = NULL; ++} ++ ++static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig) ++{ ++ sig->sig = 0; ++ sig->pf = 0; ++ sig->rev = 0; ++ ++ return 0; ++} ++ ++static struct microcode_ops microcode_xen_ops = { ++ .request_microcode_user = xen_request_microcode_user, ++ .request_microcode_fw = xen_request_microcode_fw, ++ .collect_cpu_info = xen_collect_cpu_info, ++ .apply_microcode = xen_microcode_update, ++ .microcode_fini_cpu = xen_microcode_fini_cpu, ++}; ++ ++struct microcode_ops * __init init_xen_microcode(void) ++{ ++ if (!xen_initial_domain()) ++ return NULL; ++ return µcode_xen_ops; ++} +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index 1b1739d..f7e115c 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -376,6 +376,7 @@ struct pv_cpu_ops pv_cpu_ops = { + .swapgs = native_swapgs, + + .set_iopl_mask = native_set_iopl_mask, ++ .set_io_bitmap = native_set_io_bitmap, + .io_delay = native_io_delay, + + .start_context_switch = paravirt_nop, +diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c +index e6ec8a2..c7ae5ca 100644 +--- a/arch/x86/kernel/pci-calgary_64.c ++++ b/arch/x86/kernel/pci-calgary_64.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT + int use_calgary __read_mostly = 1; +@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + } + +@@ -260,11 +261,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) + { + unsigned long entry; +- dma_addr_t ret = bad_dma_address; ++ dma_addr_t ret = DMA_ERROR_CODE; + + entry = iommu_range_alloc(dev, tbl, npages); + +- if (unlikely(entry == bad_dma_address)) ++ if (unlikely(entry == DMA_ERROR_CODE)) + goto error; + + /* set the return dma address */ +@@ -279,7 +280,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + error: + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); +- return bad_dma_address; ++ return DMA_ERROR_CODE; + } + + static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +@@ -290,8 +291,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned long flags; + + /* were we called with bad_dma_address? */ +- badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); +- if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { ++ badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); ++ if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + return; +@@ -375,7 +376,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); + + entry = iommu_range_alloc(dev, tbl, npages); +- if (entry == bad_dma_address) { ++ if (entry == DMA_ERROR_CODE) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; +@@ -393,7 +394,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + error: + calgary_unmap_sg(dev, sg, nelems, dir, NULL); + for_each_sg(sg, s, nelems, i) { +- sg->dma_address = bad_dma_address; ++ sg->dma_address = DMA_ERROR_CODE; + sg->dma_length = 0; + } + return 0; +@@ -448,7 +449,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, + + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); +- if (mapping == bad_dma_address) ++ if (mapping == DMA_ERROR_CODE) + goto free; + *dma_handle = mapping; + return ret; +@@ -729,7 +730,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) + struct iommu_table *tbl = pci_iommu(dev->bus); + + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ +- iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); ++ iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + /* for CalIOC2 - avoid the entire first MB */ +@@ -1346,6 +1347,23 @@ static void __init get_tce_space_from_tar(void) + return; + } + ++static int __init calgary_iommu_init(void) ++{ ++ int ret; ++ ++ /* ok, we're trying to use Calgary - let's roll */ ++ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); ++ ++ ret = calgary_init(); ++ if (ret) { ++ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " ++ "falling back to no_iommu\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ + void __init detect_calgary(void) + { + int bus; +@@ -1359,7 +1377,7 @@ void __init detect_calgary(void) + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ +- if (swiotlb || no_iommu || iommu_detected) ++ if (no_iommu || iommu_detected) + return; + + if (!use_calgary) +@@ -1444,9 +1462,7 @@ void __init detect_calgary(void) + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); + +- /* swiotlb for devices that aren't behind the Calgary. */ +- if (max_pfn > MAX_DMA32_PFN) +- swiotlb = 1; ++ x86_init.iommu.iommu_init = calgary_iommu_init; + } + return; + +@@ -1459,35 +1475,6 @@ cleanup: + } + } + +-int __init calgary_iommu_init(void) +-{ +- int ret; +- +- if (no_iommu || (swiotlb && !calgary_detected)) +- return -ENODEV; +- +- if (!calgary_detected) +- return -ENODEV; +- +- /* ok, we're trying to use Calgary - let's roll */ +- printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); +- +- ret = calgary_init(); +- if (ret) { +- printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " +- "falling back to no_iommu\n", ret); +- return ret; +- } +- +- force_iommu = 1; +- bad_dma_address = 0x0; +- /* dma_ops is set to swiotlb or nommu */ +- if (!dma_ops) +- dma_ops = &nommu_dma_ops; +- +- return 0; +-} +- + static int __init calgary_parse_options(char *p) + { + unsigned int bridge; +diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c +index 6ac3931..3e57c58 100644 +--- a/arch/x86/kernel/pci-dma.c ++++ b/arch/x86/kernel/pci-dma.c +@@ -11,10 +11,12 @@ + #include + #include + #include ++#include ++#include + + static int forbid_dac __read_mostly; + +-struct dma_map_ops *dma_ops; ++struct dma_map_ops *dma_ops = &nommu_dma_ops; + EXPORT_SYMBOL(dma_ops); + + static int iommu_sac_force __read_mostly; +@@ -42,9 +44,6 @@ int iommu_detected __read_mostly = 0; + */ + int iommu_pass_through __read_mostly; + +-dma_addr_t bad_dma_address __read_mostly = 0; +-EXPORT_SYMBOL(bad_dma_address); +- + /* Dummy device used for NULL arguments (normally ISA). */ + struct device x86_dma_fallback_dev = { + .init_name = "fallback device", +@@ -126,18 +125,19 @@ void __init pci_iommu_alloc(void) + /* free the range so iommu could get some range less than 4G */ + dma32_free_bootmem(); + #endif ++ if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) ++ goto out; + +- /* +- * The order of these functions is important for +- * fall-back/fail-over reasons +- */ + gart_iommu_hole_init(); + + detect_calgary(); + + detect_intel_iommu(); + ++ /* needs to be called after gart_iommu_hole_init */ + amd_iommu_detect(); ++out: ++ pci_xen_swiotlb_init(); + + pci_swiotlb_init(); + } +@@ -289,25 +289,17 @@ static int __init pci_iommu_init(void) + #ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); + #endif ++ x86_init.iommu.iommu_init(); + +- calgary_iommu_init(); +- +- intel_iommu_init(); +- +- amd_iommu_init(); ++ if (swiotlb || xen_swiotlb) { ++ printk(KERN_INFO "PCI-DMA: " ++ "Using software bounce buffering for IO (SWIOTLB)\n"); ++ swiotlb_print_info(); ++ } else ++ swiotlb_free(); + +- gart_iommu_init(); +- +- no_iommu_init(); + return 0; + } +- +-void pci_iommu_shutdown(void) +-{ +- gart_iommu_shutdown(); +- +- amd_iommu_shutdown(); +-} + /* Must execute after PCI subsystem */ + rootfs_initcall(pci_iommu_init); + +diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c +index fcc0b5c..61c4d1e 100644 +--- a/arch/x86/kernel/pci-gart_64.c ++++ b/arch/x86/kernel/pci-gart_64.c +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + static unsigned long iommu_bus_base; /* GART remapping area (physical) */ + static unsigned long iommu_size; /* size of remapping area bytes */ +@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ + + static u32 *iommu_gatt_base; /* Remapping table */ + ++static dma_addr_t bad_dma_addr; ++ + /* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is +@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); +- return bad_dma_address; ++ return bad_dma_addr; + } + + for (i = 0; i < npages; i++) { +@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); +- if (addr == bad_dma_address) { ++ if (addr == bad_dma_addr) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; +@@ -455,7 +458,7 @@ error: + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) +- s->dma_address = bad_dma_address; ++ s->dma_address = bad_dma_addr; + return 0; + } + +@@ -479,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); +- if (paddr != bad_dma_address) { ++ if (paddr != bad_dma_addr) { + *dma_addr = paddr; + return page_address(page); + } +@@ -499,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, + free_pages((unsigned long)vaddr, get_order(size)); + } + ++static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return (dma_addr == bad_dma_addr); ++} ++ + static int no_agp; + + static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +@@ -686,14 +694,15 @@ static struct dma_map_ops gart_dma_ops = { + .unmap_page = gart_unmap_page, + .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, ++ .mapping_error = gart_mapping_error, + }; + +-void gart_iommu_shutdown(void) ++static void gart_iommu_shutdown(void) + { + struct pci_dev *dev; + int i; + +- if (no_agp && (dma_ops != &gart_dma_ops)) ++ if (no_agp) + return; + + for (i = 0; i < num_k8_northbridges; i++) { +@@ -708,7 +717,7 @@ void gart_iommu_shutdown(void) + } + } + +-void __init gart_iommu_init(void) ++int __init gart_iommu_init(void) + { + struct agp_kern_info info; + unsigned long iommu_start; +@@ -718,7 +727,7 @@ void __init gart_iommu_init(void) + long i; + + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) +- return; ++ return 0; + + #ifndef CONFIG_AGP_AMD64 + no_agp = 1; +@@ -730,13 +739,6 @@ void __init gart_iommu_init(void) + (agp_copy_info(agp_bridge, &info) < 0); + #endif + +- if (swiotlb) +- return; +- +- /* Did we detect a different HW IOMMU? */ +- if (iommu_detected && !gart_iommu_aperture) +- return; +- + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || +@@ -746,7 +748,7 @@ void __init gart_iommu_init(void) + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); + } +- return; ++ return 0; + } + + /* need to map that range */ +@@ -791,7 +793,7 @@ void __init gart_iommu_init(void) + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; +- bad_dma_address = iommu_bus_base; ++ bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* +@@ -838,6 +840,10 @@ void __init gart_iommu_init(void) + + flush_gart(); + dma_ops = &gart_dma_ops; ++ x86_platform.iommu_shutdown = gart_iommu_shutdown; ++ swiotlb = 0; ++ ++ return 0; + } + + void __init gart_parse_options(char *p) +diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c +index a3933d4..22be12b 100644 +--- a/arch/x86/kernel/pci-nommu.c ++++ b/arch/x86/kernel/pci-nommu.c +@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, + dma_addr_t bus = page_to_phys(page) + offset; + WARN_ON(size == 0); + if (!check_addr("map_single", dev, bus, size)) +- return bad_dma_address; ++ return DMA_ERROR_CODE; + flush_write_buffers(); + return bus; + } +@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, + }; +- +-void __init no_iommu_init(void) +-{ +- if (dma_ops) +- return; +- +- force_iommu = 0; /* no HW IOMMU */ +- dma_ops = &nommu_dma_ops; +-} +diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c +index aaa6b78..7d2829d 100644 +--- a/arch/x86/kernel/pci-swiotlb.c ++++ b/arch/x86/kernel/pci-swiotlb.c +@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = { + .dma_supported = NULL, + }; + +-void __init pci_swiotlb_init(void) ++/* ++ * pci_swiotlb_detect - set swiotlb to 1 if necessary ++ * ++ * This returns non-zero if we are forced to use swiotlb (by the boot ++ * option). ++ */ ++int __init pci_swiotlb_detect(void) + { ++ int use_swiotlb = swiotlb | swiotlb_force; ++ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + #ifdef CONFIG_X86_64 +- if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) ++ if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; + #endif + if (swiotlb_force) + swiotlb = 1; ++ ++ return use_swiotlb; ++} ++ ++void __init pci_swiotlb_init(void) ++{ + if (swiotlb) { +- printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); +- swiotlb_init(); ++ swiotlb_init(0); + dma_ops = &swiotlb_dma_ops; + } + } +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index f010ab4..6b39f09 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -73,16 +73,12 @@ void exit_thread(void) + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { +- struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); +- ++ preempt_disable(); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); +- /* +- * Careful, clear this in the TSS too: +- */ +- memset(tss->io_bitmap, 0xff, t->io_bitmap_max); ++ set_io_bitmap(t, t->io_bitmap_max); + t->io_bitmap_max = 0; +- put_cpu(); ++ preempt_enable(); + kfree(bp); + } + } +@@ -199,19 +195,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + hard_enable_TSC(); + } + +- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { +- /* +- * Copy the relevant range of the IO bitmap. +- * Normally this is 128 bytes or less: +- */ +- memcpy(tss->io_bitmap, next->io_bitmap_ptr, +- max(prev->io_bitmap_max, next->io_bitmap_max)); +- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { +- /* +- * Clear any possible leftover bits: +- */ +- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); +- } ++ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) || ++ test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) ++ set_io_bitmap(next, ++ max(prev->io_bitmap_max, next->io_bitmap_max)); + } + + int sys_fork(struct pt_regs *regs) +diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c +index bff34d6..704bddc 100644 +--- a/arch/x86/kernel/reboot.c ++++ b/arch/x86/kernel/reboot.c +@@ -23,7 +23,7 @@ + # include + # include + #else +-# include ++# include + #endif + + /* +@@ -639,7 +639,7 @@ void native_machine_shutdown(void) + #endif + + #ifdef CONFIG_X86_64 +- pci_iommu_shutdown(); ++ x86_platform.iommu_shutdown(); + #endif + } + +diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c +index 8425f7e..9f1d581 100644 +--- a/arch/x86/kernel/setup.c ++++ b/arch/x86/kernel/setup.c +@@ -89,6 +89,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -955,6 +956,9 @@ void __init setup_arch(char **cmdline_p) + + initmem_init(0, max_pfn); + ++ /* Initialize cross-cpu tlb flushes */ ++ init_smp_flush(); ++ + #ifdef CONFIG_ACPI_SLEEP + /* + * Reserve low memory region for sleep support. +diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c +index 4449a4a..d11c5ff 100644 +--- a/arch/x86/kernel/x86_init.c ++++ b/arch/x86/kernel/x86_init.c +@@ -14,10 +14,13 @@ + #include + #include + #include ++#include + + void __cpuinit x86_init_noop(void) { } + void __init x86_init_uint_noop(unsigned int unused) { } + void __init x86_init_pgd_noop(pgd_t *unused) { } ++int __init iommu_init_noop(void) { return 0; } ++void iommu_shutdown_noop(void) { } + + /* + * The platform setup functions are preset with the default functions +@@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = { + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + }, ++ ++ .iommu = { ++ .iommu_init = iommu_init_noop, ++ }, + }; + + struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { +@@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, ++ .iommu_shutdown = iommu_shutdown_noop, + }; +diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile +index 06630d2..ad895ae 100644 +--- a/arch/x86/mm/Makefile ++++ b/arch/x86/mm/Makefile +@@ -6,6 +6,11 @@ nostackp := $(call cc-option, -fno-stack-protector) + CFLAGS_physaddr.o := $(nostackp) + CFLAGS_setup_nx.o := $(nostackp) + ++# Make sure __phys_addr has no stackprotector ++nostackp := $(call cc-option, -fno-stack-protector) ++CFLAGS_ioremap.o := $(nostackp) ++CFLAGS_init.o := $(nostackp) ++ + obj-$(CONFIG_SMP) += tlb.o + + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o +diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c +index 71da1bc..892b8eb 100644 +--- a/arch/x86/mm/gup.c ++++ b/arch/x86/mm/gup.c +@@ -313,6 +313,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, + goto slow_irqon; + #endif + ++#ifdef CONFIG_XEN ++ if (unlikely(mm->context.has_foreign_mappings)) ++ goto slow_irqon; ++#endif ++ + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by +diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c +index e78cd0e..fb91994 100644 +--- a/arch/x86/mm/pat.c ++++ b/arch/x86/mm/pat.c +@@ -666,7 +666,7 @@ void io_free_memtype(resource_size_t start, resource_size_t end) + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) + { +- return vma_prot; ++ return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP); + } + + #ifdef CONFIG_STRICT_DEVMEM +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index ed34f5e..25fc1df 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -4,8 +4,19 @@ + #include + #include + ++#include ++#include ++ + #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + ++#ifdef CONFIG_HIGHPTE ++#define PGALLOC_USER_GFP __GFP_HIGHMEM ++#else ++#define PGALLOC_USER_GFP 0 ++#endif ++ ++gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; ++ + pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) + { + return (pte_t *)__get_free_page(PGALLOC_GFP); +@@ -15,16 +26,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) + { + struct page *pte; + +-#ifdef CONFIG_HIGHPTE +- pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); +-#else +- pte = alloc_pages(PGALLOC_GFP, 0); +-#endif ++ pte = alloc_pages(__userpte_alloc_gfp, 0); + if (pte) + pgtable_page_ctor(pte); + return pte; + } + ++static int __init setup_userpte(char *arg) ++{ ++ if (!arg) ++ return -EINVAL; ++ ++ /* ++ * "userpte=nohigh" disables allocation of user pagetables in ++ * high memory. ++ */ ++ if (strcmp(arg, "nohigh") == 0) ++ __userpte_alloc_gfp &= ~__GFP_HIGHMEM; ++ else ++ return -EINVAL; ++ return 0; ++} ++early_param("userpte", setup_userpte); ++ + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) + { + pgtable_page_dtor(pte); +@@ -267,6 +291,12 @@ out: + + void pgd_free(struct mm_struct *mm, pgd_t *pgd) + { ++#ifdef CONFIG_XEN ++ /* EEW */ ++ extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd); ++ if (xen_pv_domain()) ++ xen_late_unpin_pgd(mm, pgd); ++#endif + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 36fe08e..7317947 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -148,13 +148,25 @@ void smp_invalidate_interrupt(struct pt_regs *regs) + * BUG(); + */ + +- if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { +- if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { ++ if (f->flush_mm == NULL || ++ f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { ++ int tlbstate = percpu_read(cpu_tlbstate.state); ++ ++ /* ++ * flush_mm == NULL means flush everything, including ++ * global tlbs, which will only happen when flushing ++ * kernel mappings. ++ */ ++ if (f->flush_mm == NULL) ++ __flush_tlb_all(); ++ else if (tlbstate == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); +- } else ++ } ++ ++ if (tlbstate == TLBSTATE_LAZY) + leave_mm(cpu); + } + out: +@@ -217,16 +229,13 @@ void native_flush_tlb_others(const struct cpumask *cpumask, + flush_tlb_others_ipi(cpumask, mm, va); + } + +-static int __cpuinit init_smp_flush(void) ++void __init init_smp_flush(void) + { + int i; + + for (i = 0; i < ARRAY_SIZE(flush_state); i++) + spin_lock_init(&flush_state[i].tlbstate_lock); +- +- return 0; + } +-core_initcall(init_smp_flush); + + void flush_tlb_current_task(void) + { +@@ -274,17 +283,19 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) + + preempt_enable(); + } ++EXPORT_SYMBOL_GPL(flush_tlb_page); + +-static void do_flush_tlb_all(void *info) ++void flush_tlb_all(void) + { +- unsigned long cpu = smp_processor_id(); ++ /* flush_tlb_others expects preempt to be disabled */ ++ int cpu = get_cpu(); ++ ++ flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +-} + +-void flush_tlb_all(void) +-{ +- on_each_cpu(do_flush_tlb_all, NULL, 1); ++ put_cpu(); + } ++EXPORT_SYMBOL_GPL(flush_tlb_all); +diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile +index d49202e..64182c5 100644 +--- a/arch/x86/pci/Makefile ++++ b/arch/x86/pci/Makefile +@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o + obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o + obj-$(CONFIG_PCI_DIRECT) += direct.o + obj-$(CONFIG_PCI_OLPC) += olpc.o ++obj-$(CONFIG_PCI_XEN) += xen.o + + obj-y += fixup.o + obj-$(CONFIG_ACPI) += acpi.o +diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c +index 1331fcf..30a9808 100644 +--- a/arch/x86/pci/common.c ++++ b/arch/x86/pci/common.c +@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | + unsigned int pci_early_dump_regs; + static int pci_bf_sort; + int pci_routeirq; ++int pci_scan_all_fns; + int noioapicquirk; + #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS + int noioapicreroute = 0; +@@ -412,26 +413,31 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) + + extern u8 pci_cache_line_size; + +-int __init pcibios_init(void) ++void __init pcibios_set_cache_line_size(void) + { + struct cpuinfo_x86 *c = &boot_cpu_data; + +- if (!raw_pci_ops) { +- printk(KERN_WARNING "PCI: System does not support PCI\n"); +- return 0; +- } +- + /* + * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 + * and P4. It's also good for 386/486s (which actually have 16) + * as quite a few PCI devices do not support smaller values. + */ ++ + pci_cache_line_size = 32 >> 2; + if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) + pci_cache_line_size = 64 >> 2; /* K7 & K8 */ + else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) + pci_cache_line_size = 128 >> 2; /* P4 */ ++} ++ ++int __init pcibios_init(void) ++{ ++ if (!raw_pci_ops) { ++ printk(KERN_WARNING "PCI: System does not support PCI\n"); ++ return 0; ++ } + ++ pcibios_set_cache_line_size(); + pcibios_resource_survey(); + + if (pci_bf_sort >= pci_force_bf) +diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c +index a672f12..91d040e 100644 +--- a/arch/x86/pci/i386.c ++++ b/arch/x86/pci/i386.c +@@ -283,6 +283,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + + prot = pgprot_val(vma->vm_page_prot); + ++ prot |= _PAGE_IOMAP; /* creating a mapping for IO */ ++ + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there +diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c +index 25a1f8e..4e2f90a 100644 +--- a/arch/x86/pci/init.c ++++ b/arch/x86/pci/init.c +@@ -15,10 +15,16 @@ static __init int pci_arch_init(void) + if (!(pci_probe & PCI_PROBE_NOEARLY)) + pci_mmcfg_early_init(); + ++#ifdef CONFIG_PCI_XEN ++ if (!pci_xen_init()) ++ return 0; ++#endif ++ + #ifdef CONFIG_PCI_OLPC + if (!pci_olpc_init()) + return 0; /* skip additional checks if it's an XO */ + #endif ++ + #ifdef CONFIG_PCI_BIOS + pci_pcbios_init(); + #endif +diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c +new file mode 100644 +index 0000000..1b922aa +--- /dev/null ++++ b/arch/x86/pci/xen.c +@@ -0,0 +1,51 @@ ++/* ++ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux ++ * x86 PCI core to support the Xen PCI Frontend ++ * ++ * Author: Ryan Wilson ++ */ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++static int xen_pcifront_enable_irq(struct pci_dev *dev) ++{ ++ return 0; ++} ++ ++int __init pci_xen_init(void) ++{ ++ if (!xen_pv_domain() || xen_initial_domain()) ++ return -ENODEV; ++ ++ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n"); ++ ++ pcibios_set_cache_line_size(); ++ ++ pcibios_enable_irq = xen_pcifront_enable_irq; ++ pcibios_disable_irq = NULL; ++ ++#ifdef CONFIG_ACPI ++ /* Keep ACPI out of the picture */ ++ acpi_noirq = 1; ++#endif ++ ++#ifdef CONFIG_ISAPNP ++ /* Stop isapnp from probing */ ++ isapnp_disable = 1; ++#endif ++ ++ /* Ensure a device still gets scanned even if it's fn number ++ * is non-zero. ++ */ ++ pci_scan_all_fns = 1; ++ ++ return 0; ++} ++ +diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig +index b83e119..3da23c7 100644 +--- a/arch/x86/xen/Kconfig ++++ b/arch/x86/xen/Kconfig +@@ -36,3 +36,39 @@ config XEN_DEBUG_FS + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. ++ ++config SWIOTLB_XEN ++ def_bool y ++ depends on XEN && SWIOTLB ++ ++config MICROCODE_XEN ++ def_bool y ++ depends on XEN_DOM0 && MICROCODE ++ ++config XEN_DOM0 ++ bool "Enable Xen privileged domain support" ++ depends on XEN && X86_IO_APIC && ACPI ++ help ++ The Xen hypervisor requires a privileged domain ("dom0") to ++ actually manage the machine, provide devices drivers, etc. ++ This option enables dom0 support. A dom0 kernel can also ++ run as an unprivileged domU kernel, or a kernel running ++ native on bare hardware. ++ ++# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST ++# name in tools. ++config XEN_PRIVILEGED_GUEST ++ def_bool XEN_DOM0 ++ ++config XEN_DOM0_PCI ++ def_bool y ++ depends on XEN_DOM0 && PCI ++ select PCI_XEN ++ ++config XEN_PCI_PASSTHROUGH ++ bool #"Enable support for Xen PCI passthrough devices" ++ depends on XEN && PCI ++ select PCI_XEN ++ help ++ Enable support for passing PCI devices through to ++ unprivileged domains. (COMPLETELY UNTESTED) +\ No newline at end of file +diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile +index 3bb4fc2..08ac224 100644 +--- a/arch/x86/xen/Makefile ++++ b/arch/x86/xen/Makefile +@@ -17,4 +17,7 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o + obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +- ++obj-$(CONFIG_XEN_DOM0) += vga.o ++obj-$(CONFIG_XEN_DOM0) += apic.o ++obj-$(CONFIG_SWIOTLB) += pci-swiotlb-xen.o ++obj-$(CONFIG_XEN_DOM0_PCI) += pci.o +\ No newline at end of file +diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c +new file mode 100644 +index 0000000..21a3089 +--- /dev/null ++++ b/arch/x86/xen/apic.c +@@ -0,0 +1,33 @@ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++void __init xen_io_apic_init(void) ++{ ++ enable_IO_APIC(); ++} ++ ++void xen_init_apic(void) ++{ ++ if (!xen_initial_domain()) ++ return; ++ ++#ifdef CONFIG_ACPI ++ /* ++ * Pretend ACPI found our lapic even though we've disabled it, ++ * to prevent MP tables from setting up lapics. ++ */ ++ acpi_lapic = 1; ++#endif ++} +diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c +index 79f9738..765f714 100644 +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -28,6 +28,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -48,6 +49,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,6 +67,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); + enum xen_domain_type xen_domain_type = XEN_NATIVE; + EXPORT_SYMBOL_GPL(xen_domain_type); + ++unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; ++EXPORT_SYMBOL(machine_to_phys_mapping); ++unsigned int machine_to_phys_order; ++EXPORT_SYMBOL(machine_to_phys_order); ++ + struct start_info *xen_start_info; + EXPORT_SYMBOL_GPL(xen_start_info); + +@@ -166,13 +173,16 @@ static void __init xen_banner(void) + + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + pv_info.name); +- printk(KERN_INFO "Xen version: %d.%d%s%s\n", ++ printk(KERN_INFO "Xen version: %d.%d%s%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, +- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); ++ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? ++ " (preserve-AD)" : "", ++ xen_initial_domain() ? " (dom0)" : ""); + } + + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; + static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; ++static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0; + + static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) +@@ -186,7 +196,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, + * unsupported kernel subsystems as possible. + */ + switch (*ax) { +- case 1: ++ case 0x1: + maskecx = cpuid_leaf1_ecx_mask; + maskedx = cpuid_leaf1_edx_mask; + break; +@@ -195,6 +205,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, + /* Suppress extended topology stuff */ + maskebx = 0; + break; ++ ++ case 0x80000001: ++ maskedx = cpuid_leaf81_edx_mask; ++ break; + } + + asm(XEN_EMULATE_PREFIX "cpuid" +@@ -216,8 +230,11 @@ static __init void xen_init_cpuid_mask(void) + cpuid_leaf1_edx_mask = + ~((1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ ++ (1 << X86_FEATURE_PAT) | /* disable PAT */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + ++ cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32)); ++ + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= + ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ +@@ -405,7 +422,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + +- if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) ++ if (HYPERVISOR_update_va_mapping(va, pte, 0)) + BUG(); + + frames[f] = mfn; +@@ -518,11 +535,10 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, + } else if (addr == (unsigned long)machine_check) { + return 0; + #endif +- } else { +- /* Some other trap using IST? */ +- if (WARN_ON(val->ist != 0)) +- return 0; +- } ++ } else if (WARN(val->ist != 0, ++ "Unknown IST-using trap: vector %d, %pF, val->ist=%d\n", ++ vector, (void *)addr, val->ist)) ++ return 0; + #endif /* CONFIG_X86_64 */ + info->address = addr; + +@@ -678,6 +694,18 @@ static void xen_set_iopl_mask(unsigned mask) + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + } + ++static void xen_set_io_bitmap(struct thread_struct *thread, ++ unsigned long bytes_updated) ++{ ++ struct physdev_set_iobitmap set_iobitmap; ++ ++ set_xen_guest_handle(set_iobitmap.bitmap, ++ (char *)thread->io_bitmap_ptr); ++ set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0; ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, ++ &set_iobitmap)); ++} ++ + static void xen_io_delay(void) + { + } +@@ -715,7 +743,7 @@ static u32 xen_safe_apic_wait_icr_idle(void) + return 0; + } + +-static void set_xen_basic_apic_ops(void) ++static __init void set_xen_basic_apic_ops(void) + { + apic->read = xen_apic_read; + apic->write = xen_apic_write; +@@ -977,6 +1005,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { + .load_sp0 = xen_load_sp0, + + .set_iopl_mask = xen_set_iopl_mask, ++ .set_io_bitmap = xen_set_io_bitmap, + .io_delay = xen_io_delay, + + /* Xen takes care of %gs when switching to usermode for us */ +@@ -1019,6 +1048,14 @@ static void xen_machine_halt(void) + xen_reboot(SHUTDOWN_poweroff); + } + ++static void xen_machine_power_off(void) ++{ ++ if (pm_power_off) ++ pm_power_off(); ++ else ++ xen_reboot(SHUTDOWN_poweroff); ++} ++ + static void xen_crash_shutdown(struct pt_regs *regs) + { + xen_reboot(SHUTDOWN_crash); +@@ -1027,7 +1064,7 @@ static void xen_crash_shutdown(struct pt_regs *regs) + static const struct machine_ops __initdata xen_machine_ops = { + .restart = xen_restart, + .halt = xen_machine_halt, +- .power_off = xen_machine_halt, ++ .power_off = xen_machine_power_off, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +@@ -1060,6 +1097,8 @@ asmlinkage void __init xen_start_kernel(void) + + xen_domain_type = XEN_PV_DOMAIN; + ++ xen_setup_machphys_mapping(); ++ + /* Install Xen paravirt ops */ + pv_info = xen_info; + pv_init_ops = xen_init_ops; +@@ -1085,6 +1124,12 @@ asmlinkage void __init xen_start_kernel(void) + + xen_init_mmu_ops(); + ++ /* ++ * Prevent page tables from being allocated in highmem, even ++ * if CONFIG_HIGHPTE is enabled. ++ */ ++ __userpte_alloc_gfp &= ~__GFP_HIGHMEM; ++ + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!xen_initial_domain()) +@@ -1137,6 +1182,8 @@ asmlinkage void __init xen_start_kernel(void) + + pgd = (pgd_t *)xen_start_info->pt_base; + ++ __supported_pte_mask |= _PAGE_IOMAP; ++ + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +@@ -1146,6 +1193,7 @@ asmlinkage void __init xen_start_kernel(void) + + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); ++ xen_ident_map_ISA(); + + init_mm.pgd = pgd; + +@@ -1155,6 +1203,14 @@ asmlinkage void __init xen_start_kernel(void) + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + pv_info.kernel_rpl = 0; + ++ if (xen_initial_domain()) { ++ struct physdev_set_iopl set_iopl; ++ set_iopl.iopl = 1; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1) ++ BUG(); ++ xen_init_apic(); ++ } ++ + /* set the limit of our address space */ + xen_reserve_top(); + +@@ -1177,6 +1233,16 @@ asmlinkage void __init xen_start_kernel(void) + add_preferred_console("xenboot", 0, NULL); + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); ++ ++ boot_params.screen_info.orig_video_isVGA = 0; ++ } else { ++ const struct dom0_vga_console_info *info = ++ (void *)((char *)xen_start_info + ++ xen_start_info->console.dom0.info_off); ++ ++ xen_init_vga(info, xen_start_info->console.dom0.info_size); ++ xen_start_info->console.domU.mfn = 0; ++ xen_start_info->console.domU.evtchn = 0; + } + + xen_raw_console_write("about to get started...\n"); +diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c +index bf4cd6b..3e6b558 100644 +--- a/arch/x86/xen/mmu.c ++++ b/arch/x86/xen/mmu.c +@@ -50,7 +50,9 @@ + #include + #include + #include ++#include + #include ++#include + + #include + #include +@@ -58,6 +60,7 @@ + #include + #include + #include ++#include + #include + + #include "multicalls.h" +@@ -66,6 +69,13 @@ + + #define MMU_UPDATE_HISTO 30 + ++/* ++ * Protects atomic reservation decrease/increase against concurrent increases. ++ * Also protects non-atomic updates of current_pages and driver_pages, and ++ * balloon lists. ++ */ ++DEFINE_SPINLOCK(xen_reservation_lock); ++ + #ifdef CONFIG_XEN_DEBUG_FS + + static struct { +@@ -184,6 +194,26 @@ static inline unsigned p2m_index(unsigned long pfn) + return pfn % P2M_ENTRIES_PER_PAGE; + } + ++static int lookup_pte_fn( ++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) ++{ ++ uint64_t *ptep = (uint64_t *)data; ++ if (ptep) ++ *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) << ++ PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); ++ return 0; ++} ++ ++int create_lookup_pte_addr(struct mm_struct *mm, ++ unsigned long address, ++ uint64_t *ptep) ++{ ++ return apply_to_page_range(mm, address, PAGE_SIZE, ++ lookup_pte_fn, ptep); ++} ++ ++EXPORT_SYMBOL(create_lookup_pte_addr); ++ + /* Build the parallel p2m_top_mfn structures */ + void xen_build_mfn_list_list(void) + { +@@ -315,6 +345,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr) + + return PFN_DOWN(maddr.maddr); + } ++EXPORT_SYMBOL_GPL(set_phys_to_machine); + + xmaddr_t arbitrary_virt_to_machine(void *vaddr) + { +@@ -376,6 +407,34 @@ static bool xen_page_pinned(void *ptr) + return PagePinned(page); + } + ++static bool xen_iomap_pte(pte_t pte) ++{ ++ return pte_flags(pte) & _PAGE_IOMAP; ++} ++ ++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) ++{ ++ struct multicall_space mcs; ++ struct mmu_update *u; ++ ++ mcs = xen_mc_entry(sizeof(*u)); ++ u = mcs.args; ++ ++ /* ptep might be kmapped when using 32-bit HIGHPTE */ ++ u->ptr = arbitrary_virt_to_machine(ptep).maddr; ++ u->val = pte_val_ma(pteval); ++ ++ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); ++ ++ xen_mc_issue(PARAVIRT_LAZY_MMU); ++} ++EXPORT_SYMBOL_GPL(xen_set_domain_pte); ++ ++static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) ++{ ++ xen_set_domain_pte(ptep, pteval, DOMID_IO); ++} ++ + static void xen_extend_mmu_update(const struct mmu_update *update) + { + struct multicall_space mcs; +@@ -452,6 +511,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) + void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) + { ++ if (xen_iomap_pte(pteval)) { ++ xen_set_iomap_pte(ptep, pteval); ++ goto out; ++ } ++ + ADD_STATS(set_pte_at, 1); + // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); +@@ -522,8 +586,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) + return val; + } + ++static pteval_t iomap_pte(pteval_t val) ++{ ++ if (val & _PAGE_PRESENT) { ++ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; ++ pteval_t flags = val & PTE_FLAGS_MASK; ++ ++ /* We assume the pte frame number is a MFN, so ++ just use it as-is. */ ++ val = ((pteval_t)pfn << PAGE_SHIFT) | flags; ++ } ++ ++ return val; ++} ++ + pteval_t xen_pte_val(pte_t pte) + { ++ if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) ++ return pte.pte; ++ + return pte_mfn_to_pfn(pte.pte); + } + PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); +@@ -536,7 +617,22 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); + + pte_t xen_make_pte(pteval_t pte) + { +- pte = pte_pfn_to_mfn(pte); ++ phys_addr_t addr = (pte & PTE_PFN_MASK); ++ ++ /* ++ * Unprivileged domains are allowed to do IOMAPpings for ++ * PCI passthrough, but not map ISA space. The ISA ++ * mappings are just dummy local mappings to keep other ++ * parts of the kernel happy. ++ */ ++ if (unlikely(pte & _PAGE_IOMAP) && ++ (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { ++ pte = iomap_pte(pte); ++ } else { ++ pte &= ~_PAGE_IOMAP; ++ pte = pte_pfn_to_mfn(pte); ++ } ++ + return native_make_pte(pte); + } + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); +@@ -592,6 +688,11 @@ void xen_set_pud(pud_t *ptr, pud_t val) + + void xen_set_pte(pte_t *ptep, pte_t pte) + { ++ if (xen_iomap_pte(pte)) { ++ xen_set_iomap_pte(ptep, pte); ++ return; ++ } ++ + ADD_STATS(pte_update, 1); + // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); +@@ -608,6 +709,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte) + #ifdef CONFIG_X86_PAE + void xen_set_pte_atomic(pte_t *ptep, pte_t pte) + { ++ if (xen_iomap_pte(pte)) { ++ xen_set_iomap_pte(ptep, pte); ++ return; ++ } ++ + set_64bit((u64 *)ptep, native_pte_val(pte)); + } + +@@ -1219,7 +1325,7 @@ void xen_exit_mmap(struct mm_struct *mm) + spin_lock(&mm->page_table_lock); + + /* pgd may not be pinned in the error exit path of execve */ +- if (xen_page_pinned(mm->pgd)) ++ if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings) + xen_pgd_unpin(mm); + + spin_unlock(&mm->page_table_lock); +@@ -1288,12 +1394,19 @@ static void xen_flush_tlb_single(unsigned long addr) + preempt_enable(); + } + ++/* ++ * Flush tlb on other cpus. Xen can do this via a single hypercall ++ * rather than explicit IPIs, which has the nice property of avoiding ++ * any cpus which don't actually have dirty tlbs. Unfortunately it ++ * doesn't give us an opportunity to kick out cpus which are in lazy ++ * tlb state, so we may end up reflushing some cpus unnecessarily. ++ */ + static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long va) + { + struct { + struct mmuext_op op; +- DECLARE_BITMAP(mask, NR_CPUS); ++ DECLARE_BITMAP(mask, num_processors); + } *args; + struct multicall_space mcs; + +@@ -1417,6 +1530,13 @@ static int xen_pgd_alloc(struct mm_struct *mm) + return ret; + } + ++void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd) ++{ ++ if (xen_page_pinned(pgd)) ++ __xen_pgd_unpin(mm, pgd); ++ ++} ++ + static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) + { + #ifdef CONFIG_X86_64 +@@ -1432,14 +1552,15 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) + { + pgprot_t prot = PAGE_KERNEL; + ++ /* ++ * We disable highmem allocations for page tables so we should never ++ * see any calls to kmap_atomic_pte on a highmem page. ++ */ ++ BUG_ON(PageHighMem(page)); ++ + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + +- if (0 && PageHighMem(page)) +- printk("mapping highpte %lx type %d prot %s\n", +- page_to_pfn(page), type, +- (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); +- + return kmap_atomic_prot(page, type, prot); + } + #endif +@@ -1447,10 +1568,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) + #ifdef CONFIG_X86_32 + static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) + { +- /* If there's an existing pte, then don't allow _PAGE_RW to be set */ +- if (pte_val_ma(*ptep) & _PAGE_PRESENT) ++ pte_t oldpte = *ptep; ++ ++ if (pte_flags(oldpte) & _PAGE_PRESENT) { ++ /* Don't allow existing IO mappings to be overridden */ ++ if (pte_flags(oldpte) & _PAGE_IOMAP) ++ pte = oldpte; ++ ++ /* Don't allow _PAGE_RW to be set on existing pte */ + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); ++ } + + return pte; + } +@@ -1619,6 +1747,7 @@ static void *m2v(phys_addr_t maddr) + return __ka(m2p(maddr)); + } + ++/* Set the page permissions on an identity-mapped pages */ + static void set_page_prot(void *addr, pgprot_t prot) + { + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; +@@ -1674,6 +1803,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) + set_page_prot(pmd, PAGE_KERNEL_RO); + } + ++void __init xen_setup_machphys_mapping(void) ++{ ++ struct xen_machphys_mapping mapping; ++ unsigned long machine_to_phys_nr_ents; ++ ++ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { ++ machine_to_phys_mapping = (unsigned long *)mapping.v_start; ++ machine_to_phys_nr_ents = mapping.max_mfn + 1; ++ } else { ++ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; ++ } ++ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); ++} ++ + #ifdef CONFIG_X86_64 + static void convert_pfn_mfn(void *v) + { +@@ -1765,6 +1908,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) + { + pmd_t *kernel_pmd; ++ int i; + + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + + xen_start_info->nr_pt_frames * PAGE_SIZE + +@@ -1776,6 +1920,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); ++ ++ /* ++ * When running a 32 bit domain 0 on a 64 bit hypervisor a ++ * pinned L3 (such as the initial pgd here) contains bits ++ * which are reserved in the PAE layout but not in the 64 bit ++ * layout. Unfortunately some versions of the hypervisor ++ * (incorrectly) validate compat mode guests against the PAE ++ * layout and hence will not allow such a pagetable to be ++ * pinned by the guest. Therefore we mask off only the PFN and ++ * Present bits of the supplied L3. ++ */ ++ for (i = 0; i < PTRS_PER_PGD; i++) ++ swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT); ++ + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + +@@ -1798,6 +1956,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + } + #endif /* CONFIG_X86_64 */ + ++static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss; ++ + static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + { + pte_t pte; +@@ -1827,9 +1987,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + pte = pfn_pte(phys, prot); + break; + +- default: ++#ifdef CONFIG_X86_IO_APIC ++ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: ++ /* ++ * We just don't map the IO APIC - all access is via ++ * hypercalls. Keep the address in the pte for reference. ++ */ ++ pte = __pte(__pa(dummy_ioapic_mapping) | __PAGE_KERNEL); ++ break; ++#endif ++ ++ case FIX_PARAVIRT_BOOTMAP: ++ /* This is an MFN, but it isn't an IO mapping from the ++ IO domain */ + pte = mfn_pte(phys, prot); + break; ++ ++ default: ++ /* By default, set_fixmap is used for hardware mappings */ ++ pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); ++ break; + } + + __native_set_fixmap(idx, pte); +@@ -1844,6 +2021,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) + #endif + } + ++__init void xen_ident_map_ISA(void) ++{ ++ unsigned long pa; ++ ++ /* ++ * If we're dom0, then linear map the ISA machine addresses into ++ * the kernel's address space. ++ */ ++ if (!xen_initial_domain()) ++ return; ++ ++ xen_raw_printk("Xen: setup ISA identity maps\n"); ++ ++ for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { ++ pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); ++ ++ if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) ++ BUG(); ++ } ++ ++ xen_flush_tlb(); ++} ++ + static __init void xen_post_allocator_init(void) + { + pv_mmu_ops.set_pte = xen_set_pte; +@@ -1961,6 +2161,271 @@ void __init xen_init_mmu_ops(void) + pv_mmu_ops = xen_mmu_ops; + } + ++/* Protected by xen_reservation_lock. */ ++#define MAX_CONTIG_ORDER 9 /* 2MB */ ++static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) ++ return -ENOMEM; ++ ++ memset((void *) vstart, 0, PAGE_SIZE << order); ++ ++ vm_unmap_aliases(); ++ ++ spin_lock_irqsave(&xen_reservation_lock, flags); ++ ++ /* 1. Zap current PTEs, remembering MFNs. */ ++ xen_zap_pfn_range(vstart, order, in_frames, NULL); ++ ++ /* 2. Get a new contiguous memory extent. */ ++ out_frame = virt_to_pfn(vstart); ++ success = xen_exchange_memory(1UL << order, 0, in_frames, ++ 1, order, &out_frame, ++ address_bits); ++ ++ /* 3. Map the new extent in place of old pages. */ ++ if (success) ++ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); ++ else ++ xen_remap_exchanged_ptes(vstart, order, in_frames, 0); ++ ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); ++ ++ return success ? 0 : -ENOMEM; ++} ++EXPORT_SYMBOL_GPL(xen_create_contiguous_region); ++ ++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) ++{ ++ unsigned long *out_frames = discontig_frames, in_frame; ++ unsigned long flags; ++ int success; ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return; ++ ++ if (unlikely(order > MAX_CONTIG_ORDER)) ++ return; ++ ++ memset((void *) vstart, 0, PAGE_SIZE << order); ++ ++ vm_unmap_aliases(); ++ ++ spin_lock_irqsave(&xen_reservation_lock, flags); ++ ++ /* 1. Find start MFN of contiguous extent. */ ++ in_frame = virt_to_mfn(vstart); ++ ++ /* 2. Zap current PTEs. */ ++ xen_zap_pfn_range(vstart, order, NULL, out_frames); ++ ++ /* 3. Do the exchange for non-contiguous MFNs. */ ++ success = xen_exchange_memory(1, order, &in_frame, 1UL << order, ++ 0, out_frames, 0); ++ ++ /* 4. Map new pages in place of old pages. */ ++ if (success) ++ xen_remap_exchanged_ptes(vstart, order, out_frames, 0); ++ else ++ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); ++ ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); ++} ++EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); ++ ++#define REMAP_BATCH_SIZE 16 ++ ++struct remap_data { ++ unsigned long mfn; ++ pgprot_t prot; ++ struct mmu_update *mmu_update; ++}; ++ ++static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, ++ unsigned long addr, void *data) ++{ ++ struct remap_data *rmd = data; ++ pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); ++ ++ rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; ++ rmd->mmu_update->val = pte_val_ma(pte); ++ rmd->mmu_update++; ++ ++ return 0; ++} ++ ++int xen_remap_domain_mfn_range(struct vm_area_struct *vma, ++ unsigned long addr, ++ unsigned long mfn, int nr, ++ pgprot_t prot, unsigned domid) ++{ ++ struct remap_data rmd; ++ struct mmu_update mmu_update[REMAP_BATCH_SIZE]; ++ int batch; ++ unsigned long range; ++ int err = 0; ++ ++ prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); ++ ++ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; ++ ++ rmd.mfn = mfn; ++ rmd.prot = prot; ++ ++ while (nr) { ++ batch = min(REMAP_BATCH_SIZE, nr); ++ range = (unsigned long)batch << PAGE_SHIFT; ++ ++ rmd.mmu_update = mmu_update; ++ err = apply_to_page_range(vma->vm_mm, addr, range, ++ remap_area_mfn_pte_fn, &rmd); ++ if (err) ++ goto out; ++ ++ err = -EFAULT; ++ if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) ++ goto out; ++ ++ nr -= batch; ++ addr += range; ++ } ++ ++ err = 0; ++out: ++ ++ flush_tlb_all(); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); ++ + #ifdef CONFIG_XEN_DEBUG_FS + + static struct dentry *d_mmu_debug; +diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c +new file mode 100644 +index 0000000..4d55524 +--- /dev/null ++++ b/arch/x86/xen/pci-swiotlb-xen.c +@@ -0,0 +1,52 @@ ++/* Glue code to lib/swiotlb-xen.c */ ++ ++#include ++#include ++ ++#include ++ ++int xen_swiotlb __read_mostly; ++ ++static struct dma_map_ops xen_swiotlb_dma_ops = { ++ .mapping_error = xen_swiotlb_dma_mapping_error, ++ .alloc_coherent = xen_swiotlb_alloc_coherent, ++ .free_coherent = xen_swiotlb_free_coherent, ++ .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, ++ .sync_single_for_device = xen_swiotlb_sync_single_for_device, ++ .sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu, ++ .sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device, ++ .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, ++ .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, ++ .map_sg = xen_swiotlb_map_sg_attrs, ++ .unmap_sg = xen_swiotlb_unmap_sg_attrs, ++ .map_page = xen_swiotlb_map_page, ++ .unmap_page = xen_swiotlb_unmap_page, ++ .dma_supported = xen_swiotlb_dma_supported, ++}; ++ ++/* ++ * pci_swiotlb_detect - set swiotlb to 1 if necessary ++ * ++ * This returns non-zero if we are forced to use swiotlb (by the boot ++ * option). ++ */ ++int __init pci_xen_swiotlb_detect(void) ++{ ++ ++ if (xen_pv_domain() && (xen_initial_domain() || swiotlb)) ++ xen_swiotlb = 1; ++ ++ /* If we are running under Xen, we MUST disable the native SWIOTLB */ ++ if (xen_pv_domain()) ++ swiotlb = 0; ++ ++ return xen_swiotlb; ++} ++ ++void __init pci_xen_swiotlb_init(void) ++{ ++ if (xen_swiotlb) { ++ xen_swiotlb_init(1); ++ dma_ops = &xen_swiotlb_dma_ops; ++ } ++} +diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c +new file mode 100644 +index 0000000..f999ad8 +--- /dev/null ++++ b/arch/x86/xen/pci.c +@@ -0,0 +1,117 @@ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "xen-ops.h" ++ ++int xen_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ int rc, irq; ++ struct physdev_setup_gsi setup_gsi; ++ struct physdev_map_pirq map_irq; ++ int shareable = 0; ++ char *name; ++ ++ if (!xen_domain()) ++ return -1; ++ ++ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", ++ gsi, triggering, polarity); ++ ++ if (triggering == ACPI_EDGE_SENSITIVE) { ++ shareable = 0; ++ name = "ioapic-edge"; ++ } else { ++ shareable = 1; ++ name = "ioapic-level"; ++ } ++ ++ irq = xen_allocate_pirq(gsi, shareable, name); ++ ++ printk(KERN_DEBUG "xen: --> irq=%d\n", irq); ++ ++ if (irq >= 0) { ++ setup_gsi.gsi = gsi; ++ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? ++ 0 : 1); ++ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); ++ if (rc == -EEXIST) ++ printk(KERN_INFO "Already setup the GSI :%d\n", gsi); ++ else if (rc) { ++ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", ++ gsi, rc); ++ BUG(); ++ } ++ ++ map_irq.domid = DOMID_SELF; ++ map_irq.type = MAP_PIRQ_TYPE_GSI; ++ map_irq.index = gsi; ++ map_irq.pirq = irq; ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); ++ if (rc) { ++ printk(KERN_WARNING "xen map irq failed %d\n", rc); ++ irq = -1; ++ } ++ } ++ return irq; ++} ++ ++void __init xen_setup_pirqs(void) ++{ ++ int irq; ++ ++ if (0 == nr_ioapics) { ++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) ++ xen_allocate_pirq(irq, 0, "xt-pic"); ++ return; ++ } ++ ++ /* Pre-allocate legacy irqs */ ++ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { ++ int trigger, polarity; ++ ++ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) ++ continue; ++ ++ xen_register_gsi(irq, ++ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE, ++ polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH); ++ } ++} ++ ++#ifdef CONFIG_PCI_MSI ++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) ++{ ++ int irq, ret; ++ struct msi_desc *msidesc; ++ ++ list_for_each_entry(msidesc, &dev->msi_list, list) { ++ irq = xen_create_msi_irq(dev, msidesc, type); ++ if (irq < 0) ++ return -1; ++ ++ ret = set_irq_msi(irq, msidesc); ++ if (ret) ++ goto error; ++ } ++ return 0; ++ ++error: ++ xen_destroy_irq(irq); ++ return ret; ++} ++#endif +diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c +index ad0047f..266c86a 100644 +--- a/arch/x86/xen/setup.c ++++ b/arch/x86/xen/setup.c +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include + #include + #include +@@ -19,6 +20,7 @@ + + #include + #include ++#include + #include + #include + +@@ -36,21 +38,60 @@ extern void xen_syscall32_target(void); + /** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ +- + char * __init xen_memory_setup(void) + { ++ static __initdata struct e820entry map[E820MAX]; ++ + unsigned long max_pfn = xen_start_info->nr_pages; ++ struct xen_memory_map memmap; ++ unsigned long long mem_end; ++ int op; ++ int rc; ++ int i; + + max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); ++ mem_end = PFN_PHYS((u64)max_pfn); ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, map); ++ ++ op = xen_initial_domain() ? ++ XENMEM_machine_memory_map : ++ XENMEM_memory_map; ++ rc = HYPERVISOR_memory_op(op, &memmap); ++ if (rc == -ENOSYS) { ++ memmap.nr_entries = 1; ++ map[0].addr = 0ULL; ++ map[0].size = mem_end; ++ /* 8MB slack (to balance backend allocations). */ ++ map[0].size += 8ULL << 20; ++ map[0].type = E820_RAM; ++ rc = 0; ++ } ++ BUG_ON(rc); + + e820.nr_map = 0; +- +- e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); ++ for (i = 0; i < memmap.nr_entries; i++) { ++ unsigned long long end = map[i].addr + map[i].size; ++ if (map[i].type == E820_RAM) { ++ if (map[i].addr > mem_end) ++ continue; ++ if (end > mem_end) { ++ /* Truncate region to max_mem. */ ++ map[i].size -= end - mem_end; ++ } ++ } ++ if (map[i].size > 0) ++ e820_add_region(map[i].addr, map[i].size, map[i].type); ++ } + + /* + * Even though this is normal, usable memory under Xen, reserve + * ISA memory anyway because too many things think they can poke + * about in there. ++ * ++ * In a dom0 kernel, this region is identity mapped with the ++ * hardware ISA area, so it really is out of bounds. + */ + e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, + E820_RESERVED); +@@ -182,13 +223,17 @@ void __init xen_arch_setup(void) + } + #endif + ++ /* ++ * Xen hypervisor uses HPET to wakeup cpu from deep c-states, ++ * so the HPET usage in dom0 must be forbidden. ++ */ ++ disable_hpet(NULL); ++ + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; + +- paravirt_disable_iospace(); +- + fiddle_vdso(); + } +diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c +index 360f8d8..632ea35 100644 +--- a/arch/x86/xen/smp.c ++++ b/arch/x86/xen/smp.c +@@ -178,11 +178,18 @@ static void __init xen_smp_prepare_boot_cpu(void) + static void __init xen_smp_prepare_cpus(unsigned int max_cpus) + { + unsigned cpu; ++ unsigned int i; + + xen_init_lock_cpu(0); + + smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; ++ ++ for_each_possible_cpu(i) { ++ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); ++ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); ++ zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); ++ } + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) +@@ -299,6 +306,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) + xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); + ++ cpumask_set_cpu(cpu, cpu_callout_mask); ++ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + + /* make sure interrupts start blocked */ +diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c +new file mode 100644 +index 0000000..1cd7f4d +--- /dev/null ++++ b/arch/x86/xen/vga.c +@@ -0,0 +1,67 @@ ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include "xen-ops.h" ++ ++void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) ++{ ++ struct screen_info *screen_info = &boot_params.screen_info; ++ ++ /* This is drawn from a dump from vgacon:startup in ++ * standard Linux. */ ++ screen_info->orig_video_mode = 3; ++ screen_info->orig_video_isVGA = 1; ++ screen_info->orig_video_lines = 25; ++ screen_info->orig_video_cols = 80; ++ screen_info->orig_video_ega_bx = 3; ++ screen_info->orig_video_points = 16; ++ screen_info->orig_y = screen_info->orig_video_lines - 1; ++ ++ switch (info->video_type) { ++ case XEN_VGATYPE_TEXT_MODE_3: ++ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3) ++ + sizeof(info->u.text_mode_3)) ++ break; ++ screen_info->orig_video_lines = info->u.text_mode_3.rows; ++ screen_info->orig_video_cols = info->u.text_mode_3.columns; ++ screen_info->orig_x = info->u.text_mode_3.cursor_x; ++ screen_info->orig_y = info->u.text_mode_3.cursor_y; ++ screen_info->orig_video_points = ++ info->u.text_mode_3.font_height; ++ break; ++ ++ case XEN_VGATYPE_VESA_LFB: ++ if (size < offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.gbl_caps)) ++ break; ++ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB; ++ screen_info->lfb_width = info->u.vesa_lfb.width; ++ screen_info->lfb_height = info->u.vesa_lfb.height; ++ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel; ++ screen_info->lfb_base = info->u.vesa_lfb.lfb_base; ++ screen_info->lfb_size = info->u.vesa_lfb.lfb_size; ++ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line; ++ screen_info->red_size = info->u.vesa_lfb.red_size; ++ screen_info->red_pos = info->u.vesa_lfb.red_pos; ++ screen_info->green_size = info->u.vesa_lfb.green_size; ++ screen_info->green_pos = info->u.vesa_lfb.green_pos; ++ screen_info->blue_size = info->u.vesa_lfb.blue_size; ++ screen_info->blue_pos = info->u.vesa_lfb.blue_pos; ++ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size; ++ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos; ++ if (size >= offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.gbl_caps) ++ + sizeof(info->u.vesa_lfb.gbl_caps)) ++ screen_info->capabilities = info->u.vesa_lfb.gbl_caps; ++ if (size >= offsetof(struct dom0_vga_console_info, ++ u.vesa_lfb.mode_attrs) ++ + sizeof(info->u.vesa_lfb.mode_attrs)) ++ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; ++ break; ++ } ++} +diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h +index f9153a3..5afc1fe 100644 +--- a/arch/x86/xen/xen-ops.h ++++ b/arch/x86/xen/xen-ops.h +@@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void); + pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); + void xen_ident_map_ISA(void); + void xen_reserve_top(void); ++void xen_ident_map_ISA(void); + + char * __init xen_memory_setup(void); + void __init xen_arch_setup(void); +@@ -82,6 +83,23 @@ static inline void xen_uninit_lock_cpu(int cpu) + } + #endif + ++struct dom0_vga_console_info; ++ ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_vga(const struct dom0_vga_console_info *, size_t size); ++#else ++static inline void xen_init_vga(const struct dom0_vga_console_info *info, ++ size_t size) ++{ ++} ++#endif ++ ++#ifdef CONFIG_XEN_DOM0 ++void xen_init_apic(void); ++#else ++static inline void xen_init_apic(void) {} ++#endif ++ + /* Declare an asm function, along with symbols needed to make it + inlineable */ + #define DECL_ASM(ret, name, ...) \ +diff --git a/block/blk-core.c b/block/blk-core.c +index 71da511..32d305c 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -439,6 +439,7 @@ void blk_put_queue(struct request_queue *q) + { + kobject_put(&q->kobj); + } ++EXPORT_SYMBOL_GPL(blk_put_queue); + + void blk_cleanup_queue(struct request_queue *q) + { +@@ -612,6 +613,7 @@ int blk_get_queue(struct request_queue *q) + + return 1; + } ++EXPORT_SYMBOL_GPL(blk_get_queue); + + static inline void blk_free_request(struct request_queue *q, struct request *rq) + { +diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig +index 1d886e0..f4a2b10 100644 +--- a/drivers/block/Kconfig ++++ b/drivers/block/Kconfig +@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND + tristate "Xen virtual block device support" + depends on XEN + default y ++ select XEN_XENBUS_FRONTEND + help + This driver implements the front-end of the Xen virtual + block device driver. It communicates with a back-end driver +diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c +index b8578bb..feec425 100644 +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -42,6 +42,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -102,6 +103,10 @@ struct blkfront_info + + static DEFINE_SPINLOCK(blkif_io_lock); + ++static unsigned int nr_minors; ++static unsigned long *minors; ++static DEFINE_SPINLOCK(minor_lock); ++ + #define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) + #define GRANT_INVALID_REF 0 +@@ -136,6 +141,55 @@ static void add_id_to_freelist(struct blkfront_info *info, + info->shadow_free = id; + } + ++static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) ++{ ++ unsigned int end = minor + nr; ++ int rc; ++ ++ if (end > nr_minors) { ++ unsigned long *bitmap, *old; ++ ++ bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap), ++ GFP_KERNEL); ++ if (bitmap == NULL) ++ return -ENOMEM; ++ ++ spin_lock(&minor_lock); ++ if (end > nr_minors) { ++ old = minors; ++ memcpy(bitmap, minors, ++ BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); ++ minors = bitmap; ++ nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; ++ } else ++ old = bitmap; ++ spin_unlock(&minor_lock); ++ kfree(old); ++ } ++ ++ spin_lock(&minor_lock); ++ if (find_next_bit(minors, end, minor) >= end) { ++ for (; minor < end; ++minor) ++ __set_bit(minor, minors); ++ rc = 0; ++ } else ++ rc = -EBUSY; ++ spin_unlock(&minor_lock); ++ ++ return rc; ++} ++ ++static void xlbd_release_minors(unsigned int minor, unsigned int nr) ++{ ++ unsigned int end = minor + nr; ++ ++ BUG_ON(end > nr_minors); ++ spin_lock(&minor_lock); ++ for (; minor < end; ++minor) ++ __clear_bit(minor, minors); ++ spin_unlock(&minor_lock); ++} ++ + static void blkif_restart_queue_callback(void *arg) + { + struct blkfront_info *info = (struct blkfront_info *)arg; +@@ -416,9 +470,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + if ((minor % nr_parts) == 0) + nr_minors = nr_parts; + ++ err = xlbd_reserve_minors(minor, nr_minors); ++ if (err) ++ goto out; ++ err = -ENODEV; ++ + gd = alloc_disk(nr_minors); + if (gd == NULL) +- goto out; ++ goto release; + + offset = minor / nr_parts; + +@@ -449,7 +508,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + + if (xlvbd_init_blk_queue(gd, sector_size)) { + del_gendisk(gd); +- goto out; ++ goto release; + } + + info->rq = gd->queue; +@@ -469,6 +528,8 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + + return 0; + ++ release: ++ xlbd_release_minors(minor, nr_minors); + out: + return err; + } +@@ -650,7 +711,7 @@ fail: + + + /* Common code used when first setting up, and when resuming. */ +-static int talk_to_backend(struct xenbus_device *dev, ++static int talk_to_blkback(struct xenbus_device *dev, + struct blkfront_info *info) + { + const char *message = NULL; +@@ -755,7 +816,7 @@ static int blkfront_probe(struct xenbus_device *dev, + info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); + dev_set_drvdata(&dev->dev, info); + +- err = talk_to_backend(dev, info); ++ err = talk_to_blkback(dev, info); + if (err) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); +@@ -850,7 +911,7 @@ static int blkfront_resume(struct xenbus_device *dev) + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + +- err = talk_to_backend(dev, info); ++ err = talk_to_blkback(dev, info); + if (info->connected == BLKIF_STATE_SUSPENDED && !err) + err = blkif_recover(info); + +@@ -923,6 +984,7 @@ static void blkfront_connect(struct blkfront_info *info) + static void blkfront_closing(struct xenbus_device *dev) + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); ++ unsigned int minor, nr_minors; + unsigned long flags; + + dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename); +@@ -945,7 +1007,10 @@ static void blkfront_closing(struct xenbus_device *dev) + blk_cleanup_queue(info->rq); + info->rq = NULL; + ++ minor = info->gd->first_minor; ++ nr_minors = info->gd->minors; + del_gendisk(info->gd); ++ xlbd_release_minors(minor, nr_minors); + + out: + xenbus_frontend_closed(dev); +@@ -954,13 +1019,13 @@ static void blkfront_closing(struct xenbus_device *dev) + /** + * Callback received when the backend's state changes. + */ +-static void backend_changed(struct xenbus_device *dev, ++static void blkback_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + struct block_device *bd; + +- dev_dbg(&dev->dev, "blkfront:backend_changed.\n"); ++ dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); + + switch (backend_state) { + case XenbusStateInitialising: +@@ -1003,7 +1068,10 @@ static int blkfront_remove(struct xenbus_device *dev) + + blkif_free(info, 0); + +- kfree(info); ++ if(info->users == 0) ++ kfree(info); ++ else ++ info->is_ready = -1; + + return 0; + } +@@ -1012,12 +1080,15 @@ static int blkfront_is_ready(struct xenbus_device *dev) + { + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + +- return info->is_ready; ++ return info->is_ready > 0; + } + + static int blkif_open(struct block_device *bdev, fmode_t mode) + { + struct blkfront_info *info = bdev->bd_disk->private_data; ++ ++ if(info->is_ready < 0) ++ return -ENODEV; + info->users++; + return 0; + } +@@ -1033,7 +1104,10 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) + struct xenbus_device *dev = info->xbdev; + enum xenbus_state state = xenbus_read_driver_state(dev->otherend); + +- if (state == XenbusStateClosing && info->is_ready) ++ if(info->is_ready < 0) { ++ blkfront_closing(dev); ++ kfree(info); ++ } else if (state == XenbusStateClosing && info->is_ready) + blkfront_closing(dev); + } + return 0; +@@ -1061,7 +1135,7 @@ static struct xenbus_driver blkfront = { + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, +- .otherend_changed = backend_changed, ++ .otherend_changed = blkback_changed, + .is_ready = blkfront_is_ready, + }; + +diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c +index 4dcfef0..9bca04e 100644 +--- a/drivers/char/agp/intel-agp.c ++++ b/drivers/char/agp/intel-agp.c +@@ -15,8 +15,12 @@ + * an Intel IOMMU. So make the correct use of the PCI DMA API contingent + * on the Intel IOMMU support (CONFIG_DMAR). + * Only newer chipsets need to bother with this, of course. ++ * ++ * Xen guests accessing graphics hardware also need proper translation ++ * between pseudo-physical addresses and real machine addresses, which ++ * is also achieved by using the DMA API. + */ +-#ifdef CONFIG_DMAR ++#if defined(CONFIG_DMAR) || defined(CONFIG_XEN) + #define USE_PCI_DMA_API 1 + #endif + +diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c +index a6ee32b..5be0dd3 100644 +--- a/drivers/char/hvc_xen.c ++++ b/drivers/char/hvc_xen.c +@@ -25,6 +25,8 @@ + #include + + #include ++ ++#include + #include + #include + #include +@@ -76,7 +78,7 @@ static int __write_console(const char *data, int len) + return sent; + } + +-static int write_console(uint32_t vtermno, const char *data, int len) ++static int domU_write_console(uint32_t vtermno, const char *data, int len) + { + int ret = len; + +@@ -99,7 +101,7 @@ static int write_console(uint32_t vtermno, const char *data, int len) + return ret; + } + +-static int read_console(uint32_t vtermno, char *buf, int len) ++static int domU_read_console(uint32_t vtermno, char *buf, int len) + { + struct xencons_interface *intf = xencons_interface(); + XENCONS_RING_IDX cons, prod; +@@ -120,28 +122,63 @@ static int read_console(uint32_t vtermno, char *buf, int len) + return recv; + } + +-static struct hv_ops hvc_ops = { +- .get_chars = read_console, +- .put_chars = write_console, ++static struct hv_ops domU_hvc_ops = { ++ .get_chars = domU_read_console, ++ .put_chars = domU_write_console, ++ .notifier_add = notifier_add_irq, ++ .notifier_del = notifier_del_irq, ++ .notifier_hangup = notifier_hangup_irq, ++}; ++ ++static int dom0_read_console(uint32_t vtermno, char *buf, int len) ++{ ++ return HYPERVISOR_console_io(CONSOLEIO_read, len, buf); ++} ++ ++/* ++ * Either for a dom0 to write to the system console, or a domU with a ++ * debug version of Xen ++ */ ++static int dom0_write_console(uint32_t vtermno, const char *str, int len) ++{ ++ int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); ++ if (rc < 0) ++ return 0; ++ ++ return len; ++} ++ ++static struct hv_ops dom0_hvc_ops = { ++ .get_chars = dom0_read_console, ++ .put_chars = dom0_write_console, + .notifier_add = notifier_add_irq, + .notifier_del = notifier_del_irq, + .notifier_hangup = notifier_hangup_irq, + }; + +-static int __init xen_init(void) ++static int __init xen_hvc_init(void) + { + struct hvc_struct *hp; ++ struct hv_ops *ops; + +- if (!xen_pv_domain() || +- xen_initial_domain() || +- !xen_start_info->console.domU.evtchn) ++ if (!xen_pv_domain()) + return -ENODEV; + +- xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); ++ if (xen_initial_domain()) { ++ ops = &dom0_hvc_ops; ++ xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0); ++ } else { ++ if (!xen_start_info->console.domU.evtchn) ++ return -ENODEV; ++ ++ ops = &domU_hvc_ops; ++ xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn); ++ } ++ + if (xencons_irq < 0) + xencons_irq = 0; /* NO_IRQ */ + +- hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256); ++ hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256); + if (IS_ERR(hp)) + return PTR_ERR(hp); + +@@ -158,7 +195,7 @@ void xen_console_resume(void) + rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq); + } + +-static void __exit xen_fini(void) ++static void __exit xen_hvc_fini(void) + { + if (hvc) + hvc_remove(hvc); +@@ -166,29 +203,24 @@ static void __exit xen_fini(void) + + static int xen_cons_init(void) + { ++ struct hv_ops *ops; ++ + if (!xen_pv_domain()) + return 0; + +- hvc_instantiate(HVC_COOKIE, 0, &hvc_ops); ++ ops = &domU_hvc_ops; ++ if (xen_initial_domain()) ++ ops = &dom0_hvc_ops; ++ ++ hvc_instantiate(HVC_COOKIE, 0, ops); ++ + return 0; + } + +-module_init(xen_init); +-module_exit(xen_fini); ++module_init(xen_hvc_init); ++module_exit(xen_hvc_fini); + console_initcall(xen_cons_init); + +-static void raw_console_write(const char *str, int len) +-{ +- while(len > 0) { +- int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str); +- if (rc <= 0) +- break; +- +- str += rc; +- len -= rc; +- } +-} +- + #ifdef CONFIG_EARLY_PRINTK + static void xenboot_write_console(struct console *console, const char *string, + unsigned len) +@@ -196,19 +228,22 @@ static void xenboot_write_console(struct console *console, const char *string, + unsigned int linelen, off = 0; + const char *pos; + +- raw_console_write(string, len); ++ dom0_write_console(0, string, len); ++ ++ if (xen_initial_domain()) ++ return; + +- write_console(0, "(early) ", 8); ++ domU_write_console(0, "(early) ", 8); + while (off < len && NULL != (pos = strchr(string+off, '\n'))) { + linelen = pos-string+off; + if (off + linelen > len) + break; +- write_console(0, string+off, linelen); +- write_console(0, "\r\n", 2); ++ domU_write_console(0, string+off, linelen); ++ domU_write_console(0, "\r\n", 2); + off += linelen + 1; + } + if (off < len) +- write_console(0, string+off, len-off); ++ domU_write_console(0, string+off, len-off); + } + + struct console xenboot_console = { +@@ -220,7 +255,7 @@ struct console xenboot_console = { + + void xen_raw_console_write(const char *str) + { +- raw_console_write(str, strlen(str)); ++ dom0_write_console(0, str, strlen(str)); + } + + void xen_raw_printk(const char *fmt, ...) +diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c +index b115726..c721c0a 100644 +--- a/drivers/input/xen-kbdfront.c ++++ b/drivers/input/xen-kbdfront.c +@@ -21,7 +21,10 @@ + #include + #include + #include ++ + #include ++ ++#include + #include + #include + #include +diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig +index b2f71f7..b7feb84 100644 +--- a/drivers/net/Kconfig ++++ b/drivers/net/Kconfig +@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig" + config XEN_NETDEV_FRONTEND + tristate "Xen network device frontend driver" + depends on XEN ++ select XEN_XENBUS_FRONTEND + default y + help + The network device frontend driver allows the kernel to +diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c +index baa051d..87d7121 100644 +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -42,6 +42,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -1393,7 +1394,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info) + } + + /* Common code used when first setting up, and when resuming. */ +-static int talk_to_backend(struct xenbus_device *dev, ++static int talk_to_netback(struct xenbus_device *dev, + struct netfront_info *info) + { + const char *message; +@@ -1543,7 +1544,7 @@ static int xennet_connect(struct net_device *dev) + return -ENODEV; + } + +- err = talk_to_backend(np->xbdev, np); ++ err = talk_to_netback(np->xbdev, np); + if (err) + return err; + +@@ -1597,7 +1598,7 @@ static int xennet_connect(struct net_device *dev) + /** + * Callback received when the backend's state changes. + */ +-static void backend_changed(struct xenbus_device *dev, ++static void netback_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) + { + struct netfront_info *np = dev_get_drvdata(&dev->dev); +@@ -1798,7 +1799,7 @@ static struct xenbus_driver netfront_driver = { + .probe = netfront_probe, + .remove = __devexit_p(xennet_remove), + .resume = netfront_resume, +- .otherend_changed = backend_changed, ++ .otherend_changed = netback_changed, + }; + + static int __init netif_init(void) +diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile +index 4a7f11d..ae3e98f 100644 +--- a/drivers/pci/Makefile ++++ b/drivers/pci/Makefile +@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o + # Build Intel IOMMU support + obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o + ++# Build Xen IOMMU support ++obj-$(CONFIG_PCI_XEN) += xen-iommu.o + obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o + + obj-$(CONFIG_PCI_IOV) += iov.o +diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c +index 5753036..8e6e6d1 100644 +--- a/drivers/pci/dmar.c ++++ b/drivers/pci/dmar.c +@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void) + "x2apic and Intr-remapping.\n"); + #endif + #ifdef CONFIG_DMAR +- if (ret && !no_iommu && !iommu_detected && !swiotlb && +- !dmar_disabled) ++ if (ret && !no_iommu && !iommu_detected && !dmar_disabled) + iommu_detected = 1; + #endif ++#ifdef CONFIG_X86 ++ if (ret) ++ x86_init.iommu.iommu_init = intel_iommu_init; ++#endif + } + early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size); + dmar_tbl = NULL; +diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c +index 2498602..fd89530 100644 +--- a/drivers/pci/intel-iommu.c ++++ b/drivers/pci/intel-iommu.c +@@ -3282,7 +3282,7 @@ int __init intel_iommu_init(void) + * Check the need for DMA-remapping initialization now. + * Above initialization will also be used by Interrupt-remapping. + */ +- if (no_iommu || swiotlb || dmar_disabled) ++ if (no_iommu || dmar_disabled) + return -ENODEV; + + iommu_init_mempool(); +@@ -3303,7 +3303,9 @@ int __init intel_iommu_init(void) + "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n"); + + init_timer(&unmap_timer); +- force_iommu = 1; ++#ifdef CONFIG_SWIOTLB ++ swiotlb = 0; ++#endif + dma_ops = &intel_dma_ops; + + init_iommu_sysfs(); +diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c +index f9cf317..80b9756 100644 +--- a/drivers/pci/msi.c ++++ b/drivers/pci/msi.c +@@ -19,6 +19,8 @@ + #include + #include + ++#include ++ + #include "pci.h" + #include "msi.h" + +@@ -268,7 +270,8 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg) + { + struct irq_desc *desc = irq_to_desc(irq); + +- write_msi_msg_desc(desc, msg); ++ if (!xen_initial_domain()) ++ write_msi_msg_desc(desc, msg); + } + + static void free_msi_irqs(struct pci_dev *dev) +diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c +new file mode 100644 +index 0000000..ac6bcdb +--- /dev/null ++++ b/drivers/pci/xen-iommu.c +@@ -0,0 +1,271 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define IOMMU_BUG_ON(test) \ ++do { \ ++ if (unlikely(test)) { \ ++ printk(KERN_ALERT "Fatal DMA error! " \ ++ "Please use 'swiotlb=force'\n"); \ ++ BUG(); \ ++ } \ ++} while (0) ++ ++/* Print address range with message */ ++#define PAR(msg, addr, size) \ ++do { \ ++ printk(msg "[%#llx - %#llx]\n", \ ++ (unsigned long long)addr, \ ++ (unsigned long long)addr + size); \ ++} while (0) ++ ++static inline int address_needs_mapping(struct device *hwdev, ++ dma_addr_t addr) ++{ ++ dma_addr_t mask = DMA_BIT_MASK(32); ++ int ret; ++ ++ /* If the device has a mask, use it, otherwise default to 32 bits */ ++ if (hwdev) ++ mask = *hwdev->dma_mask; ++ ++ ret = (addr & ~mask) != 0; ++ ++ if (ret) { ++ printk(KERN_ERR "dma address needs mapping\n"); ++ printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr); ++ } ++ return ret; ++} ++ ++static int check_pages_physically_contiguous(unsigned long pfn, ++ unsigned int offset, ++ size_t length) ++{ ++ unsigned long next_mfn; ++ int i; ++ int nr_pages; ++ ++ next_mfn = pfn_to_mfn(pfn); ++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ for (i = 1; i < nr_pages; i++) { ++ if (pfn_to_mfn(++pfn) != ++next_mfn) ++ return 0; ++ } ++ return 1; ++} ++ ++static int range_straddles_page_boundary(phys_addr_t p, size_t size) ++{ ++ unsigned long pfn = PFN_DOWN(p); ++ unsigned int offset = p & ~PAGE_MASK; ++ ++ if (offset + size <= PAGE_SIZE) ++ return 0; ++ if (check_pages_physically_contiguous(pfn, offset, size)) ++ return 0; ++ return 1; ++} ++ ++static inline void xen_dma_unmap_page(struct page *page) ++{ ++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here ++ * to deal with foreign pages. We'll need similar logic here at ++ * some point. ++ */ ++} ++ ++/* Gets dma address of a page */ ++static inline dma_addr_t xen_dma_map_page(struct page *page) ++{ ++ /* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal ++ * with foreign pages. We'll need similar logic here at some ++ * point. ++ */ ++ return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT; ++} ++ ++static int xen_map_sg(struct device *hwdev, struct scatterlist *sg, ++ int nents, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *s; ++ struct page *page; ++ int i, rc; ++ ++ BUG_ON(direction == DMA_NONE); ++ WARN_ON(nents == 0 || sg[0].length == 0); ++ ++ for_each_sg(sg, s, nents, i) { ++ BUG_ON(!sg_page(s)); ++ page = sg_page(s); ++ s->dma_address = xen_dma_map_page(page) + s->offset; ++ s->dma_length = s->length; ++ IOMMU_BUG_ON(range_straddles_page_boundary( ++ page_to_phys(page), s->length)); ++ } ++ ++ rc = nents; ++ ++ flush_write_buffers(); ++ return rc; ++} ++ ++static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg, ++ int nents, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *s; ++ struct page *page; ++ int i; ++ ++ for_each_sg(sg, s, nents, i) { ++ page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address))); ++ xen_dma_unmap_page(page); ++ } ++} ++ ++static void *xen_alloc_coherent(struct device *dev, size_t size, ++ dma_addr_t *dma_handle, gfp_t gfp) ++{ ++ void *ret; ++ unsigned int order = get_order(size); ++ unsigned long vstart; ++ u64 mask; ++ ++ /* ignore region specifiers */ ++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); ++ ++ if (dma_alloc_from_coherent(dev, size, dma_handle, &ret)) ++ return ret; ++ ++ if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32))) ++ gfp |= GFP_DMA; ++ ++ vstart = __get_free_pages(gfp, order); ++ ret = (void *)vstart; ++ ++ if (dev != NULL && dev->coherent_dma_mask) ++ mask = dev->coherent_dma_mask; ++ else ++ mask = DMA_BIT_MASK(32); ++ ++ if (ret != NULL) { ++ if (xen_create_contiguous_region(vstart, order, ++ fls64(mask)) != 0) { ++ free_pages(vstart, order); ++ return NULL; ++ } ++ memset(ret, 0, size); ++ *dma_handle = virt_to_machine(ret).maddr; ++ } ++ return ret; ++} ++ ++static void xen_free_coherent(struct device *dev, size_t size, ++ void *vaddr, dma_addr_t dma_addr) ++{ ++ int order = get_order(size); ++ ++ if (dma_release_from_coherent(dev, order, vaddr)) ++ return; ++ ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++} ++ ++static dma_addr_t xen_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ dma_addr_t dma; ++ ++ BUG_ON(direction == DMA_NONE); ++ ++ WARN_ON(size == 0); ++ ++ dma = xen_dma_map_page(page) + offset; ++ ++ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); ++ flush_write_buffers(); ++ return dma; ++} ++ ++static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr, ++ size_t size, ++ enum dma_data_direction direction, ++ struct dma_attrs *attrs) ++{ ++ BUG_ON(direction == DMA_NONE); ++ xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr)))); ++} ++ ++static struct dma_map_ops xen_dma_ops = { ++ .dma_supported = NULL, ++ ++ .alloc_coherent = xen_alloc_coherent, ++ .free_coherent = xen_free_coherent, ++ ++ .map_page = xen_map_page, ++ .unmap_page = xen_unmap_page, ++ ++ .map_sg = xen_map_sg, ++ .unmap_sg = xen_unmap_sg, ++ ++ .mapping_error = NULL, ++ ++ .is_phys = 0, ++}; ++ ++static struct dma_map_ops xen_swiotlb_dma_ops = { ++ .dma_supported = swiotlb_dma_supported, ++ ++ .alloc_coherent = xen_alloc_coherent, ++ .free_coherent = xen_free_coherent, ++ ++ .map_page = swiotlb_map_page, ++ .unmap_page = swiotlb_unmap_page, ++ ++ .map_sg = swiotlb_map_sg_attrs, ++ .unmap_sg = swiotlb_unmap_sg_attrs, ++ ++ .mapping_error = swiotlb_dma_mapping_error, ++ ++ .is_phys = 0, ++}; ++ ++void __init xen_iommu_init(void) ++{ ++ if (!xen_pv_domain()) ++ return; ++ ++ printk(KERN_INFO "Xen: Initializing Xen DMA ops\n"); ++ ++ force_iommu = 0; ++ dma_ops = &xen_dma_ops; ++ ++ if (swiotlb) { ++ printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n"); ++ dma_ops = &xen_swiotlb_dma_ops; ++ } ++} ++ +diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig +index 188e1ba..efac9e3 100644 +--- a/drivers/video/Kconfig ++++ b/drivers/video/Kconfig +@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND + select FB_SYS_IMAGEBLIT + select FB_SYS_FOPS + select FB_DEFERRED_IO ++ select XEN_XENBUS_FRONTEND + default y + help + This driver implements the front-end of the Xen virtual +diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c +index 54cd916..966b226 100644 +--- a/drivers/video/xen-fbfront.c ++++ b/drivers/video/xen-fbfront.c +@@ -25,7 +25,10 @@ + #include + #include + #include ++ + #include ++ ++#include + #include + #include + #include +diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig +index cab100a..edeb9b2 100644 +--- a/drivers/xen/Kconfig ++++ b/drivers/xen/Kconfig +@@ -28,6 +28,46 @@ config XEN_DEV_EVTCHN + firing. + If in doubt, say yes. + ++config XEN_BACKEND ++ bool "Backend driver support" ++ depends on XEN_DOM0 ++ default y ++ help ++ Support for backend device drivers that provide I/O services ++ to other virtual machines. ++ ++config XEN_NETDEV_BACKEND ++ tristate "Xen backend network device" ++ depends on XEN_BACKEND && NET ++ help ++ Implement the network backend driver, which passes packets ++ from the guest domain's frontend drivers to the network. ++ ++config XEN_BLKDEV_BACKEND ++ tristate "Block-device backend driver" ++ depends on XEN_BACKEND && BLOCK ++ help ++ The block-device backend driver allows the kernel to export its ++ block devices to other guests via a high-performance shared-memory ++ interface. ++ ++ ++config XEN_BLKDEV_TAP ++ tristate "Block-device tap backend driver" ++ depends on XEN_BACKEND && BLOCK ++ help ++ The block tap driver is an alternative to the block back driver ++ and allows VM block requests to be redirected to userspace through ++ a device interface. The tap allows user-space development of ++ high-performance block backends, where disk images may be implemented ++ as files, in memory, or on other hosts across the network. This ++ driver can safely coexist with the existing blockback driver. ++ ++config XEN_BLKBACK_PAGEMAP ++ tristate ++ depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n ++ default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP ++ + config XENFS + tristate "Xen filesystem" + depends on XEN +@@ -60,4 +100,14 @@ config XEN_SYS_HYPERVISOR + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, +- but will have no xen contents. +\ No newline at end of file ++ but will have no xen contents. ++ ++config XEN_XENBUS_FRONTEND ++ tristate ++ ++config XEN_GNTDEV ++ tristate "userspace grant access device driver" ++ depends on XEN ++ select MMU_NOTIFIER ++ help ++ Allows userspace processes use grants. +diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile +index 7c28434..ab2e672 100644 +--- a/drivers/xen/Makefile ++++ b/drivers/xen/Makefile +@@ -1,12 +1,20 @@ +-obj-y += grant-table.o features.o events.o manage.o ++obj-y += grant-table.o features.o events.o manage.o biomerge.o + obj-y += xenbus/ + + nostackp := $(call cc-option, -fno-stack-protector) + CFLAGS_features.o := $(nostackp) + +-obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +-obj-$(CONFIG_XEN_XENCOMM) += xencomm.o +-obj-$(CONFIG_XEN_BALLOON) += balloon.o +-obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +-obj-$(CONFIG_XENFS) += xenfs/ +-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +\ No newline at end of file ++obj-$(CONFIG_PCI) += pci.o ++obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o ++obj-$(CONFIG_XEN_XENCOMM) += xencomm.o ++obj-$(CONFIG_XEN_BALLOON) += balloon.o ++obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o ++obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ ++obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ ++obj-$(CONFIG_XENFS) += xenfs/ ++obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o ++ ++xen-evtchn-y := evtchn.o ++xen-gntdev-y := gntdev.o +diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c +index 4204336..d7c0eae 100644 +--- a/drivers/xen/balloon.c ++++ b/drivers/xen/balloon.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -52,13 +53,15 @@ + + #include + #include ++ ++#include + #include + #include + #include + #include + #include + +-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) ++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10)) + + #define BALLOON_CLASS_NAME "xen_memory" + +@@ -82,14 +85,15 @@ static struct sys_device balloon_sysdev; + + static int register_balloon(struct sys_device *sysdev); + ++static struct balloon_stats balloon_stats; ++ + /* +- * Protects atomic reservation decrease/increase against concurrent increases. +- * Also protects non-atomic updates of current_pages and driver_pages, and +- * balloon lists. ++ * Work in pages of this order. Can be either 0 for normal pages ++ * or 9 for hugepages. + */ +-static DEFINE_SPINLOCK(balloon_lock); +- +-static struct balloon_stats balloon_stats; ++static int balloon_order; ++static unsigned long balloon_npages; ++static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)]; + + /* We increase/decrease in batches which fit in a page */ + static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; +@@ -118,10 +122,41 @@ static struct timer_list balloon_timer; + static void scrub_page(struct page *page) + { + #ifdef CONFIG_XEN_SCRUB_PAGES +- clear_highpage(page); ++ int i; ++ ++ for (i = 0; i < balloon_npages; i++) ++ clear_highpage(page++); + #endif + } + ++static void free_discontig_frame(void) ++{ ++ int rc; ++ struct xen_memory_reservation reservation = { ++ .address_bits = 0, ++ .domid = DOMID_SELF, ++ .nr_extents = balloon_npages, ++ .extent_order = 0 ++ }; ++ ++ set_xen_guest_handle(reservation.extent_start, discontig_frame_list); ++ rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(rc != balloon_npages); ++} ++ ++static unsigned long shrink_frame(unsigned long nr_pages) ++{ ++ unsigned long i, j; ++ ++ for (i = 0, j = 0; i < nr_pages; i++, j++) { ++ if (frame_list[i] == 0) ++ j++; ++ if (i != j) ++ frame_list[i] = frame_list[j]; ++ } ++ return i; ++} ++ + /* balloon_append: add the given page to the balloon. */ + static void balloon_append(struct page *page) + { +@@ -195,19 +230,18 @@ static unsigned long current_target(void) + + static int increase_reservation(unsigned long nr_pages) + { +- unsigned long pfn, i, flags; ++ unsigned long pfn, mfn, i, j, flags; + struct page *page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, +- .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + +- spin_lock_irqsave(&balloon_lock, flags); ++ spin_lock_irqsave(&xen_reservation_lock, flags); + + page = balloon_first_page(); + for (i = 0; i < nr_pages; i++) { +@@ -218,6 +252,8 @@ static int increase_reservation(unsigned long nr_pages) + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; ++ reservation.extent_order = balloon_order; ++ + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc < 0) + goto out; +@@ -227,19 +263,22 @@ static int increase_reservation(unsigned long nr_pages) + BUG_ON(page == NULL); + + pfn = page_to_pfn(page); ++ mfn = frame_list[i]; + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + +- set_phys_to_machine(pfn, frame_list[i]); +- +- /* Link back into the page tables if not highmem. */ +- if (pfn < max_low_pfn) { +- int ret; +- ret = HYPERVISOR_update_va_mapping( +- (unsigned long)__va(pfn << PAGE_SHIFT), +- mfn_pte(frame_list[i], PAGE_KERNEL), +- 0); +- BUG_ON(ret); ++ for (j = 0; j < balloon_npages; j++, pfn++, mfn++) { ++ set_phys_to_machine(pfn, mfn); ++ ++ /* Link back into the page tables if not highmem. */ ++ if (pfn < max_low_pfn) { ++ int ret; ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)__va(pfn << PAGE_SHIFT), ++ mfn_pte(mfn, PAGE_KERNEL), ++ 0); ++ BUG_ON(ret); ++ } + } + + /* Relinquish the page back to the allocator. */ +@@ -251,20 +290,20 @@ static int increase_reservation(unsigned long nr_pages) + balloon_stats.current_pages += rc; + + out: +- spin_unlock_irqrestore(&balloon_lock, flags); ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return rc < 0 ? rc : rc != nr_pages; + } + + static int decrease_reservation(unsigned long nr_pages) + { +- unsigned long pfn, i, flags; ++ unsigned long pfn, lpfn, mfn, i, j, flags; + struct page *page; + int need_sleep = 0; +- int ret; ++ int discontig, discontig_free; ++ int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, +- .extent_order = 0, + .domid = DOMID_SELF + }; + +@@ -272,7 +311,7 @@ static int decrease_reservation(unsigned long nr_pages) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { +- if ((page = alloc_page(GFP_BALLOON)) == NULL) { ++ if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; +@@ -282,37 +321,50 @@ static int decrease_reservation(unsigned long nr_pages) + frame_list[i] = pfn_to_mfn(pfn); + + scrub_page(page); +- +- if (!PageHighMem(page)) { +- ret = HYPERVISOR_update_va_mapping( +- (unsigned long)__va(pfn << PAGE_SHIFT), +- __pte_ma(0), 0); +- BUG_ON(ret); +- } +- + } + + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); + +- spin_lock_irqsave(&balloon_lock, flags); ++ spin_lock_irqsave(&xen_reservation_lock, flags); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { +- pfn = mfn_to_pfn(frame_list[i]); +- set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ mfn = frame_list[i]; ++ lpfn = pfn = mfn_to_pfn(mfn); + balloon_append(pfn_to_page(pfn)); ++ discontig_free = 0; ++ for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) { ++ if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn) ++ discontig_free = 1; ++ ++ set_phys_to_machine(lpfn, INVALID_P2M_ENTRY); ++ if (!PageHighMem(page)) { ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)__va(lpfn << PAGE_SHIFT), ++ __pte_ma(0), 0); ++ BUG_ON(ret); ++ } ++ } ++ if (discontig_free) { ++ free_discontig_frame(); ++ frame_list[i] = 0; ++ discontig = 1; ++ } + } ++ balloon_stats.current_pages -= nr_pages; ++ ++ if (discontig) ++ nr_pages = shrink_frame(nr_pages); + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; ++ reservation.extent_order = balloon_order; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != nr_pages); + +- balloon_stats.current_pages -= nr_pages; +- +- spin_unlock_irqrestore(&balloon_lock, flags); ++ spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return need_sleep; + } +@@ -379,7 +431,7 @@ static void watch_target(struct xenbus_watch *watch, + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ +- balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); ++ balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order)); + } + + static int balloon_init_watcher(struct notifier_block *notifier, +@@ -405,9 +457,12 @@ static int __init balloon_init(void) + if (!xen_pv_domain()) + return -ENODEV; + +- pr_info("xen_balloon: Initialising balloon driver.\n"); ++ pr_info("xen_balloon: Initialising balloon driver with page order %d.\n", ++ balloon_order); ++ ++ balloon_npages = 1 << balloon_order; + +- balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); ++ balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order; + balloon_stats.target_pages = balloon_stats.current_pages; + balloon_stats.balloon_low = 0; + balloon_stats.balloon_high = 0; +@@ -420,7 +475,7 @@ static int __init balloon_init(void) + register_balloon(&balloon_sysdev); + + /* Initialise the balloon with excess memory space. */ +- for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { ++ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn += balloon_npages) { + page = pfn_to_page(pfn); + if (!PageReserved(page)) + balloon_append(page); +@@ -444,6 +499,121 @@ static void balloon_exit(void) + + module_exit(balloon_exit); + ++static int __init balloon_parse_huge(char *s) ++{ ++ balloon_order = 9; ++ return 1; ++} ++ ++__setup("balloon_hugepages", balloon_parse_huge); ++ ++static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ unsigned long mfn = pte_mfn(*pte); ++ int ret; ++ struct xen_memory_reservation reservation = { ++ .nr_extents = 1, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ ++ set_xen_guest_handle(reservation.extent_start, &mfn); ++ set_pte_at(&init_mm, addr, pte, __pte_ma(0)); ++ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ++ ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(ret != 1); ++ ++ return 0; ++} ++ ++struct page **alloc_empty_pages_and_pagevec(int nr_pages) ++{ ++ struct page *page, **pagevec; ++ int npages; ++ int i, j, ret; ++ ++ /* Round up to next number of balloon_order pages */ ++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order; ++ ++ pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL); ++ if (pagevec == NULL) ++ return NULL; ++ ++ for (i = 0; i < nr_pages; i++) { ++ void *v; ++ ++ page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order); ++ if (page == NULL) ++ goto err; ++ ++ scrub_page(page); ++ ++ mutex_lock(&balloon_mutex); ++ ++ v = page_address(page); ++ ++ ret = apply_to_page_range(&init_mm, (unsigned long)v, ++ PAGE_SIZE << balloon_order, ++ dealloc_pte_fn, NULL); ++ ++ if (ret != 0) { ++ mutex_unlock(&balloon_mutex); ++ //balloon_free_page(page); /* tries to use free_cold_page */ ++ __free_page(page); ++ goto err; ++ } ++ for (j = 0; j < balloon_npages; j++) ++ pagevec[(i<= 0) ++ balloon_append(pagevec[i << balloon_order]); ++ mutex_unlock(&balloon_mutex); ++ kfree(pagevec); ++ pagevec = NULL; ++ goto out; ++} ++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); ++ ++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) ++{ ++ struct page *page; ++ int i; ++ int npages; ++ ++ if (pagevec == NULL) ++ return; ++ ++ /* Round up to next number of balloon_order pages */ ++ npages = (nr_pages + (balloon_npages-1)) >> balloon_order; ++ ++ mutex_lock(&balloon_mutex); ++ for (i = 0; i < nr_pages; i++) { ++ page = pagevec[i << balloon_order]; ++ BUG_ON(page_count(page) != 1); ++ balloon_append(page); ++ } ++ mutex_unlock(&balloon_mutex); ++ ++ kfree(pagevec); ++ ++ schedule_work(&balloon_worker); ++} ++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); ++ + #define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ +@@ -477,7 +647,7 @@ static ssize_t store_target_kb(struct sys_device *dev, + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + +- balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order)); + + return count; + } +@@ -491,7 +661,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr + { + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages +- << PAGE_SHIFT); ++ << (PAGE_SHIFT + balloon_order)); + } + + static ssize_t store_target(struct sys_device *dev, +@@ -507,7 +677,7 @@ static ssize_t store_target(struct sys_device *dev, + + target_bytes = memparse(buf, &endchar); + +- balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order)); + + return count; + } +diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c +new file mode 100644 +index 0000000..d40f534 +--- /dev/null ++++ b/drivers/xen/biomerge.c +@@ -0,0 +1,14 @@ ++#include ++#include ++#include ++ ++bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, ++ const struct bio_vec *vec2) ++{ ++ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); ++ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); ++ ++ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && ++ ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); ++} ++ +diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile +new file mode 100644 +index 0000000..dee55ba +--- /dev/null ++++ b/drivers/xen/blkback/Makefile +@@ -0,0 +1,4 @@ ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o ++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o ++ ++xen-blkback-y := blkback.o xenbus.o interface.o vbd.o +diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c +new file mode 100644 +index 0000000..45f6eb2 +--- /dev/null ++++ b/drivers/xen/blkback/blkback-pagemap.c +@@ -0,0 +1,109 @@ ++#include ++#include "blkback-pagemap.h" ++ ++static int blkback_pagemap_size; ++static struct blkback_pagemap *blkback_pagemap; ++ ++static inline int ++blkback_pagemap_entry_clear(struct blkback_pagemap *map) ++{ ++ static struct blkback_pagemap zero; ++ return !memcmp(map, &zero, sizeof(zero)); ++} ++ ++int ++blkback_pagemap_init(int pages) ++{ ++ blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap), ++ GFP_KERNEL); ++ if (!blkback_pagemap) ++ return -ENOMEM; ++ ++ blkback_pagemap_size = pages; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_init); ++ ++void ++blkback_pagemap_set(int idx, struct page *page, ++ domid_t domid, busid_t busid, grant_ref_t gref) ++{ ++ struct blkback_pagemap *entry; ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ set_page_private(page, idx); ++ ++ entry = blkback_pagemap + idx; ++ if (!blkback_pagemap_entry_clear(entry)) { ++ printk("overwriting pagemap %d: d %u b %u g %u\n", ++ idx, entry->domid, entry->busid, entry->gref); ++ BUG(); ++ } ++ ++ entry->page = page; ++ entry->domid = domid; ++ entry->busid = busid; ++ entry->gref = gref; ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_set); ++ ++void ++blkback_pagemap_clear(struct page *page) ++{ ++ int idx; ++ struct blkback_pagemap *entry; ++ ++ idx = (int)page_private(page); ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ entry = blkback_pagemap + idx; ++ if (blkback_pagemap_entry_clear(entry)) { ++ printk("clearing empty pagemap %d\n", idx); ++ BUG(); ++ } ++ ++ memset(entry, 0, sizeof(*entry)); ++} ++EXPORT_SYMBOL_GPL(blkback_pagemap_clear); ++ ++struct blkback_pagemap ++blkback_pagemap_read(struct page *page) ++{ ++ int idx; ++ struct blkback_pagemap *entry; ++ ++ idx = (int)page_private(page); ++ ++ BUG_ON(!blkback_pagemap); ++ BUG_ON(idx >= blkback_pagemap_size); ++ ++ entry = blkback_pagemap + idx; ++ if (blkback_pagemap_entry_clear(entry)) { ++ printk("reading empty pagemap %d\n", idx); ++ BUG(); ++ } ++ ++ return *entry; ++} ++EXPORT_SYMBOL(blkback_pagemap_read); ++ ++MODULE_LICENSE("Dual BSD/GPL"); ++ ++int ++blkback_pagemap_contains_page(struct page *page) ++{ ++ struct blkback_pagemap *entry; ++ int idx = (int)page_private(page); ++ ++ if (idx < 0 || idx >= blkback_pagemap_size) ++ return 0; ++ ++ entry = blkback_pagemap + idx; ++ ++ return (entry->page == page); ++} ++EXPORT_SYMBOL(blkback_pagemap_contains_page); +diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h +new file mode 100644 +index 0000000..7f97d15 +--- /dev/null ++++ b/drivers/xen/blkback/blkback-pagemap.h +@@ -0,0 +1,36 @@ ++#ifndef _BLKBACK_PAGEMAP_H_ ++#define _BLKBACK_PAGEMAP_H_ ++ ++#include ++#include ++#include ++ ++typedef unsigned int busid_t; ++ ++struct blkback_pagemap { ++ struct page *page; ++ domid_t domid; ++ busid_t busid; ++ grant_ref_t gref; ++}; ++ ++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE) ++ ++int blkback_pagemap_init(int); ++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t); ++void blkback_pagemap_clear(struct page *); ++struct blkback_pagemap blkback_pagemap_read(struct page *); ++int blkback_pagemap_contains_page(struct page *page); ++ ++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */ ++ ++static inline int blkback_pagemap_init(int pages) { return 0; } ++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom, ++ busid_t bus, grant_ref_t gnt) {} ++static inline void blkback_pagemap_clear(struct page *page) {} ++#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; }) ++static inline int blkback_pagemap_contains_page(struct page *page) { return 0; } ++ ++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */ ++ ++#endif +diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c +new file mode 100644 +index 0000000..e644dd5 +--- /dev/null ++++ b/drivers/xen/blkback/blkback.c +@@ -0,0 +1,672 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/main.c ++ * ++ * Back-end of the driver for virtual block devices. This portion of the ++ * driver exports a 'unified' block-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * arch/xen/drivers/blkif/frontend ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Copyright (c) 2005, Christopher Clark ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include "common.h" ++ ++/* ++ * These are rather arbitrary. They are fairly large because adjacent requests ++ * pulled from a communication ring are quite likely to end up being part of ++ * the same scatter/gather request at the disc. ++ * ++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** ++ * ++ * This will increase the chances of being able to write whole tracks. ++ * 64 should be enough to keep us competitive with Linux. ++ */ ++static int blkif_reqs = 64; ++module_param_named(reqs, blkif_reqs, int, 0); ++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); ++ ++/* Run-time switchable: /sys/module/blkback/parameters/ */ ++static unsigned int log_stats = 0; ++static unsigned int debug_lvl = 0; ++module_param(log_stats, int, 0644); ++module_param(debug_lvl, int, 0644); ++ ++/* ++ * Each outstanding request that we've passed to the lower device layers has a ++ * 'pending_req' allocated to it. Each buffer_head that completes decrements ++ * the pendcnt towards zero. When it hits zero, the specified domain has a ++ * response queued for it, with the saved 'id' passed back. ++ */ ++typedef struct { ++ blkif_t *blkif; ++ u64 id; ++ int nr_pages; ++ atomic_t pendcnt; ++ unsigned short operation; ++ int status; ++ struct list_head free_list; ++} pending_req_t; ++ ++static pending_req_t *pending_reqs; ++static struct list_head pending_free; ++static DEFINE_SPINLOCK(pending_free_lock); ++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); ++ ++#define BLKBACK_INVALID_HANDLE (~0) ++ ++static struct page **pending_pages; ++static grant_handle_t *pending_grant_handles; ++ ++static inline int vaddr_pagenr(pending_req_t *req, int seg) ++{ ++ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; ++} ++ ++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] ++ ++static inline unsigned long vaddr(pending_req_t *req, int seg) ++{ ++ unsigned long pfn = page_to_pfn(pending_page(req, seg)); ++ return (unsigned long)pfn_to_kaddr(pfn); ++} ++ ++#define pending_handle(_req, _seg) \ ++ (pending_grant_handles[vaddr_pagenr(_req, _seg)]) ++ ++ ++static int do_block_io_op(blkif_t *blkif); ++static void dispatch_rw_block_io(blkif_t *blkif, ++ struct blkif_request *req, ++ pending_req_t *pending_req); ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st); ++ ++/****************************************************************** ++ * misc small helpers ++ */ ++static pending_req_t* alloc_req(void) ++{ ++ pending_req_t *req = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ if (!list_empty(&pending_free)) { ++ req = list_entry(pending_free.next, pending_req_t, free_list); ++ list_del(&req->free_list); ++ } ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ return req; ++} ++ ++static void free_req(pending_req_t *req) ++{ ++ unsigned long flags; ++ int was_empty; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ was_empty = list_empty(&pending_free); ++ list_add(&req->free_list, &pending_free); ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ if (was_empty) ++ wake_up(&pending_free_wq); ++} ++ ++static void unplug_queue(blkif_t *blkif) ++{ ++ if (blkif->plug == NULL) ++ return; ++ if (blkif->plug->unplug_fn) ++ blkif->plug->unplug_fn(blkif->plug); ++ blk_put_queue(blkif->plug); ++ blkif->plug = NULL; ++} ++ ++static void plug_queue(blkif_t *blkif, struct block_device *bdev) ++{ ++ struct request_queue *q = bdev_get_queue(bdev); ++ ++ if (q == blkif->plug) ++ return; ++ unplug_queue(blkif); ++ blk_get_queue(q); ++ blkif->plug = q; ++} ++ ++static void fast_flush_area(pending_req_t *req) ++{ ++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int i, invcount = 0; ++ grant_handle_t handle; ++ int ret; ++ ++ for (i = 0; i < req->nr_pages; i++) { ++ handle = pending_handle(req, i); ++ if (handle == BLKBACK_INVALID_HANDLE) ++ continue; ++ blkback_pagemap_clear(pending_page(req, i)); ++ gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), ++ GNTMAP_host_map, handle); ++ pending_handle(req, i) = BLKBACK_INVALID_HANDLE; ++ invcount++; ++ } ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, unmap, invcount); ++ BUG_ON(ret); ++} ++ ++/****************************************************************** ++ * SCHEDULER FUNCTIONS ++ */ ++ ++static void print_stats(blkif_t *blkif) ++{ ++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", ++ current->comm, blkif->st_oo_req, ++ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); ++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ++ blkif->st_rd_req = 0; ++ blkif->st_wr_req = 0; ++ blkif->st_oo_req = 0; ++} ++ ++int blkif_schedule(void *arg) ++{ ++ blkif_t *blkif = arg; ++ ++ blkif_get(blkif); ++ ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: started\n", current->comm); ++ ++ while (!kthread_should_stop()) { ++ if (try_to_freeze()) ++ continue; ++ ++ wait_event_interruptible( ++ blkif->wq, ++ blkif->waiting_reqs || kthread_should_stop()); ++ wait_event_interruptible( ++ pending_free_wq, ++ !list_empty(&pending_free) || kthread_should_stop()); ++ ++ blkif->waiting_reqs = 0; ++ smp_mb(); /* clear flag *before* checking for work */ ++ ++ if (do_block_io_op(blkif)) ++ blkif->waiting_reqs = 1; ++ unplug_queue(blkif); ++ ++ if (log_stats && time_after(jiffies, blkif->st_print)) ++ print_stats(blkif); ++ } ++ ++ if (log_stats) ++ print_stats(blkif); ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: exiting\n", current->comm); ++ ++ blkif->xenblkd = NULL; ++ blkif_put(blkif); ++ ++ return 0; ++} ++ ++/****************************************************************** ++ * COMPLETION CALLBACK -- Called as bh->b_end_io() ++ */ ++ ++static void __end_block_io_op(pending_req_t *pending_req, int error) ++{ ++ /* An error fails the entire request. */ ++ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && ++ (error == -EOPNOTSUPP)) { ++ DPRINTK("blkback: write barrier op failed, not supported\n"); ++ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); ++ pending_req->status = BLKIF_RSP_EOPNOTSUPP; ++ } else if (error) { ++ DPRINTK("Buffer not up-to-date at end of operation, " ++ "error=%d\n", error); ++ pending_req->status = BLKIF_RSP_ERROR; ++ } ++ ++ if (atomic_dec_and_test(&pending_req->pendcnt)) { ++ fast_flush_area(pending_req); ++ make_response(pending_req->blkif, pending_req->id, ++ pending_req->operation, pending_req->status); ++ blkif_put(pending_req->blkif); ++ free_req(pending_req); ++ } ++} ++ ++static void end_block_io_op(struct bio *bio, int error) ++{ ++ __end_block_io_op(bio->bi_private, error); ++ bio_put(bio); ++} ++ ++ ++/****************************************************************************** ++ * NOTIFICATION FROM GUEST OS. ++ */ ++ ++static void blkif_notify_work(blkif_t *blkif) ++{ ++ blkif->waiting_reqs = 1; ++ wake_up(&blkif->wq); ++} ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id) ++{ ++ blkif_notify_work(dev_id); ++ return IRQ_HANDLED; ++} ++ ++ ++ ++/****************************************************************** ++ * DOWNWARD CALLS -- These interface with the block-device layer proper. ++ */ ++ ++static int do_block_io_op(blkif_t *blkif) ++{ ++ union blkif_back_rings *blk_rings = &blkif->blk_rings; ++ struct blkif_request req; ++ pending_req_t *pending_req; ++ RING_IDX rc, rp; ++ int more_to_do = 0; ++ ++ rc = blk_rings->common.req_cons; ++ rp = blk_rings->common.sring->req_prod; ++ rmb(); /* Ensure we see queued requests up to 'rp'. */ ++ ++ while (rc != rp) { ++ ++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) ++ break; ++ ++ if (kthread_should_stop()) { ++ more_to_do = 1; ++ break; ++ } ++ ++ pending_req = alloc_req(); ++ if (NULL == pending_req) { ++ blkif->st_oo_req++; ++ more_to_do = 1; ++ break; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.req_cons = ++rc; /* before make_response() */ ++ ++ /* Apply all sanity checks to /private copy/ of request. */ ++ barrier(); ++ ++ switch (req.operation) { ++ case BLKIF_OP_READ: ++ blkif->st_rd_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ blkif->st_br_req++; ++ /* fall through */ ++ case BLKIF_OP_WRITE: ++ blkif->st_wr_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ default: ++ /* A good sign something is wrong: sleep for a while to ++ * avoid excessive CPU consumption by a bad guest. */ ++ msleep(1); ++ DPRINTK("error: unknown block io operation [%d]\n", ++ req.operation); ++ make_response(blkif, req.id, req.operation, ++ BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ break; ++ } ++ ++ /* Yield point for this unbounded loop. */ ++ cond_resched(); ++ } ++ ++ return more_to_do; ++} ++ ++static void dispatch_rw_block_io(blkif_t *blkif, ++ struct blkif_request *req, ++ pending_req_t *pending_req) ++{ ++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ struct phys_req preq; ++ struct { ++ unsigned long buf; unsigned int nsec; ++ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int nseg; ++ struct bio *bio = NULL; ++ int ret, i; ++ int operation; ++ ++ switch (req->operation) { ++ case BLKIF_OP_READ: ++ operation = READ; ++ break; ++ case BLKIF_OP_WRITE: ++ operation = WRITE; ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ operation = WRITE_BARRIER; ++ break; ++ default: ++ operation = 0; /* make gcc happy */ ++ BUG(); ++ } ++ ++ /* Check that number of segments is sane. */ ++ nseg = req->nr_segments; ++ if (unlikely(nseg == 0 && operation != WRITE_BARRIER) || ++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { ++ DPRINTK("Bad number of segments in request (%d)\n", nseg); ++ goto fail_response; ++ } ++ ++ preq.dev = req->handle; ++ preq.sector_number = req->sector_number; ++ preq.nr_sects = 0; ++ ++ pending_req->blkif = blkif; ++ pending_req->id = req->id; ++ pending_req->operation = req->operation; ++ pending_req->status = BLKIF_RSP_OKAY; ++ pending_req->nr_pages = nseg; ++ ++ for (i = 0; i < nseg; i++) { ++ uint32_t flags; ++ ++ seg[i].nsec = req->seg[i].last_sect - ++ req->seg[i].first_sect + 1; ++ ++ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || ++ (req->seg[i].last_sect < req->seg[i].first_sect)) ++ goto fail_response; ++ preq.nr_sects += seg[i].nsec; ++ ++ flags = GNTMAP_host_map; ++ if (operation != READ) ++ flags |= GNTMAP_readonly; ++ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, ++ req->seg[i].gref, blkif->domid); ++ } ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); ++ BUG_ON(ret); ++ ++ for (i = 0; i < nseg; i++) { ++ if (unlikely(map[i].status != 0)) { ++ DPRINTK("invalid buffer -- could not remap it\n"); ++ map[i].handle = BLKBACK_INVALID_HANDLE; ++ ret |= 1; ++ continue; ++ } ++ ++ set_phys_to_machine( ++ page_to_pfn(pending_page(pending_req, i)), ++ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); ++ seg[i].buf = map[i].dev_bus_addr | ++ (req->seg[i].first_sect << 9); ++ blkback_pagemap_set(vaddr_pagenr(pending_req, i), ++ pending_page(pending_req, i), ++ blkif->domid, req->handle, ++ req->seg[i].gref); ++ pending_handle(pending_req, i) = map[i].handle; ++ } ++ ++ if (ret) ++ goto fail_flush; ++ ++ if (vbd_translate(&preq, blkif, operation) != 0) { ++ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", ++ operation == READ ? "read" : "write", ++ preq.sector_number, ++ preq.sector_number + preq.nr_sects, preq.dev); ++ goto fail_flush; ++ } ++ ++ plug_queue(blkif, preq.bdev); ++ atomic_set(&pending_req->pendcnt, 1); ++ blkif_get(blkif); ++ ++ for (i = 0; i < nseg; i++) { ++ if (((int)preq.sector_number|(int)seg[i].nsec) & ++ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { ++ DPRINTK("Misaligned I/O request from domain %d", ++ blkif->domid); ++ goto fail_put_bio; ++ } ++ ++ while ((bio == NULL) || ++ (bio_add_page(bio, ++ pending_page(pending_req, i), ++ seg[i].nsec << 9, ++ seg[i].buf & ~PAGE_MASK) == 0)) { ++ if (bio) { ++ atomic_inc(&pending_req->pendcnt); ++ submit_bio(operation, bio); ++ } ++ ++ bio = bio_alloc(GFP_KERNEL, nseg-i); ++ if (unlikely(bio == NULL)) ++ goto fail_put_bio; ++ ++ bio->bi_bdev = preq.bdev; ++ bio->bi_private = pending_req; ++ bio->bi_end_io = end_block_io_op; ++ bio->bi_sector = preq.sector_number; ++ } ++ ++ preq.sector_number += seg[i].nsec; ++ } ++ ++ if (!bio) { ++ BUG_ON(operation != WRITE_BARRIER); ++ bio = bio_alloc(GFP_KERNEL, 0); ++ if (unlikely(bio == NULL)) ++ goto fail_put_bio; ++ ++ bio->bi_bdev = preq.bdev; ++ bio->bi_private = pending_req; ++ bio->bi_end_io = end_block_io_op; ++ bio->bi_sector = -1; ++ } ++ ++ submit_bio(operation, bio); ++ ++ if (operation == READ) ++ blkif->st_rd_sect += preq.nr_sects; ++ else if (operation == WRITE || operation == WRITE_BARRIER) ++ blkif->st_wr_sect += preq.nr_sects; ++ ++ return; ++ ++ fail_flush: ++ fast_flush_area(pending_req); ++ fail_response: ++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ msleep(1); /* back off a bit */ ++ return; ++ ++ fail_put_bio: ++ __end_block_io_op(pending_req, -EINVAL); ++ if (bio) ++ bio_put(bio); ++ unplug_queue(blkif); ++ msleep(1); /* back off a bit */ ++ return; ++} ++ ++ ++ ++/****************************************************************** ++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING ++ */ ++ ++ ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st) ++{ ++ struct blkif_response resp; ++ unsigned long flags; ++ union blkif_back_rings *blk_rings = &blkif->blk_rings; ++ int more_to_do = 0; ++ int notify; ++ ++ resp.id = id; ++ resp.operation = op; ++ resp.status = st; ++ ++ spin_lock_irqsave(&blkif->blk_ring_lock, flags); ++ /* Place on the response ring for the relevant domain. */ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.rsp_prod_pvt++; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); ++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { ++ /* ++ * Tail check for pending requests. Allows frontend to avoid ++ * notifications if requests are already in flight (lower ++ * overheads and promotes batching). ++ */ ++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); ++ ++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { ++ more_to_do = 1; ++ } ++ ++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); ++ ++ if (more_to_do) ++ blkif_notify_work(blkif); ++ if (notify) ++ notify_remote_via_irq(blkif->irq); ++} ++ ++static int __init blkif_init(void) ++{ ++ int i, mmap_pages; ++ int rc = 0; ++ ++ if (!xen_pv_domain()) ++ return -ENODEV; ++ ++ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ ++ pending_reqs = kmalloc(sizeof(pending_reqs[0]) * ++ blkif_reqs, GFP_KERNEL); ++ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * ++ mmap_pages, GFP_KERNEL); ++ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); ++ ++ if (blkback_pagemap_init(mmap_pages)) ++ goto out_of_memory; ++ ++ if (!pending_reqs || !pending_grant_handles || !pending_pages) { ++ rc = -ENOMEM; ++ goto out_of_memory; ++ } ++ ++ for (i = 0; i < mmap_pages; i++) ++ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; ++ ++ rc = blkif_interface_init(); ++ if (rc) ++ goto failed_init; ++ ++ memset(pending_reqs, 0, sizeof(pending_reqs)); ++ INIT_LIST_HEAD(&pending_free); ++ ++ for (i = 0; i < blkif_reqs; i++) ++ list_add_tail(&pending_reqs[i].free_list, &pending_free); ++ ++ rc = blkif_xenbus_init(); ++ if (rc) ++ goto failed_init; ++ ++ return 0; ++ ++ out_of_memory: ++ printk(KERN_ERR "%s: out of memory\n", __func__); ++ failed_init: ++ kfree(pending_reqs); ++ kfree(pending_grant_handles); ++ free_empty_pages_and_pagevec(pending_pages, mmap_pages); ++ return rc; ++} ++ ++module_init(blkif_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h +new file mode 100644 +index 0000000..af43d63 +--- /dev/null ++++ b/drivers/xen/blkback/common.h +@@ -0,0 +1,139 @@ ++/* ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __BLKIF__BACKEND__COMMON_H__ ++#define __BLKIF__BACKEND__COMMON_H__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blkback-pagemap.h" ++ ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++ ++struct vbd { ++ blkif_vdev_t handle; /* what the domain refers to this vbd as */ ++ unsigned char readonly; /* Non-zero -> read-only */ ++ unsigned char type; /* VDISK_xxx */ ++ u32 pdevice; /* phys device that this vbd maps to */ ++ struct block_device *bdev; ++}; ++ ++struct backend_info; ++ ++typedef struct blkif_st { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ /* Physical parameters of the comms window. */ ++ unsigned int irq; ++ /* Comms information. */ ++ enum blkif_protocol blk_protocol; ++ union blkif_back_rings blk_rings; ++ struct vm_struct *blk_ring_area; ++ /* The VBD attached to this interface. */ ++ struct vbd vbd; ++ /* Back pointer to the backend_info. */ ++ struct backend_info *be; ++ /* Private fields. */ ++ spinlock_t blk_ring_lock; ++ atomic_t refcnt; ++ ++ wait_queue_head_t wq; ++ struct task_struct *xenblkd; ++ unsigned int waiting_reqs; ++ struct request_queue *plug; ++ ++ /* statistics */ ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_br_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ ++ wait_queue_head_t waiting_to_free; ++ ++ grant_handle_t shmem_handle; ++ grant_ref_t shmem_ref; ++} blkif_t; ++ ++blkif_t *blkif_alloc(domid_t domid); ++void blkif_disconnect(blkif_t *blkif); ++void blkif_free(blkif_t *blkif); ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); ++ ++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define blkif_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ wake_up(&(_b)->waiting_to_free);\ ++ } while (0) ++ ++/* Create a vbd. */ ++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, ++ unsigned minor, int readonly, int cdrom); ++void vbd_free(struct vbd *vbd); ++ ++unsigned long long vbd_size(struct vbd *vbd); ++unsigned int vbd_info(struct vbd *vbd); ++unsigned long vbd_secsize(struct vbd *vbd); ++ ++struct phys_req { ++ unsigned short dev; ++ unsigned short nr_sects; ++ struct block_device *bdev; ++ blkif_sector_t sector_number; ++}; ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); ++ ++int blkif_interface_init(void); ++ ++int blkif_xenbus_init(void); ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id); ++int blkif_schedule(void *arg); ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state); ++ ++#endif /* __BLKIF__BACKEND__COMMON_H__ */ +diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c +new file mode 100644 +index 0000000..e397a41 +--- /dev/null ++++ b/drivers/xen/blkback/interface.c +@@ -0,0 +1,186 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/interface.c ++ * ++ * Block-device interface management. ++ * ++ * Copyright (c) 2004, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include ++#include ++#include ++ ++static struct kmem_cache *blkif_cachep; ++ ++blkif_t *blkif_alloc(domid_t domid) ++{ ++ blkif_t *blkif; ++ ++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); ++ if (!blkif) ++ return ERR_PTR(-ENOMEM); ++ ++ memset(blkif, 0, sizeof(*blkif)); ++ blkif->domid = domid; ++ spin_lock_init(&blkif->blk_ring_lock); ++ atomic_set(&blkif->refcnt, 1); ++ init_waitqueue_head(&blkif->wq); ++ blkif->st_print = jiffies; ++ init_waitqueue_head(&blkif->waiting_to_free); ++ ++ return blkif; ++} ++ ++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, shared_page, blkif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return op.status; ++ } ++ ++ blkif->shmem_ref = shared_page; ++ blkif->shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_page(blkif_t *blkif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, blkif->shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) ++{ ++ int err; ++ ++ /* Already connected through? */ ++ if (blkif->irq) ++ return 0; ++ ++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) ++ return -ENOMEM; ++ ++ err = map_frontend_page(blkif, shared_page); ++ if (err) { ++ free_vm_area(blkif->blk_ring_area); ++ return err; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ { ++ struct blkif_sring *sring; ++ sring = (struct blkif_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_32: ++ { ++ struct blkif_x86_32_sring *sring_x86_32; ++ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_64: ++ { ++ struct blkif_x86_64_sring *sring_x86_64; ++ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif); ++ if (err < 0) ++ { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ return err; ++ } ++ blkif->irq = err; ++ ++ return 0; ++} ++ ++void blkif_disconnect(blkif_t *blkif) ++{ ++ if (blkif->xenblkd) { ++ kthread_stop(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ } ++ ++ atomic_dec(&blkif->refcnt); ++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); ++ atomic_inc(&blkif->refcnt); ++ ++ if (blkif->irq) { ++ unbind_from_irqhandler(blkif->irq, blkif); ++ blkif->irq = 0; ++ } ++ ++ if (blkif->blk_rings.common.sring) { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ } ++} ++ ++void blkif_free(blkif_t *blkif) ++{ ++ if (!atomic_dec_and_test(&blkif->refcnt)) ++ BUG(); ++ kmem_cache_free(blkif_cachep, blkif); ++} ++ ++int __init blkif_interface_init(void) ++{ ++ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), ++ 0, 0, NULL); ++ if (!blkif_cachep) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c +new file mode 100644 +index 0000000..410c2ea +--- /dev/null ++++ b/drivers/xen/blkback/vbd.c +@@ -0,0 +1,118 @@ ++/****************************************************************************** ++ * blkback/vbd.c ++ * ++ * Routines for managing virtual block devices (VBDs). ++ * ++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++ ++#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ ++ (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) ++ ++unsigned long long vbd_size(struct vbd *vbd) ++{ ++ return vbd_sz(vbd); ++} ++ ++unsigned int vbd_info(struct vbd *vbd) ++{ ++ return vbd->type | (vbd->readonly?VDISK_READONLY:0); ++} ++ ++unsigned long vbd_secsize(struct vbd *vbd) ++{ ++ return bdev_logical_block_size(vbd->bdev); ++} ++ ++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, ++ unsigned minor, int readonly, int cdrom) ++{ ++ struct vbd *vbd; ++ struct block_device *bdev; ++ ++ vbd = &blkif->vbd; ++ vbd->handle = handle; ++ vbd->readonly = readonly; ++ vbd->type = 0; ++ ++ vbd->pdevice = MKDEV(major, minor); ++ ++ bdev = open_by_devnum(vbd->pdevice, ++ vbd->readonly ? FMODE_READ : FMODE_WRITE); ++ ++ if (IS_ERR(bdev)) { ++ DPRINTK("vbd_creat: device %08x could not be opened.\n", ++ vbd->pdevice); ++ return -ENOENT; ++ } ++ ++ vbd->bdev = bdev; ++ ++ if (vbd->bdev->bd_disk == NULL) { ++ DPRINTK("vbd_creat: device %08x doesn't exist.\n", ++ vbd->pdevice); ++ vbd_free(vbd); ++ return -ENOENT; ++ } ++ ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) ++ vbd->type |= VDISK_CDROM; ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) ++ vbd->type |= VDISK_REMOVABLE; ++ ++ DPRINTK("Successful creation of handle=%04x (dom=%u)\n", ++ handle, blkif->domid); ++ return 0; ++} ++ ++void vbd_free(struct vbd *vbd) ++{ ++ if (vbd->bdev) ++ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE); ++ vbd->bdev = NULL; ++} ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) ++{ ++ struct vbd *vbd = &blkif->vbd; ++ int rc = -EACCES; ++ ++ if ((operation != READ) && vbd->readonly) ++ goto out; ++ ++ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) ++ goto out; ++ ++ req->dev = vbd->pdevice; ++ req->bdev = vbd->bdev; ++ rc = 0; ++ ++ out: ++ return rc; ++} +diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c +new file mode 100644 +index 0000000..34f8e40 +--- /dev/null ++++ b/drivers/xen/blkback/xenbus.c +@@ -0,0 +1,541 @@ ++/* Xenbus code for blkif backend ++ Copyright (C) 2005 Rusty Russell ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include ++#include ++#include ++#include "common.h" ++ ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ ++ __FUNCTION__, __LINE__, ##args) ++ ++struct backend_info ++{ ++ struct xenbus_device *dev; ++ blkif_t *blkif; ++ struct xenbus_watch backend_watch; ++ unsigned major; ++ unsigned minor; ++ char *mode; ++}; ++ ++static void connect(struct backend_info *); ++static int connect_ring(struct backend_info *); ++static void backend_changed(struct xenbus_watch *, const char **, ++ unsigned int); ++ ++static int blkback_name(blkif_t *blkif, char *buf) ++{ ++ char *devpath, *devname; ++ struct xenbus_device *dev = blkif->be->dev; ++ ++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); ++ if (IS_ERR(devpath)) ++ return PTR_ERR(devpath); ++ ++ if ((devname = strstr(devpath, "/dev/")) != NULL) ++ devname += strlen("/dev/"); ++ else ++ devname = devpath; ++ ++ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); ++ kfree(devpath); ++ ++ return 0; ++} ++ ++static void update_blkif_status(blkif_t *blkif) ++{ ++ int err; ++ char name[TASK_COMM_LEN]; ++ ++ /* Not ready to connect? */ ++ if (!blkif->irq || !blkif->vbd.bdev) ++ return; ++ ++ /* Already connected? */ ++ if (blkif->be->dev->state == XenbusStateConnected) ++ return; ++ ++ /* Attempt to connect: exit if we fail to. */ ++ connect(blkif->be); ++ if (blkif->be->dev->state != XenbusStateConnected) ++ return; ++ ++ err = blkback_name(blkif, name); ++ if (err) { ++ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); ++ return; ++ } ++ ++ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name); ++ if (IS_ERR(blkif->xenblkd)) { ++ err = PTR_ERR(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); ++ } ++} ++ ++ ++/**************************************************************** ++ * sysfs interface for VBD I/O requests ++ */ ++ ++#define VBD_SHOW(name, format, args...) \ ++ static ssize_t show_##name(struct device *_dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct xenbus_device *dev = to_xenbus_device(_dev); \ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); \ ++ \ ++ return sprintf(buf, format, ##args); \ ++ } \ ++ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) ++ ++VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); ++VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); ++VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); ++VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); ++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); ++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); ++ ++static struct attribute *vbdstat_attrs[] = { ++ &dev_attr_oo_req.attr, ++ &dev_attr_rd_req.attr, ++ &dev_attr_wr_req.attr, ++ &dev_attr_br_req.attr, ++ &dev_attr_rd_sect.attr, ++ &dev_attr_wr_sect.attr, ++ NULL ++}; ++ ++static struct attribute_group vbdstat_group = { ++ .name = "statistics", ++ .attrs = vbdstat_attrs, ++}; ++ ++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); ++VBD_SHOW(mode, "%s\n", be->mode); ++ ++int xenvbd_sysfs_addif(struct xenbus_device *dev) ++{ ++ int error; ++ ++ error = device_create_file(&dev->dev, &dev_attr_physical_device); ++ if (error) ++ goto fail1; ++ ++ error = device_create_file(&dev->dev, &dev_attr_mode); ++ if (error) ++ goto fail2; ++ ++ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group); ++ if (error) ++ goto fail3; ++ ++ return 0; ++ ++fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++fail2: device_remove_file(&dev->dev, &dev_attr_mode); ++fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); ++ return error; ++} ++ ++void xenvbd_sysfs_delif(struct xenbus_device *dev) ++{ ++ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++ device_remove_file(&dev->dev, &dev_attr_mode); ++ device_remove_file(&dev->dev, &dev_attr_physical_device); ++} ++ ++static int blkback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ DPRINTK(""); ++ ++ if (be->major || be->minor) ++ xenvbd_sysfs_delif(dev); ++ ++ if (be->backend_watch.node) { ++ unregister_xenbus_watch(&be->backend_watch); ++ kfree(be->backend_watch.node); ++ be->backend_watch.node = NULL; ++ } ++ ++ if (be->blkif) { ++ blkif_disconnect(be->blkif); ++ vbd_free(&be->blkif->vbd); ++ blkif_free(be->blkif); ++ be->blkif = NULL; ++ } ++ ++ kfree(be); ++ dev_set_drvdata(&dev->dev, NULL); ++ return 0; ++} ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state) ++{ ++ struct xenbus_device *dev = be->dev; ++ int err; ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-barrier", ++ "%d", state); ++ if (err) ++ xenbus_dev_fatal(dev, err, "writing feature-barrier"); ++ ++ return err; ++} ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures, and watch the store waiting for the hotplug scripts to tell us ++ * the device's physical major and minor numbers. Switch to InitWait. ++ */ ++static int blkback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ be->dev = dev; ++ dev_set_drvdata(&dev->dev, be); ++ ++ be->blkif = blkif_alloc(dev->otherend_id); ++ if (IS_ERR(be->blkif)) { ++ err = PTR_ERR(be->blkif); ++ be->blkif = NULL; ++ xenbus_dev_fatal(dev, err, "creating block interface"); ++ goto fail; ++ } ++ ++ /* setup back pointer */ ++ be->blkif->be = be; ++ ++ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, ++ "%s/%s", dev->nodename, "physical-device"); ++ if (err) ++ goto fail; ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ DPRINTK("failed"); ++ blkback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Callback received when the hotplug scripts have placed the physical-device ++ * node. Read it and the mode node, and create a vbd. If the frontend is ++ * ready, connect. ++ */ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ int err; ++ unsigned major; ++ unsigned minor; ++ struct backend_info *be ++ = container_of(watch, struct backend_info, backend_watch); ++ struct xenbus_device *dev = be->dev; ++ int cdrom = 0; ++ char *device_type; ++ ++ DPRINTK(""); ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", ++ &major, &minor); ++ if (XENBUS_EXIST_ERR(err)) { ++ /* Since this watch will fire once immediately after it is ++ registered, we expect this. Ignore it, and wait for the ++ hotplug scripts. */ ++ return; ++ } ++ if (err != 2) { ++ xenbus_dev_fatal(dev, err, "reading physical-device"); ++ return; ++ } ++ ++ if ((be->major || be->minor) && ++ ((be->major != major) || (be->minor != minor))) { ++ printk(KERN_WARNING ++ "blkback: changing physical device (from %x:%x to " ++ "%x:%x) not supported.\n", be->major, be->minor, ++ major, minor); ++ return; ++ } ++ ++ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); ++ if (IS_ERR(be->mode)) { ++ err = PTR_ERR(be->mode); ++ be->mode = NULL; ++ xenbus_dev_fatal(dev, err, "reading mode"); ++ return; ++ } ++ ++ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); ++ if (!IS_ERR(device_type)) { ++ cdrom = strcmp(device_type, "cdrom") == 0; ++ kfree(device_type); ++ } ++ ++ if (be->major == 0 && be->minor == 0) { ++ /* Front end dir is a number, which is used as the handle. */ ++ ++ char *p = strrchr(dev->otherend, '/') + 1; ++ long handle = simple_strtoul(p, NULL, 0); ++ ++ be->major = major; ++ be->minor = minor; ++ ++ err = vbd_create(be->blkif, handle, major, minor, ++ (NULL == strchr(be->mode, 'w')), cdrom); ++ if (err) { ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating vbd structure"); ++ return; ++ } ++ ++ err = xenvbd_sysfs_addif(dev); ++ if (err) { ++ vbd_free(&be->blkif->vbd); ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating sysfs entries"); ++ return; ++ } ++ ++ /* We're potentially connected now */ ++ update_blkif_status(be->blkif); ++ } ++} ++ ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ int err; ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ /* Ensure we connect even when two watches fire in ++ close successsion and we miss the intermediate value ++ of frontend_state. */ ++ if (dev->state == XenbusStateConnected) ++ break; ++ ++ err = connect_ring(be); ++ if (err) ++ break; ++ update_blkif_status(be->blkif); ++ break; ++ ++ case XenbusStateClosing: ++ blkif_disconnect(be->blkif); ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++/* ** Connection ** */ ++ ++ ++/** ++ * Write the physical details regarding the block device to the store, and ++ * switch to Connected state. ++ */ ++static void connect(struct backend_info *be) ++{ ++ struct xenbus_transaction xbt; ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ /* Supply the information about the device the frontend needs */ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ return; ++ } ++ ++ err = blkback_barrier(xbt, be, 1); ++ if (err) ++ goto abort; ++ ++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", ++ vbd_size(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sectors", ++ dev->nodename); ++ goto abort; ++ } ++ ++ /* FIXME: use a typename instead */ ++ err = xenbus_printf(xbt, dev->nodename, "info", "%u", ++ vbd_info(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/info", ++ dev->nodename); ++ goto abort; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", ++ vbd_secsize(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sector-size", ++ dev->nodename); ++ goto abort; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ if (err) ++ xenbus_dev_fatal(dev, err, "ending transaction"); ++ ++ err = xenbus_switch_state(dev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(dev, err, "switching to Connected state", ++ dev->nodename); ++ ++ return; ++ abort: ++ xenbus_transaction_end(xbt, 1); ++} ++ ++ ++static int connect_ring(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long ring_ref; ++ unsigned int evtchn; ++ char protocol[64] = ""; ++ int err; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", ++ "%63s", protocol, NULL); ++ if (err) ++ strcpy(protocol, "unspecified, assuming native"); ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; ++ else { ++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); ++ return -1; ++ } ++ printk(KERN_INFO ++ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n", ++ ring_ref, evtchn, be->blkif->blk_protocol, protocol); ++ ++ /* Map the shared frame, irq etc. */ ++ err = blkif_map(be->blkif, ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", ++ ring_ref, evtchn); ++ return err; ++ } ++ ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static const struct xenbus_device_id blkback_ids[] = { ++ { "vbd" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver blkback = { ++ .name = "vbd", ++ .owner = THIS_MODULE, ++ .ids = blkback_ids, ++ .probe = blkback_probe, ++ .remove = blkback_remove, ++ .otherend_changed = frontend_changed ++}; ++ ++ ++int blkif_xenbus_init(void) ++{ ++ return xenbus_register_backend(&blkback); ++} +diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile +new file mode 100644 +index 0000000..99ff53c +--- /dev/null ++++ b/drivers/xen/blktap/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o ++ ++blktap-objs := control.o ring.o wait_queue.o device.o request.o sysfs.o +diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h +new file mode 100644 +index 0000000..db4cf02 +--- /dev/null ++++ b/drivers/xen/blktap/blktap.h +@@ -0,0 +1,253 @@ ++#ifndef _BLKTAP_H_ ++#define _BLKTAP_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++//#define ENABLE_PASSTHROUGH ++ ++extern int blktap_debug_level; ++ ++#define BTPRINTK(level, tag, force, _f, _a...) \ ++ do { \ ++ if (blktap_debug_level > level && \ ++ (force || printk_ratelimit())) \ ++ printk(tag "%s: " _f, __func__, ##_a); \ ++ } while (0) ++ ++#define BTDBG(_f, _a...) BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a) ++#define BTINFO(_f, _a...) BTPRINTK(0, KERN_INFO, 0, _f, ##_a) ++#define BTWARN(_f, _a...) BTPRINTK(0, KERN_WARNING, 0, _f, ##_a) ++#define BTERR(_f, _a...) BTPRINTK(0, KERN_ERR, 0, _f, ##_a) ++ ++#define MAX_BLKTAP_DEVICE 256 ++ ++#define BLKTAP_CONTROL 1 ++#define BLKTAP_RING_FD 2 ++#define BLKTAP_RING_VMA 3 ++#define BLKTAP_DEVICE 4 ++#define BLKTAP_PAUSE_REQUESTED 6 ++#define BLKTAP_PAUSED 7 ++#define BLKTAP_SHUTDOWN_REQUESTED 8 ++#define BLKTAP_PASSTHROUGH 9 ++#define BLKTAP_DEFERRED 10 ++ ++/* blktap IOCTLs: */ ++#define BLKTAP2_IOCTL_KICK_FE 1 ++#define BLKTAP2_IOCTL_ALLOC_TAP 200 ++#define BLKTAP2_IOCTL_FREE_TAP 201 ++#define BLKTAP2_IOCTL_CREATE_DEVICE 202 ++#define BLKTAP2_IOCTL_SET_PARAMS 203 ++#define BLKTAP2_IOCTL_PAUSE 204 ++#define BLKTAP2_IOCTL_REOPEN 205 ++#define BLKTAP2_IOCTL_RESUME 206 ++ ++#define BLKTAP2_MAX_MESSAGE_LEN 256 ++ ++#define BLKTAP2_RING_MESSAGE_PAUSE 1 ++#define BLKTAP2_RING_MESSAGE_RESUME 2 ++#define BLKTAP2_RING_MESSAGE_CLOSE 3 ++ ++#define BLKTAP_REQUEST_FREE 0 ++#define BLKTAP_REQUEST_PENDING 1 ++ ++/* ++ * The maximum number of requests that can be outstanding at any time ++ * is determined by ++ * ++ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] ++ * ++ * where mmap_alloc < MAX_DYNAMIC_MEM. ++ * ++ * TODO: ++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via ++ * sysfs. ++ */ ++#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE) ++#define MAX_DYNAMIC_MEM BLK_RING_SIZE ++#define MAX_PENDING_REQS BLK_RING_SIZE ++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) ++#define MMAP_VADDR(_start, _req, _seg) \ ++ (_start + \ ++ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ ++ ((_seg) * PAGE_SIZE)) ++ ++#define blktap_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define blktap_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ wake_up(&(_b)->wq); \ ++ } while (0) ++ ++struct blktap; ++ ++struct grant_handle_pair { ++ grant_handle_t kernel; ++ grant_handle_t user; ++}; ++#define INVALID_GRANT_HANDLE 0xFFFF ++ ++struct blktap_handle { ++ unsigned int ring; ++ unsigned int device; ++ unsigned int minor; ++}; ++ ++struct blktap_params { ++ char name[BLKTAP2_MAX_MESSAGE_LEN]; ++ unsigned long long capacity; ++ unsigned long sector_size; ++}; ++ ++struct blktap_device { ++ int users; ++ spinlock_t lock; ++ struct gendisk *gd; ++ ++#ifdef ENABLE_PASSTHROUGH ++ struct block_device *bdev; ++#endif ++}; ++ ++struct blktap_ring { ++ struct vm_area_struct *vma; ++ struct blkif_front_ring ring; ++ struct vm_foreign_map foreign_map; ++ unsigned long ring_vstart; ++ unsigned long user_vstart; ++ ++ int response; ++ ++ wait_queue_head_t poll_wait; ++ ++ dev_t devno; ++ struct device *dev; ++ atomic_t sysfs_refcnt; ++ struct mutex sysfs_mutex; ++}; ++ ++struct blktap_statistics { ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ s64 st_rd_cnt; ++ s64 st_rd_sum_usecs; ++ s64 st_rd_max_usecs; ++ s64 st_wr_cnt; ++ s64 st_wr_sum_usecs; ++ s64 st_wr_max_usecs; ++}; ++ ++struct blktap_request { ++ uint64_t id; ++ uint16_t usr_idx; ++ ++ uint8_t status; ++ atomic_t pendcnt; ++ uint8_t nr_pages; ++ unsigned short operation; ++ ++ struct timeval time; ++ struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ struct list_head free_list; ++}; ++ ++struct blktap { ++ int minor; ++ pid_t pid; ++ atomic_t refcnt; ++ unsigned long dev_inuse; ++ ++ struct blktap_params params; ++ ++ struct rw_semaphore tap_sem; ++ ++ struct blktap_ring ring; ++ struct blktap_device device; ++ ++ int pending_cnt; ++ struct blktap_request *pending_requests[MAX_PENDING_REQS]; ++ struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ ++ wait_queue_head_t wq; ++ struct list_head deferred_queue; ++ ++ struct blktap_statistics stats; ++}; ++ ++extern struct blktap *blktaps[MAX_BLKTAP_DEVICE]; ++ ++static inline int ++blktap_active(struct blktap *tap) ++{ ++ return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse); ++} ++ ++static inline int ++blktap_validate_params(struct blktap *tap, struct blktap_params *params) ++{ ++ /* TODO: sanity check */ ++ params->name[sizeof(params->name) - 1] = '\0'; ++ BTINFO("%s: capacity: %llu, sector-size: %lu\n", ++ params->name, params->capacity, params->sector_size); ++ return 0; ++} ++ ++int blktap_control_destroy_device(struct blktap *); ++ ++int blktap_ring_init(int *); ++int blktap_ring_free(void); ++int blktap_ring_create(struct blktap *); ++int blktap_ring_destroy(struct blktap *); ++int blktap_ring_pause(struct blktap *); ++int blktap_ring_resume(struct blktap *); ++void blktap_ring_kick_user(struct blktap *); ++ ++int blktap_sysfs_init(void); ++void blktap_sysfs_free(void); ++int blktap_sysfs_create(struct blktap *); ++int blktap_sysfs_destroy(struct blktap *); ++ ++int blktap_device_init(int *); ++void blktap_device_free(void); ++int blktap_device_create(struct blktap *); ++int blktap_device_destroy(struct blktap *); ++int blktap_device_pause(struct blktap *); ++int blktap_device_resume(struct blktap *); ++void blktap_device_restart(struct blktap *); ++void blktap_device_finish_request(struct blktap *, ++ struct blkif_response *, ++ struct blktap_request *); ++void blktap_device_fail_pending_requests(struct blktap *); ++#ifdef ENABLE_PASSTHROUGH ++int blktap_device_enable_passthrough(struct blktap *, ++ unsigned, unsigned); ++#endif ++ ++void blktap_defer(struct blktap *); ++void blktap_run_deferred(void); ++ ++int blktap_request_pool_init(void); ++void blktap_request_pool_free(void); ++int blktap_request_pool_grow(void); ++int blktap_request_pool_shrink(void); ++struct blktap_request *blktap_request_allocate(struct blktap *); ++void blktap_request_free(struct blktap *, struct blktap_request *); ++struct page *request_to_page(struct blktap_request *, int); ++ ++static inline unsigned long ++request_to_kaddr(struct blktap_request *req, int seg) ++{ ++ unsigned long pfn = page_to_pfn(request_to_page(req, seg)); ++ return (unsigned long)pfn_to_kaddr(pfn); ++} ++ ++#endif +diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c +new file mode 100644 +index 0000000..a4852f7 +--- /dev/null ++++ b/drivers/xen/blktap/control.c +@@ -0,0 +1,284 @@ ++#include ++#include ++#include ++ ++#include ++ ++#include "blktap.h" ++ ++static DEFINE_SPINLOCK(blktap_control_lock); ++struct blktap *blktaps[MAX_BLKTAP_DEVICE]; ++ ++static int ring_major; ++static int device_major; ++static int blktap_control_registered; ++ ++static void ++blktap_control_initialize_tap(struct blktap *tap) ++{ ++ int minor = tap->minor; ++ ++ memset(tap, 0, sizeof(*tap)); ++ set_bit(BLKTAP_CONTROL, &tap->dev_inuse); ++ init_rwsem(&tap->tap_sem); ++ init_waitqueue_head(&tap->wq); ++ atomic_set(&tap->refcnt, 0); ++ sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ ++ tap->minor = minor; ++} ++ ++static struct blktap * ++blktap_control_create_tap(void) ++{ ++ int minor; ++ struct blktap *tap; ++ ++ tap = kmalloc(sizeof(*tap), GFP_KERNEL); ++ if (unlikely(!tap)) ++ return NULL; ++ ++ blktap_control_initialize_tap(tap); ++ ++ spin_lock_irq(&blktap_control_lock); ++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) ++ if (!blktaps[minor]) ++ break; ++ ++ if (minor == MAX_BLKTAP_DEVICE) { ++ kfree(tap); ++ tap = NULL; ++ goto out; ++ } ++ ++ tap->minor = minor; ++ blktaps[minor] = tap; ++ ++out: ++ spin_unlock_irq(&blktap_control_lock); ++ return tap; ++} ++ ++static struct blktap * ++blktap_control_allocate_tap(void) ++{ ++ int err, minor; ++ struct blktap *tap; ++ ++ /* ++ * This is called only from the ioctl, which ++ * means we should always have interrupts enabled. ++ */ ++ BUG_ON(irqs_disabled()); ++ ++ spin_lock_irq(&blktap_control_lock); ++ ++ for (minor = 0; minor < MAX_BLKTAP_DEVICE; minor++) { ++ tap = blktaps[minor]; ++ if (!tap) ++ goto found; ++ ++ if (!tap->dev_inuse) { ++ blktap_control_initialize_tap(tap); ++ goto found; ++ } ++ } ++ ++ tap = NULL; ++ ++found: ++ spin_unlock_irq(&blktap_control_lock); ++ ++ if (!tap) { ++ tap = blktap_control_create_tap(); ++ if (!tap) ++ return NULL; ++ } ++ ++ err = blktap_ring_create(tap); ++ if (err) { ++ BTERR("ring creation failed: %d\n", err); ++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); ++ return NULL; ++ } ++ ++ BTINFO("allocated tap %p\n", tap); ++ return tap; ++} ++ ++static int ++blktap_control_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ unsigned long dev; ++ struct blktap *tap; ++ ++ switch (cmd) { ++ case BLKTAP2_IOCTL_ALLOC_TAP: { ++ struct blktap_handle h; ++ ++ tap = blktap_control_allocate_tap(); ++ if (!tap) { ++ BTERR("error allocating device\n"); ++ return -ENOMEM; ++ } ++ ++ h.ring = ring_major; ++ h.device = device_major; ++ h.minor = tap->minor; ++ ++ if (copy_to_user((struct blktap_handle __user *)arg, ++ &h, sizeof(h))) { ++ blktap_control_destroy_device(tap); ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ ++ case BLKTAP2_IOCTL_FREE_TAP: ++ dev = arg; ++ ++ if (dev > MAX_BLKTAP_DEVICE || !blktaps[dev]) ++ return -EINVAL; ++ ++ blktap_control_destroy_device(blktaps[dev]); ++ return 0; ++ } ++ ++ return -ENOIOCTLCMD; ++} ++ ++static struct file_operations blktap_control_file_operations = { ++ .owner = THIS_MODULE, ++ .ioctl = blktap_control_ioctl, ++}; ++ ++static struct miscdevice blktap_misc = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "blktap-control", ++ .fops = &blktap_control_file_operations, ++}; ++ ++int ++blktap_control_destroy_device(struct blktap *tap) ++{ ++ int err; ++ unsigned long inuse; ++ ++ if (!tap) ++ return 0; ++ ++ set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); ++ ++ for (;;) { ++ inuse = tap->dev_inuse; ++ err = blktap_device_destroy(tap); ++ if (err) ++ goto wait; ++ ++ inuse = tap->dev_inuse; ++ err = blktap_ring_destroy(tap); ++ if (err) ++ goto wait; ++ ++ inuse = tap->dev_inuse; ++ err = blktap_sysfs_destroy(tap); ++ if (err) ++ goto wait; ++ ++ break; ++ ++ wait: ++ BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n", ++ inuse, tap->dev_inuse); ++ if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse)) ++ break; ++ } ++ ++ clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse); ++ ++ if (tap->dev_inuse == (1UL << BLKTAP_CONTROL)) { ++ err = 0; ++ clear_bit(BLKTAP_CONTROL, &tap->dev_inuse); ++ } ++ ++ return err; ++} ++ ++static int __init ++blktap_control_init(void) ++{ ++ int err; ++ ++ err = misc_register(&blktap_misc); ++ if (err) { ++ BTERR("misc_register failed for control device"); ++ return err; ++ } ++ ++ blktap_control_registered = 1; ++ return 0; ++} ++ ++static void ++blktap_control_free(void) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++) ++ blktap_control_destroy_device(blktaps[i]); ++ ++ if (blktap_control_registered) ++ if (misc_deregister(&blktap_misc) < 0) ++ BTERR("misc_deregister failed for control device"); ++} ++ ++static void ++blktap_exit(void) ++{ ++ blktap_control_free(); ++ blktap_ring_free(); ++ blktap_sysfs_free(); ++ blktap_device_free(); ++ blktap_request_pool_free(); ++} ++ ++static int __init ++blktap_init(void) ++{ ++ int err; ++ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ err = blktap_request_pool_init(); ++ if (err) ++ return err; ++ ++ err = blktap_device_init(&device_major); ++ if (err) ++ goto fail; ++ ++ err = blktap_ring_init(&ring_major); ++ if (err) ++ goto fail; ++ ++ err = blktap_sysfs_init(); ++ if (err) ++ goto fail; ++ ++ err = blktap_control_init(); ++ if (err) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ blktap_exit(); ++ return err; ++} ++ ++module_init(blktap_init); ++module_exit(blktap_exit); ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c +new file mode 100644 +index 0000000..a50b622 +--- /dev/null ++++ b/drivers/xen/blktap/device.c +@@ -0,0 +1,1138 @@ ++#include /* XXX Remove uses of VERSION instead. */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "blktap.h" ++ ++#include "../blkback/blkback-pagemap.h" ++ ++#if 0 ++#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) ++#else ++#define DPRINTK_IOCTL(_f, _a...) ((void)0) ++#endif ++ ++struct blktap_grant_table { ++ int cnt; ++ struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; ++}; ++ ++static int blktap_device_major; ++ ++static inline struct blktap * ++dev_to_blktap(struct blktap_device *dev) ++{ ++ return container_of(dev, struct blktap, device); ++} ++ ++static int ++blktap_device_open(struct block_device * bd, fmode_t mode) ++{ ++ struct blktap *tap; ++ struct blktap_device *dev = bd->bd_disk->private_data; ++ ++ if (!dev) ++ return -ENOENT; ++ ++ tap = dev_to_blktap(dev); ++ if (!blktap_active(tap) || ++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ return -ENOENT; ++ ++ dev->users++; ++ ++ return 0; ++} ++ ++static int ++blktap_device_release(struct gendisk *gd, fmode_t mode) ++{ ++ struct blktap_device *dev = gd->private_data; ++ struct blktap *tap = dev_to_blktap(dev); ++ ++ dev->users--; ++ if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ blktap_device_destroy(tap); ++ ++ return 0; ++} ++ ++static int ++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg) ++{ ++ /* We don't have real geometry info, but let's at least return ++ values consistent with the size of the device */ ++ sector_t nsect = get_capacity(bd->bd_disk); ++ sector_t cylinders = nsect; ++ ++ hg->heads = 0xff; ++ hg->sectors = 0x3f; ++ sector_div(cylinders, hg->heads * hg->sectors); ++ hg->cylinders = cylinders; ++ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) ++ hg->cylinders = 0xffff; ++ return 0; ++} ++ ++static int ++blktap_device_ioctl(struct block_device *bd, fmode_t mode, ++ unsigned command, unsigned long argument) ++{ ++ int i; ++ ++ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", ++ command, (long)argument, inode->i_rdev); ++ ++ switch (command) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) ++ case HDIO_GETGEO: { ++ struct hd_geometry geo; ++ int ret; ++ ++ if (!argument) ++ return -EINVAL; ++ ++ geo.start = get_start_sect(bd); ++ ret = blktap_device_getgeo(bd, &geo); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user((struct hd_geometry __user *)argument, &geo, ++ sizeof(geo))) ++ return -EFAULT; ++ ++ return 0; ++ } ++#endif ++ case CDROMMULTISESSION: ++ BTDBG("FIXME: support multisession CDs later\n"); ++ for (i = 0; i < sizeof(struct cdrom_multisession); i++) ++ if (put_user(0, (char __user *)(argument + i))) ++ return -EFAULT; ++ return 0; ++ ++ case SCSI_IOCTL_GET_IDLUN: ++ if (!access_ok(VERIFY_WRITE, argument, ++ sizeof(struct scsi_idlun))) ++ return -EFAULT; ++ ++ /* return 0 for now. */ ++ __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id); ++ __put_user(0, ++ &((struct scsi_idlun __user *)argument)->host_unique_id); ++ return 0; ++ ++ default: ++ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", ++ command);*/ ++ return -EINVAL; /* same return as native Linux */ ++ } ++ ++ return 0; ++} ++ ++static struct block_device_operations blktap_device_file_operations = { ++ .owner = THIS_MODULE, ++ .open = blktap_device_open, ++ .release = blktap_device_release, ++ .ioctl = blktap_device_ioctl, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) ++ .getgeo = blktap_device_getgeo ++#endif ++}; ++ ++static int ++blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ pte_t *pte = (pte_t *)data; ++ ++ BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte)); ++ set_pte(ptep, *pte); ++ return 0; ++} ++ ++static int ++blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) ++{ ++ return apply_to_page_range(mm, address, ++ PAGE_SIZE, blktap_map_uaddr_fn, &pte); ++} ++ ++static int ++blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ struct mm_struct *mm = (struct mm_struct *)data; ++ ++ BTDBG("ptep %p\n", ptep); ++ pte_clear(mm, addr, ptep); ++ return 0; ++} ++ ++static int ++blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) ++{ ++ return apply_to_page_range(mm, address, ++ PAGE_SIZE, blktap_umap_uaddr_fn, mm); ++} ++ ++static inline void ++flush_tlb_kernel_page(unsigned long kvaddr) ++{ ++ flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); ++} ++ ++static void ++blktap_device_end_dequeued_request(struct blktap_device *dev, ++ struct request *req, int error) ++{ ++ unsigned long flags; ++ int ret; ++ ++ //spin_lock_irq(&dev->lock); ++ spin_lock_irqsave(dev->gd->queue->queue_lock, flags); ++ ret = __blk_end_request(req, error, blk_rq_bytes(req)); ++ spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags); ++ //spin_unlock_irq(&dev->lock); ++ ++ BUG_ON(ret); ++} ++ ++/* ++ * tap->tap_sem held on entry ++ */ ++static void ++blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request) ++{ ++ uint64_t ptep; ++ int ret, usr_idx; ++ unsigned int i, cnt; ++ struct page **map, *page; ++ struct blktap_ring *ring; ++ struct grant_handle_pair *khandle; ++ unsigned long kvaddr, uvaddr, offset; ++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; ++ ++ cnt = 0; ++ ring = &tap->ring; ++ usr_idx = request->usr_idx; ++ map = ring->foreign_map.map; ++ ++ if (!ring->vma) ++ return; ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ zap_page_range(ring->vma, ++ MMAP_VADDR(ring->user_vstart, usr_idx, 0), ++ request->nr_pages << PAGE_SHIFT, NULL); ++ ++ for (i = 0; i < request->nr_pages; i++) { ++ kvaddr = request_to_kaddr(request, i); ++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); ++ ++ khandle = request->handles + i; ++ ++ if (khandle->kernel != INVALID_GRANT_HANDLE) { ++ gnttab_set_unmap_op(&unmap[cnt], kvaddr, ++ GNTMAP_host_map, khandle->kernel); ++ cnt++; ++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, ++ INVALID_P2M_ENTRY); ++ } ++ ++ if (khandle->user != INVALID_GRANT_HANDLE) { ++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); ++ if (create_lookup_pte_addr(ring->vma->vm_mm, ++ uvaddr, &ptep) != 0) { ++ BTERR("Couldn't get a pte addr!\n"); ++ return; ++ } ++ ++ gnttab_set_unmap_op(&unmap[cnt], ptep, ++ GNTMAP_host_map ++ | GNTMAP_application_map ++ | GNTMAP_contains_pte, ++ khandle->user); ++ cnt++; ++ } ++ ++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; ++ ++ BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, " ++ "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: " ++ "0x%08lx, handle: %u\n", offset, map[offset], request, ++ usr_idx, i, kvaddr, khandle->kernel, uvaddr, ++ khandle->user); ++ ++ page = map[offset]; ++ if (page) { ++ ClearPageReserved(map[offset]); ++ if (blkback_pagemap_contains_page(page)) ++ set_page_private(page, 0); ++ } ++ map[offset] = NULL; ++ ++ khandle->kernel = INVALID_GRANT_HANDLE; ++ khandle->user = INVALID_GRANT_HANDLE; ++ } ++ ++ if (cnt) { ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ unmap, cnt); ++ BUG_ON(ret); ++ } ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) ++ zap_page_range(ring->vma, ++ MMAP_VADDR(ring->user_vstart, usr_idx, 0), ++ request->nr_pages << PAGE_SHIFT, NULL); ++} ++ ++/* ++ * tap->tap_sem held on entry ++ */ ++static void ++blktap_unmap(struct blktap *tap, struct blktap_request *request) ++{ ++ int i, usr_idx; ++ unsigned long kvaddr; ++ ++ usr_idx = request->usr_idx; ++ down_write(&tap->ring.vma->vm_mm->mmap_sem); ++ ++ for (i = 0; i < request->nr_pages; i++) { ++ kvaddr = request_to_kaddr(request, i); ++ BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, " ++ "uvaddr: 0x%08lx, uhandle: %u\n", request, i, ++ kvaddr, request->handles[i].kernel, ++ MMAP_VADDR(tap->ring.user_vstart, usr_idx, i), ++ request->handles[i].user); ++ ++ if (request->handles[i].kernel == INVALID_GRANT_HANDLE) { ++ blktap_umap_uaddr(tap->ring.vma->vm_mm, kvaddr); ++ flush_tlb_kernel_page(kvaddr); ++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, ++ INVALID_P2M_ENTRY); ++ } ++ } ++ ++ blktap_device_fast_flush(tap, request); ++ up_write(&tap->ring.vma->vm_mm->mmap_sem); ++} ++ ++/* ++ * called if the tapdisk process dies unexpectedly. ++ * fail and release any pending requests and disable queue. ++ */ ++void ++blktap_device_fail_pending_requests(struct blktap *tap) ++{ ++ int usr_idx; ++ struct request *req; ++ struct blktap_device *dev; ++ struct blktap_request *request; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ return; ++ ++ down_write(&tap->tap_sem); ++ ++ dev = &tap->device; ++ for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { ++ request = tap->pending_requests[usr_idx]; ++ if (!request || request->status != BLKTAP_REQUEST_PENDING) ++ continue; ++ ++ BTERR("%u:%u: failing pending %s of %d pages\n", ++ blktap_device_major, tap->minor, ++ (request->operation == BLKIF_OP_READ ? ++ "read" : "write"), request->nr_pages); ++ ++ blktap_unmap(tap, request); ++ req = (struct request *)(unsigned long)request->id; ++ blktap_device_end_dequeued_request(dev, req, -EIO); ++ blktap_request_free(tap, request); ++ } ++ ++ up_write(&tap->tap_sem); ++ ++ spin_lock_irq(&dev->lock); ++ ++ /* fail any future requests */ ++ dev->gd->queue->queuedata = NULL; ++ blk_start_queue(dev->gd->queue); ++ ++ spin_unlock_irq(&dev->lock); ++} ++ ++/* ++ * tap->tap_sem held on entry ++ */ ++void ++blktap_device_finish_request(struct blktap *tap, ++ struct blkif_response *res, ++ struct blktap_request *request) ++{ ++ int ret; ++ struct request *req; ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ ++ blktap_unmap(tap, request); ++ ++ req = (struct request *)(unsigned long)request->id; ++ ret = res->status == BLKIF_RSP_OKAY ? 0 : -EIO; ++ ++ BTDBG("req %p res status %d operation %d/%d id %lld\n", req, ++ res->status, res->operation, request->operation, ++ (unsigned long long)res->id); ++ ++ switch (request->operation) { ++ case BLKIF_OP_READ: ++ case BLKIF_OP_WRITE: ++ if (unlikely(res->status != BLKIF_RSP_OKAY)) ++ BTERR("Bad return from device data " ++ "request: %x\n", res->status); ++ blktap_device_end_dequeued_request(dev, req, ret); ++ break; ++ default: ++ BUG(); ++ } ++ ++ blktap_request_free(tap, request); ++} ++ ++static int ++blktap_prep_foreign(struct blktap *tap, ++ struct blktap_request *request, ++ struct blkif_request *blkif_req, ++ unsigned int seg, struct page *page, ++ struct blktap_grant_table *table) ++{ ++ uint64_t ptep; ++ uint32_t flags; ++#ifdef BLKTAP_CHAINED_BLKTAP ++ struct page *tap_page; ++#endif ++ struct blktap_ring *ring; ++ struct blkback_pagemap map; ++ unsigned long uvaddr, kvaddr; ++ ++ ring = &tap->ring; ++ map = blkback_pagemap_read(page); ++ blkif_req->seg[seg].gref = map.gref; ++ ++ uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); ++ kvaddr = request_to_kaddr(request, seg); ++ flags = GNTMAP_host_map | ++ (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0); ++ ++ gnttab_set_map_op(&table->grants[table->cnt], ++ kvaddr, flags, map.gref, map.domid); ++ table->cnt++; ++ ++ ++#ifdef BLKTAP_CHAINED_BLKTAP ++ /* enable chained tap devices */ ++ tap_page = request_to_page(request, seg); ++ set_page_private(tap_page, page_private(page)); ++ SetPageBlkback(tap_page); ++#endif ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return 0; ++ ++ if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) { ++ BTERR("couldn't get a pte addr!\n"); ++ return -1; ++ } ++ ++ flags |= GNTMAP_application_map | GNTMAP_contains_pte; ++ gnttab_set_map_op(&table->grants[table->cnt], ++ ptep, flags, map.gref, map.domid); ++ table->cnt++; ++ ++ return 0; ++} ++ ++static int ++blktap_map_foreign(struct blktap *tap, ++ struct blktap_request *request, ++ struct blkif_request *blkif_req, ++ struct blktap_grant_table *table) ++{ ++ struct page *page; ++ int i, grant, err, usr_idx; ++ struct blktap_ring *ring; ++ unsigned long uvaddr, foreign_mfn; ++ ++ if (!table->cnt) ++ return 0; ++ ++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ table->grants, table->cnt); ++ BUG_ON(err); ++ ++ grant = 0; ++ usr_idx = request->usr_idx; ++ ring = &tap->ring; ++ ++ for (i = 0; i < request->nr_pages; i++) { ++ if (!blkif_req->seg[i].gref) ++ continue; ++ ++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); ++ ++ if (unlikely(table->grants[grant].status)) { ++ BTERR("invalid kernel buffer: could not remap it\n"); ++ err |= 1; ++ table->grants[grant].handle = INVALID_GRANT_HANDLE; ++ } ++ ++ request->handles[i].kernel = table->grants[grant].handle; ++ foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT; ++ grant++; ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ goto done; ++ ++ if (unlikely(table->grants[grant].status)) { ++ BTERR("invalid user buffer: could not remap it\n"); ++ err |= 1; ++ table->grants[grant].handle = INVALID_GRANT_HANDLE; ++ } ++ ++ request->handles[i].user = table->grants[grant].handle; ++ grant++; ++ ++ done: ++ if (err) ++ continue; ++ ++ page = request_to_page(request, i); ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) ++ set_phys_to_machine(page_to_pfn(page), ++ FOREIGN_FRAME(foreign_mfn)); ++ else if (vm_insert_page(ring->vma, uvaddr, page)) ++ err |= 1; ++ ++ BTDBG("pending_req: %p, seg: %d, page: %p, " ++ "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, " ++ "uhandle: %u\n", request, i, page, ++ pfn_to_kaddr(page_to_pfn(page)), ++ request->handles[i].kernel, ++ uvaddr, request->handles[i].user); ++ } ++ ++ return err; ++} ++ ++static void ++blktap_map(struct blktap *tap, ++ struct blktap_request *request, ++ unsigned int seg, struct page *page) ++{ ++ pte_t pte; ++ int usr_idx; ++ struct blktap_ring *ring; ++ unsigned long uvaddr, kvaddr; ++ ++ ring = &tap->ring; ++ usr_idx = request->usr_idx; ++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg); ++ kvaddr = request_to_kaddr(request, seg); ++ ++ pte = mk_pte(page, ring->vma->vm_page_prot); ++ blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); ++ flush_tlb_page(ring->vma, uvaddr); ++ blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); ++ flush_tlb_kernel_page(kvaddr); ++ ++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); ++ request->handles[seg].kernel = INVALID_GRANT_HANDLE; ++ request->handles[seg].user = INVALID_GRANT_HANDLE; ++ ++ BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, " ++ "uvaddr: 0x%08lx\n", request, seg, page, kvaddr, ++ uvaddr); ++} ++ ++static int ++blktap_device_process_request(struct blktap *tap, ++ struct blktap_request *request, ++ struct request *req) ++{ ++ struct page *page; ++ int i, usr_idx, err; ++ struct blktap_ring *ring; ++ struct scatterlist *sg; ++ struct blktap_grant_table table; ++ unsigned int fsect, lsect, nr_sects; ++ unsigned long offset, uvaddr; ++ struct blkif_request blkif_req, *target; ++ ++ err = -1; ++ memset(&table, 0, sizeof(table)); ++ ++ if (!blktap_active(tap)) ++ goto out; ++ ++ ring = &tap->ring; ++ usr_idx = request->usr_idx; ++ blkif_req.id = usr_idx; ++ blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req); ++ blkif_req.handle = 0; ++ blkif_req.operation = rq_data_dir(req) ? ++ BLKIF_OP_WRITE : BLKIF_OP_READ; ++ ++ request->id = (unsigned long)req; ++ request->operation = blkif_req.operation; ++ request->status = BLKTAP_REQUEST_PENDING; ++ do_gettimeofday(&request->time); ++ ++ nr_sects = 0; ++ request->nr_pages = 0; ++ blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg); ++ BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ for (i = 0; i < blkif_req.nr_segments; ++i) { ++ sg = tap->sg + i; ++ fsect = sg->offset >> 9; ++ lsect = fsect + (sg->length >> 9) - 1; ++ nr_sects += sg->length >> 9; ++ ++ blkif_req.seg[i] = ++ (struct blkif_request_segment) { ++ .gref = 0, ++ .first_sect = fsect, ++ .last_sect = lsect }; ++ ++ if (blkback_pagemap_contains_page(sg_page(sg))) { ++ /* foreign page -- use xen */ ++ if (blktap_prep_foreign(tap, ++ request, ++ &blkif_req, ++ i, ++ sg_page(sg), ++ &table)) ++ goto out; ++ } else { ++ /* do it the old fashioned way */ ++ blktap_map(tap, ++ request, ++ i, ++ sg_page(sg)); ++ } ++ ++ uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); ++ offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; ++ page = request_to_page(request, i); ++ ring->foreign_map.map[offset] = page; ++ SetPageReserved(page); ++ ++ BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n", ++ uvaddr, page, page_to_pfn(page)); ++ BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, " ++ "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n", ++ offset, request, i, ++ page, pfn_to_kaddr(page_to_pfn(page)), uvaddr); ++ ++ request->nr_pages++; ++ } ++ ++ if (blktap_map_foreign(tap, request, &blkif_req, &table)) ++ goto out; ++ ++ /* Finally, write the request message to the user ring. */ ++ target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); ++ memcpy(target, &blkif_req, sizeof(blkif_req)); ++ target->id = request->usr_idx; ++ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ ++ ring->ring.req_prod_pvt++; ++ ++ if (rq_data_dir(req)) { ++ tap->stats.st_wr_sect += nr_sects; ++ tap->stats.st_wr_req++; ++ } else { ++ tap->stats.st_rd_sect += nr_sects; ++ tap->stats.st_rd_req++; ++ } ++ ++ err = 0; ++ ++out: ++ if (err) ++ blktap_device_fast_flush(tap, request); ++ return err; ++} ++ ++#ifdef ENABLE_PASSTHROUGH ++#define rq_for_each_bio_safe(_bio, _tmp, _req) \ ++ if ((_req)->bio) \ ++ for (_bio = (_req)->bio; \ ++ _bio && ((_tmp = _bio->bi_next) || 1); \ ++ _bio = _tmp) ++ ++static void ++blktap_device_forward_request(struct blktap *tap, struct request *req) ++{ ++ struct bio *bio, *tmp; ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ ++ rq_for_each_bio_safe(bio, tmp, req) { ++ bio->bi_bdev = dev->bdev; ++ submit_bio(bio->bi_rw, bio); ++ } ++} ++ ++static void ++blktap_device_close_bdev(struct blktap *tap) ++{ ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ ++ if (dev->bdev) ++ blkdev_put(dev->bdev); ++ ++ dev->bdev = NULL; ++ clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); ++} ++ ++static int ++blktap_device_open_bdev(struct blktap *tap, u32 pdev) ++{ ++ struct block_device *bdev; ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ ++ bdev = open_by_devnum(pdev, FMODE_WRITE); ++ if (IS_ERR(bdev)) { ++ BTERR("opening device %x:%x failed: %ld\n", ++ MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev)); ++ return PTR_ERR(bdev); ++ } ++ ++ if (!bdev->bd_disk) { ++ BTERR("device %x:%x doesn't exist\n", ++ MAJOR(pdev), MINOR(pdev)); ++ blkdev_put(dev->bdev); ++ return -ENOENT; ++ } ++ ++ dev->bdev = bdev; ++ set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse); ++ ++ /* TODO: readjust queue parameters */ ++ ++ BTINFO("set device %d to passthrough on %x:%x\n", ++ tap->minor, MAJOR(pdev), MINOR(pdev)); ++ ++ return 0; ++} ++ ++int ++blktap_device_enable_passthrough(struct blktap *tap, ++ unsigned major, unsigned minor) ++{ ++ u32 pdev; ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ pdev = MKDEV(major, minor); ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ if (dev->bdev) { ++ if (pdev) ++ return -EINVAL; ++ blktap_device_close_bdev(tap); ++ return 0; ++ } ++ ++ return blktap_device_open_bdev(tap, pdev); ++} ++#endif ++ ++/* ++ * dev->lock held on entry ++ */ ++static void ++blktap_device_run_queue(struct blktap *tap) ++{ ++ int queued, err; ++ struct request_queue *rq; ++ struct request *req; ++ struct blktap_ring *ring; ++ struct blktap_device *dev; ++ struct blktap_request *request; ++ ++ queued = 0; ++ ring = &tap->ring; ++ dev = &tap->device; ++ rq = dev->gd->queue; ++ ++ BTDBG("running queue for %d\n", tap->minor); ++ ++ while ((req = blk_peek_request(rq)) != NULL) { ++ if (!blk_fs_request(req)) { ++ __blk_end_request_cur(req, 0); ++ continue; ++ } ++ ++ if (blk_barrier_rq(req)) { ++ __blk_end_request_cur(req, 0); ++ continue; ++ } ++ ++#ifdef ENABLE_PASSTHROUGH ++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { ++ blkdev_dequeue_request(req); ++ blktap_device_forward_request(tap, req); ++ continue; ++ } ++#endif ++ ++ if (RING_FULL(&ring->ring)) { ++ wait: ++ /* Avoid pointless unplugs. */ ++ blk_stop_queue(rq); ++ blktap_defer(tap); ++ break; ++ } ++ ++ request = blktap_request_allocate(tap); ++ if (!request) { ++ tap->stats.st_oo_req++; ++ goto wait; ++ } ++ ++ BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) " ++ "buffer:%p [%s], pending: %p\n", req, tap->minor, ++ req->cmd, (unsigned long long)blk_rq_pos(req), ++ blk_rq_cur_sectors(req), ++ blk_rq_sectors(req), req->buffer, ++ rq_data_dir(req) ? "write" : "read", request); ++ ++ blk_start_request(req); ++ ++ spin_unlock_irq(&dev->lock); ++ down_read(&tap->tap_sem); ++ ++ err = blktap_device_process_request(tap, request, req); ++ if (!err) ++ queued++; ++ else { ++ blktap_device_end_dequeued_request(dev, req, -EIO); ++ blktap_request_free(tap, request); ++ } ++ ++ up_read(&tap->tap_sem); ++ spin_lock_irq(&dev->lock); ++ } ++ ++ if (queued) ++ blktap_ring_kick_user(tap); ++} ++ ++/* ++ * dev->lock held on entry ++ */ ++static void ++blktap_device_do_request(struct request_queue *rq) ++{ ++ struct request *req; ++ struct blktap *tap; ++ struct blktap_device *dev; ++ ++ dev = rq->queuedata; ++ if (!dev) ++ goto fail; ++ ++ tap = dev_to_blktap(dev); ++ if (!blktap_active(tap)) ++ goto fail; ++ ++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || ++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { ++ blktap_defer(tap); ++ return; ++ } ++ ++ blktap_device_run_queue(tap); ++ return; ++ ++fail: ++ while ((req = blk_peek_request(rq))) { ++ BTERR("device closed: failing secs %llu - %llu\n", ++ (unsigned long long)blk_rq_pos(req), ++ (unsigned long long)blk_rq_pos(req) + blk_rq_sectors(req)); ++ __blk_end_request_cur(req, 0); ++ } ++} ++ ++void ++blktap_device_restart(struct blktap *tap) ++{ ++ struct blktap_device *dev; ++ ++ dev = &tap->device; ++ ++ if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) { ++ blktap_defer(tap); ++ return; ++ } ++ ++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) || ++ test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { ++ blktap_defer(tap); ++ return; ++ } ++ ++ spin_lock_irq(&dev->lock); ++ ++ /* Re-enable calldowns. */ ++ if (dev->gd) { ++ struct request_queue *rq = dev->gd->queue; ++ ++ if (blk_queue_stopped(rq)) ++ blk_start_queue(rq); ++ ++ /* Kick things off immediately. */ ++ blktap_device_do_request(rq); ++ } ++ ++ spin_unlock_irq(&dev->lock); ++} ++ ++static void ++blktap_device_configure(struct blktap *tap) ++{ ++ struct request_queue *rq; ++ struct blktap_device *dev = &tap->device; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd) ++ return; ++ ++ dev = &tap->device; ++ rq = dev->gd->queue; ++ ++ spin_lock_irq(&dev->lock); ++ ++ set_capacity(dev->gd, tap->params.capacity); ++ ++ /* Hard sector size and max sectors impersonate the equiv. hardware. */ ++ blk_queue_logical_block_size(rq, tap->params.sector_size); ++ blk_queue_max_sectors(rq, 512); ++ ++ /* Each segment in a request is up to an aligned page in size. */ ++ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); ++ blk_queue_max_segment_size(rq, PAGE_SIZE); ++ ++ /* Ensure a merged request will fit in a single I/O ring slot. */ ++ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ ++ /* Make sure buffer addresses are sector-aligned. */ ++ blk_queue_dma_alignment(rq, 511); ++ ++ spin_unlock_irq(&dev->lock); ++} ++ ++int ++blktap_device_resume(struct blktap *tap) ++{ ++ int err; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) ++ return -ENODEV; ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return 0; ++ ++ err = blktap_ring_resume(tap); ++ if (err) ++ return err; ++ ++ /* device size may have changed */ ++ blktap_device_configure(tap); ++ ++ BTDBG("restarting device\n"); ++ blktap_device_restart(tap); ++ ++ return 0; ++} ++ ++int ++blktap_device_pause(struct blktap *tap) ++{ ++ unsigned long flags; ++ struct blktap_device *dev = &tap->device; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap)) ++ return -ENODEV; ++ ++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return 0; ++ ++ spin_lock_irqsave(&dev->lock, flags); ++ ++ blk_stop_queue(dev->gd->queue); ++ set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); ++ ++ spin_unlock_irqrestore(&dev->lock, flags); ++ ++ return blktap_ring_pause(tap); ++} ++ ++int ++blktap_device_destroy(struct blktap *tap) ++{ ++ struct blktap_device *dev = &tap->device; ++ struct gendisk *gd = dev->gd; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ return 0; ++ ++ BTINFO("destroy device %d users %d\n", tap->minor, dev->users); ++ ++ if (dev->users) ++ return -EBUSY; ++ ++ spin_lock_irq(&dev->lock); ++ /* No more blktap_device_do_request(). */ ++ blk_stop_queue(gd->queue); ++ clear_bit(BLKTAP_DEVICE, &tap->dev_inuse); ++ dev->gd = NULL; ++ spin_unlock_irq(&dev->lock); ++ ++#ifdef ENABLE_PASSTHROUGH ++ if (dev->bdev) ++ blktap_device_close_bdev(tap); ++#endif ++ ++ del_gendisk(gd); ++ blk_cleanup_queue(gd->queue); ++ put_disk(gd); ++ ++ wake_up(&tap->wq); ++ ++ return 0; ++} ++ ++int ++blktap_device_create(struct blktap *tap) ++{ ++ int minor, err; ++ struct gendisk *gd; ++ struct request_queue *rq; ++ struct blktap_device *dev; ++ ++ gd = NULL; ++ rq = NULL; ++ dev = &tap->device; ++ minor = tap->minor; ++ ++ if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ return -EEXIST; ++ ++ if (blktap_validate_params(tap, &tap->params)) ++ return -EINVAL; ++ ++ BTINFO("minor %d sectors %Lu sector-size %lu\n", ++ minor, tap->params.capacity, tap->params.sector_size); ++ ++ err = -ENODEV; ++ ++ gd = alloc_disk(1); ++ if (!gd) ++ goto error; ++ ++ if (minor < 26) ++ sprintf(gd->disk_name, "tapdev%c", 'a' + minor); ++ else ++ sprintf(gd->disk_name, "tapdev%c%c", ++ 'a' + ((minor / 26) - 1), 'a' + (minor % 26)); ++ ++ gd->major = blktap_device_major; ++ gd->first_minor = minor; ++ gd->fops = &blktap_device_file_operations; ++ gd->private_data = dev; ++ ++ spin_lock_init(&dev->lock); ++ rq = blk_init_queue(blktap_device_do_request, &dev->lock); ++ if (!rq) ++ goto error; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ elevator_init(rq, "noop"); ++#else ++ elevator_init(rq, &elevator_noop); ++#endif ++ ++ gd->queue = rq; ++ rq->queuedata = dev; ++ dev->gd = gd; ++ ++ set_bit(BLKTAP_DEVICE, &tap->dev_inuse); ++ blktap_device_configure(tap); ++ ++ add_disk(gd); ++ ++ err = 0; ++ goto out; ++ ++ error: ++ if (gd) ++ del_gendisk(gd); ++ if (rq) ++ blk_cleanup_queue(rq); ++ ++ out: ++ BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err); ++ return err; ++} ++ ++int __init ++blktap_device_init(int *maj) ++{ ++ int major; ++ ++ /* Dynamically allocate a major for this device */ ++ major = register_blkdev(0, "tapdev"); ++ if (major < 0) { ++ BTERR("Couldn't register blktap device\n"); ++ return -ENOMEM; ++ } ++ ++ blktap_device_major = *maj = major; ++ BTINFO("blktap device major %d\n", major); ++ ++ return 0; ++} ++ ++void ++blktap_device_free(void) ++{ ++ if (blktap_device_major) ++ unregister_blkdev(blktap_device_major, "tapdev"); ++} +diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c +new file mode 100644 +index 0000000..770736a +--- /dev/null ++++ b/drivers/xen/blktap/request.c +@@ -0,0 +1,297 @@ ++#include ++#include ++#include ++ ++#include "blktap.h" ++ ++#define MAX_BUCKETS 8 ++#define BUCKET_SIZE MAX_PENDING_REQS ++ ++#define BLKTAP_POOL_CLOSING 1 ++ ++struct blktap_request_bucket; ++ ++struct blktap_request_handle { ++ int slot; ++ uint8_t inuse; ++ struct blktap_request request; ++ struct blktap_request_bucket *bucket; ++}; ++ ++struct blktap_request_bucket { ++ atomic_t reqs_in_use; ++ struct blktap_request_handle handles[BUCKET_SIZE]; ++ struct page **foreign_pages; ++}; ++ ++struct blktap_request_pool { ++ spinlock_t lock; ++ uint8_t status; ++ struct list_head free_list; ++ atomic_t reqs_in_use; ++ wait_queue_head_t wait_queue; ++ struct blktap_request_bucket *buckets[MAX_BUCKETS]; ++}; ++ ++static struct blktap_request_pool pool; ++ ++static inline struct blktap_request_handle * ++blktap_request_to_handle(struct blktap_request *req) ++{ ++ return container_of(req, struct blktap_request_handle, request); ++} ++ ++static void ++blktap_request_pool_init_request(struct blktap_request *request) ++{ ++ int i; ++ ++ request->usr_idx = -1; ++ request->nr_pages = 0; ++ request->status = BLKTAP_REQUEST_FREE; ++ INIT_LIST_HEAD(&request->free_list); ++ for (i = 0; i < ARRAY_SIZE(request->handles); i++) { ++ request->handles[i].user = INVALID_GRANT_HANDLE; ++ request->handles[i].kernel = INVALID_GRANT_HANDLE; ++ } ++} ++ ++static int ++blktap_request_pool_allocate_bucket(void) ++{ ++ int i, idx; ++ unsigned long flags; ++ struct blktap_request *request; ++ struct blktap_request_handle *handle; ++ struct blktap_request_bucket *bucket; ++ ++ bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL); ++ if (!bucket) ++ goto fail; ++ ++ bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES); ++ if (!bucket->foreign_pages) ++ goto fail; ++ ++ spin_lock_irqsave(&pool.lock, flags); ++ ++ idx = -1; ++ for (i = 0; i < MAX_BUCKETS; i++) { ++ if (!pool.buckets[i]) { ++ idx = i; ++ pool.buckets[idx] = bucket; ++ break; ++ } ++ } ++ ++ if (idx == -1) { ++ spin_unlock_irqrestore(&pool.lock, flags); ++ goto fail; ++ } ++ ++ for (i = 0; i < BUCKET_SIZE; i++) { ++ handle = bucket->handles + i; ++ request = &handle->request; ++ ++ handle->slot = i; ++ handle->inuse = 0; ++ handle->bucket = bucket; ++ ++ blktap_request_pool_init_request(request); ++ list_add_tail(&request->free_list, &pool.free_list); ++ } ++ ++ spin_unlock_irqrestore(&pool.lock, flags); ++ ++ return 0; ++ ++fail: ++ if (bucket && bucket->foreign_pages) ++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); ++ kfree(bucket); ++ return -ENOMEM; ++} ++ ++static void ++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket) ++{ ++ if (!bucket) ++ return; ++ ++ BTDBG("freeing bucket %p\n", bucket); ++ ++ free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES); ++ kfree(bucket); ++} ++ ++struct page * ++request_to_page(struct blktap_request *req, int seg) ++{ ++ struct blktap_request_handle *handle = blktap_request_to_handle(req); ++ int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; ++ return handle->bucket->foreign_pages[idx]; ++} ++ ++int ++blktap_request_pool_shrink(void) ++{ ++ int i, err; ++ unsigned long flags; ++ struct blktap_request_bucket *bucket; ++ ++ err = -EAGAIN; ++ ++ spin_lock_irqsave(&pool.lock, flags); ++ ++ /* always keep at least one bucket */ ++ for (i = 1; i < MAX_BUCKETS; i++) { ++ bucket = pool.buckets[i]; ++ if (!bucket) ++ continue; ++ ++ if (atomic_read(&bucket->reqs_in_use)) ++ continue; ++ ++ blktap_request_pool_free_bucket(bucket); ++ pool.buckets[i] = NULL; ++ err = 0; ++ break; ++ } ++ ++ spin_unlock_irqrestore(&pool.lock, flags); ++ ++ return err; ++} ++ ++int ++blktap_request_pool_grow(void) ++{ ++ return blktap_request_pool_allocate_bucket(); ++} ++ ++struct blktap_request * ++blktap_request_allocate(struct blktap *tap) ++{ ++ int i; ++ uint16_t usr_idx; ++ unsigned long flags; ++ struct blktap_request *request; ++ ++ usr_idx = -1; ++ request = NULL; ++ ++ spin_lock_irqsave(&pool.lock, flags); ++ ++ if (pool.status == BLKTAP_POOL_CLOSING) ++ goto out; ++ ++ for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++) ++ if (!tap->pending_requests[i]) { ++ usr_idx = i; ++ break; ++ } ++ ++ if (usr_idx == (uint16_t)-1) ++ goto out; ++ ++ if (!list_empty(&pool.free_list)) { ++ request = list_entry(pool.free_list.next, ++ struct blktap_request, free_list); ++ list_del(&request->free_list); ++ } ++ ++ if (request) { ++ struct blktap_request_handle *handle; ++ ++ atomic_inc(&pool.reqs_in_use); ++ ++ handle = blktap_request_to_handle(request); ++ atomic_inc(&handle->bucket->reqs_in_use); ++ handle->inuse = 1; ++ ++ request->usr_idx = usr_idx; ++ ++ tap->pending_requests[usr_idx] = request; ++ tap->pending_cnt++; ++ } ++ ++out: ++ spin_unlock_irqrestore(&pool.lock, flags); ++ return request; ++} ++ ++void ++blktap_request_free(struct blktap *tap, struct blktap_request *request) ++{ ++ int free; ++ unsigned long flags; ++ struct blktap_request_handle *handle; ++ ++ BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests)); ++ handle = blktap_request_to_handle(request); ++ ++ spin_lock_irqsave(&pool.lock, flags); ++ ++ handle->inuse = 0; ++ tap->pending_requests[request->usr_idx] = NULL; ++ blktap_request_pool_init_request(request); ++ list_add(&request->free_list, &pool.free_list); ++ atomic_dec(&handle->bucket->reqs_in_use); ++ free = atomic_dec_and_test(&pool.reqs_in_use); ++ ++ spin_unlock_irqrestore(&pool.lock, flags); ++ ++ if (--tap->pending_cnt == 0) ++ wake_up_interruptible(&tap->wq); ++ ++ if (free) ++ wake_up(&pool.wait_queue); ++} ++ ++void ++blktap_request_pool_free(void) ++{ ++ int i; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pool.lock, flags); ++ ++ pool.status = BLKTAP_POOL_CLOSING; ++ while (atomic_read(&pool.reqs_in_use)) { ++ spin_unlock_irqrestore(&pool.lock, flags); ++ wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use)); ++ spin_lock_irqsave(&pool.lock, flags); ++ } ++ ++ for (i = 0; i < MAX_BUCKETS; i++) { ++ blktap_request_pool_free_bucket(pool.buckets[i]); ++ pool.buckets[i] = NULL; ++ } ++ ++ spin_unlock_irqrestore(&pool.lock, flags); ++} ++ ++int __init ++blktap_request_pool_init(void) ++{ ++ int i, err; ++ ++ memset(&pool, 0, sizeof(pool)); ++ ++ spin_lock_init(&pool.lock); ++ INIT_LIST_HEAD(&pool.free_list); ++ atomic_set(&pool.reqs_in_use, 0); ++ init_waitqueue_head(&pool.wait_queue); ++ ++ for (i = 0; i < 2; i++) { ++ err = blktap_request_pool_allocate_bucket(); ++ if (err) ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ blktap_request_pool_free(); ++ return err; ++} +diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c +new file mode 100644 +index 0000000..74a7aa7 +--- /dev/null ++++ b/drivers/xen/blktap/ring.c +@@ -0,0 +1,615 @@ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "blktap.h" ++ ++#ifdef CONFIG_XEN_BLKDEV_BACKEND ++#include "../blkback/blkback-pagemap.h" ++#else ++#define blkback_pagemap_contains_page(page) 0 ++#endif ++ ++static int blktap_ring_major; ++ ++static inline struct blktap * ++vma_to_blktap(struct vm_area_struct *vma) ++{ ++ struct vm_foreign_map *m = vma->vm_private_data; ++ struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map); ++ return container_of(r, struct blktap, ring); ++} ++ ++ /* ++ * BLKTAP - immediately before the mmap area, ++ * we have a bunch of pages reserved for shared memory rings. ++ */ ++#define RING_PAGES 1 ++ ++static int ++blktap_read_ring(struct blktap *tap) ++{ ++ /* This is called to read responses from the ring. */ ++ int usr_idx; ++ RING_IDX rc, rp; ++ struct blkif_response res; ++ struct blktap_ring *ring; ++ struct blktap_request *request; ++ ++ down_read(&tap->tap_sem); ++ ++ ring = &tap->ring; ++ if (!ring->vma) { ++ up_read(&tap->tap_sem); ++ return 0; ++ } ++ ++ /* for each outstanding message on the ring */ ++ rp = ring->ring.sring->rsp_prod; ++ rmb(); ++ ++ for (rc = ring->ring.rsp_cons; rc != rp; rc++) { ++ memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res)); ++ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ ++ ++ring->ring.rsp_cons; ++ ++ usr_idx = (int)res.id; ++ if (usr_idx >= MAX_PENDING_REQS || ++ !tap->pending_requests[usr_idx]) { ++ BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n", ++ rc, rp, usr_idx, tap->pid, ring->vma); ++ continue; ++ } ++ ++ request = tap->pending_requests[usr_idx]; ++ BTDBG("request %p response #%d id %x\n", request, rc, usr_idx); ++ blktap_device_finish_request(tap, &res, request); ++ } ++ ++ up_read(&tap->tap_sem); ++ ++ blktap_run_deferred(); ++ ++ return 0; ++} ++ ++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ return VM_FAULT_SIGBUS; ++} ++ ++static pte_t ++blktap_ring_clear_pte(struct vm_area_struct *vma, ++ unsigned long uvaddr, ++ pte_t *ptep, int is_fullmm) ++{ ++ pte_t copy; ++ struct blktap *tap; ++ unsigned long kvaddr; ++ struct page **map, *page; ++ struct blktap_ring *ring; ++ struct blktap_request *request; ++ struct grant_handle_pair *khandle; ++ struct gnttab_unmap_grant_ref unmap[2]; ++ int offset, seg, usr_idx, count = 0; ++ ++ tap = vma_to_blktap(vma); ++ ring = &tap->ring; ++ map = ring->foreign_map.map; ++ BUG_ON(!map); /* TODO Should this be changed to if statement? */ ++ ++ /* ++ * Zap entry if the address is before the start of the grant ++ * mapped region. ++ */ ++ if (uvaddr < ring->user_vstart) ++ return ptep_get_and_clear_full(vma->vm_mm, uvaddr, ++ ptep, is_fullmm); ++ ++ offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); ++ usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ ++ offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT); ++ page = map[offset]; ++ if (page) { ++ ClearPageReserved(page); ++ if (blkback_pagemap_contains_page(page)) ++ set_page_private(page, 0); ++ } ++ map[offset] = NULL; ++ ++ request = tap->pending_requests[usr_idx]; ++ kvaddr = request_to_kaddr(request, seg); ++ khandle = request->handles + seg; ++ ++ if (khandle->kernel != INVALID_GRANT_HANDLE) { ++ gnttab_set_unmap_op(&unmap[count], kvaddr, ++ GNTMAP_host_map, khandle->kernel); ++ count++; ++ ++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, ++ INVALID_P2M_ENTRY); ++ } ++ ++ ++ if (khandle->user != INVALID_GRANT_HANDLE) { ++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); ++ ++ copy = *ptep; ++ gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr, ++ GNTMAP_host_map ++ | GNTMAP_application_map ++ | GNTMAP_contains_pte, ++ khandle->user); ++ count++; ++ } else ++ copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, ++ is_fullmm); ++ ++ if (count) ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ unmap, count)) ++ BUG(); ++ ++ khandle->kernel = INVALID_GRANT_HANDLE; ++ khandle->user = INVALID_GRANT_HANDLE; ++ ++ return copy; ++} ++ ++static void ++blktap_ring_vm_unmap(struct vm_area_struct *vma) ++{ ++ struct blktap *tap = vma_to_blktap(vma); ++ ++ down_write(&tap->tap_sem); ++ clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse); ++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); ++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); ++ up_write(&tap->tap_sem); ++} ++ ++static void ++blktap_ring_vm_close(struct vm_area_struct *vma) ++{ ++ struct blktap *tap = vma_to_blktap(vma); ++ struct blktap_ring *ring = &tap->ring; ++ ++ blktap_ring_vm_unmap(vma); /* fail future requests */ ++ blktap_device_fail_pending_requests(tap); /* fail pending requests */ ++ blktap_device_restart(tap); /* fail deferred requests */ ++ ++ down_write(&tap->tap_sem); ++ ++ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); ++ ++ kfree(ring->foreign_map.map); ++ ring->foreign_map.map = NULL; ++ ++ /* Free the ring page. */ ++ ClearPageReserved(virt_to_page(ring->ring.sring)); ++ free_page((unsigned long)ring->ring.sring); ++ ++ BTINFO("unmapping ring %d\n", tap->minor); ++ ring->ring.sring = NULL; ++ ring->vma = NULL; ++ ++ up_write(&tap->tap_sem); ++ ++ wake_up(&tap->wq); ++} ++ ++static struct vm_operations_struct blktap_ring_vm_operations = { ++ .close = blktap_ring_vm_close, ++ .unmap = blktap_ring_vm_unmap, ++ .fault = blktap_ring_fault, ++ .zap_pte = blktap_ring_clear_pte, ++}; ++ ++static int ++blktap_ring_open(struct inode *inode, struct file *filp) ++{ ++ int idx; ++ struct blktap *tap; ++ ++ idx = iminor(inode); ++ if (idx < 0 || idx > MAX_BLKTAP_DEVICE || blktaps[idx] == NULL) { ++ BTERR("unable to open device blktap%d\n", idx); ++ return -ENODEV; ++ } ++ ++ tap = blktaps[idx]; ++ ++ BTINFO("opening device blktap%d\n", idx); ++ ++ if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse)) ++ return -ENODEV; ++ ++ /* Only one process can access ring at a time */ ++ if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse)) ++ return -EBUSY; ++ ++ filp->private_data = tap; ++ BTINFO("opened device %d\n", tap->minor); ++ ++ return 0; ++} ++ ++static int ++blktap_ring_release(struct inode *inode, struct file *filp) ++{ ++ struct blktap *tap = filp->private_data; ++ ++ BTINFO("freeing device %d\n", tap->minor); ++ clear_bit(BLKTAP_RING_FD, &tap->dev_inuse); ++ filp->private_data = NULL; ++ wake_up(&tap->wq); ++ return 0; ++} ++ ++/* Note on mmap: ++ * We need to map pages to user space in a way that will allow the block ++ * subsystem set up direct IO to them. This couldn't be done before, because ++ * there isn't really a sane way to translate a user virtual address down to a ++ * physical address when the page belongs to another domain. ++ * ++ * My first approach was to map the page in to kernel memory, add an entry ++ * for it in the physical frame list (using alloc_lomem_region as in blkback) ++ * and then attempt to map that page up to user space. This is disallowed ++ * by xen though, which realizes that we don't really own the machine frame ++ * underlying the physical page. ++ * ++ * The new approach is to provide explicit support for this in xen linux. ++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages ++ * mapped from other vms. vma->vm_private_data is set up as a mapping ++ * from pages to actual page structs. There is a new clause in get_user_pages ++ * that does the right thing for this sort of mapping. ++ */ ++static int ++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ int size, err; ++ struct page **map; ++ struct blktap *tap; ++ struct blkif_sring *sring; ++ struct blktap_ring *ring; ++ ++ tap = filp->private_data; ++ ring = &tap->ring; ++ map = NULL; ++ sring = NULL; ++ ++ if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) ++ return -ENOMEM; ++ ++ size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; ++ if (size != (MMAP_PAGES + RING_PAGES)) { ++ BTERR("you _must_ map exactly %lu pages!\n", ++ MMAP_PAGES + RING_PAGES); ++ return -EAGAIN; ++ } ++ ++ /* Allocate the fe ring. */ ++ sring = (struct blkif_sring *)get_zeroed_page(GFP_KERNEL); ++ if (!sring) { ++ BTERR("Couldn't alloc sring.\n"); ++ goto fail_mem; ++ } ++ ++ map = kzalloc(size * sizeof(struct page *), GFP_KERNEL); ++ if (!map) { ++ BTERR("Couldn't alloc VM_FOREIGN map.\n"); ++ goto fail_mem; ++ } ++ ++ SetPageReserved(virt_to_page(sring)); ++ ++ SHARED_RING_INIT(sring); ++ FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE); ++ ++ ring->ring_vstart = vma->vm_start; ++ ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT); ++ ++ /* Map the ring pages to the start of the region and reserve it. */ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ err = vm_insert_page(vma, vma->vm_start, ++ virt_to_page(ring->ring.sring)); ++ else ++ err = remap_pfn_range(vma, vma->vm_start, ++ __pa(ring->ring.sring) >> PAGE_SHIFT, ++ PAGE_SIZE, vma->vm_page_prot); ++ if (err) { ++ BTERR("Mapping user ring failed: %d\n", err); ++ goto fail; ++ } ++ ++ /* Mark this VM as containing foreign pages, and set up mappings. */ ++ ring->foreign_map.map = map; ++ vma->vm_private_data = &ring->foreign_map; ++ vma->vm_flags |= VM_FOREIGN; ++ vma->vm_flags |= VM_DONTCOPY; ++ vma->vm_flags |= VM_RESERVED; ++ vma->vm_ops = &blktap_ring_vm_operations; ++ ++#ifdef CONFIG_X86 ++ vma->vm_mm->context.has_foreign_mappings = 1; ++#endif ++ ++ tap->pid = current->pid; ++ BTINFO("blktap: mapping pid is %d\n", tap->pid); ++ ++ ring->vma = vma; ++ return 0; ++ ++ fail: ++ /* Clear any active mappings. */ ++ zap_page_range(vma, vma->vm_start, ++ vma->vm_end - vma->vm_start, NULL); ++ ClearPageReserved(virt_to_page(sring)); ++ fail_mem: ++ free_page((unsigned long)sring); ++ kfree(map); ++ ++ return -ENOMEM; ++} ++ ++static inline void ++blktap_ring_set_message(struct blktap *tap, int msg) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ ++ down_read(&tap->tap_sem); ++ if (ring->ring.sring) ++ ring->ring.sring->pad[0] = msg; ++ up_read(&tap->tap_sem); ++} ++ ++static int ++blktap_ring_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct blktap_params params; ++ struct blktap *tap = filp->private_data; ++ ++ BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg); ++ ++ switch(cmd) { ++ case BLKTAP2_IOCTL_KICK_FE: ++ /* There are fe messages to process. */ ++ return blktap_read_ring(tap); ++ ++ case BLKTAP2_IOCTL_CREATE_DEVICE: ++ if (!arg) ++ return -EINVAL; ++ ++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg, ++ sizeof(params))) { ++ BTERR("failed to get params\n"); ++ return -EFAULT; ++ } ++ ++ if (blktap_validate_params(tap, ¶ms)) { ++ BTERR("invalid params\n"); ++ return -EINVAL; ++ } ++ ++ tap->params = params; ++ return blktap_device_create(tap); ++ ++ case BLKTAP2_IOCTL_SET_PARAMS: ++ if (!arg) ++ return -EINVAL; ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ if (copy_from_user(¶ms, (struct blktap_params __user *)arg, ++ sizeof(params))) { ++ BTERR("failed to get params\n"); ++ return -EFAULT; ++ } ++ ++ if (blktap_validate_params(tap, ¶ms)) { ++ BTERR("invalid params\n"); ++ return -EINVAL; ++ } ++ ++ tap->params = params; ++ return 0; ++ ++ case BLKTAP2_IOCTL_PAUSE: ++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ set_bit(BLKTAP_PAUSED, &tap->dev_inuse); ++ clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse); ++ ++ blktap_ring_set_message(tap, 0); ++ wake_up_interruptible(&tap->wq); ++ ++ return 0; ++ ++ ++ case BLKTAP2_IOCTL_REOPEN: ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ if (!arg) ++ return -EINVAL; ++ ++ if (copy_to_user((char __user *)arg, ++ tap->params.name, ++ strlen(tap->params.name) + 1)) ++ return -EFAULT; ++ ++ blktap_ring_set_message(tap, 0); ++ wake_up_interruptible(&tap->wq); ++ ++ return 0; ++ ++ case BLKTAP2_IOCTL_RESUME: ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ tap->ring.response = (int)arg; ++ if (!tap->ring.response) ++ clear_bit(BLKTAP_PAUSED, &tap->dev_inuse); ++ ++ blktap_ring_set_message(tap, 0); ++ wake_up_interruptible(&tap->wq); ++ ++ return 0; ++ } ++ ++ return -ENOIOCTLCMD; ++} ++ ++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) ++{ ++ struct blktap *tap = filp->private_data; ++ struct blktap_ring *ring = &tap->ring; ++ ++ poll_wait(filp, &ring->poll_wait, wait); ++ if (ring->ring.sring->pad[0] != 0 || ++ ring->ring.req_prod_pvt != ring->ring.sring->req_prod) { ++ RING_PUSH_REQUESTS(&ring->ring); ++ return POLLIN | POLLRDNORM; ++ } ++ ++ return 0; ++} ++ ++static struct file_operations blktap_ring_file_operations = { ++ .owner = THIS_MODULE, ++ .open = blktap_ring_open, ++ .release = blktap_ring_release, ++ .ioctl = blktap_ring_ioctl, ++ .mmap = blktap_ring_mmap, ++ .poll = blktap_ring_poll, ++}; ++ ++void ++blktap_ring_kick_user(struct blktap *tap) ++{ ++ wake_up_interruptible(&tap->ring.poll_wait); ++} ++ ++int ++blktap_ring_resume(struct blktap *tap) ++{ ++ int err; ++ struct blktap_ring *ring = &tap->ring; ++ ++ if (!blktap_active(tap)) ++ return -ENODEV; ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ /* set shared flag for resume */ ++ ring->response = 0; ++ ++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME); ++ blktap_ring_kick_user(tap); ++ ++ wait_event_interruptible(tap->wq, ring->response || ++ !test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); ++ ++ err = ring->response; ++ ring->response = 0; ++ ++ BTDBG("err: %d\n", err); ++ ++ if (err) ++ return err; ++ ++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EAGAIN; ++ ++ return 0; ++} ++ ++int ++blktap_ring_pause(struct blktap *tap) ++{ ++ if (!blktap_active(tap)) ++ return -ENODEV; ++ ++ if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) ++ return -EINVAL; ++ ++ BTDBG("draining queue\n"); ++ wait_event_interruptible(tap->wq, !tap->pending_cnt); ++ if (tap->pending_cnt) ++ return -EAGAIN; ++ ++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE); ++ blktap_ring_kick_user(tap); ++ ++ BTDBG("waiting for tapdisk response\n"); ++ wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse)); ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) ++ return -EAGAIN; ++ ++ return 0; ++} ++ ++int ++blktap_ring_destroy(struct blktap *tap) ++{ ++ if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) && ++ !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse)) ++ return 0; ++ ++ BTDBG("sending tapdisk close message\n"); ++ blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE); ++ blktap_ring_kick_user(tap); ++ ++ return -EAGAIN; ++} ++ ++static void ++blktap_ring_initialize(struct blktap_ring *ring, int minor) ++{ ++ memset(ring, 0, sizeof(*ring)); ++ init_waitqueue_head(&ring->poll_wait); ++ ring->devno = MKDEV(blktap_ring_major, minor); ++} ++ ++int ++blktap_ring_create(struct blktap *tap) ++{ ++ struct blktap_ring *ring = &tap->ring; ++ blktap_ring_initialize(ring, tap->minor); ++ return blktap_sysfs_create(tap); ++} ++ ++int __init ++blktap_ring_init(int *major) ++{ ++ int err; ++ ++ err = register_chrdev(0, "blktap2", &blktap_ring_file_operations); ++ if (err < 0) { ++ BTERR("error registering blktap ring device: %d\n", err); ++ return err; ++ } ++ ++ blktap_ring_major = *major = err; ++ BTINFO("blktap ring major: %d\n", blktap_ring_major); ++ return 0; ++} ++ ++int ++blktap_ring_free(void) ++{ ++ if (blktap_ring_major) ++ unregister_chrdev(blktap_ring_major, "blktap2"); ++ ++ return 0; ++} +diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c +new file mode 100644 +index 0000000..23a3a51 +--- /dev/null ++++ b/drivers/xen/blktap/sysfs.c +@@ -0,0 +1,451 @@ ++#include ++#include ++#include ++#include ++ ++#include "blktap.h" ++ ++int blktap_debug_level = 1; ++ ++static struct class *class; ++static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq); ++ ++static inline void ++blktap_sysfs_get(struct blktap *tap) ++{ ++ atomic_inc(&tap->ring.sysfs_refcnt); ++} ++ ++static inline void ++blktap_sysfs_put(struct blktap *tap) ++{ ++ if (atomic_dec_and_test(&tap->ring.sysfs_refcnt)) ++ wake_up(&sysfs_wq); ++} ++ ++static inline void ++blktap_sysfs_enter(struct blktap *tap) ++{ ++ blktap_sysfs_get(tap); /* pin sysfs device */ ++ mutex_lock(&tap->ring.sysfs_mutex); /* serialize sysfs operations */ ++} ++ ++static inline void ++blktap_sysfs_exit(struct blktap *tap) ++{ ++ mutex_unlock(&tap->ring.sysfs_mutex); ++ blktap_sysfs_put(tap); ++} ++ ++#define CLASS_DEVICE_ATTR(a,b,c,d) DEVICE_ATTR(a,b,c,d) ++ ++static ssize_t blktap_sysfs_pause_device(struct device *, struct device_attribute *, const char *, size_t); ++CLASS_DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device); ++static ssize_t blktap_sysfs_resume_device(struct device *, struct device_attribute *, const char *, size_t); ++CLASS_DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device); ++ ++static ssize_t ++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) ++{ ++ int err; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ blktap_sysfs_enter(tap); ++ ++ if (!tap->ring.dev || ++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { ++ err = -ENODEV; ++ goto out; ++ } ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { ++ err = -EPERM; ++ goto out; ++ } ++ ++ if (size > BLKTAP2_MAX_MESSAGE_LEN) { ++ err = -ENAMETOOLONG; ++ goto out; ++ } ++ ++ if (strnlen(buf, BLKTAP2_MAX_MESSAGE_LEN) >= BLKTAP2_MAX_MESSAGE_LEN) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ snprintf(tap->params.name, sizeof(tap->params.name) - 1, "%s", buf); ++ err = size; ++ ++out: ++ blktap_sysfs_exit(tap); ++ return err; ++} ++ ++static ssize_t ++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ ssize_t size; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ blktap_sysfs_enter(tap); ++ ++ if (!tap->ring.dev) ++ size = -ENODEV; ++ else if (tap->params.name[0]) ++ size = sprintf(buf, "%s\n", tap->params.name); ++ else ++ size = sprintf(buf, "%d\n", tap->minor); ++ ++ blktap_sysfs_exit(tap); ++ ++ return size; ++} ++CLASS_DEVICE_ATTR(name, S_IRUSR | S_IWUSR, ++ blktap_sysfs_get_name, blktap_sysfs_set_name); ++ ++static ssize_t ++blktap_sysfs_remove_device(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ int err; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ if (!tap->ring.dev) ++ return size; ++ ++ if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) ++ return -EBUSY; ++ ++ err = blktap_control_destroy_device(tap); ++ ++ return (err ? : size); ++} ++CLASS_DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device); ++ ++static ssize_t ++blktap_sysfs_pause_device(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ int err; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ blktap_sysfs_enter(tap); ++ ++ BTDBG("pausing %u:%u: dev_inuse: %lu\n", ++ MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse); ++ ++ if (!tap->ring.dev || ++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { ++ err = -ENODEV; ++ goto out; ++ } ++ ++ if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { ++ err = 0; ++ goto out; ++ } ++ ++ err = blktap_device_pause(tap); ++ if (!err) { ++ device_remove_file(dev, &dev_attr_pause); ++ err = device_create_file(dev, &dev_attr_resume); ++ } ++ ++out: ++ blktap_sysfs_exit(tap); ++ ++ return (err ? err : size); ++} ++ ++static ssize_t ++blktap_sysfs_resume_device(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t size) ++{ ++ int err; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ blktap_sysfs_enter(tap); ++ ++ if (!tap->ring.dev || ++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { ++ err = -ENODEV; ++ goto out; ++ } ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = blktap_device_resume(tap); ++ if (!err) { ++ device_remove_file(dev, &dev_attr_resume); ++ err = device_create_file(dev, &dev_attr_pause); ++ } ++ ++out: ++ blktap_sysfs_exit(tap); ++ ++ BTDBG("returning %zd\n", (err ? err : size)); ++ return (err ? err : size); ++} ++ ++#ifdef ENABLE_PASSTHROUGH ++static ssize_t ++blktap_sysfs_enable_passthrough(struct device *dev, ++ const char *buf, size_t size) ++{ ++ int err; ++ unsigned major, minor; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ BTINFO("passthrough request enabled\n"); ++ ++ blktap_sysfs_enter(tap); ++ ++ if (!tap->ring.dev || ++ test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) { ++ err = -ENODEV; ++ goto out; ++ } ++ ++ if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = sscanf(buf, "%x:%x", &major, &minor); ++ if (err != 2) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ err = blktap_device_enable_passthrough(tap, major, minor); ++ ++out: ++ blktap_sysfs_exit(tap); ++ BTDBG("returning %d\n", (err ? err : size)); ++ return (err ? err : size); ++} ++#endif ++ ++static ssize_t ++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ char *tmp; ++ int i, ret; ++ struct blktap *tap = (struct blktap *)dev_get_drvdata(dev); ++ ++ tmp = buf; ++ blktap_sysfs_get(tap); ++ ++ if (!tap->ring.dev) { ++ ret = sprintf(tmp, "no device\n"); ++ goto out; ++ } ++ ++ tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n", ++ tap->params.name, MAJOR(tap->ring.devno), ++ MINOR(tap->ring.devno), atomic_read(&tap->refcnt), ++ tap->dev_inuse); ++ tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, " ++ "device users: %d\n", tap->params.capacity, ++ tap->params.sector_size, tap->device.users); ++ ++ down_read(&tap->tap_sem); ++ ++ tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt); ++ for (i = 0; i < MAX_PENDING_REQS; i++) { ++ struct blktap_request *req = tap->pending_requests[i]; ++ if (!req) ++ continue; ++ ++ tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, " ++ "status: 0x%02x, pendcnt: %d, " ++ "nr_pages: %u, op: %d, time: %lu:%lu\n", ++ i, (unsigned long long)req->id, req->usr_idx, ++ req->status, atomic_read(&req->pendcnt), ++ req->nr_pages, req->operation, req->time.tv_sec, ++ req->time.tv_usec); ++ } ++ ++ up_read(&tap->tap_sem); ++ ret = (tmp - buf) + 1; ++ ++out: ++ blktap_sysfs_put(tap); ++ BTDBG("%s\n", buf); ++ ++ return ret; ++} ++CLASS_DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL); ++ ++int ++blktap_sysfs_create(struct blktap *tap) ++{ ++ struct blktap_ring *ring; ++ struct device *dev; ++ int err; ++ ++ if (!class) ++ return -ENODEV; ++ ++ ring = &tap->ring; ++ ++ dev = device_create(class, NULL, ring->devno, ++ tap, "blktap%d", tap->minor); ++ if (IS_ERR(dev)) ++ return PTR_ERR(dev); ++ ++ ring->dev = dev; ++ ++ mutex_init(&ring->sysfs_mutex); ++ atomic_set(&ring->sysfs_refcnt, 0); ++ ++ ++ printk(KERN_CRIT "%s: adding attributes for dev %p\n", __func__, dev); ++ err = device_create_file(dev, &dev_attr_name); ++ if (err) ++ goto out; ++ err = device_create_file(dev, &dev_attr_remove); ++ if (err) ++ goto out_unregister_name; ++ err = device_create_file(dev, &dev_attr_pause); ++ if (err) ++ goto out_unregister_remove; ++ err = device_create_file(dev, &dev_attr_debug); ++ if (err) ++ goto out_unregister_pause; ++ ++ return 0; ++ ++out_unregister_pause: ++ device_remove_file(dev, &dev_attr_pause); ++out_unregister_remove: ++ device_remove_file(dev, &dev_attr_remove); ++out_unregister_name: ++ device_remove_file(dev, &dev_attr_name); ++out: ++ return err; ++} ++ ++int ++blktap_sysfs_destroy(struct blktap *tap) ++{ ++ struct blktap_ring *ring; ++ struct device *dev; ++ ++ printk(KERN_CRIT "%s\n", __func__); ++ ++ ring = &tap->ring; ++ dev = ring->dev; ++ if (!class || !dev) ++ return 0; ++ ++ ring->dev = NULL; ++ if (wait_event_interruptible(sysfs_wq, ++ !atomic_read(&tap->ring.sysfs_refcnt))) ++ return -EAGAIN; ++ ++ device_schedule_callback(dev, device_unregister); ++ ++ return 0; ++} ++ ++static ssize_t ++blktap_sysfs_show_verbosity(struct class *class, char *buf) ++{ ++ return sprintf(buf, "%d\n", blktap_debug_level); ++} ++ ++static ssize_t ++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size) ++{ ++ int level; ++ ++ if (sscanf(buf, "%d", &level) == 1) { ++ blktap_debug_level = level; ++ return size; ++ } ++ ++ return -EINVAL; ++} ++CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR, ++ blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity); ++ ++static ssize_t ++blktap_sysfs_show_devices(struct class *class, char *buf) ++{ ++ int i, ret; ++ struct blktap *tap; ++ ++ ret = 0; ++ for (i = 0; i < MAX_BLKTAP_DEVICE; i++) { ++ tap = blktaps[i]; ++ if (!tap) ++ continue; ++ ++ if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse)) ++ continue; ++ ++ ret += sprintf(buf + ret, "%d ", tap->minor); ++ ret += snprintf(buf + ret, sizeof(tap->params.name) - 1, ++ tap->params.name); ++ ret += sprintf(buf + ret, "\n"); ++ } ++ ++ return ret; ++} ++CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL); ++ ++void ++blktap_sysfs_free(void) ++{ ++ if (!class) ++ return; ++ ++ class_remove_file(class, &class_attr_verbosity); ++ class_remove_file(class, &class_attr_devices); ++ ++ class_destroy(class); ++} ++ ++int __init ++blktap_sysfs_init(void) ++{ ++ struct class *cls; ++ int err; ++ ++ if (class) ++ return -EEXIST; ++ ++ cls = class_create(THIS_MODULE, "blktap2"); ++ if (IS_ERR(cls)) ++ return PTR_ERR(cls); ++ ++ err = class_create_file(cls, &class_attr_verbosity); ++ if (err) ++ goto out_unregister; ++ err = class_create_file(cls, &class_attr_devices); ++ if (err) ++ goto out_unregister; ++ ++ class = cls; ++ return 0; ++out_unregister: ++ class_destroy(cls); ++ return err; ++} +diff --git a/drivers/xen/blktap/wait_queue.c b/drivers/xen/blktap/wait_queue.c +new file mode 100644 +index 0000000..f8995aa +--- /dev/null ++++ b/drivers/xen/blktap/wait_queue.c +@@ -0,0 +1,40 @@ ++#include ++#include ++ ++#include "blktap.h" ++ ++static LIST_HEAD(deferred_work_queue); ++static DEFINE_SPINLOCK(deferred_work_lock); ++ ++void ++blktap_run_deferred(void) ++{ ++ LIST_HEAD(queue); ++ struct blktap *tap; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&deferred_work_lock, flags); ++ list_splice_init(&deferred_work_queue, &queue); ++ list_for_each_entry(tap, &queue, deferred_queue) ++ clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse); ++ spin_unlock_irqrestore(&deferred_work_lock, flags); ++ ++ while (!list_empty(&queue)) { ++ tap = list_entry(queue.next, struct blktap, deferred_queue); ++ list_del_init(&tap->deferred_queue); ++ blktap_device_restart(tap); ++ } ++} ++ ++void ++blktap_defer(struct blktap *tap) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&deferred_work_lock, flags); ++ if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) { ++ set_bit(BLKTAP_DEFERRED, &tap->dev_inuse); ++ list_add_tail(&tap->deferred_queue, &deferred_work_queue); ++ } ++ spin_unlock_irqrestore(&deferred_work_lock, flags); ++} +diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c +index bdfd584..6625ffe 100644 +--- a/drivers/xen/cpu_hotplug.c ++++ b/drivers/xen/cpu_hotplug.c +@@ -1,5 +1,6 @@ + #include + ++#include + #include + + #include +diff --git a/drivers/xen/events.c b/drivers/xen/events.c +index ce602dd..925e7a1 100644 +--- a/drivers/xen/events.c ++++ b/drivers/xen/events.c +@@ -16,7 +16,7 @@ + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. +- * 4. Hardware interrupts. Not supported at present. ++ * 4. PIRQs - Hardware interrupts. + * + * Jeremy Fitzhardinge , XenSource Inc, 2007 + */ +@@ -27,10 +27,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include + #include + #include ++#include + #include + #include + #include +@@ -40,6 +45,8 @@ + #include + #include + ++#include "../pci/msi.h" ++ + /* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. +@@ -67,7 +74,7 @@ enum xen_irq_type { + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: +- * PIRQ - vector, with MSB being "needs EIO" ++ * PIRQ - with MSB being "needs EIO" + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - +@@ -82,21 +89,26 @@ struct irq_info + unsigned short virq; + enum ipi_vector ipi; + struct { +- unsigned short gsi; +- unsigned short vector; ++ unsigned short nr; ++ unsigned char flags; + } pirq; + } u; + }; ++#define PIRQ_NEEDS_EOI (1 << 0) ++#define PIRQ_SHAREABLE (1 << 1) + +-static struct irq_info irq_info[NR_IRQS]; ++static struct irq_info *irq_info; + +-static int evtchn_to_irq[NR_EVENT_CHANNELS] = { +- [0 ... NR_EVENT_CHANNELS-1] = -1 +-}; ++static int *evtchn_to_irq; + struct cpu_evtchn_s { + unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; + }; +-static struct cpu_evtchn_s *cpu_evtchn_mask_p; ++ ++static __initdata struct cpu_evtchn_s init_evtchn_mask = { ++ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, ++}; ++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; ++ + static inline unsigned long *cpu_evtchn_mask(int cpu) + { + return cpu_evtchn_mask_p[cpu].bits; +@@ -106,6 +118,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu) + #define VALID_EVTCHN(chn) ((chn) != 0) + + static struct irq_chip xen_dynamic_chip; ++static struct irq_chip xen_pirq_chip; + + /* Constructor for packed IRQ information. */ + static struct irq_info mk_unbound_info(void) +@@ -132,10 +145,10 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) + } + + static struct irq_info mk_pirq_info(unsigned short evtchn, +- unsigned short gsi, unsigned short vector) ++ unsigned short pirq) + { + return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, +- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } }; ++ .cpu = 0, .u.pirq = { .nr = pirq } }; + } + + /* +@@ -184,17 +197,7 @@ static unsigned gsi_from_irq(unsigned irq) + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_PIRQ); + +- return info->u.pirq.gsi; +-} +- +-static unsigned vector_from_irq(unsigned irq) +-{ +- struct irq_info *info = info_for_irq(irq); +- +- BUG_ON(info == NULL); +- BUG_ON(info->type != IRQT_PIRQ); +- +- return info->u.pirq.vector; ++ return info->u.pirq.nr; + } + + static enum xen_irq_type type_from_irq(unsigned irq) +@@ -218,6 +221,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) + return ret; + } + ++static bool pirq_needs_eoi(unsigned irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ return info->u.pirq.flags & PIRQ_NEEDS_EOI; ++} ++ + static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +@@ -329,12 +341,24 @@ static void unmask_evtchn(int port) + put_cpu(); + } + ++static int get_nr_hw_irqs(void) ++{ ++ int ret = 1; ++ ++#ifdef CONFIG_X86_IO_APIC ++ ret = get_nr_irqs_gsi(); ++#endif ++ ++ return ret; ++} ++ + static int find_unbound_irq(void) + { + int irq; + struct irq_desc *desc; ++ int start = get_nr_hw_irqs(); + +- for (irq = 0; irq < nr_irqs; irq++) ++ for (irq = start; irq < nr_irqs; irq++) + if (irq_info[irq].type == IRQT_UNBOUND) + break; + +@@ -350,6 +374,290 @@ static int find_unbound_irq(void) + return irq; + } + ++static bool identity_mapped_irq(unsigned irq) ++{ ++ /* identity map all the hardware irqs */ ++ return irq < get_nr_hw_irqs(); ++} ++ ++static void pirq_unmask_notify(int irq) ++{ ++ struct irq_info *info = info_for_irq(irq); ++ struct physdev_eoi eoi = { .irq = info->u.pirq.nr }; ++ ++ if (unlikely(pirq_needs_eoi(irq))) { ++ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); ++ WARN_ON(rc); ++ } ++} ++ ++static void pirq_query_unmask(int irq) ++{ ++ struct physdev_irq_status_query irq_status; ++ struct irq_info *info = info_for_irq(irq); ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ irq_status.irq = info->u.pirq.nr; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) ++ irq_status.flags = 0; ++ ++ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; ++ if (irq_status.flags & XENIRQSTAT_needs_eoi) ++ info->u.pirq.flags |= PIRQ_NEEDS_EOI; ++} ++ ++static bool probing_irq(int irq) ++{ ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ return desc && desc->action == NULL; ++} ++ ++static unsigned int startup_pirq(unsigned int irq) ++{ ++ struct evtchn_bind_pirq bind_pirq; ++ struct irq_info *info = info_for_irq(irq); ++ int evtchn = evtchn_from_irq(irq); ++ int rc; ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ if (VALID_EVTCHN(evtchn)) ++ goto out; ++ ++ bind_pirq.pirq = info->u.pirq.nr; ++ /* NB. We are happy to share unless we are probing. */ ++ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? ++ BIND_PIRQ__WILL_SHARE : 0; ++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); ++ if (rc != 0) { ++ if (!probing_irq(irq)) ++ printk(KERN_INFO "Failed to obtain physical IRQ %d\n", ++ irq); ++ return 0; ++ } ++ evtchn = bind_pirq.port; ++ ++ pirq_query_unmask(irq); ++ ++ evtchn_to_irq[evtchn] = irq; ++ bind_evtchn_to_cpu(evtchn, 0); ++ info->evtchn = evtchn; ++ ++ out: ++ unmask_evtchn(evtchn); ++ pirq_unmask_notify(irq); ++ ++ return 0; ++} ++ ++static void shutdown_pirq(unsigned int irq) ++{ ++ struct evtchn_close close; ++ struct irq_info *info = info_for_irq(irq); ++ int evtchn = evtchn_from_irq(irq); ++ ++ BUG_ON(info->type != IRQT_PIRQ); ++ ++ if (!VALID_EVTCHN(evtchn)) ++ return; ++ ++ mask_evtchn(evtchn); ++ ++ close.port = evtchn; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) ++ BUG(); ++ ++ bind_evtchn_to_cpu(evtchn, 0); ++ evtchn_to_irq[evtchn] = -1; ++ info->evtchn = 0; ++} ++ ++static void enable_pirq(unsigned int irq) ++{ ++ startup_pirq(irq); ++} ++ ++static void disable_pirq(unsigned int irq) ++{ ++} ++ ++static void ack_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ move_native_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) { ++ mask_evtchn(evtchn); ++ clear_evtchn(evtchn); ++ } ++} ++ ++static void end_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ struct irq_desc *desc = irq_to_desc(irq); ++ ++ if (WARN_ON(!desc)) ++ return; ++ ++ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == ++ (IRQ_DISABLED|IRQ_PENDING)) { ++ shutdown_pirq(irq); ++ } else if (VALID_EVTCHN(evtchn)) { ++ unmask_evtchn(evtchn); ++ pirq_unmask_notify(irq); ++ } ++} ++ ++static int find_irq_by_gsi(unsigned gsi) ++{ ++ int irq; ++ ++ for (irq = 0; irq < nr_irqs; irq++) { ++ struct irq_info *info = info_for_irq(irq); ++ ++ if (info == NULL || info->type != IRQT_PIRQ) ++ continue; ++ ++ if (gsi_from_irq(irq) == gsi) ++ return irq; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Allocate a physical irq. We don't assign an event channel ++ * until the irq actually started up. Return an ++ * existing irq if we've already got one for the gsi. ++ */ ++int xen_allocate_pirq(unsigned gsi, int shareable, char *name) ++{ ++ int irq; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ irq = find_irq_by_gsi(gsi); ++ if (irq != -1) { ++ printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n", ++ irq, gsi); ++ goto out; /* XXX need refcount? */ ++ } ++ ++ if (identity_mapped_irq(gsi)) { ++ irq = gsi; ++ irq_to_desc_alloc_node(irq, 0); ++ dynamic_irq_init(irq); ++ } else ++ irq = find_unbound_irq(); ++ ++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip, ++ handle_level_irq, name); ++ ++ irq_info[irq] = mk_pirq_info(0, gsi); ++ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; ++out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++ ++#ifdef CONFIG_PCI_MSI ++int xen_destroy_irq(int irq) ++{ ++ struct irq_desc *desc; ++ struct physdev_unmap_pirq unmap_irq; ++ struct irq_info *info = info_for_irq(irq); ++ int rc = -ENOENT; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ desc = irq_to_desc(irq); ++ if (!desc) ++ goto out; ++ ++ unmap_irq.pirq = info->u.pirq.nr; ++ unmap_irq.domid = DOMID_SELF; ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); ++ if (rc) { ++ printk(KERN_WARNING "unmap irq failed %d\n", rc); ++ goto out; ++ } ++ ++ irq_info[irq] = mk_unbound_info(); ++ ++ dynamic_irq_cleanup(irq); ++ ++out: ++ spin_unlock(&irq_mapping_update_lock); ++ return rc; ++} ++ ++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) ++{ ++ int irq = 0; ++ struct physdev_map_pirq map_irq; ++ int rc; ++ domid_t domid = DOMID_SELF; ++ int pos; ++ u32 table_offset, bir; ++ ++ memset(&map_irq, 0, sizeof(map_irq)); ++ map_irq.domid = domid; ++ map_irq.type = MAP_PIRQ_TYPE_MSI; ++ map_irq.index = -1; ++ map_irq.pirq = -1; ++ map_irq.bus = dev->bus->number; ++ map_irq.devfn = dev->devfn; ++ ++ if (type == PCI_CAP_ID_MSIX) { ++ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); ++ ++ pci_read_config_dword(dev, msix_table_offset_reg(pos), ++ &table_offset); ++ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); ++ ++ map_irq.table_base = pci_resource_start(dev, bir); ++ map_irq.entry_nr = msidesc->msi_attrib.entry_nr; ++ } ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ irq = find_unbound_irq(); ++ ++ if (irq == -1) ++ goto out; ++ ++ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); ++ if (rc) { ++ ++ printk(KERN_WARNING "xen map irq failed %d\n", rc); ++ ++ dynamic_irq_cleanup(irq); ++ ++ irq = -1; ++ goto out; ++ } ++ irq_info[irq] = mk_pirq_info(0, map_irq.pirq); ++ ++ set_irq_chip_and_handler_name(irq, &xen_pirq_chip, ++ handle_level_irq, ++ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); ++ ++out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++#endif ++ ++int xen_gsi_from_irq(unsigned irq) ++{ ++ return gsi_from_irq(irq); ++} ++EXPORT_SYMBOL_GPL(xen_gsi_from_irq); ++ + int bind_evtchn_to_irq(unsigned int evtchn) + { + int irq; +@@ -409,8 +717,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) + return irq; + } + ++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, ++ unsigned int remote_port) ++{ ++ struct evtchn_bind_interdomain bind_interdomain; ++ int err; ++ ++ bind_interdomain.remote_dom = remote_domain; ++ bind_interdomain.remote_port = remote_port; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, ++ &bind_interdomain); ++ ++ return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); ++} ++ + +-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) ++int bind_virq_to_irq(unsigned int virq, unsigned int cpu) + { + struct evtchn_bind_virq bind_virq; + int evtchn, irq; +@@ -504,6 +827,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, + } + EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + ++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, ++ unsigned int remote_port, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); ++ + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +@@ -649,9 +995,13 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; ++ struct irq_desc *desc; + +- if (irq != -1) +- handle_irq(irq, regs); ++ if (irq != -1) { ++ desc = irq_to_desc(irq); ++ if (desc) ++ generic_handle_irq_desc(irq, desc); ++ } + } + } + +@@ -928,13 +1278,37 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { + .retrigger = retrigger_dynirq, + }; + ++static struct irq_chip xen_pirq_chip __read_mostly = { ++ .name = "xen-pirq", ++ ++ .startup = startup_pirq, ++ .shutdown = shutdown_pirq, ++ ++ .enable = enable_pirq, ++ .unmask = enable_pirq, ++ ++ .disable = disable_pirq, ++ .mask = disable_pirq, ++ ++ .ack = ack_pirq, ++ .end = end_pirq, ++ ++ .set_affinity = set_affinity_irq, ++ ++ .retrigger = retrigger_dynirq, ++}; ++ + void __init xen_init_IRQ(void) + { + int i; + + cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), + GFP_KERNEL); +- BUG_ON(cpu_evtchn_mask_p == NULL); ++ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); ++ ++ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); ++ for(i = 0; i < NR_EVENT_CHANNELS; i++) ++ evtchn_to_irq[i] = -1; + + init_evtchn_cpu_bindings(); + +@@ -943,4 +1317,6 @@ void __init xen_init_IRQ(void) + mask_evtchn(i); + + irq_ctx_init(smp_processor_id()); ++ ++ xen_setup_pirqs(); + } +diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c +index 79bedba..f70a4f4 100644 +--- a/drivers/xen/evtchn.c ++++ b/drivers/xen/evtchn.c +@@ -48,6 +48,8 @@ + #include + #include + #include ++ ++#include + #include + #include + #include +diff --git a/drivers/xen/features.c b/drivers/xen/features.c +index 99eda16..9e2b64f 100644 +--- a/drivers/xen/features.c ++++ b/drivers/xen/features.c +@@ -18,7 +18,7 @@ + u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; + EXPORT_SYMBOL_GPL(xen_features); + +-void xen_setup_features(void) ++void __init xen_setup_features(void) + { + struct xen_feature_info fi; + int i, j; +diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c +new file mode 100644 +index 0000000..ddc59cc +--- /dev/null ++++ b/drivers/xen/gntdev.c +@@ -0,0 +1,626 @@ ++/****************************************************************************** ++ * gntdev.c ++ * ++ * Device for accessing (in user-space) pages that have been granted by other ++ * domains. ++ * ++ * Copyright (c) 2006-2007, D G Murray. ++ * (c) 2009 Gerd Hoffmann ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Derek G. Murray , " ++ "Gerd Hoffmann "); ++MODULE_DESCRIPTION("User-space granted page access driver"); ++ ++static int debug = 0; ++module_param(debug, int, 0644); ++static int limit = 1024; ++module_param(limit, int, 0644); ++ ++struct gntdev_priv { ++ struct list_head maps; ++ uint32_t used; ++ uint32_t limit; ++ struct rw_semaphore sem; ++ struct mm_struct *mm; ++ struct mmu_notifier mn; ++}; ++ ++struct grant_map { ++ struct list_head next; ++ struct gntdev_priv *priv; ++ struct vm_area_struct *vma; ++ int index; ++ int count; ++ int flags; ++ int is_mapped; ++ struct ioctl_gntdev_grant_ref *grants; ++ struct gnttab_map_grant_ref *map_ops; ++ struct gnttab_unmap_grant_ref *unmap_ops; ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static void gntdev_print_maps(struct gntdev_priv *priv, ++ char *text, int text_index) ++{ ++ struct grant_map *map; ++ ++ printk("%s: maps list (priv %p, usage %d/%d)\n", ++ __FUNCTION__, priv, priv->used, priv->limit); ++ list_for_each_entry(map, &priv->maps, next) ++ printk(" index %2d, count %2d %s\n", ++ map->index, map->count, ++ map->index == text_index && text ? text : ""); ++} ++ ++static struct grant_map *gntdev_add_map(struct gntdev_priv *priv, int count) ++{ ++ struct grant_map *map, *add; ++ ++ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); ++ if (NULL == add) ++ return NULL; ++ ++ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); ++ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); ++ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); ++ if (NULL == add->grants || ++ NULL == add->map_ops || ++ NULL == add->unmap_ops) ++ goto err; ++ ++ add->index = 0; ++ add->count = count; ++ add->priv = priv; ++ ++ if (add->count + priv->used > priv->limit) ++ goto err; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (add->index + add->count < map->index) { ++ list_add_tail(&add->next, &map->next); ++ goto done; ++ } ++ add->index = map->index + map->count; ++ } ++ list_add_tail(&add->next, &priv->maps); ++ ++done: ++ priv->used += add->count; ++ if (debug) ++ gntdev_print_maps(priv, "[new]", add->index); ++ return add; ++ ++err: ++ kfree(add->grants); ++ kfree(add->map_ops); ++ kfree(add->unmap_ops); ++ kfree(add); ++ return NULL; ++} ++ ++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index, ++ int count) ++{ ++ struct grant_map *map; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (map->index != index) ++ continue; ++ if (map->count != count) ++ continue; ++ return map; ++ } ++ return NULL; ++} ++ ++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, ++ unsigned long vaddr) ++{ ++ struct grant_map *map; ++ ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (vaddr < map->vma->vm_start) ++ continue; ++ if (vaddr >= map->vma->vm_end) ++ continue; ++ return map; ++ } ++ return NULL; ++} ++ ++static int gntdev_del_map(struct grant_map *map) ++{ ++ int i; ++ ++ if (map->vma) ++ return -EBUSY; ++ for (i = 0; i < map->count; i++) ++ if (map->unmap_ops[i].handle) ++ return -EBUSY; ++ ++ map->priv->used -= map->count; ++ list_del(&map->next); ++ kfree(map->grants); ++ kfree(map->map_ops); ++ kfree(map->unmap_ops); ++ kfree(map); ++ return 0; ++} ++ ++/* ------------------------------------------------------------------ */ ++ ++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data) ++{ ++ struct grant_map *map = data; ++ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; ++ u64 pte_maddr; ++ ++ BUG_ON(pgnr >= map->count); ++ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT; ++ pte_maddr += (unsigned long)pte & ~PAGE_MASK; ++ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags, ++ map->grants[pgnr].ref, ++ map->grants[pgnr].domid); ++ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags, ++ 0 /* handle */); ++ return 0; ++} ++ ++static int map_grant_pages(struct grant_map *map) ++{ ++ int i, err = 0; ++ ++ if (debug) ++ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count); ++ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ map->map_ops, map->count); ++ if (WARN_ON(err)) ++ return err; ++ ++ for (i = 0; i < map->count; i++) { ++ if (map->map_ops[i].status) ++ err = -EINVAL; ++ map->unmap_ops[i].handle = map->map_ops[i].handle; ++ } ++ return err; ++} ++ ++static int unmap_grant_pages(struct grant_map *map, int offset, int pages) ++{ ++ int i, err = 0; ++ ++ if (debug) ++ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__, ++ map->index, map->count, offset, pages); ++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ map->unmap_ops + offset, pages); ++ if (WARN_ON(err)) ++ return err; ++ ++ for (i = 0; i < pages; i++) { ++ if (map->unmap_ops[offset+i].status) ++ err = -EINVAL; ++ map->unmap_ops[offset+i].handle = 0; ++ } ++ return err; ++} ++ ++/* ------------------------------------------------------------------ */ ++ ++static void gntdev_vma_close(struct vm_area_struct *vma) ++{ ++ struct grant_map *map = vma->vm_private_data; ++ ++ if (debug) ++ printk("%s\n", __FUNCTION__); ++ map->is_mapped = 0; ++ map->vma = NULL; ++ vma->vm_private_data = NULL; ++} ++ ++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ if (debug) ++ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n", ++ __FUNCTION__, vmf->virtual_address, vmf->pgoff); ++ vmf->flags = VM_FAULT_ERROR; ++ return 0; ++} ++ ++static struct vm_operations_struct gntdev_vmops = { ++ .close = gntdev_vma_close, ++ .fault = gntdev_vma_fault, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static void mn_invl_range_start(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long start, unsigned long end) ++{ ++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); ++ struct grant_map *map; ++ unsigned long mstart, mend; ++ int err; ++ ++ down_read(&priv->sem); ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (!map->is_mapped) ++ continue; ++ if (map->vma->vm_start >= end) ++ continue; ++ if (map->vma->vm_end <= start) ++ continue; ++ mstart = max(start, map->vma->vm_start); ++ mend = min(end, map->vma->vm_end); ++ if (debug) ++ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", ++ __FUNCTION__, map->index, map->count, ++ map->vma->vm_start, map->vma->vm_end, ++ start, end, mstart, mend); ++ err = unmap_grant_pages(map, ++ (mstart - map->vma->vm_start) >> PAGE_SHIFT, ++ (mend - mstart) >> PAGE_SHIFT); ++ WARN_ON(err); ++ } ++ up_read(&priv->sem); ++} ++ ++static void mn_invl_page(struct mmu_notifier *mn, ++ struct mm_struct *mm, ++ unsigned long address) ++{ ++ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); ++} ++ ++static void mn_release(struct mmu_notifier *mn, ++ struct mm_struct *mm) ++{ ++ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); ++ struct grant_map *map; ++ int err; ++ ++ down_read(&priv->sem); ++ list_for_each_entry(map, &priv->maps, next) { ++ if (!map->vma) ++ continue; ++ if (debug) ++ printk("%s: map %d+%d (%lx %lx)\n", ++ __FUNCTION__, map->index, map->count, ++ map->vma->vm_start, map->vma->vm_end); ++ err = unmap_grant_pages(map, 0, map->count); ++ WARN_ON(err); ++ } ++ up_read(&priv->sem); ++} ++ ++struct mmu_notifier_ops gntdev_mmu_ops = { ++ .release = mn_release, ++ .invalidate_page = mn_invl_page, ++ .invalidate_range_start = mn_invl_range_start, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static int gntdev_open(struct inode *inode, struct file *flip) ++{ ++ struct gntdev_priv *priv; ++ ++ priv = kzalloc(sizeof(*priv), GFP_KERNEL); ++ if (!priv) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&priv->maps); ++ init_rwsem(&priv->sem); ++ priv->limit = limit; ++ ++ priv->mm = get_task_mm(current); ++ if (!priv->mm) { ++ kfree(priv); ++ return -ENOMEM; ++ } ++ priv->mn.ops = &gntdev_mmu_ops; ++ mmu_notifier_register(&priv->mn, priv->mm); ++ mmput(priv->mm); ++ ++ flip->private_data = priv; ++ if (debug) ++ printk("%s: priv %p\n", __FUNCTION__, priv); ++ ++ return 0; ++} ++ ++static int gntdev_release(struct inode *inode, struct file *flip) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ struct grant_map *map; ++ int err; ++ ++ if (debug) ++ printk("%s: priv %p\n", __FUNCTION__, priv); ++ ++ down_write(&priv->sem); ++ while (!list_empty(&priv->maps)) { ++ map = list_entry(priv->maps.next, struct grant_map, next); ++ err = gntdev_del_map(map); ++ WARN_ON(err); ++ } ++ up_write(&priv->sem); ++ mmu_notifier_unregister(&priv->mn, priv->mm); ++ kfree(priv); ++ return 0; ++} ++ ++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, ++ struct ioctl_gntdev_map_grant_ref __user *u) ++{ ++ struct ioctl_gntdev_map_grant_ref op; ++ struct grant_map *map; ++ int err; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, add %d\n", __FUNCTION__, priv, ++ op.count); ++ if (unlikely(op.count <= 0)) ++ return -EINVAL; ++ if (unlikely(op.count > priv->limit)) ++ return -EINVAL; ++ ++ down_write(&priv->sem); ++ err = -ENOMEM; ++ map = gntdev_add_map(priv, op.count); ++ if (!map) ++ goto err_unlock; ++ ++ err = -ENOMEM; ++ if (copy_from_user(map->grants, &u->refs, ++ sizeof(map->grants[0]) * op.count) != 0) ++ goto err_free; ++ op.index = map->index << PAGE_SHIFT; ++ if (copy_to_user(u, &op, sizeof(op)) != 0) ++ goto err_free; ++ up_write(&priv->sem); ++ return 0; ++ ++err_free: ++ gntdev_del_map(map); ++err_unlock: ++ up_write(&priv->sem); ++ return err; ++} ++ ++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, ++ struct ioctl_gntdev_unmap_grant_ref __user *u) ++{ ++ struct ioctl_gntdev_unmap_grant_ref op; ++ struct grant_map *map; ++ int err = -EINVAL; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv, ++ (int)op.index, (int)op.count); ++ ++ down_write(&priv->sem); ++ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); ++ if (map) ++ err = gntdev_del_map(map); ++ up_write(&priv->sem); ++ return err; ++} ++ ++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, ++ struct ioctl_gntdev_get_offset_for_vaddr __user *u) ++{ ++ struct ioctl_gntdev_get_offset_for_vaddr op; ++ struct grant_map *map; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv, ++ (unsigned long)op.vaddr); ++ ++ down_read(&priv->sem); ++ map = gntdev_find_map_vaddr(priv, op.vaddr); ++ if (map == NULL || ++ map->vma->vm_start != op.vaddr) { ++ up_read(&priv->sem); ++ return -EINVAL; ++ } ++ op.offset = map->index << PAGE_SHIFT; ++ op.count = map->count; ++ up_read(&priv->sem); ++ ++ if (copy_to_user(u, &op, sizeof(op)) != 0) ++ return -EFAULT; ++ return 0; ++} ++ ++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, ++ struct ioctl_gntdev_set_max_grants __user *u) ++{ ++ struct ioctl_gntdev_set_max_grants op; ++ ++ if (copy_from_user(&op, u, sizeof(op)) != 0) ++ return -EFAULT; ++ if (debug) ++ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count); ++ if (op.count > limit) ++ return -EINVAL; ++ ++ down_write(&priv->sem); ++ priv->limit = op.count; ++ up_write(&priv->sem); ++ return 0; ++} ++ ++static long gntdev_ioctl(struct file *flip, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ void __user *ptr = (void __user *)arg; ++ ++ switch (cmd) { ++ case IOCTL_GNTDEV_MAP_GRANT_REF: ++ return gntdev_ioctl_map_grant_ref(priv, ptr); ++ ++ case IOCTL_GNTDEV_UNMAP_GRANT_REF: ++ return gntdev_ioctl_unmap_grant_ref(priv, ptr); ++ ++ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: ++ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); ++ ++ case IOCTL_GNTDEV_SET_MAX_GRANTS: ++ return gntdev_ioctl_set_max_grants(priv, ptr); ++ ++ default: ++ if (debug) ++ printk("%s: priv %p, unknown cmd %x\n", ++ __FUNCTION__, priv, cmd); ++ return -ENOIOCTLCMD; ++ } ++ ++ return 0; ++} ++ ++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) ++{ ++ struct gntdev_priv *priv = flip->private_data; ++ int index = vma->vm_pgoff; ++ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; ++ struct grant_map *map; ++ int err = -EINVAL; ++ ++ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) ++ return -EINVAL; ++ ++ if (debug) ++ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__, ++ index, count, vma->vm_start, vma->vm_pgoff); ++ ++ down_read(&priv->sem); ++ map = gntdev_find_map_index(priv, index, count); ++ if (!map) ++ goto unlock_out; ++ if (map->vma) ++ goto unlock_out; ++ if (priv->mm != vma->vm_mm) { ++ printk("%s: Huh? Other mm?\n", __FUNCTION__); ++ goto unlock_out; ++ } ++ ++ vma->vm_ops = &gntdev_vmops; ++ ++ vma->vm_flags |= VM_RESERVED; ++ vma->vm_flags |= VM_DONTCOPY; ++ vma->vm_flags |= VM_DONTEXPAND; ++ ++ vma->vm_private_data = map; ++ map->vma = vma; ++ ++ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte; ++ if (!(vma->vm_flags & VM_WRITE)) ++ map->flags |= GNTMAP_readonly; ++ ++ err = apply_to_page_range(vma->vm_mm, vma->vm_start, ++ vma->vm_end - vma->vm_start, ++ find_grant_ptes, map); ++ if (err) { ++ goto unlock_out; ++ if (debug) ++ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__); ++ } ++ ++ err = map_grant_pages(map); ++ if (err) { ++ goto unlock_out; ++ if (debug) ++ printk("%s: map_grant_pages() failure.\n", __FUNCTION__); ++ } ++ map->is_mapped = 1; ++ ++unlock_out: ++ up_read(&priv->sem); ++ return err; ++} ++ ++static const struct file_operations gntdev_fops = { ++ .owner = THIS_MODULE, ++ .open = gntdev_open, ++ .release = gntdev_release, ++ .mmap = gntdev_mmap, ++ .unlocked_ioctl = gntdev_ioctl ++}; ++ ++static struct miscdevice gntdev_miscdev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "gntdev", ++ .fops = &gntdev_fops, ++}; ++ ++/* ------------------------------------------------------------------ */ ++ ++static int __init gntdev_init(void) ++{ ++ int err; ++ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ err = misc_register(&gntdev_miscdev); ++ if (err != 0) { ++ printk(KERN_ERR "Could not register gntdev device\n"); ++ return err; ++ } ++ return 0; ++} ++ ++static void __exit gntdev_exit(void) ++{ ++ misc_deregister(&gntdev_miscdev); ++} ++ ++module_init(gntdev_init); ++module_exit(gntdev_exit); ++ ++/* ------------------------------------------------------------------ */ +diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c +index 7d8f531..76fe621 100644 +--- a/drivers/xen/grant-table.c ++++ b/drivers/xen/grant-table.c +@@ -37,6 +37,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -472,6 +473,111 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) + return 0; + } + ++static void gnttab_page_free(struct page *page, unsigned int order) ++{ ++ BUG_ON(order); ++ ClearPageForeign(page); ++ gnttab_reset_grant_page(page); ++ put_page(page); ++} ++ ++/* ++ * Must not be called with IRQs off. This should only be used on the ++ * slow path. ++ * ++ * Copy a foreign granted page to local memory. ++ */ ++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep) ++{ ++ struct gnttab_unmap_and_replace unmap; ++ struct mmu_update mmu; ++ struct page *page; ++ struct page *new_page; ++ void *new_addr; ++ void *addr; ++ unsigned long pfn; ++ unsigned long mfn; ++ unsigned long new_mfn; ++ int err; ++ ++ page = *pagep; ++ if (!get_page_unless_zero(page)) ++ return -ENOENT; ++ ++ err = -ENOMEM; ++ new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); ++ if (!new_page) ++ goto out; ++ ++ new_addr = page_address(new_page); ++ addr = page_address(page); ++ memcpy(new_addr, addr, PAGE_SIZE); ++ ++ pfn = page_to_pfn(page); ++ mfn = pfn_to_mfn(pfn); ++ new_mfn = virt_to_mfn(new_addr); ++ ++// write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */ ++ ++ /* Make seq visible before checking page_mapped. */ ++ smp_mb(); ++ ++ /* Has the page been DMA-mapped? */ ++ if (unlikely(page_mapped(page))) { ++ //write_sequnlock(&gnttab_dma_lock); ++ put_page(new_page); ++ err = -EBUSY; ++ goto out; ++ } ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) ++ set_phys_to_machine(pfn, new_mfn); ++ ++ //gnttab_set_replace_op(&unmap, (unsigned long)addr, ++ // (unsigned long)new_addr, ref); ++ unmap.host_addr = (unsigned long)addr; ++ unmap.new_addr = (unsigned long)new_addr; ++ unmap.handle = ref; ++ ++ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, ++ &unmap, 1); ++ BUG_ON(err); ++ BUG_ON(unmap.status); ++ ++// write_sequnlock(&gnttab_dma_lock); ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY); ++ ++ mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; ++ mmu.val = pfn; ++ err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF); ++ BUG_ON(err); ++ } ++ ++ new_page->mapping = page->mapping; ++ SetPageForeign(new_page, _PageForeignDestructor(page)); ++ if (PageReserved(page)) ++ SetPageReserved(new_page); ++ *pagep = new_page; ++ ++ SetPageForeign(page, gnttab_page_free); ++ ClearPageReserved(page); ++ page->mapping = NULL; ++ ++out: ++ put_page(page); ++ return err; ++} ++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page); ++ ++void gnttab_reset_grant_page(struct page *page) ++{ ++ init_page_count(page); ++ reset_page_mapcount(page); ++} ++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page); ++ + int gnttab_resume(void) + { + if (max_nr_grant_frames() < nr_grant_frames) +diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile +new file mode 100644 +index 0000000..e346e81 +--- /dev/null ++++ b/drivers/xen/netback/Makefile +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o ++ ++xen-netback-y := netback.o xenbus.o interface.o +diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h +new file mode 100644 +index 0000000..51f97c0 +--- /dev/null ++++ b/drivers/xen/netback/common.h +@@ -0,0 +1,227 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/common.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __NETIF__BACKEND__COMMON_H__ ++#define __NETIF__BACKEND__COMMON_H__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++#define IPRINTK(fmt, args...) \ ++ printk(KERN_INFO "xen_net: " fmt, ##args) ++#define WPRINTK(fmt, args...) \ ++ printk(KERN_WARNING "xen_net: " fmt, ##args) ++ ++struct xen_netif { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ ++ u8 fe_dev_addr[6]; ++ ++ /* Physical parameters of the comms window. */ ++ grant_handle_t tx_shmem_handle; ++ grant_ref_t tx_shmem_ref; ++ grant_handle_t rx_shmem_handle; ++ grant_ref_t rx_shmem_ref; ++ unsigned int irq; ++ ++ /* The shared rings and indexes. */ ++ struct xen_netif_tx_back_ring tx; ++ struct xen_netif_rx_back_ring rx; ++ struct vm_struct *tx_comms_area; ++ struct vm_struct *rx_comms_area; ++ ++ /* Set of features that can be turned on in dev->features. */ ++ int features; ++ ++ int smart_poll; ++ ++ /* Internal feature information. */ ++ u8 can_queue:1; /* can queue packets for receiver? */ ++ ++ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */ ++ RING_IDX rx_req_cons_peek; ++ ++ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ ++ unsigned long credit_bytes; ++ unsigned long credit_usec; ++ unsigned long remaining_credit; ++ struct timer_list credit_timeout; ++ ++ /* Enforce draining of the transmit queue. */ ++ struct timer_list tx_queue_timeout; ++ ++ /* Statistics */ ++ int nr_copied_skbs; ++ ++ /* Miscellaneous private stuff. */ ++ struct list_head list; /* scheduling list */ ++ atomic_t refcnt; ++ struct net_device *dev; ++ struct net_device_stats stats; ++ ++ unsigned int carrier; ++ ++ wait_queue_head_t waiting_to_free; ++}; ++ ++/* ++ * Implement our own carrier flag: the network stack's version causes delays ++ * when the carrier is re-enabled (in particular, dev_activate() may not ++ * immediately be called, which can cause packet loss; also the etherbridge ++ * can be rather lazy in activating its port). ++ */ ++#define netback_carrier_on(netif) ((netif)->carrier = 1) ++#define netback_carrier_off(netif) ((netif)->carrier = 0) ++#define netback_carrier_ok(netif) ((netif)->carrier) ++ ++enum { ++ NETBK_DONT_COPY_SKB, ++ NETBK_DELAYED_COPY_SKB, ++ NETBK_ALWAYS_COPY_SKB, ++}; ++ ++extern int netbk_copy_skb_mode; ++ ++/* Function pointers into netback accelerator plugin modules */ ++struct netback_accel_hooks { ++ struct module *owner; ++ int (*probe)(struct xenbus_device *dev); ++ int (*remove)(struct xenbus_device *dev); ++}; ++ ++/* Structure to track the state of a netback accelerator plugin */ ++struct netback_accelerator { ++ struct list_head link; ++ int id; ++ char *eth_name; ++ atomic_t use_count; ++ struct netback_accel_hooks *hooks; ++}; ++ ++struct backend_info { ++ struct xenbus_device *dev; ++ struct xen_netif *netif; ++ enum xenbus_state frontend_state; ++ struct xenbus_watch hotplug_status_watch; ++ int have_hotplug_status_watch:1; ++ ++ /* State relating to the netback accelerator */ ++ void *netback_accel_priv; ++ /* The accelerator that this backend is currently using */ ++ struct netback_accelerator *accelerator; ++}; ++ ++#define NETBACK_ACCEL_VERSION 0x00010001 ++ ++/* ++ * Connect an accelerator plugin module to netback. Returns zero on ++ * success, < 0 on error, > 0 (with highest version number supported) ++ * if version mismatch. ++ */ ++extern int netback_connect_accelerator(unsigned version, ++ int id, const char *eth_name, ++ struct netback_accel_hooks *hooks); ++/* Disconnect a previously connected accelerator plugin module */ ++extern void netback_disconnect_accelerator(int id, const char *eth_name); ++ ++ ++extern ++void netback_probe_accelerators(struct backend_info *be, ++ struct xenbus_device *dev); ++extern ++void netback_remove_accelerators(struct backend_info *be, ++ struct xenbus_device *dev); ++extern ++void netif_accel_init(void); ++ ++ ++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE) ++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE) ++ ++void netif_disconnect(struct xen_netif *netif); ++ ++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle); ++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn); ++ ++static inline void netif_get(struct xen_netif *netif) ++{ ++ atomic_inc(&netif->refcnt); ++} ++ ++static inline void netif_put(struct xen_netif *netif) ++{ ++ if (atomic_dec_and_test(&netif->refcnt)) ++ wake_up(&netif->waiting_to_free); ++} ++ ++int netif_xenbus_init(void); ++ ++#define netif_schedulable(netif) \ ++ (netif_running((netif)->dev) && netback_carrier_ok(netif)) ++ ++void netif_schedule_work(struct xen_netif *netif); ++void netif_deschedule_work(struct xen_netif *netif); ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); ++struct net_device_stats *netif_be_get_stats(struct net_device *dev); ++irqreturn_t netif_be_int(int irq, void *dev_id); ++ ++static inline int netbk_can_queue(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return netif->can_queue; ++} ++ ++static inline int netbk_can_sg(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return netif->features & NETIF_F_SG; ++} ++ ++#endif /* __NETIF__BACKEND__COMMON_H__ */ +diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c +new file mode 100644 +index 0000000..b23b14d +--- /dev/null ++++ b/drivers/xen/netback/interface.c +@@ -0,0 +1,405 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/interface.c ++ * ++ * Network-device interface management. ++ * ++ * Copyright (c) 2004-2005, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include ++#include ++ ++#include ++#include ++ ++/* ++ * Module parameter 'queue_length': ++ * ++ * Enables queuing in the network stack when a client has run out of receive ++ * descriptors. Although this feature can improve receive bandwidth by avoiding ++ * packet loss, it can also result in packets sitting in the 'tx_queue' for ++ * unbounded time. This is bad if those packets hold onto foreign resources. ++ * For example, consider a packet that holds onto resources belonging to the ++ * guest for which it is queued (e.g., packet received on vif1.0, destined for ++ * vif1.1 which is not activated in the guest): in this situation the guest ++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we ++ * run a timer (tx_queue_timeout) to drain the queue when the interface is ++ * blocked. ++ */ ++static unsigned long netbk_queue_length = 32; ++module_param_named(queue_length, netbk_queue_length, ulong, 0644); ++ ++static void __netif_up(struct xen_netif *netif) ++{ ++ enable_irq(netif->irq); ++ netif_schedule_work(netif); ++} ++ ++static void __netif_down(struct xen_netif *netif) ++{ ++ disable_irq(netif->irq); ++ netif_deschedule_work(netif); ++} ++ ++static int net_open(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) { ++ __netif_up(netif); ++ netif_start_queue(dev); ++ } ++ return 0; ++} ++ ++static int net_close(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) ++ __netif_down(netif); ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int netbk_change_mtu(struct net_device *dev, int mtu) ++{ ++ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; ++ ++ if (mtu > max) ++ return -EINVAL; ++ dev->mtu = mtu; ++ return 0; ++} ++ ++static int netbk_set_sg(struct net_device *dev, u32 data) ++{ ++ if (data) { ++ struct xen_netif *netif = netdev_priv(dev); ++ ++ if (!(netif->features & NETIF_F_SG)) ++ return -ENOSYS; ++ } ++ ++ if (dev->mtu > ETH_DATA_LEN) ++ dev->mtu = ETH_DATA_LEN; ++ ++ return ethtool_op_set_sg(dev, data); ++} ++ ++static int netbk_set_tso(struct net_device *dev, u32 data) ++{ ++ if (data) { ++ struct xen_netif *netif = netdev_priv(dev); ++ ++ if (!(netif->features & NETIF_F_TSO)) ++ return -ENOSYS; ++ } ++ ++ return ethtool_op_set_tso(dev, data); ++} ++ ++static void netbk_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ strcpy(info->driver, "netbk"); ++ strcpy(info->bus_info, dev_name(dev->dev.parent)); ++} ++ ++static const struct netif_stat { ++ char name[ETH_GSTRING_LEN]; ++ u16 offset; ++} netbk_stats[] = { ++ { "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) }, ++}; ++ ++static int netbk_get_stats_count(struct net_device *dev) ++{ ++ return ARRAY_SIZE(netbk_stats); ++} ++ ++static void netbk_get_ethtool_stats(struct net_device *dev, ++ struct ethtool_stats *stats, u64 * data) ++{ ++ void *netif = netdev_priv(dev); ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) ++ data[i] = *(int *)(netif + netbk_stats[i].offset); ++} ++ ++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data) ++{ ++ int i; ++ ++ switch (stringset) { ++ case ETH_SS_STATS: ++ for (i = 0; i < ARRAY_SIZE(netbk_stats); i++) ++ memcpy(data + i * ETH_GSTRING_LEN, ++ netbk_stats[i].name, ETH_GSTRING_LEN); ++ break; ++ } ++} ++ ++static struct ethtool_ops network_ethtool_ops = ++{ ++ .get_drvinfo = netbk_get_drvinfo, ++ ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = ethtool_op_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = netbk_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = netbk_set_tso, ++ .get_link = ethtool_op_get_link, ++ ++ .get_stats_count = netbk_get_stats_count, ++ .get_ethtool_stats = netbk_get_ethtool_stats, ++ .get_strings = netbk_get_strings, ++}; ++ ++static struct net_device_ops netback_ops = ++{ ++ .ndo_start_xmit = netif_be_start_xmit, ++ .ndo_get_stats = netif_be_get_stats, ++ .ndo_open = net_open, ++ .ndo_stop = net_close, ++ .ndo_change_mtu = netbk_change_mtu, ++}; ++ ++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle) ++{ ++ int err = 0; ++ struct net_device *dev; ++ struct xen_netif *netif; ++ char name[IFNAMSIZ] = {}; ++ ++ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); ++ dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup); ++ if (dev == NULL) { ++ DPRINTK("Could not create netif: out of memory\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ SET_NETDEV_DEV(dev, parent); ++ ++ netif = netdev_priv(dev); ++ memset(netif, 0, sizeof(*netif)); ++ netif->domid = domid; ++ netif->handle = handle; ++ netif->features = NETIF_F_SG; ++ atomic_set(&netif->refcnt, 1); ++ init_waitqueue_head(&netif->waiting_to_free); ++ netif->dev = dev; ++ INIT_LIST_HEAD(&netif->list); ++ ++ netback_carrier_off(netif); ++ ++ netif->credit_bytes = netif->remaining_credit = ~0UL; ++ netif->credit_usec = 0UL; ++ init_timer(&netif->credit_timeout); ++ /* Initialize 'expires' now: it's used to track the credit window. */ ++ netif->credit_timeout.expires = jiffies; ++ ++ init_timer(&netif->tx_queue_timeout); ++ ++ dev->netdev_ops = &netback_ops; ++ dev->features = NETIF_F_IP_CSUM|NETIF_F_SG; ++ ++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops); ++ ++ dev->tx_queue_len = netbk_queue_length; ++ ++ /* ++ * Initialise a dummy MAC address. We choose the numerically ++ * largest non-broadcast address to prevent the address getting ++ * stolen by an Ethernet bridge for STP purposes. ++ * (FE:FF:FF:FF:FF:FF) ++ */ ++ memset(dev->dev_addr, 0xFF, ETH_ALEN); ++ dev->dev_addr[0] &= ~0x01; ++ ++ rtnl_lock(); ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ if (err) { ++ DPRINTK("Could not register new net device %s: err=%d\n", ++ dev->name, err); ++ free_netdev(dev); ++ return ERR_PTR(err); ++ } ++ ++ DPRINTK("Successfully created netif\n"); ++ return netif; ++} ++ ++static int map_frontend_pages( ++ struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, tx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->tx_shmem_ref = tx_ring_ref; ++ netif->tx_shmem_handle = op.handle; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, rx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ struct gnttab_unmap_grant_ref unop; ++ ++ gnttab_set_unmap_op(&unop, ++ (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, netif->tx_shmem_handle); ++ HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1); ++ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->rx_shmem_ref = rx_ring_ref; ++ netif->rx_shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_pages(struct xen_netif *netif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, netif->tx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, netif->rx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn) ++{ ++ int err = -ENOMEM; ++ struct xen_netif_tx_sring *txs; ++ struct xen_netif_rx_sring *rxs; ++ ++ /* Already connected through? */ ++ if (netif->irq) ++ return 0; ++ ++ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->tx_comms_area == NULL) ++ return -ENOMEM; ++ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->rx_comms_area == NULL) ++ goto err_rx; ++ ++ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); ++ if (err) ++ goto err_map; ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ netif->domid, evtchn, netif_be_int, 0, ++ netif->dev->name, netif); ++ if (err < 0) ++ goto err_hypervisor; ++ netif->irq = err; ++ disable_irq(netif->irq); ++ ++ txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr; ++ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); ++ ++ rxs = (struct xen_netif_rx_sring *) ++ ((char *)netif->rx_comms_area->addr); ++ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); ++ ++ netif->rx_req_cons_peek = 0; ++ ++ netif_get(netif); ++ ++ rtnl_lock(); ++ netback_carrier_on(netif); ++ if (netif_running(netif->dev)) ++ __netif_up(netif); ++ rtnl_unlock(); ++ ++ return 0; ++err_hypervisor: ++ unmap_frontend_pages(netif); ++err_map: ++ free_vm_area(netif->rx_comms_area); ++err_rx: ++ free_vm_area(netif->tx_comms_area); ++ return err; ++} ++ ++void netif_disconnect(struct xen_netif *netif) ++{ ++ if (netback_carrier_ok(netif)) { ++ rtnl_lock(); ++ netback_carrier_off(netif); ++ netif_carrier_off(netif->dev); /* discard queued packets */ ++ if (netif_running(netif->dev)) ++ __netif_down(netif); ++ rtnl_unlock(); ++ netif_put(netif); ++ } ++ ++ atomic_dec(&netif->refcnt); ++ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); ++ ++ del_timer_sync(&netif->credit_timeout); ++ del_timer_sync(&netif->tx_queue_timeout); ++ ++ if (netif->irq) ++ unbind_from_irqhandler(netif->irq, netif); ++ ++ unregister_netdev(netif->dev); ++ ++ if (netif->tx.sring) { ++ unmap_frontend_pages(netif); ++ free_vm_area(netif->tx_comms_area); ++ free_vm_area(netif->rx_comms_area); ++ } ++ ++ free_netdev(netif->dev); ++} +diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c +new file mode 100644 +index 0000000..0bc6398 +--- /dev/null ++++ b/drivers/xen/netback/netback.c +@@ -0,0 +1,1613 @@ ++/****************************************************************************** ++ * drivers/xen/netback/netback.c ++ * ++ * Back-end of the driver for virtual network devices. This portion of the ++ * driver exports a 'unified' network-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * drivers/xen/netfront/netfront.c ++ * ++ * Copyright (c) 2002-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++/*define NETBE_DEBUG_INTERRUPT*/ ++ ++struct netbk_rx_meta { ++ skb_frag_t frag; ++ int id; ++}; ++ ++struct netbk_tx_pending_inuse { ++ struct list_head list; ++ unsigned long alloc_time; ++}; ++ ++ ++static void netif_idx_release(u16 pending_idx); ++static void make_tx_response(struct xen_netif *netif, ++ struct xen_netif_tx_request *txp, ++ s8 st); ++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags); ++ ++static void net_tx_action(unsigned long unused); ++static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); ++ ++static void net_rx_action(unsigned long unused); ++static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); ++ ++static struct timer_list net_timer; ++static struct timer_list netbk_tx_pending_timer; ++ ++#define MAX_PENDING_REQS 256 ++ ++static struct sk_buff_head rx_queue; ++ ++static struct page **mmap_pages; ++static inline unsigned long idx_to_pfn(unsigned int idx) ++{ ++ return page_to_pfn(mmap_pages[idx]); ++} ++ ++static inline unsigned long idx_to_kaddr(unsigned int idx) ++{ ++ return (unsigned long)pfn_to_kaddr(idx_to_pfn(idx)); ++} ++ ++/* extra field used in struct page */ ++static inline void netif_set_page_index(struct page *pg, unsigned int index) ++{ ++ *(unsigned long *)&pg->mapping = index + 1; ++} ++ ++static inline int netif_page_index(struct page *pg) ++{ ++ unsigned long idx = (unsigned long)pg->mapping - 1; ++ ++ if (!PageForeign(pg)) ++ return -1; ++ ++ if ((idx >= MAX_PENDING_REQS) || (mmap_pages[idx] != pg)) ++ return -1; ++ ++ return idx; ++} ++ ++/* ++ * This is the amount of packet we copy rather than map, so that the ++ * guest can't fiddle with the contents of the headers while we do ++ * packet processing on them (netfilter, routing, etc). 72 is enough ++ * to cover TCP+IP headers including options. ++ */ ++#define PKT_PROT_LEN 72 ++ ++static struct pending_tx_info { ++ struct xen_netif_tx_request req; ++ struct xen_netif *netif; ++} pending_tx_info[MAX_PENDING_REQS]; ++static u16 pending_ring[MAX_PENDING_REQS]; ++typedef unsigned int pending_ring_idx_t; ++ ++static inline pending_ring_idx_t pending_index(unsigned i) ++{ ++ return i & (MAX_PENDING_REQS-1); ++} ++ ++static pending_ring_idx_t pending_prod, pending_cons; ++ ++static inline pending_ring_idx_t nr_pending_reqs(void) ++{ ++ return MAX_PENDING_REQS - pending_prod + pending_cons; ++} ++ ++/* Freed TX SKBs get batched on this ring before return to pending_ring. */ ++static u16 dealloc_ring[MAX_PENDING_REQS]; ++static pending_ring_idx_t dealloc_prod, dealloc_cons; ++ ++/* Doubly-linked list of in-use pending entries. */ ++static struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS]; ++static LIST_HEAD(pending_inuse_head); ++ ++static struct sk_buff_head tx_queue; ++ ++static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; ++static struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; ++static struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; ++ ++static LIST_HEAD(net_schedule_list); ++static DEFINE_SPINLOCK(net_schedule_list_lock); ++ ++#define MAX_MFN_ALLOC 64 ++static unsigned long mfn_list[MAX_MFN_ALLOC]; ++static unsigned int alloc_index = 0; ++ ++/* Setting this allows the safe use of this driver without netloop. */ ++static int MODPARM_copy_skb = 1; ++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0); ++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop"); ++ ++int netbk_copy_skb_mode; ++ ++static inline unsigned long alloc_mfn(void) ++{ ++ BUG_ON(alloc_index == 0); ++ return mfn_list[--alloc_index]; ++} ++ ++static inline void maybe_schedule_tx_action(void) ++{ ++ smp_mb(); ++ if ((nr_pending_reqs() < (MAX_PENDING_REQS/2)) && ++ !list_empty(&net_schedule_list)) ++ tasklet_schedule(&net_tx_tasklet); ++} ++ ++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) ++{ ++ struct skb_shared_info *ninfo; ++ struct sk_buff *nskb; ++ unsigned long offset; ++ int ret; ++ int len; ++ int headlen; ++ ++ BUG_ON(skb_shinfo(skb)->frag_list != NULL); ++ ++ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN); ++ headlen = skb_end_pointer(nskb) - nskb->data; ++ if (headlen > skb_headlen(skb)) ++ headlen = skb_headlen(skb); ++ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); ++ BUG_ON(ret); ++ ++ ninfo = skb_shinfo(nskb); ++ ninfo->gso_size = skb_shinfo(skb)->gso_size; ++ ninfo->gso_type = skb_shinfo(skb)->gso_type; ++ ++ offset = headlen; ++ len = skb->len - headlen; ++ ++ nskb->len = skb->len; ++ nskb->data_len = len; ++ nskb->truesize += len; ++ ++ while (len) { ++ struct page *page; ++ int copy; ++ int zero; ++ ++ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) { ++ dump_stack(); ++ goto err_free; ++ } ++ ++ copy = len >= PAGE_SIZE ? PAGE_SIZE : len; ++ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; ++ ++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); ++ if (unlikely(!page)) ++ goto err_free; ++ ++ ret = skb_copy_bits(skb, offset, page_address(page), copy); ++ BUG_ON(ret); ++ ++ ninfo->frags[ninfo->nr_frags].page = page; ++ ninfo->frags[ninfo->nr_frags].page_offset = 0; ++ ninfo->frags[ninfo->nr_frags].size = copy; ++ ninfo->nr_frags++; ++ ++ offset += copy; ++ len -= copy; ++ } ++ ++ offset = nskb->data - skb->data; ++ ++ nskb->transport_header = skb->transport_header + offset; ++ nskb->network_header = skb->network_header + offset; ++ nskb->mac_header = skb->mac_header + offset; ++ ++ return nskb; ++ ++ err_free: ++ kfree_skb(nskb); ++ err: ++ return NULL; ++} ++ ++static inline int netbk_max_required_rx_slots(struct xen_netif *netif) ++{ ++ if (netif->features & (NETIF_F_SG|NETIF_F_TSO)) ++ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */ ++ return 1; /* all in one */ ++} ++ ++static inline int netbk_queue_full(struct xen_netif *netif) ++{ ++ RING_IDX peek = netif->rx_req_cons_peek; ++ RING_IDX needed = netbk_max_required_rx_slots(netif); ++ ++ return ((netif->rx.sring->req_prod - peek) < needed) || ++ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); ++} ++ ++static void tx_queue_callback(unsigned long data) ++{ ++ struct xen_netif *netif = (struct xen_netif *)data; ++ if (netif_schedulable(netif)) ++ netif_wake_queue(netif->dev); ++} ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ ++ BUG_ON(skb->dev != dev); ++ ++ /* Drop the packet if the target domain has no receive buffers. */ ++ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif))) ++ goto drop; ++ ++ /* ++ * XXX For now we also copy skbuffs whose head crosses a page ++ * boundary, because netbk_gop_skb can't handle them. ++ */ ++ if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) { ++ struct sk_buff *nskb = netbk_copy_skb(skb); ++ if ( unlikely(nskb == NULL) ) ++ goto drop; ++ /* Copy only the header fields we use in this driver. */ ++ nskb->dev = skb->dev; ++ nskb->ip_summed = skb->ip_summed; ++ dev_kfree_skb(skb); ++ skb = nskb; ++ } ++ ++ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 + ++ !!skb_shinfo(skb)->gso_size; ++ netif_get(netif); ++ ++ if (netbk_can_queue(dev) && netbk_queue_full(netif)) { ++ netif->rx.sring->req_event = netif->rx_req_cons_peek + ++ netbk_max_required_rx_slots(netif); ++ mb(); /* request notification /then/ check & stop the queue */ ++ if (netbk_queue_full(netif)) { ++ netif_stop_queue(dev); ++ /* ++ * Schedule 500ms timeout to restart the queue, thus ++ * ensuring that an inactive queue will be drained. ++ * Packets will be immediately be dropped until more ++ * receive buffers become available (see ++ * netbk_queue_full() check above). ++ */ ++ netif->tx_queue_timeout.data = (unsigned long)netif; ++ netif->tx_queue_timeout.function = tx_queue_callback; ++ mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); ++ } ++ } ++ ++ skb_queue_tail(&rx_queue, skb); ++ tasklet_schedule(&net_rx_tasklet); ++ ++ return 0; ++ ++ drop: ++ netif->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ return 0; ++} ++ ++struct netrx_pending_operations { ++ unsigned trans_prod, trans_cons; ++ unsigned mmu_prod, mmu_mcl; ++ unsigned mcl_prod, mcl_cons; ++ unsigned copy_prod, copy_cons; ++ unsigned meta_prod, meta_cons; ++ struct mmu_update *mmu; ++ struct gnttab_transfer *trans; ++ struct gnttab_copy *copy; ++ struct multicall_entry *mcl; ++ struct netbk_rx_meta *meta; ++}; ++ ++/* Set up the grant operations for this fragment. If it's a flipping ++ interface, we also set up the unmap request from here. */ ++static u16 netbk_gop_frag(struct xen_netif *netif, struct netbk_rx_meta *meta, ++ int i, struct netrx_pending_operations *npo, ++ struct page *page, unsigned long size, ++ unsigned long offset) ++{ ++ struct gnttab_copy *copy_gop; ++ struct xen_netif_rx_request *req; ++ unsigned long old_mfn; ++ int idx = netif_page_index(page); ++ ++ old_mfn = virt_to_mfn(page_address(page)); ++ ++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i); ++ ++ copy_gop = npo->copy + npo->copy_prod++; ++ copy_gop->flags = GNTCOPY_dest_gref; ++ if (idx > -1) { ++ struct pending_tx_info *src_pend = &pending_tx_info[idx]; ++ copy_gop->source.domid = src_pend->netif->domid; ++ copy_gop->source.u.ref = src_pend->req.gref; ++ copy_gop->flags |= GNTCOPY_source_gref; ++ } else { ++ copy_gop->source.domid = DOMID_SELF; ++ copy_gop->source.u.gmfn = old_mfn; ++ } ++ copy_gop->source.offset = offset; ++ copy_gop->dest.domid = netif->domid; ++ copy_gop->dest.offset = 0; ++ copy_gop->dest.u.ref = req->gref; ++ copy_gop->len = size; ++ ++ return req->id; ++} ++ ++static void netbk_gop_skb(struct sk_buff *skb, ++ struct netrx_pending_operations *npo) ++{ ++ struct xen_netif *netif = netdev_priv(skb->dev); ++ int nr_frags = skb_shinfo(skb)->nr_frags; ++ int i; ++ int extra; ++ struct netbk_rx_meta *head_meta, *meta; ++ ++ head_meta = npo->meta + npo->meta_prod++; ++ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type; ++ head_meta->frag.size = skb_shinfo(skb)->gso_size; ++ extra = !!head_meta->frag.size + 1; ++ ++ for (i = 0; i < nr_frags; i++) { ++ meta = npo->meta + npo->meta_prod++; ++ meta->frag = skb_shinfo(skb)->frags[i]; ++ meta->id = netbk_gop_frag(netif, meta, i + extra, npo, ++ meta->frag.page, ++ meta->frag.size, ++ meta->frag.page_offset); ++ } ++ ++ /* ++ * This must occur at the end to ensure that we don't trash skb_shinfo ++ * until we're done. We know that the head doesn't cross a page ++ * boundary because such packets get copied in netif_be_start_xmit. ++ */ ++ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo, ++ virt_to_page(skb->data), ++ skb_headlen(skb), ++ offset_in_page(skb->data)); ++ ++ netif->rx.req_cons += nr_frags + extra; ++} ++ ++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta) ++{ ++ int i; ++ ++ for (i = 0; i < nr_frags; i++) ++ put_page(meta[i].frag.page); ++} ++ ++/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was ++ used to set up the operations on the top of ++ netrx_pending_operations, which have since been done. Check that ++ they didn't give any errors and advance over them. */ ++static int netbk_check_gop(int nr_frags, domid_t domid, ++ struct netrx_pending_operations *npo) ++{ ++ struct gnttab_copy *copy_op; ++ int status = NETIF_RSP_OKAY; ++ int i; ++ ++ for (i = 0; i <= nr_frags; i++) { ++ copy_op = npo->copy + npo->copy_cons++; ++ if (copy_op->status != GNTST_okay) { ++ DPRINTK("Bad status %d from copy to DOM%d.\n", ++ copy_op->status, domid); ++ status = NETIF_RSP_ERROR; ++ } ++ } ++ ++ return status; ++} ++ ++static void netbk_add_frag_responses(struct xen_netif *netif, int status, ++ struct netbk_rx_meta *meta, int nr_frags) ++{ ++ int i; ++ unsigned long offset; ++ ++ for (i = 0; i < nr_frags; i++) { ++ int id = meta[i].id; ++ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data; ++ ++ offset = 0; ++ make_rx_response(netif, id, status, offset, ++ meta[i].frag.size, flags); ++ } ++} ++ ++static void net_rx_action(unsigned long unused) ++{ ++ struct xen_netif *netif = NULL; ++ s8 status; ++ u16 id, irq, flags; ++ struct xen_netif_rx_response *resp; ++ struct multicall_entry *mcl; ++ struct sk_buff_head rxq; ++ struct sk_buff *skb; ++ int notify_nr = 0; ++ int ret; ++ int nr_frags; ++ int count; ++ unsigned long offset; ++ ++ /* ++ * Putting hundreds of bytes on the stack is considered rude. ++ * Static works because a tasklet can only be on one CPU at any time. ++ */ ++ static struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3]; ++ static struct mmu_update rx_mmu[NET_RX_RING_SIZE]; ++ static struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE]; ++ static struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE]; ++ static unsigned char rx_notify[NR_IRQS]; ++ static u16 notify_list[NET_RX_RING_SIZE]; ++ static struct netbk_rx_meta meta[NET_RX_RING_SIZE]; ++ ++ struct netrx_pending_operations npo = { ++ mmu: rx_mmu, ++ trans: grant_trans_op, ++ copy: grant_copy_op, ++ mcl: rx_mcl, ++ meta: meta}; ++ ++ skb_queue_head_init(&rxq); ++ ++ count = 0; ++ ++ while ((skb = skb_dequeue(&rx_queue)) != NULL) { ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ *(int *)skb->cb = nr_frags; ++ ++ netbk_gop_skb(skb, &npo); ++ ++ count += nr_frags + 1; ++ ++ __skb_queue_tail(&rxq, skb); ++ ++ /* Filled the batch queue? */ ++ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE) ++ break; ++ } ++ ++ BUG_ON(npo.meta_prod > ARRAY_SIZE(meta)); ++ ++ npo.mmu_mcl = npo.mcl_prod; ++ if (npo.mcl_prod) { ++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); ++ BUG_ON(npo.mmu_prod > ARRAY_SIZE(rx_mmu)); ++ mcl = npo.mcl + npo.mcl_prod++; ++ ++ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping); ++ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; ++ ++ mcl->op = __HYPERVISOR_mmu_update; ++ mcl->args[0] = (unsigned long)rx_mmu; ++ mcl->args[1] = npo.mmu_prod; ++ mcl->args[2] = 0; ++ mcl->args[3] = DOMID_SELF; ++ } ++ ++ if (npo.trans_prod) { ++ BUG_ON(npo.trans_prod > ARRAY_SIZE(grant_trans_op)); ++ mcl = npo.mcl + npo.mcl_prod++; ++ mcl->op = __HYPERVISOR_grant_table_op; ++ mcl->args[0] = GNTTABOP_transfer; ++ mcl->args[1] = (unsigned long)grant_trans_op; ++ mcl->args[2] = npo.trans_prod; ++ } ++ ++ if (npo.copy_prod) { ++ BUG_ON(npo.copy_prod > ARRAY_SIZE(grant_copy_op)); ++ mcl = npo.mcl + npo.mcl_prod++; ++ mcl->op = __HYPERVISOR_grant_table_op; ++ mcl->args[0] = GNTTABOP_copy; ++ mcl->args[1] = (unsigned long)grant_copy_op; ++ mcl->args[2] = npo.copy_prod; ++ } ++ ++ /* Nothing to do? */ ++ if (!npo.mcl_prod) ++ return; ++ ++ BUG_ON(npo.mcl_prod > ARRAY_SIZE(rx_mcl)); ++ ++ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod); ++ BUG_ON(ret != 0); ++ /* The mmu_machphys_update() must not fail. */ ++ BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0); ++ ++ while ((skb = __skb_dequeue(&rxq)) != NULL) { ++ nr_frags = *(int *)skb->cb; ++ ++ netif = netdev_priv(skb->dev); ++ ++ netif->stats.tx_bytes += skb->len; ++ netif->stats.tx_packets++; ++ ++ status = netbk_check_gop(nr_frags, netif->domid, &npo); ++ ++ id = meta[npo.meta_cons].id; ++ flags = nr_frags ? NETRXF_more_data : 0; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */ ++ flags |= NETRXF_csum_blank | NETRXF_data_validated; ++ else if (skb->ip_summed == CHECKSUM_UNNECESSARY) ++ /* remote but checksummed. */ ++ flags |= NETRXF_data_validated; ++ ++ offset = 0; ++ resp = make_rx_response(netif, id, status, offset, ++ skb_headlen(skb), flags); ++ ++ if (meta[npo.meta_cons].frag.size) { ++ struct xen_netif_extra_info *gso = ++ (struct xen_netif_extra_info *) ++ RING_GET_RESPONSE(&netif->rx, ++ netif->rx.rsp_prod_pvt++); ++ ++ resp->flags |= NETRXF_extra_info; ++ ++ gso->u.gso.size = meta[npo.meta_cons].frag.size; ++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ++ gso->u.gso.pad = 0; ++ gso->u.gso.features = 0; ++ ++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO; ++ gso->flags = 0; ++ } ++ ++ netbk_add_frag_responses(netif, status, ++ meta + npo.meta_cons + 1, ++ nr_frags); ++ ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret); ++ irq = netif->irq; ++ if (ret && !rx_notify[irq] && ++ (netif->smart_poll != 1)) { ++ rx_notify[irq] = 1; ++ notify_list[notify_nr++] = irq; ++ } ++ ++ if (netif_queue_stopped(netif->dev) && ++ netif_schedulable(netif) && ++ !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ /* ++ * netfront_smartpoll_active indicates whether ++ * netfront timer is active. ++ */ ++ if ((netif->smart_poll == 1)) { ++ if (!(netif->rx.sring->netfront_smartpoll_active)) { ++ notify_remote_via_irq(irq); ++ netif->rx.sring->netfront_smartpoll_active = 1; ++ } ++ } ++ ++ netif_put(netif); ++ dev_kfree_skb(skb); ++ npo.meta_cons += nr_frags + 1; ++ } ++ ++ while (notify_nr != 0) { ++ irq = notify_list[--notify_nr]; ++ rx_notify[irq] = 0; ++ notify_remote_via_irq(irq); ++ } ++ ++ /* More work to do? */ ++ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer)) ++ tasklet_schedule(&net_rx_tasklet); ++} ++ ++static void net_alarm(unsigned long unused) ++{ ++ tasklet_schedule(&net_rx_tasklet); ++} ++ ++static void netbk_tx_pending_timeout(unsigned long unused) ++{ ++ tasklet_schedule(&net_tx_tasklet); ++} ++ ++struct net_device_stats *netif_be_get_stats(struct net_device *dev) ++{ ++ struct xen_netif *netif = netdev_priv(dev); ++ return &netif->stats; ++} ++ ++static int __on_net_schedule_list(struct xen_netif *netif) ++{ ++ return !list_empty(&netif->list); ++} ++ ++static void remove_from_net_schedule_list(struct xen_netif *netif) ++{ ++ spin_lock_irq(&net_schedule_list_lock); ++ if (likely(__on_net_schedule_list(netif))) { ++ list_del_init(&netif->list); ++ netif_put(netif); ++ } ++ spin_unlock_irq(&net_schedule_list_lock); ++} ++ ++static void add_to_net_schedule_list_tail(struct xen_netif *netif) ++{ ++ if (__on_net_schedule_list(netif)) ++ return; ++ ++ spin_lock_irq(&net_schedule_list_lock); ++ if (!__on_net_schedule_list(netif) && ++ likely(netif_schedulable(netif))) { ++ list_add_tail(&netif->list, &net_schedule_list); ++ netif_get(netif); ++ } ++ spin_unlock_irq(&net_schedule_list_lock); ++} ++ ++void netif_schedule_work(struct xen_netif *netif) ++{ ++ int more_to_do; ++ ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); ++ ++ if (more_to_do) { ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(); ++ } ++} ++ ++void netif_deschedule_work(struct xen_netif *netif) ++{ ++ remove_from_net_schedule_list(netif); ++} ++ ++ ++static void tx_add_credit(struct xen_netif *netif) ++{ ++ unsigned long max_burst, max_credit; ++ ++ /* ++ * Allow a burst big enough to transmit a jumbo packet of up to 128kB. ++ * Otherwise the interface can seize up due to insufficient credit. ++ */ ++ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; ++ max_burst = min(max_burst, 131072UL); ++ max_burst = max(max_burst, netif->credit_bytes); ++ ++ /* Take care that adding a new chunk of credit doesn't wrap to zero. */ ++ max_credit = netif->remaining_credit + netif->credit_bytes; ++ if (max_credit < netif->remaining_credit) ++ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ ++ ++ netif->remaining_credit = min(max_credit, max_burst); ++} ++ ++static void tx_credit_callback(unsigned long data) ++{ ++ struct xen_netif *netif = (struct xen_netif *)data; ++ tx_add_credit(netif); ++ netif_schedule_work(netif); ++} ++ ++static inline int copy_pending_req(pending_ring_idx_t pending_idx) ++{ ++ return gnttab_copy_grant_page(grant_tx_handle[pending_idx], ++ &mmap_pages[pending_idx]); ++} ++ ++inline static void net_tx_action_dealloc(void) ++{ ++ struct netbk_tx_pending_inuse *inuse, *n; ++ struct gnttab_unmap_grant_ref *gop; ++ u16 pending_idx; ++ pending_ring_idx_t dc, dp; ++ struct xen_netif *netif; ++ int ret; ++ LIST_HEAD(list); ++ ++ dc = dealloc_cons; ++ gop = tx_unmap_ops; ++ ++ /* ++ * Free up any grants we have finished using ++ */ ++ do { ++ dp = dealloc_prod; ++ ++ /* Ensure we see all indices enqueued by netif_idx_release(). */ ++ smp_rmb(); ++ ++ while (dc != dp) { ++ unsigned long pfn; ++ ++ pending_idx = dealloc_ring[pending_index(dc++)]; ++ list_move_tail(&pending_inuse[pending_idx].list, &list); ++ ++ pfn = idx_to_pfn(pending_idx); ++ /* Already unmapped? */ ++ if (!phys_to_machine_mapping_valid(pfn)) ++ continue; ++ ++ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map, ++ grant_tx_handle[pending_idx]); ++ gop++; ++ } ++ ++ if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB || ++ list_empty(&pending_inuse_head)) ++ break; ++ ++ /* Copy any entries that have been pending for too long. */ ++ list_for_each_entry_safe(inuse, n, &pending_inuse_head, list) { ++ if (time_after(inuse->alloc_time + HZ / 2, jiffies)) ++ break; ++ ++ pending_idx = inuse - pending_inuse; ++ ++ pending_tx_info[pending_idx].netif->nr_copied_skbs++; ++ ++ switch (copy_pending_req(pending_idx)) { ++ case 0: ++ list_move_tail(&inuse->list, &list); ++ continue; ++ case -EBUSY: ++ list_del_init(&inuse->list); ++ continue; ++ case -ENOENT: ++ continue; ++ } ++ ++ break; ++ } ++ } while (dp != dealloc_prod); ++ ++ dealloc_cons = dc; ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); ++ BUG_ON(ret); ++ ++ list_for_each_entry_safe(inuse, n, &list, list) { ++ pending_idx = inuse - pending_inuse; ++ ++ netif = pending_tx_info[pending_idx].netif; ++ ++ make_tx_response(netif, &pending_tx_info[pending_idx].req, ++ NETIF_RSP_OKAY); ++ ++ /* Ready for next use. */ ++ gnttab_reset_grant_page(mmap_pages[pending_idx]); ++ ++ pending_ring[pending_index(pending_prod++)] = pending_idx; ++ ++ netif_put(netif); ++ ++ list_del_init(&inuse->list); ++ } ++} ++ ++static void netbk_tx_err(struct xen_netif *netif, struct xen_netif_tx_request *txp, RING_IDX end) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ if (cons >= end) ++ break; ++ txp = RING_GET_REQUEST(&netif->tx, cons++); ++ } while (1); ++ netif->tx.req_cons = cons; ++ netif_schedule_work(netif); ++ netif_put(netif); ++} ++ ++static int netbk_count_requests(struct xen_netif *netif, ++ struct xen_netif_tx_request *first, ++ struct xen_netif_tx_request *txp, int work_to_do) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ int frags = 0; ++ ++ if (!(first->flags & NETTXF_more_data)) ++ return 0; ++ ++ do { ++ if (frags >= work_to_do) { ++ DPRINTK("Need more frags\n"); ++ return -frags; ++ } ++ ++ if (unlikely(frags >= MAX_SKB_FRAGS)) { ++ DPRINTK("Too many frags\n"); ++ return -frags; ++ } ++ ++ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), ++ sizeof(*txp)); ++ if (txp->size > first->size) { ++ DPRINTK("Frags galore\n"); ++ return -frags; ++ } ++ ++ first->size -= txp->size; ++ frags++; ++ ++ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { ++ DPRINTK("txp->offset: %x, size: %u\n", ++ txp->offset, txp->size); ++ return -frags; ++ } ++ } while ((txp++)->flags & NETTXF_more_data); ++ ++ return frags; ++} ++ ++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netif *netif, ++ struct sk_buff *skb, ++ struct xen_netif_tx_request *txp, ++ struct gnttab_map_grant_ref *mop) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frags = shinfo->frags; ++ unsigned long pending_idx = *((u16 *)skb->data); ++ int i, start; ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < shinfo->nr_frags; i++, txp++) { ++ pending_idx = pending_ring[pending_index(pending_cons++)]; ++ ++ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txp->gref, netif->domid); ++ ++ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); ++ netif_get(netif); ++ pending_tx_info[pending_idx].netif = netif; ++ frags[i].page = (void *)pending_idx; ++ } ++ ++ return mop; ++} ++ ++static int netbk_tx_check_mop(struct sk_buff *skb, ++ struct gnttab_map_grant_ref **mopp) ++{ ++ struct gnttab_map_grant_ref *mop = *mopp; ++ int pending_idx = *((u16 *)skb->data); ++ struct xen_netif *netif = pending_tx_info[pending_idx].netif; ++ struct xen_netif_tx_request *txp; ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i, err, start; ++ ++ /* Check status of header. */ ++ err = mop->status; ++ if (unlikely(err)) { ++ txp = &pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ pending_ring[pending_index(pending_prod++)] = pending_idx; ++ netif_put(netif); ++ } else { ++ set_phys_to_machine( ++ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); ++ grant_tx_handle[pending_idx] = mop->handle; ++ } ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < nr_frags; i++) { ++ int j, newerr; ++ ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ ++ /* Check error status: if okay then remember grant handle. */ ++ newerr = (++mop)->status; ++ if (likely(!newerr)) { ++ set_phys_to_machine( ++ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); ++ grant_tx_handle[pending_idx] = mop->handle; ++ /* Had a previous error? Invalidate this fragment. */ ++ if (unlikely(err)) ++ netif_idx_release(pending_idx); ++ continue; ++ } ++ ++ /* Error on this fragment: respond to client with an error. */ ++ txp = &pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ pending_ring[pending_index(pending_prod++)] = pending_idx; ++ netif_put(netif); ++ ++ /* Not the first error? Preceding frags already invalidated. */ ++ if (err) ++ continue; ++ ++ /* First error: invalidate header and preceding fragments. */ ++ pending_idx = *((u16 *)skb->data); ++ netif_idx_release(pending_idx); ++ for (j = start; j < i; j++) { ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ netif_idx_release(pending_idx); ++ } ++ ++ /* Remember the error: invalidate all subsequent fragments. */ ++ err = newerr; ++ } ++ ++ *mopp = mop + 1; ++ return err; ++} ++ ++static void netbk_fill_frags(struct sk_buff *skb) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i; ++ ++ for (i = 0; i < nr_frags; i++) { ++ skb_frag_t *frag = shinfo->frags + i; ++ struct xen_netif_tx_request *txp; ++ unsigned long pending_idx; ++ ++ pending_idx = (unsigned long)frag->page; ++ ++ pending_inuse[pending_idx].alloc_time = jiffies; ++ list_add_tail(&pending_inuse[pending_idx].list, ++ &pending_inuse_head); ++ ++ txp = &pending_tx_info[pending_idx].req; ++ frag->page = virt_to_page(idx_to_kaddr(pending_idx)); ++ frag->size = txp->size; ++ frag->page_offset = txp->offset; ++ ++ skb->len += txp->size; ++ skb->data_len += txp->size; ++ skb->truesize += txp->size; ++ } ++} ++ ++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras, ++ int work_to_do) ++{ ++ struct xen_netif_extra_info extra; ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ if (unlikely(work_to_do-- <= 0)) { ++ DPRINTK("Missing extra info\n"); ++ return -EBADR; ++ } ++ ++ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), ++ sizeof(extra)); ++ if (unlikely(!extra.type || ++ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { ++ netif->tx.req_cons = ++cons; ++ DPRINTK("Invalid extra type: %d\n", extra.type); ++ return -EINVAL; ++ } ++ ++ memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); ++ netif->tx.req_cons = ++cons; ++ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); ++ ++ return work_to_do; ++} ++ ++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso) ++{ ++ if (!gso->u.gso.size) { ++ DPRINTK("GSO size must not be zero.\n"); ++ return -EINVAL; ++ } ++ ++ /* Currently only TCPv4 S.O. is supported. */ ++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { ++ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type); ++ return -EINVAL; ++ } ++ ++ skb_shinfo(skb)->gso_size = gso->u.gso.size; ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ ++ /* Header must be checked, and gso_segs computed. */ ++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; ++ skb_shinfo(skb)->gso_segs = 0; ++ ++ return 0; ++} ++ ++static int skb_checksum_setup(struct sk_buff *skb) ++{ ++ struct iphdr *iph; ++ unsigned char *th; ++ int err = -EPROTO; ++ ++ if (skb->protocol != htons(ETH_P_IP)) ++ goto out; ++ ++ iph = (void *)skb->data; ++ th = skb->data + 4 * iph->ihl; ++ if (th >= skb_tail_pointer(skb)) ++ goto out; ++ ++ skb->csum_start = th - skb->head; ++ switch (iph->protocol) { ++ case IPPROTO_TCP: ++ skb->csum_offset = offsetof(struct tcphdr, check); ++ break; ++ case IPPROTO_UDP: ++ skb->csum_offset = offsetof(struct udphdr, check); ++ break; ++ default: ++ if (net_ratelimit()) ++ printk(KERN_ERR "Attempting to checksum a non-" ++ "TCP/UDP packet, dropping a protocol" ++ " %d packet", iph->protocol); ++ goto out; ++ } ++ ++ if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb)) ++ goto out; ++ ++ err = 0; ++ ++out: ++ return err; ++} ++ ++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size) ++{ ++ unsigned long now = jiffies; ++ unsigned long next_credit = ++ netif->credit_timeout.expires + ++ msecs_to_jiffies(netif->credit_usec / 1000); ++ ++ /* Timer could already be pending in rare cases. */ ++ if (timer_pending(&netif->credit_timeout)) ++ return true; ++ ++ /* Passed the point where we can replenish credit? */ ++ if (time_after_eq(now, next_credit)) { ++ netif->credit_timeout.expires = now; ++ tx_add_credit(netif); ++ } ++ ++ /* Still too big to send right now? Set a callback. */ ++ if (size > netif->remaining_credit) { ++ netif->credit_timeout.data = ++ (unsigned long)netif; ++ netif->credit_timeout.function = ++ tx_credit_callback; ++ mod_timer(&netif->credit_timeout, ++ next_credit); ++ ++ return true; ++ } ++ ++ return false; ++} ++ ++static unsigned net_tx_build_mops(void) ++{ ++ struct gnttab_map_grant_ref *mop; ++ struct sk_buff *skb; ++ int ret; ++ ++ mop = tx_map_ops; ++ while (((nr_pending_reqs() + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && ++ !list_empty(&net_schedule_list)) { ++ struct xen_netif *netif; ++ struct xen_netif_tx_request txreq; ++ struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS]; ++ struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; ++ u16 pending_idx; ++ RING_IDX idx; ++ int work_to_do; ++ unsigned int data_len; ++ ++ /* Get a netif from the list with work to do. */ ++ netif = list_first_entry(&net_schedule_list, struct xen_netif, list); ++ netif_get(netif); ++ remove_from_net_schedule_list(netif); ++ ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); ++ if (!work_to_do) { ++ netif_put(netif); ++ continue; ++ } ++ ++ idx = netif->tx.req_cons; ++ rmb(); /* Ensure that we see the request before we copy it. */ ++ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq)); ++ ++ /* Credit-based scheduling. */ ++ if (txreq.size > netif->remaining_credit && ++ tx_credit_exceeded(netif, txreq.size)) { ++ netif_put(netif); ++ continue; ++ } ++ ++ netif->remaining_credit -= txreq.size; ++ ++ work_to_do--; ++ netif->tx.req_cons = ++idx; ++ ++ memset(extras, 0, sizeof(extras)); ++ if (txreq.flags & NETTXF_extra_info) { ++ work_to_do = netbk_get_extras(netif, extras, ++ work_to_do); ++ idx = netif->tx.req_cons; ++ if (unlikely(work_to_do < 0)) { ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ } ++ ++ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); ++ if (unlikely(ret < 0)) { ++ netbk_tx_err(netif, &txreq, idx - ret); ++ continue; ++ } ++ idx += ret; ++ ++ if (unlikely(txreq.size < ETH_HLEN)) { ++ DPRINTK("Bad packet size: %d\n", txreq.size); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ ++ /* No crossing a page as the payload mustn't fragment. */ ++ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { ++ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", ++ txreq.offset, txreq.size, ++ (txreq.offset &~PAGE_MASK) + txreq.size); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ ++ pending_idx = pending_ring[pending_index(pending_cons)]; ++ ++ data_len = (txreq.size > PKT_PROT_LEN && ++ ret < MAX_SKB_FRAGS) ? ++ PKT_PROT_LEN : txreq.size; ++ ++ skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN, ++ GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(skb == NULL)) { ++ DPRINTK("Can't allocate a skb in start_xmit.\n"); ++ netbk_tx_err(netif, &txreq, idx); ++ break; ++ } ++ ++ /* Packets passed to netif_rx() must have some headroom. */ ++ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); ++ ++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { ++ struct xen_netif_extra_info *gso; ++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; ++ ++ if (netbk_set_skb_gso(skb, gso)) { ++ kfree_skb(skb); ++ netbk_tx_err(netif, &txreq, idx); ++ continue; ++ } ++ } ++ ++ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txreq.gref, netif->domid); ++ mop++; ++ ++ memcpy(&pending_tx_info[pending_idx].req, ++ &txreq, sizeof(txreq)); ++ pending_tx_info[pending_idx].netif = netif; ++ *((u16 *)skb->data) = pending_idx; ++ ++ __skb_put(skb, data_len); ++ ++ skb_shinfo(skb)->nr_frags = ret; ++ if (data_len < txreq.size) { ++ skb_shinfo(skb)->nr_frags++; ++ skb_shinfo(skb)->frags[0].page = ++ (void *)(unsigned long)pending_idx; ++ } else { ++ /* Discriminate from any valid pending_idx value. */ ++ skb_shinfo(skb)->frags[0].page = (void *)~0UL; ++ } ++ ++ __skb_queue_tail(&tx_queue, skb); ++ ++ pending_cons++; ++ ++ mop = netbk_get_requests(netif, skb, txfrags, mop); ++ ++ netif->tx.req_cons = idx; ++ netif_schedule_work(netif); ++ ++ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) ++ break; ++ } ++ ++ return mop - tx_map_ops; ++} ++ ++static void net_tx_submit(void) ++{ ++ struct gnttab_map_grant_ref *mop; ++ struct sk_buff *skb; ++ ++ mop = tx_map_ops; ++ while ((skb = __skb_dequeue(&tx_queue)) != NULL) { ++ struct xen_netif_tx_request *txp; ++ struct xen_netif *netif; ++ u16 pending_idx; ++ unsigned data_len; ++ ++ pending_idx = *((u16 *)skb->data); ++ netif = pending_tx_info[pending_idx].netif; ++ txp = &pending_tx_info[pending_idx].req; ++ ++ /* Check the remap error code. */ ++ if (unlikely(netbk_tx_check_mop(skb, &mop))) { ++ DPRINTK("netback grant failed.\n"); ++ skb_shinfo(skb)->nr_frags = 0; ++ kfree_skb(skb); ++ continue; ++ } ++ ++ data_len = skb->len; ++ memcpy(skb->data, ++ (void *)(idx_to_kaddr(pending_idx)|txp->offset), ++ data_len); ++ if (data_len < txp->size) { ++ /* Append the packet payload as a fragment. */ ++ txp->offset += data_len; ++ txp->size -= data_len; ++ } else { ++ /* Schedule a response immediately. */ ++ netif_idx_release(pending_idx); ++ } ++ ++ /* ++ * Old frontends do not assert data_validated but we ++ * can infer it from csum_blank so test both flags. ++ */ ++ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) ++ skb->ip_summed = CHECKSUM_PARTIAL; ++ else ++ skb->ip_summed = CHECKSUM_NONE; ++ ++ netbk_fill_frags(skb); ++ ++ /* ++ * If the initial fragment was < PKT_PROT_LEN then ++ * pull through some bytes from the other fragments to ++ * increase the linear region to PKT_PROT_LEN bytes. ++ */ ++ if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) { ++ int target = min_t(int, skb->len, PKT_PROT_LEN); ++ __pskb_pull_tail(skb, target - skb_headlen(skb)); ++ } ++ ++ skb->dev = netif->dev; ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ ++ netif->stats.rx_bytes += skb->len; ++ netif->stats.rx_packets++; ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++ if (skb_checksum_setup(skb)) { ++ DPRINTK("Can't setup checksum in net_tx_action\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ } ++ ++ if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) && ++ unlikely(skb_linearize(skb))) { ++ DPRINTK("Can't linearize skb in net_tx_action.\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ ++ netif_rx(skb); ++ netif->dev->last_rx = jiffies; ++ } ++ ++ if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB && ++ !list_empty(&pending_inuse_head)) { ++ struct netbk_tx_pending_inuse *oldest; ++ ++ oldest = list_entry(pending_inuse_head.next, ++ struct netbk_tx_pending_inuse, list); ++ mod_timer(&netbk_tx_pending_timer, oldest->alloc_time + HZ); ++ } ++} ++ ++/* Called after netfront has transmitted */ ++static void net_tx_action(unsigned long unused) ++{ ++ unsigned nr_mops; ++ int ret; ++ ++ if (dealloc_cons != dealloc_prod) ++ net_tx_action_dealloc(); ++ ++ nr_mops = net_tx_build_mops(); ++ ++ if (nr_mops == 0) ++ return; ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ tx_map_ops, nr_mops); ++ BUG_ON(ret); ++ ++ net_tx_submit(); ++} ++ ++static void netif_idx_release(u16 pending_idx) ++{ ++ static DEFINE_SPINLOCK(_lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&_lock, flags); ++ dealloc_ring[pending_index(dealloc_prod)] = pending_idx; ++ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */ ++ smp_wmb(); ++ dealloc_prod++; ++ spin_unlock_irqrestore(&_lock, flags); ++ ++ tasklet_schedule(&net_tx_tasklet); ++} ++ ++static void netif_page_release(struct page *page, unsigned int order) ++{ ++ int idx = netif_page_index(page); ++ BUG_ON(order); ++ BUG_ON(idx < 0); ++ netif_idx_release(idx); ++} ++ ++irqreturn_t netif_be_int(int irq, void *dev_id) ++{ ++ struct xen_netif *netif = dev_id; ++ ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(); ++ ++ if (netif_schedulable(netif) && !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ return IRQ_HANDLED; ++} ++ ++static void make_tx_response(struct xen_netif *netif, ++ struct xen_netif_tx_request *txp, ++ s8 st) ++{ ++ RING_IDX i = netif->tx.rsp_prod_pvt; ++ struct xen_netif_tx_response *resp; ++ int notify; ++ ++ resp = RING_GET_RESPONSE(&netif->tx, i); ++ resp->id = txp->id; ++ resp->status = st; ++ ++ if (txp->flags & NETTXF_extra_info) ++ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL; ++ ++ netif->tx.rsp_prod_pvt = ++i; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); ++ ++ /* ++ * netfront_smartpoll_active indicates whether netfront timer ++ * is active. ++ */ ++ if ((netif->smart_poll == 1)) { ++ if (!(netif->rx.sring->netfront_smartpoll_active)) { ++ notify_remote_via_irq(netif->irq); ++ netif->rx.sring->netfront_smartpoll_active = 1; ++ } ++ } else if (notify) ++ notify_remote_via_irq(netif->irq); ++} ++ ++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags) ++{ ++ RING_IDX i = netif->rx.rsp_prod_pvt; ++ struct xen_netif_rx_response *resp; ++ ++ resp = RING_GET_RESPONSE(&netif->rx, i); ++ resp->offset = offset; ++ resp->flags = flags; ++ resp->id = id; ++ resp->status = (s16)size; ++ if (st < 0) ++ resp->status = (s16)st; ++ ++ netif->rx.rsp_prod_pvt = ++i; ++ ++ return resp; ++} ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct list_head *ent; ++ struct xen_netif *netif; ++ int i = 0; ++ ++ printk(KERN_ALERT "netif_schedule_list:\n"); ++ spin_lock_irq(&net_schedule_list_lock); ++ ++ list_for_each (ent, &net_schedule_list) { ++ netif = list_entry(ent, struct xen_netif, list); ++ printk(KERN_ALERT " %d: private(rx_req_cons=%08x " ++ "rx_resp_prod=%08x\n", ++ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt); ++ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", ++ netif->tx.req_cons, netif->tx.rsp_prod_pvt); ++ printk(KERN_ALERT " shared(rx_req_prod=%08x " ++ "rx_resp_prod=%08x\n", ++ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod); ++ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", ++ netif->rx.sring->rsp_event, netif->tx.sring->req_prod); ++ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", ++ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event); ++ i++; ++ } ++ ++ spin_unlock_irq(&net_schedule_list_lock); ++ printk(KERN_ALERT " ** End of netif_schedule_list **\n"); ++ ++ return IRQ_HANDLED; ++} ++#endif ++ ++static int __init netback_init(void) ++{ ++ int i; ++ struct page *page; ++ int rc = 0; ++ ++ if (!xen_domain()) ++ return -ENODEV; ++ ++ /* We can increase reservation by this much in net_rx_action(). */ ++// balloon_update_driver_allowance(NET_RX_RING_SIZE); ++ ++ skb_queue_head_init(&rx_queue); ++ skb_queue_head_init(&tx_queue); ++ ++ init_timer(&net_timer); ++ net_timer.data = 0; ++ net_timer.function = net_alarm; ++ ++ init_timer(&netbk_tx_pending_timer); ++ netbk_tx_pending_timer.data = 0; ++ netbk_tx_pending_timer.function = netbk_tx_pending_timeout; ++ ++ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); ++ if (mmap_pages == NULL) { ++ printk("%s: out of memory\n", __FUNCTION__); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < MAX_PENDING_REQS; i++) { ++ page = mmap_pages[i]; ++ SetPageForeign(page, netif_page_release); ++ netif_set_page_index(page, i); ++ INIT_LIST_HEAD(&pending_inuse[i].list); ++ } ++ ++ pending_cons = 0; ++ pending_prod = MAX_PENDING_REQS; ++ for (i = 0; i < MAX_PENDING_REQS; i++) ++ pending_ring[i] = i; ++ ++ netbk_copy_skb_mode = NETBK_DONT_COPY_SKB; ++ if (MODPARM_copy_skb) { ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace, ++ NULL, 0)) ++ netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB; ++ else ++ netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB; ++ } ++ ++ //netif_accel_init(); ++ ++ rc = netif_xenbus_init(); ++ if (rc) ++ goto failed_init; ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++ (void)bind_virq_to_irqhandler(VIRQ_DEBUG, ++ 0, ++ netif_be_dbg, ++ SA_SHIRQ, ++ "net-be-dbg", ++ &netif_be_dbg); ++#endif ++ ++ return 0; ++ ++failed_init: ++ free_empty_pages_and_pagevec(mmap_pages, MAX_PENDING_REQS); ++ del_timer(&netbk_tx_pending_timer); ++ del_timer(&net_timer); ++ return rc; ++ ++} ++ ++module_init(netback_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c +new file mode 100644 +index 0000000..70636d0 +--- /dev/null ++++ b/drivers/xen/netback/xenbus.c +@@ -0,0 +1,523 @@ ++/* Xenbus code for netif backend ++ Copyright (C) 2005 Rusty Russell ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include ++#include ++#include ++#include "common.h" ++ ++#if 0 ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) ++#endif ++ ++ ++static int connect_rings(struct backend_info *); ++static void connect(struct backend_info *); ++static void backend_create_netif(struct backend_info *be); ++static void unregister_hotplug_status_watch(struct backend_info *be); ++ ++static int netback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ //netback_remove_accelerators(be, dev); ++ ++ unregister_hotplug_status_watch(be); ++ if (be->netif) { ++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); ++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++ kfree(be); ++ dev_set_drvdata(&dev->dev, NULL); ++ return 0; ++} ++ ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures and switch to InitWait. ++ */ ++static int netback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ const char *message; ++ struct xenbus_transaction xbt; ++ int err; ++ int sg; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ ++ be->dev = dev; ++ dev_set_drvdata(&dev->dev, be); ++ ++ sg = 1; ++ if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) ++ sg = 0; ++ ++ do { ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ goto fail; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg); ++ if (err) { ++ message = "writing feature-sg"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", ++ "%d", sg); ++ if (err) { ++ message = "writing feature-gso-tcpv4"; ++ goto abort_transaction; ++ } ++ ++ /* We support rx-copy path. */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-copy", "%d", 1); ++ if (err) { ++ message = "writing feature-rx-copy"; ++ goto abort_transaction; ++ } ++ ++ /* ++ * We don't support rx-flip path (except old guests who don't ++ * grok this feature flag). ++ */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-flip", "%d", 0); ++ if (err) { ++ message = "writing feature-rx-flip"; ++ goto abort_transaction; ++ } ++ ++ /* We support data smart poll mechanism */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-smart-poll", "%d", 1); ++ if (err) { ++ message = "writing feature-smart-poll"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ } while (err == -EAGAIN); ++ ++ if (err) { ++ xenbus_dev_fatal(dev, err, "completing transaction"); ++ goto fail; ++ } ++ ++ //netback_probe_accelerators(be, dev); ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ /* This kicks hotplug scripts, so do it immediately. */ ++ backend_create_netif(be); ++ ++ return 0; ++ ++abort_transaction: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, err, "%s", message); ++fail: ++ DPRINTK("failed"); ++ netback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Handle the creation of the hotplug script environment. We add the script ++ * and vif variables to the environment, for the benefit of the vif-* hotplug ++ * scripts. ++ */ ++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env) ++{ ++ struct backend_info *be = dev_get_drvdata(&xdev->dev); ++ struct xen_netif *netif = be->netif; ++ char *val; ++ ++ DPRINTK("netback_uevent"); ++ ++ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL); ++ if (IS_ERR(val)) { ++ int err = PTR_ERR(val); ++ xenbus_dev_fatal(xdev, err, "reading script"); ++ return err; ++ } ++ else { ++ if (add_uevent_var(env, "script=%s", val)) { ++ kfree(val); ++ return -ENOMEM; ++ } ++ kfree(val); ++ } ++ ++ if (add_uevent_var(env, "vif=%s", netif->dev->name)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++ ++static void backend_create_netif(struct backend_info *be) ++{ ++ int err; ++ long handle; ++ struct xenbus_device *dev = be->dev; ++ ++ if (be->netif != NULL) ++ return; ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); ++ if (err != 1) { ++ xenbus_dev_fatal(dev, err, "reading handle"); ++ return; ++ } ++ ++ be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle); ++ if (IS_ERR(be->netif)) { ++ err = PTR_ERR(be->netif); ++ be->netif = NULL; ++ xenbus_dev_fatal(dev, err, "creating interface"); ++ return; ++ } ++ ++ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); ++} ++ ++ ++static void disconnect_backend(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ if (be->netif) { ++ xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status"); ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++} ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev_get_drvdata(&dev->dev); ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ be->frontend_state = frontend_state; ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ break; ++ ++ case XenbusStateConnected: ++ if (dev->state == XenbusStateConnected) ++ break; ++ backend_create_netif(be); ++ if (be->netif) ++ connect(be); ++ break; ++ ++ case XenbusStateClosing: ++ if (be->netif) ++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); ++ disconnect_backend(dev); ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++static void xen_net_read_rate(struct xenbus_device *dev, ++ unsigned long *bytes, unsigned long *usec) ++{ ++ char *s, *e; ++ unsigned long b, u; ++ char *ratestr; ++ ++ /* Default to unlimited bandwidth. */ ++ *bytes = ~0UL; ++ *usec = 0; ++ ++ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL); ++ if (IS_ERR(ratestr)) ++ return; ++ ++ s = ratestr; ++ b = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != ',')) ++ goto fail; ++ ++ s = e + 1; ++ u = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != '\0')) ++ goto fail; ++ ++ *bytes = b; ++ *usec = u; ++ ++ kfree(ratestr); ++ return; ++ ++ fail: ++ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n"); ++ kfree(ratestr); ++} ++ ++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) ++{ ++ char *s, *e, *macstr; ++ int i; ++ ++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); ++ if (IS_ERR(macstr)) ++ return PTR_ERR(macstr); ++ ++ for (i = 0; i < ETH_ALEN; i++) { ++ mac[i] = simple_strtoul(s, &e, 16); ++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { ++ kfree(macstr); ++ return -ENOENT; ++ } ++ s = e+1; ++ } ++ ++ kfree(macstr); ++ return 0; ++} ++ ++static void unregister_hotplug_status_watch(struct backend_info *be) ++{ ++ if (be->have_hotplug_status_watch) { ++ unregister_xenbus_watch(&be->hotplug_status_watch); ++ kfree(be->hotplug_status_watch.node); ++ } ++ be->have_hotplug_status_watch = 0; ++} ++ ++static void hotplug_status_changed(struct xenbus_watch *watch, ++ const char **vec, ++ unsigned int vec_size) ++{ ++ struct backend_info *be = container_of(watch, ++ struct backend_info, ++ hotplug_status_watch); ++ char *str; ++ unsigned int len; ++ ++ str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len); ++ if (IS_ERR(str)) ++ return; ++ if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) { ++ xenbus_switch_state(be->dev, XenbusStateConnected); ++ /* Not interested in this watch anymore. */ ++ unregister_hotplug_status_watch(be); ++ } ++ kfree(str); ++} ++ ++static void connect(struct backend_info *be) ++{ ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ err = connect_rings(be); ++ if (err) ++ return; ++ ++ err = xen_net_read_mac(dev, be->netif->fe_dev_addr); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); ++ return; ++ } ++ ++ xen_net_read_rate(dev, &be->netif->credit_bytes, ++ &be->netif->credit_usec); ++ be->netif->remaining_credit = be->netif->credit_bytes; ++ ++ unregister_hotplug_status_watch(be); ++ err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, ++ hotplug_status_changed, ++ "%s/%s", dev->nodename, "hotplug-status"); ++ if (err) { ++ /* Switch now, since we can't do a watch. */ ++ xenbus_switch_state(dev, XenbusStateConnected); ++ } else { ++ be->have_hotplug_status_watch = 1; ++ } ++ ++ netif_wake_queue(be->netif->dev); ++} ++ ++ ++static int connect_rings(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long tx_ring_ref, rx_ring_ref; ++ unsigned int evtchn, rx_copy; ++ int err; ++ int val; ++ ++ DPRINTK(""); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, ++ "tx-ring-ref", "%lu", &tx_ring_ref, ++ "rx-ring-ref", "%lu", &rx_ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u", ++ &rx_copy); ++ if (err == -ENOENT) { ++ err = 0; ++ rx_copy = 0; ++ } ++ if (err < 0) { ++ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy", ++ dev->otherend); ++ return err; ++ } ++ if (!rx_copy) ++ return -EOPNOTSUPP; ++ ++ if (be->netif->dev->tx_queue_len != 0) { ++ if (xenbus_scanf(XBT_NIL, dev->otherend, ++ "feature-rx-notify", "%d", &val) < 0) ++ val = 0; ++ if (val) ++ be->netif->can_queue = 1; ++ else ++ /* Must be non-zero for pfifo_fast to work. */ ++ be->netif->dev->tx_queue_len = 1; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0) ++ val = 0; ++ if (!val) { ++ be->netif->features &= ~NETIF_F_SG; ++ be->netif->dev->features &= ~NETIF_F_SG; ++ if (be->netif->dev->mtu > ETH_DATA_LEN) ++ be->netif->dev->mtu = ETH_DATA_LEN; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d", ++ &val) < 0) ++ val = 0; ++ if (val) { ++ be->netif->features |= NETIF_F_TSO; ++ be->netif->dev->features |= NETIF_F_TSO; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", ++ "%d", &val) < 0) ++ val = 0; ++ if (val) { ++ be->netif->features &= ~NETIF_F_IP_CSUM; ++ be->netif->dev->features &= ~NETIF_F_IP_CSUM; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll", ++ "%d", &val) < 0) ++ val = 0; ++ if (val) ++ be->netif->smart_poll = 1; ++ else ++ be->netif->smart_poll = 0; ++ ++ /* Map the shared frame, irq etc. */ ++ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "mapping shared-frames %lu/%lu port %u", ++ tx_ring_ref, rx_ring_ref, evtchn); ++ return err; ++ } ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static const struct xenbus_device_id netback_ids[] = { ++ { "vif" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver netback = { ++ .name = "vif", ++ .owner = THIS_MODULE, ++ .ids = netback_ids, ++ .probe = netback_probe, ++ .remove = netback_remove, ++ .uevent = netback_uevent, ++ .otherend_changed = frontend_changed, ++}; ++ ++ ++int netif_xenbus_init(void) ++{ ++ printk(KERN_CRIT "registering netback\n"); ++ return xenbus_register_backend(&netback); ++} +diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c +new file mode 100644 +index 0000000..ae693e7 +--- /dev/null ++++ b/drivers/xen/pci.c +@@ -0,0 +1,124 @@ ++/* ++ * Copyright (c) 2009, Intel Corporation. ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple ++ * Place - Suite 330, Boston, MA 02111-1307 USA. ++ * ++ * Author: Weidong Han ++ */ ++ ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "../pci/pci.h" ++ ++ ++#ifdef CONFIG_PCI_IOV ++#define HANDLE_PCI_IOV 1 ++#else ++#define HANDLE_PCI_IOV 0 ++#endif ++ ++static int xen_add_device(struct device *dev) ++{ ++ int r; ++ struct pci_dev *pci_dev = to_pci_dev(dev); ++ ++ if (HANDLE_PCI_IOV && pci_dev->is_virtfn) { ++ struct physdev_manage_pci_ext manage_pci_ext = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ .is_virtfn = 1, ++#ifdef CONFIG_PCI_IOV ++ .physfn.bus = pci_dev->physfn->bus->number, ++ .physfn.devfn = pci_dev->physfn->devfn, ++#endif ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, ++ &manage_pci_ext); ++ } else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { ++ struct physdev_manage_pci_ext manage_pci_ext = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ .is_extfn = 1, ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, ++ &manage_pci_ext); ++ } else { ++ struct physdev_manage_pci manage_pci = { ++ .bus = pci_dev->bus->number, ++ .devfn = pci_dev->devfn, ++ }; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, ++ &manage_pci); ++ } ++ ++ return r; ++} ++ ++static int xen_remove_device(struct device *dev) ++{ ++ int r; ++ struct pci_dev *pci_dev = to_pci_dev(dev); ++ struct physdev_manage_pci manage_pci; ++ ++ manage_pci.bus = pci_dev->bus->number; ++ manage_pci.devfn = pci_dev->devfn; ++ ++ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, ++ &manage_pci); ++ ++ return r; ++} ++ ++static int xen_pci_notifier(struct notifier_block *nb, ++ unsigned long action, void *data) ++{ ++ struct device *dev = data; ++ int r = 0; ++ ++ switch (action) { ++ case BUS_NOTIFY_ADD_DEVICE: ++ r = xen_add_device(dev); ++ break; ++ case BUS_NOTIFY_DEL_DEVICE: ++ r = xen_remove_device(dev); ++ break; ++ default: ++ break; ++ } ++ ++ return r; ++} ++ ++struct notifier_block device_nb = { ++ .notifier_call = xen_pci_notifier, ++}; ++ ++static int __init register_xen_pci_notifier(void) ++{ ++ if (!xen_pv_domain()) ++ return 0; ++ ++ return bus_register_notifier(&pci_bus_type, &device_nb); ++} ++ ++arch_initcall(register_xen_pci_notifier); +diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c +index 88a60e0..ae5cb05 100644 +--- a/drivers/xen/sys-hypervisor.c ++++ b/drivers/xen/sys-hypervisor.c +@@ -14,6 +14,7 @@ + #include + #include + ++#include + #include + #include + #include +diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile +index 5571f5b..8dca685 100644 +--- a/drivers/xen/xenbus/Makefile ++++ b/drivers/xen/xenbus/Makefile +@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o + xenbus-objs += xenbus_comms.o + xenbus-objs += xenbus_xs.o + xenbus-objs += xenbus_probe.o ++ ++xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o ++xenbus-objs += $(xenbus-be-objs-y) ++ ++obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o +diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c +index 090c61e..700dc77 100644 +--- a/drivers/xen/xenbus/xenbus_comms.c ++++ b/drivers/xen/xenbus/xenbus_comms.c +@@ -49,6 +49,7 @@ static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); + static irqreturn_t wake_waiting(int irq, void *unused) + { + if (unlikely(xenstored_ready == 0)) { ++ printk(KERN_CRIT "xenbus_probe wake_waiting\n"); + xenstored_ready = 1; + schedule_work(&probe_work); + } +diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c +index 649fcdf..a90e0bf 100644 +--- a/drivers/xen/xenbus/xenbus_probe.c ++++ b/drivers/xen/xenbus/xenbus_probe.c +@@ -49,6 +49,8 @@ + #include + #include + #include ++ ++#include + #include + #include + #include +@@ -58,22 +60,15 @@ + + + int xen_store_evtchn; +-EXPORT_SYMBOL(xen_store_evtchn); ++EXPORT_SYMBOL_GPL(xen_store_evtchn); + + struct xenstore_domain_interface *xen_store_interface; ++EXPORT_SYMBOL_GPL(xen_store_interface); ++ + static unsigned long xen_store_mfn; + + static BLOCKING_NOTIFIER_HEAD(xenstore_chain); + +-static void wait_for_devices(struct xenbus_driver *xendrv); +- +-static int xenbus_probe_frontend(const char *type, const char *name); +- +-static void xenbus_dev_shutdown(struct device *_dev); +- +-static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +-static int xenbus_dev_resume(struct device *dev); +- + /* If something in array of ids matches this device, return it. */ + static const struct xenbus_device_id * + match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) +@@ -94,34 +89,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv) + + return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; + } +- +-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env) +-{ +- struct xenbus_device *dev = to_xenbus_device(_dev); +- +- if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) +- return -ENOMEM; +- +- return 0; +-} +- +-/* device// => - */ +-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +-{ +- nodename = strchr(nodename, '/'); +- if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { +- printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); +- return -EINVAL; +- } +- +- strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); +- if (!strchr(bus_id, '/')) { +- printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); +- return -EINVAL; +- } +- *strchr(bus_id, '/') = '-'; +- return 0; +-} ++EXPORT_SYMBOL_GPL(xenbus_match); + + + static void free_otherend_details(struct xenbus_device *dev) +@@ -141,7 +109,28 @@ static void free_otherend_watch(struct xenbus_device *dev) + } + + +-int read_otherend_details(struct xenbus_device *xendev, ++static int talk_to_otherend(struct xenbus_device *dev) ++{ ++ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); ++ ++ free_otherend_watch(dev); ++ free_otherend_details(dev); ++ ++ return drv->read_otherend_details(dev); ++} ++ ++ ++ ++static int watch_otherend(struct xenbus_device *dev) ++{ ++ struct xen_bus_type *bus = container_of(dev->dev.bus, struct xen_bus_type, bus); ++ ++ return xenbus_watch_pathfmt(dev, &dev->otherend_watch, bus->otherend_changed, ++ "%s/%s", dev->otherend, "state"); ++} ++ ++ ++int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node) + { + int err = xenbus_gather(XBT_NIL, xendev->nodename, +@@ -166,39 +155,11 @@ int read_otherend_details(struct xenbus_device *xendev, + + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); + +- +-static int read_backend_details(struct xenbus_device *xendev) +-{ +- return read_otherend_details(xendev, "backend-id", "backend"); +-} +- +-static struct device_attribute xenbus_dev_attrs[] = { +- __ATTR_NULL +-}; +- +-/* Bus type for frontend drivers. */ +-static struct xen_bus_type xenbus_frontend = { +- .root = "device", +- .levels = 2, /* device/type/ */ +- .get_bus_id = frontend_bus_id, +- .probe = xenbus_probe_frontend, +- .bus = { +- .name = "xen", +- .match = xenbus_match, +- .uevent = xenbus_uevent, +- .probe = xenbus_dev_probe, +- .remove = xenbus_dev_remove, +- .shutdown = xenbus_dev_shutdown, +- .dev_attrs = xenbus_dev_attrs, +- +- .suspend = xenbus_dev_suspend, +- .resume = xenbus_dev_resume, +- }, +-}; +- +-static void otherend_changed(struct xenbus_watch *watch, +- const char **vec, unsigned int len) ++void xenbus_otherend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len, ++ int ignore_on_shutdown) + { + struct xenbus_device *dev = + container_of(watch, struct xenbus_device, otherend_watch); +@@ -226,11 +187,7 @@ static void otherend_changed(struct xenbus_watch *watch, + * work that can fail e.g., when the rootfs is gone. + */ + if (system_state > SYSTEM_RUNNING) { +- struct xen_bus_type *bus = bus; +- bus = container_of(dev->dev.bus, struct xen_bus_type, bus); +- /* If we're frontend, drive the state machine to Closed. */ +- /* This should cause the backend to release our resources. */ +- if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) ++ if (ignore_on_shutdown && (state == XenbusStateClosing)) + xenbus_frontend_closed(dev); + return; + } +@@ -238,25 +195,7 @@ static void otherend_changed(struct xenbus_watch *watch, + if (drv->otherend_changed) + drv->otherend_changed(dev, state); + } +- +- +-static int talk_to_otherend(struct xenbus_device *dev) +-{ +- struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); +- +- free_otherend_watch(dev); +- free_otherend_details(dev); +- +- return drv->read_otherend_details(dev); +-} +- +- +-static int watch_otherend(struct xenbus_device *dev) +-{ +- return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed, +- "%s/%s", dev->otherend, "state"); +-} +- ++EXPORT_SYMBOL_GPL(xenbus_otherend_changed); + + int xenbus_dev_probe(struct device *_dev) + { +@@ -300,8 +239,9 @@ int xenbus_dev_probe(struct device *_dev) + fail: + xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); + xenbus_switch_state(dev, XenbusStateClosed); +- return -ENODEV; ++ return err; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_probe); + + int xenbus_dev_remove(struct device *_dev) + { +@@ -319,8 +259,9 @@ int xenbus_dev_remove(struct device *_dev) + xenbus_switch_state(dev, XenbusStateClosed); + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_remove); + +-static void xenbus_dev_shutdown(struct device *_dev) ++void xenbus_dev_shutdown(struct device *_dev) + { + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; +@@ -341,6 +282,7 @@ static void xenbus_dev_shutdown(struct device *_dev) + out: + put_device(&dev->dev); + } ++EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); + + int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, +@@ -354,25 +296,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv, + + return driver_register(&drv->driver); + } +- +-int __xenbus_register_frontend(struct xenbus_driver *drv, +- struct module *owner, const char *mod_name) +-{ +- int ret; +- +- drv->read_otherend_details = read_backend_details; +- +- ret = xenbus_register_driver_common(drv, &xenbus_frontend, +- owner, mod_name); +- if (ret) +- return ret; +- +- /* If this driver is loaded as a module wait for devices to attach. */ +- wait_for_devices(drv); +- +- return 0; +-} +-EXPORT_SYMBOL_GPL(__xenbus_register_frontend); ++EXPORT_SYMBOL_GPL(xenbus_register_driver_common); + + void xenbus_unregister_driver(struct xenbus_driver *drv) + { +@@ -543,24 +467,7 @@ fail: + kfree(xendev); + return err; + } +- +-/* device// */ +-static int xenbus_probe_frontend(const char *type, const char *name) +-{ +- char *nodename; +- int err; +- +- nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", +- xenbus_frontend.root, type, name); +- if (!nodename) +- return -ENOMEM; +- +- DPRINTK("%s", nodename); +- +- err = xenbus_probe_node(&xenbus_frontend, type, nodename); +- kfree(nodename); +- return err; +-} ++EXPORT_SYMBOL_GPL(xenbus_probe_node); + + static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) + { +@@ -569,15 +476,23 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) + unsigned int dir_n = 0; + int i; + ++ printk(KERN_CRIT "%s type %s\n", __func__, type); ++ + dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n); +- if (IS_ERR(dir)) ++ if (IS_ERR(dir)) { ++ printk(KERN_CRIT "%s failed xenbus_directory\n", __func__); + return PTR_ERR(dir); ++ } + + for (i = 0; i < dir_n; i++) { +- err = bus->probe(type, dir[i]); +- if (err) ++ printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]); ++ err = bus->probe(bus, type, dir[i]); ++ if (err) { ++ printk(KERN_CRIT "%s failed\n", __func__); + break; ++ } + } ++ printk("%s done\n", __func__); + kfree(dir); + return err; + } +@@ -588,18 +503,27 @@ int xenbus_probe_devices(struct xen_bus_type *bus) + char **dir; + unsigned int i, dir_n; + ++ printk(KERN_CRIT "%s %s\n", __func__, bus->root); ++ + dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n); +- if (IS_ERR(dir)) ++ if (IS_ERR(dir)) { ++ printk(KERN_CRIT "%s failed xenbus_directory\n", __func__); + return PTR_ERR(dir); ++ } + + for (i = 0; i < dir_n; i++) { ++ printk(KERN_CRIT "%s %d/%d %s\n", __func__, i+1,dir_n, dir[i]); + err = xenbus_probe_device_type(bus, dir[i]); +- if (err) ++ if (err) { ++ printk(KERN_CRIT "%s failed\n", __func__); + break; ++ } + } ++ printk("%s done\n", __func__); + kfree(dir); + return err; + } ++EXPORT_SYMBOL_GPL(xenbus_probe_devices); + + static unsigned int char_count(const char *str, char c) + { +@@ -662,32 +586,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) + } + EXPORT_SYMBOL_GPL(xenbus_dev_changed); + +-static void frontend_changed(struct xenbus_watch *watch, +- const char **vec, unsigned int len) +-{ +- DPRINTK(""); +- +- xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); +-} +- +-/* We watch for devices appearing and vanishing. */ +-static struct xenbus_watch fe_watch = { +- .node = "device", +- .callback = frontend_changed, +-}; +- +-static int xenbus_dev_suspend(struct device *dev, pm_message_t state) ++int xenbus_dev_suspend(struct device *dev, pm_message_t state) + { + int err = 0; + struct xenbus_driver *drv; +- struct xenbus_device *xdev; ++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + +- DPRINTK(""); ++ DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); +- xdev = container_of(dev, struct xenbus_device, dev); + if (drv->suspend) + err = drv->suspend(xdev, state); + if (err) +@@ -695,21 +604,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state) + "xenbus: suspend %s failed: %i\n", dev_name(dev), err); + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_suspend); + +-static int xenbus_dev_resume(struct device *dev) ++int xenbus_dev_resume(struct device *dev) + { + int err; + struct xenbus_driver *drv; +- struct xenbus_device *xdev; ++ struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev); + +- DPRINTK(""); ++ DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; +- + drv = to_xenbus_driver(dev->driver); +- xdev = container_of(dev, struct xenbus_device, dev); +- + err = talk_to_otherend(xdev); + if (err) { + printk(KERN_WARNING +@@ -740,6 +647,7 @@ static int xenbus_dev_resume(struct device *dev) + + return 0; + } ++EXPORT_SYMBOL_GPL(xenbus_dev_resume); + + /* A flag to determine if xenstored is 'ready' (i.e. has started) */ + int xenstored_ready = 0; +@@ -768,10 +676,7 @@ void xenbus_probe(struct work_struct *unused) + { + BUG_ON((xenstored_ready <= 0)); + +- /* Enumerate devices in xenstore and watch for changes. */ +- xenbus_probe_devices(&xenbus_frontend); +- register_xenbus_watch(&fe_watch); +- xenbus_backend_probe_and_watch(); ++ printk(KERN_CRIT "xenbus_probe wake_waiting\n"); + + /* Notify others that xenstore is up */ + blocking_notifier_call_chain(&xenstore_chain, 0, NULL); +@@ -780,27 +685,43 @@ void xenbus_probe(struct work_struct *unused) + static int __init xenbus_probe_init(void) + { + int err = 0; ++ unsigned long page = 0; + + DPRINTK(""); + + err = -ENODEV; + if (!xen_domain()) +- goto out_error; +- +- /* Register ourselves with the kernel bus subsystem */ +- err = bus_register(&xenbus_frontend.bus); +- if (err) +- goto out_error; +- +- err = xenbus_backend_bus_register(); +- if (err) +- goto out_unreg_front; ++ return err; + + /* + * Domain0 doesn't have a store_evtchn or store_mfn yet. + */ + if (xen_initial_domain()) { +- /* dom0 not yet supported */ ++ struct evtchn_alloc_unbound alloc_unbound; ++ ++ /* Allocate Xenstore page */ ++ page = get_zeroed_page(GFP_KERNEL); ++ if (!page) ++ goto out_error; ++ ++ xen_store_mfn = xen_start_info->store_mfn = ++ pfn_to_mfn(virt_to_phys((void *)page) >> ++ PAGE_SHIFT); ++ ++ /* Next allocate a local port which xenstored can bind to */ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = 0; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ if (err == -ENOSYS) ++ goto out_error; ++ ++ BUG_ON(err); ++ xen_store_evtchn = xen_start_info->store_evtchn = ++ alloc_unbound.port; ++ ++ xen_store_interface = mfn_to_virt(xen_store_mfn); + } else { + xenstored_ready = 1; + xen_store_evtchn = xen_start_info->store_evtchn; +@@ -813,7 +734,7 @@ static int __init xenbus_probe_init(void) + if (err) { + printk(KERN_WARNING + "XENBUS: Error initializing xenstore comms: %i\n", err); +- goto out_unreg_back; ++ goto out_error; + } + + if (!xen_initial_domain()) +@@ -827,130 +748,17 @@ static int __init xenbus_probe_init(void) + proc_mkdir("xen", NULL); + #endif + ++ printk(KERN_CRIT "%s ok\n", __func__); + return 0; + +- out_unreg_back: +- xenbus_backend_bus_unregister(); +- +- out_unreg_front: +- bus_unregister(&xenbus_frontend.bus); +- + out_error: ++ if (page != 0) ++ free_page(page); ++ ++ printk(KERN_CRIT "err %d in %s\n", err, __func__); + return err; + } + + postcore_initcall(xenbus_probe_init); + + MODULE_LICENSE("GPL"); +- +-static int is_device_connecting(struct device *dev, void *data) +-{ +- struct xenbus_device *xendev = to_xenbus_device(dev); +- struct device_driver *drv = data; +- struct xenbus_driver *xendrv; +- +- /* +- * A device with no driver will never connect. We care only about +- * devices which should currently be in the process of connecting. +- */ +- if (!dev->driver) +- return 0; +- +- /* Is this search limited to a particular driver? */ +- if (drv && (dev->driver != drv)) +- return 0; +- +- xendrv = to_xenbus_driver(dev->driver); +- return (xendev->state < XenbusStateConnected || +- (xendev->state == XenbusStateConnected && +- xendrv->is_ready && !xendrv->is_ready(xendev))); +-} +- +-static int exists_connecting_device(struct device_driver *drv) +-{ +- return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +- is_device_connecting); +-} +- +-static int print_device_status(struct device *dev, void *data) +-{ +- struct xenbus_device *xendev = to_xenbus_device(dev); +- struct device_driver *drv = data; +- +- /* Is this operation limited to a particular driver? */ +- if (drv && (dev->driver != drv)) +- return 0; +- +- if (!dev->driver) { +- /* Information only: is this too noisy? */ +- printk(KERN_INFO "XENBUS: Device with no driver: %s\n", +- xendev->nodename); +- } else if (xendev->state < XenbusStateConnected) { +- enum xenbus_state rstate = XenbusStateUnknown; +- if (xendev->otherend) +- rstate = xenbus_read_driver_state(xendev->otherend); +- printk(KERN_WARNING "XENBUS: Timeout connecting " +- "to device: %s (local state %d, remote state %d)\n", +- xendev->nodename, xendev->state, rstate); +- } +- +- return 0; +-} +- +-/* We only wait for device setup after most initcalls have run. */ +-static int ready_to_wait_for_devices; +- +-/* +- * On a 5-minute timeout, wait for all devices currently configured. We need +- * to do this to guarantee that the filesystems and / or network devices +- * needed for boot are available, before we can allow the boot to proceed. +- * +- * This needs to be on a late_initcall, to happen after the frontend device +- * drivers have been initialised, but before the root fs is mounted. +- * +- * A possible improvement here would be to have the tools add a per-device +- * flag to the store entry, indicating whether it is needed at boot time. +- * This would allow people who knew what they were doing to accelerate their +- * boot slightly, but of course needs tools or manual intervention to set up +- * those flags correctly. +- */ +-static void wait_for_devices(struct xenbus_driver *xendrv) +-{ +- unsigned long start = jiffies; +- struct device_driver *drv = xendrv ? &xendrv->driver : NULL; +- unsigned int seconds_waited = 0; +- +- if (!ready_to_wait_for_devices || !xen_domain()) +- return; +- +- while (exists_connecting_device(drv)) { +- if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { +- if (!seconds_waited) +- printk(KERN_WARNING "XENBUS: Waiting for " +- "devices to initialise: "); +- seconds_waited += 5; +- printk("%us...", 300 - seconds_waited); +- if (seconds_waited == 300) +- break; +- } +- +- schedule_timeout_interruptible(HZ/10); +- } +- +- if (seconds_waited) +- printk("\n"); +- +- bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, +- print_device_status); +-} +- +-#ifndef MODULE +-static int __init boot_wait_for_devices(void) +-{ +- ready_to_wait_for_devices = 1; +- wait_for_devices(NULL); +- return 0; +-} +- +-late_initcall(boot_wait_for_devices); +-#endif +diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h +index 6c5e318..0e5fc4c 100644 +--- a/drivers/xen/xenbus/xenbus_probe.h ++++ b/drivers/xen/xenbus/xenbus_probe.h +@@ -36,26 +36,13 @@ + + #define XEN_BUS_ID_SIZE 20 + +-#ifdef CONFIG_XEN_BACKEND +-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); +-extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); +-extern void xenbus_backend_probe_and_watch(void); +-extern int xenbus_backend_bus_register(void); +-extern void xenbus_backend_bus_unregister(void); +-#else +-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} +-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} +-static inline void xenbus_backend_probe_and_watch(void) {} +-static inline int xenbus_backend_bus_register(void) { return 0; } +-static inline void xenbus_backend_bus_unregister(void) {} +-#endif +- + struct xen_bus_type + { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); +- int (*probe)(const char *type, const char *dir); ++ int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); ++ void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, unsigned int len); + struct bus_type bus; + }; + +@@ -73,4 +60,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus); + + extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + ++extern void xenbus_dev_shutdown(struct device *_dev); ++ ++extern int xenbus_dev_suspend(struct device *dev, pm_message_t state); ++extern int xenbus_dev_resume(struct device *dev); ++ ++extern void xenbus_otherend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len, ++ int ignore_on_shutdown); ++ ++extern int xenbus_read_otherend_details(struct xenbus_device *xendev, ++ char *id_node, char *path_node); ++ + #endif +diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c +new file mode 100644 +index 0000000..a3cc535 +--- /dev/null ++++ b/drivers/xen/xenbus/xenbus_probe_backend.c +@@ -0,0 +1,298 @@ ++/****************************************************************************** ++ * Talks to Xen Store to figure out what devices we have (backend half). ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard ++ * Copyright (C) 2005, 2006 XenSource Ltd ++ * Copyright (C) 2007 Solarflare Communications, Inc. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __func__, __LINE__, ##args) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++/* backend/// => -- */ ++static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) ++{ ++ int domid, err; ++ const char *devid, *type, *frontend; ++ unsigned int typelen; ++ ++ type = strchr(nodename, '/'); ++ if (!type) ++ return -EINVAL; ++ type++; ++ typelen = strcspn(type, "/"); ++ if (!typelen || type[typelen] != '/') ++ return -EINVAL; ++ ++ devid = strrchr(nodename, '/') + 1; ++ ++ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, ++ "frontend", NULL, &frontend, ++ NULL); ++ if (err) ++ return err; ++ if (strlen(frontend) == 0) ++ err = -ERANGE; ++ if (!err && !xenbus_exists(XBT_NIL, frontend, "")) ++ err = -ENOENT; ++ kfree(frontend); ++ ++ if (err) ++ return err; ++ ++ if (snprintf(bus_id, XEN_BUS_ID_SIZE, ++ "%.*s-%i-%s", typelen, type, domid, devid) >= XEN_BUS_ID_SIZE) ++ return -ENOSPC; ++ return 0; ++} ++ ++static int xenbus_uevent_backend(struct device *dev, ++ struct kobj_uevent_env *env) ++{ ++ struct xenbus_device *xdev; ++ struct xenbus_driver *drv; ++ struct xen_bus_type *bus; ++ ++ DPRINTK(""); ++ ++ if (dev == NULL) ++ return -ENODEV; ++ ++ xdev = to_xenbus_device(dev); ++ bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); ++ if (xdev == NULL) ++ return -ENODEV; ++ ++ /* stuff we want to pass to /sbin/hotplug */ ++ if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) ++ return -ENOMEM; ++ ++ if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename)) ++ return -ENOMEM; ++ ++ if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root)) ++ return -ENOMEM; ++ ++ if (dev->driver) { ++ drv = to_xenbus_driver(dev->driver); ++ if (drv && drv->uevent) ++ return drv->uevent(xdev, env); ++ } ++ ++ return 0; ++} ++ ++/* backend/// */ ++static int xenbus_probe_backend_unit(struct xen_bus_type *bus, ++ const char *dir, ++ const char *type, ++ const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s\n", nodename); ++ ++ err = xenbus_probe_node(bus, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++/* backend// */ ++static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, const char *domid) ++{ ++ char *nodename; ++ int err = 0; ++ char **dir; ++ unsigned int i, dir_n = 0; ++ ++ DPRINTK(""); ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid); ++ if (!nodename) ++ return -ENOMEM; ++ ++ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); ++ if (IS_ERR(dir)) { ++ kfree(nodename); ++ return PTR_ERR(dir); ++ } ++ ++ for (i = 0; i < dir_n; i++) { ++ err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]); ++ if (err) ++ break; ++ } ++ kfree(dir); ++ kfree(nodename); ++ return err; ++} ++ ++static void frontend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ xenbus_otherend_changed(watch, vec, len, 0); ++} ++ ++static struct device_attribute xenbus_backend_dev_attrs[] = { ++ __ATTR_NULL ++}; ++ ++static struct xen_bus_type xenbus_backend = { ++ .root = "backend", ++ .levels = 3, /* backend/type// */ ++ .get_bus_id = backend_bus_id, ++ .probe = xenbus_probe_backend, ++ .otherend_changed = frontend_changed, ++ .bus = { ++ .name = "xen-backend", ++ .match = xenbus_match, ++ .uevent = xenbus_uevent_backend, ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++ .shutdown = xenbus_dev_shutdown, ++ .dev_attrs = xenbus_backend_dev_attrs, ++ }, ++}; ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); ++} ++ ++static struct xenbus_watch be_watch = { ++ .node = "backend", ++ .callback = backend_changed, ++}; ++ ++static int read_frontend_details(struct xenbus_device *xendev) ++{ ++ return xenbus_read_otherend_details(xendev, "frontend-id", "frontend"); ++} ++ ++//void xenbus_backend_suspend(int (*fn)(struct device *, void *)) ++//{ ++// DPRINTK(""); ++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++//} ++ ++//void xenbus_backend_resume(int (*fn)(struct device *, void *)) ++//{ ++// DPRINTK(""); ++// bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++//} ++ ++//int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *)) ++//{ ++// return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn); ++//} ++//EXPORT_SYMBOL_GPL(xenbus_for_each_backend); ++ ++int xenbus_dev_is_online(struct xenbus_device *dev) ++{ ++ int rc, val; ++ ++ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); ++ if (rc != 1) ++ val = 0; /* no online node present */ ++ ++ return val; ++} ++EXPORT_SYMBOL_GPL(xenbus_dev_is_online); ++ ++int __xenbus_register_backend(struct xenbus_driver *drv, ++ struct module *owner, const char *mod_name) ++{ ++ drv->read_otherend_details = read_frontend_details; ++ ++ return xenbus_register_driver_common(drv, &xenbus_backend, ++ owner, mod_name); ++} ++EXPORT_SYMBOL_GPL(__xenbus_register_backend); ++ ++static int backend_probe_and_watch(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ /* Enumerate devices in xenstore and watch for changes. */ ++ xenbus_probe_devices(&xenbus_backend); ++ printk(KERN_CRIT "%s devices probed ok\n", __func__); ++ register_xenbus_watch(&be_watch); ++ printk(KERN_CRIT "%s watch add ok ok\n", __func__); ++ printk(KERN_CRIT "%s all done\n", __func__); ++ return NOTIFY_DONE; ++} ++ ++static int __init xenbus_probe_backend_init(void) ++{ ++ static struct notifier_block xenstore_notifier = { ++ .notifier_call = backend_probe_and_watch ++ }; ++ int err; ++ ++ DPRINTK(""); ++ ++ /* Register ourselves with the kernel bus subsystem */ ++ err = bus_register(&xenbus_backend.bus); ++ if (err) { ++ printk(KERN_CRIT "%s didn't register bus!\n", __func__); ++ return err; ++ } ++ printk(KERN_CRIT "%s bus registered ok\n", __func__); ++ ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++subsys_initcall(xenbus_probe_backend_init); +diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c +new file mode 100644 +index 0000000..47be902 +--- /dev/null ++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c +@@ -0,0 +1,292 @@ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __func__, __LINE__, ##args) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++/* device// => - */ ++static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) ++{ ++ nodename = strchr(nodename, '/'); ++ if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { ++ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); ++ return -EINVAL; ++ } ++ ++ strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); ++ if (!strchr(bus_id, '/')) { ++ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); ++ return -EINVAL; ++ } ++ *strchr(bus_id, '/') = '-'; ++ return 0; ++} ++ ++/* device// */ ++static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s", nodename); ++ ++ err = xenbus_probe_node(bus, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++static int xenbus_uevent_frontend(struct device *_dev, struct kobj_uevent_env *env) ++{ ++ struct xenbus_device *dev = to_xenbus_device(_dev); ++ ++ if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ xenbus_otherend_changed(watch, vec, len, 1); ++} ++ ++static struct device_attribute xenbus_frontend_dev_attrs[] = { ++ __ATTR_NULL ++}; ++ ++ ++static struct xen_bus_type xenbus_frontend = { ++ .root = "device", ++ .levels = 2, /* device/type/ */ ++ .get_bus_id = frontend_bus_id, ++ .probe = xenbus_probe_frontend, ++ .otherend_changed = backend_changed, ++ .bus = { ++ .name = "xen", ++ .match = xenbus_match, ++ .uevent = xenbus_uevent_frontend, ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++ .shutdown = xenbus_dev_shutdown, ++ .dev_attrs= xenbus_frontend_dev_attrs, ++ ++ .suspend = xenbus_dev_suspend, ++ .resume = xenbus_dev_resume, ++ }, ++}; ++ ++static void frontend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); ++} ++ ++ ++/* We watch for devices appearing and vanishing. */ ++static struct xenbus_watch fe_watch = { ++ .node = "device", ++ .callback = frontend_changed, ++}; ++ ++static int read_backend_details(struct xenbus_device *xendev) ++{ ++ return xenbus_read_otherend_details(xendev, "backend-id", "backend"); ++} ++ ++static int is_device_connecting(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ struct xenbus_driver *xendrv; ++ ++ /* ++ * A device with no driver will never connect. We care only about ++ * devices which should currently be in the process of connecting. ++ */ ++ if (!dev->driver) ++ return 0; ++ ++ /* Is this search limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ xendrv = to_xenbus_driver(dev->driver); ++ return (xendev->state < XenbusStateConnected || ++ (xendev->state == XenbusStateConnected && ++ xendrv->is_ready && !xendrv->is_ready(xendev))); ++} ++ ++static int exists_connecting_device(struct device_driver *drv) ++{ ++ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ is_device_connecting); ++} ++ ++static int print_device_status(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ ++ /* Is this operation limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ if (!dev->driver) { ++ /* Information only: is this too noisy? */ ++ printk(KERN_INFO "XENBUS: Device with no driver: %s\n", ++ xendev->nodename); ++ } else if (xendev->state < XenbusStateConnected) { ++ enum xenbus_state rstate = XenbusStateUnknown; ++ if (xendev->otherend) ++ rstate = xenbus_read_driver_state(xendev->otherend); ++ printk(KERN_WARNING "XENBUS: Timeout connecting " ++ "to device: %s (local state %d, remote state %d)\n", ++ xendev->nodename, xendev->state, rstate); ++ } ++ ++ return 0; ++} ++ ++/* We only wait for device setup after most initcalls have run. */ ++static int ready_to_wait_for_devices; ++ ++/* ++ * On a 5-minute timeout, wait for all devices currently configured. We need ++ * to do this to guarantee that the filesystems and / or network devices ++ * needed for boot are available, before we can allow the boot to proceed. ++ * ++ * This needs to be on a late_initcall, to happen after the frontend device ++ * drivers have been initialised, but before the root fs is mounted. ++ * ++ * A possible improvement here would be to have the tools add a per-device ++ * flag to the store entry, indicating whether it is needed at boot time. ++ * This would allow people who knew what they were doing to accelerate their ++ * boot slightly, but of course needs tools or manual intervention to set up ++ * those flags correctly. ++ */ ++static void wait_for_devices(struct xenbus_driver *xendrv) ++{ ++ unsigned long start = jiffies; ++ struct device_driver *drv = xendrv ? &xendrv->driver : NULL; ++ unsigned int seconds_waited = 0; ++ ++ if (!ready_to_wait_for_devices || !xen_domain()) ++ return; ++ ++ while (exists_connecting_device(drv)) { ++ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { ++ if (!seconds_waited) ++ printk(KERN_WARNING "XENBUS: Waiting for " ++ "devices to initialise: "); ++ seconds_waited += 5; ++ printk("%us...", 300 - seconds_waited); ++ if (seconds_waited == 300) ++ break; ++ } ++ ++ schedule_timeout_interruptible(HZ/10); ++ } ++ ++ if (seconds_waited) ++ printk("\n"); ++ ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ print_device_status); ++} ++ ++int __xenbus_register_frontend(struct xenbus_driver *drv, ++ struct module *owner, const char *mod_name) ++{ ++ int ret; ++ ++ drv->read_otherend_details = read_backend_details; ++ ++ ret = xenbus_register_driver_common(drv, &xenbus_frontend, ++ owner, mod_name); ++ if (ret) ++ return ret; ++ ++ /* If this driver is loaded as a module wait for devices to attach. */ ++ wait_for_devices(drv); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(__xenbus_register_frontend); ++ ++static int frontend_probe_and_watch(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ /* Enumerate devices in xenstore and watch for changes. */ ++ xenbus_probe_devices(&xenbus_frontend); ++ printk(KERN_CRIT "%s devices probed ok\n", __func__); ++ register_xenbus_watch(&fe_watch); ++ printk(KERN_CRIT "%s watch add ok ok\n", __func__); ++ printk(KERN_CRIT "%s all done\n", __func__); ++ return NOTIFY_DONE; ++} ++ ++ ++static int __init xenbus_probe_frontend_init(void) ++{ ++ static struct notifier_block xenstore_notifier = { ++ .notifier_call = frontend_probe_and_watch ++ }; ++ int err; ++ ++ DPRINTK(""); ++ ++ /* Register ourselves with the kernel bus subsystem */ ++ err = bus_register(&xenbus_frontend.bus); ++ if (err) { ++ printk(KERN_CRIT "%s didn't register bus!\n", __func__); ++ return err; ++ } ++ printk(KERN_CRIT "%s bus registered ok\n", __func__); ++ ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++subsys_initcall(xenbus_probe_frontend_init); ++ ++#ifndef MODULE ++static int __init boot_wait_for_devices(void) ++{ ++ ready_to_wait_for_devices = 1; ++ wait_for_devices(NULL); ++ return 0; ++} ++ ++late_initcall(boot_wait_for_devices); ++#endif ++ ++MODULE_LICENSE("GPL"); +diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c +index eab33f1..6f91e8c 100644 +--- a/drivers/xen/xenbus/xenbus_xs.c ++++ b/drivers/xen/xenbus/xenbus_xs.c +@@ -76,6 +76,14 @@ struct xs_handle { + /* + * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. + * response_mutex is never taken simultaneously with the other three. ++ * ++ * transaction_mutex must be held before incrementing ++ * transaction_count. The mutex is held when a suspend is in ++ * progress to prevent new transactions starting. ++ * ++ * When decrementing transaction_count to zero the wait queue ++ * should be woken up, the suspend code waits for count to ++ * reach zero. + */ + + /* One request at a time. */ +@@ -85,7 +93,9 @@ struct xs_handle { + struct mutex response_mutex; + + /* Protect transactions against save/restore. */ +- struct rw_semaphore transaction_mutex; ++ struct mutex transaction_mutex; ++ atomic_t transaction_count; ++ wait_queue_head_t transaction_wq; + + /* Protect watch (de)register against save/restore. */ + struct rw_semaphore watch_mutex; +@@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) + return body; + } + ++static void transaction_start(void) ++{ ++ mutex_lock(&xs_state.transaction_mutex); ++ atomic_inc(&xs_state.transaction_count); ++ mutex_unlock(&xs_state.transaction_mutex); ++} ++ ++static void transaction_end(void) ++{ ++ if (atomic_dec_and_test(&xs_state.transaction_count)) ++ wake_up(&xs_state.transaction_wq); ++} ++ ++static void transaction_suspend(void) ++{ ++ mutex_lock(&xs_state.transaction_mutex); ++ wait_event(xs_state.transaction_wq, ++ atomic_read(&xs_state.transaction_count) == 0); ++} ++ ++static void transaction_resume(void) ++{ ++ mutex_unlock(&xs_state.transaction_mutex); ++} ++ + void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + { + void *ret; +@@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + int err; + + if (req_msg.type == XS_TRANSACTION_START) +- down_read(&xs_state.transaction_mutex); ++ transaction_start(); + + mutex_lock(&xs_state.request_mutex); + +@@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) + if ((msg->type == XS_TRANSACTION_END) || + ((req_msg.type == XS_TRANSACTION_START) && + (msg->type == XS_ERROR))) +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + + return ret; + } +@@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t) + { + char *id_str; + +- down_read(&xs_state.transaction_mutex); ++ transaction_start(); + + id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); + if (IS_ERR(id_str)) { +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + return PTR_ERR(id_str); + } + +@@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort) + + err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); + +- up_read(&xs_state.transaction_mutex); ++ transaction_end(); + + return err; + } +@@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); + + void xs_suspend(void) + { +- down_write(&xs_state.transaction_mutex); ++ transaction_suspend(); + down_write(&xs_state.watch_mutex); + mutex_lock(&xs_state.request_mutex); + mutex_lock(&xs_state.response_mutex); +@@ -677,7 +712,7 @@ void xs_resume(void) + + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); +- up_write(&xs_state.transaction_mutex); ++ transaction_resume(); + + /* No need for watches_lock: the watch_mutex is sufficient. */ + list_for_each_entry(watch, &watches, list) { +@@ -693,7 +728,7 @@ void xs_suspend_cancel(void) + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); + up_write(&xs_state.watch_mutex); +- up_write(&xs_state.transaction_mutex); ++ mutex_unlock(&xs_state.transaction_mutex); + } + + static int xenwatch_thread(void *unused) +@@ -843,8 +878,10 @@ int xs_init(void) + + mutex_init(&xs_state.request_mutex); + mutex_init(&xs_state.response_mutex); +- init_rwsem(&xs_state.transaction_mutex); ++ mutex_init(&xs_state.transaction_mutex); + init_rwsem(&xs_state.watch_mutex); ++ atomic_set(&xs_state.transaction_count, 0); ++ init_waitqueue_head(&xs_state.transaction_wq); + + /* Initialize the shared memory rings to talk to xenstored */ + err = xb_init_comms(); +diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile +index 25275c3..4a0be9a 100644 +--- a/drivers/xen/xenfs/Makefile ++++ b/drivers/xen/xenfs/Makefile +@@ -1,3 +1,4 @@ + obj-$(CONFIG_XENFS) += xenfs.o + +-xenfs-objs = super.o xenbus.o +\ No newline at end of file ++xenfs-y = super.o xenbus.o ++xenfs-$(CONFIG_XEN_DOM0) += xenstored.o privcmd.o +diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c +new file mode 100644 +index 0000000..f80be7f +--- /dev/null ++++ b/drivers/xen/xenfs/privcmd.c +@@ -0,0 +1,404 @@ ++/****************************************************************************** ++ * privcmd.c ++ * ++ * Interface to privileged domain-0 commands. ++ * ++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); ++#endif ++ ++static long privcmd_ioctl_hypercall(void __user *udata) ++{ ++ struct privcmd_hypercall hypercall; ++ long ret; ++ ++ if (copy_from_user(&hypercall, udata, sizeof(hypercall))) ++ return -EFAULT; ++ ++ ret = privcmd_call(hypercall.op, ++ hypercall.arg[0], hypercall.arg[1], ++ hypercall.arg[2], hypercall.arg[3], ++ hypercall.arg[4]); ++ ++ return ret; ++} ++ ++static void free_page_list(struct list_head *pages) ++{ ++ struct page *p, *n; ++ ++ list_for_each_entry_safe(p, n, pages, lru) ++ __free_page(p); ++ ++ INIT_LIST_HEAD(pages); ++} ++ ++/* ++ * Given an array of items in userspace, return a list of pages ++ * containing the data. If copying fails, either because of memory ++ * allocation failure or a problem reading user memory, return an ++ * error code; its up to the caller to dispose of any partial list. ++ */ ++static int gather_array(struct list_head *pagelist, ++ unsigned nelem, size_t size, ++ void __user *data) ++{ ++ unsigned pageidx; ++ void *pagedata; ++ int ret; ++ ++ if (size > PAGE_SIZE) ++ return 0; ++ ++ pageidx = PAGE_SIZE; ++ pagedata = NULL; /* quiet, gcc */ ++ while (nelem--) { ++ if (pageidx > PAGE_SIZE-size) { ++ struct page *page = alloc_page(GFP_KERNEL); ++ ++ ret = -ENOMEM; ++ if (page == NULL) ++ goto fail; ++ ++ pagedata = page_address(page); ++ ++ list_add_tail(&page->lru, pagelist); ++ pageidx = 0; ++ } ++ ++ ret = -EFAULT; ++ if (copy_from_user(pagedata + pageidx, data, size)) ++ goto fail; ++ ++ data += size; ++ pageidx += size; ++ } ++ ++ ret = 0; ++ ++fail: ++ return ret; ++} ++ ++/* ++ * Call function "fn" on each element of the array fragmented ++ * over a list of pages. ++ */ ++static int traverse_pages(unsigned nelem, size_t size, ++ struct list_head *pos, ++ int (*fn)(void *data, void *state), ++ void *state) ++{ ++ void *pagedata; ++ unsigned pageidx; ++ int ret = 0; ++ ++ BUG_ON(size > PAGE_SIZE); ++ ++ pageidx = PAGE_SIZE; ++ pagedata = NULL; /* hush, gcc */ ++ ++ while (nelem--) { ++ if (pageidx > PAGE_SIZE-size) { ++ struct page *page; ++ pos = pos->next; ++ page = list_entry(pos, struct page, lru); ++ pagedata = page_address(page); ++ pageidx = 0; ++ } ++ ++ ret = (*fn)(pagedata + pageidx, state); ++ if (ret) ++ break; ++ pageidx += size; ++ } ++ ++ return ret; ++} ++ ++struct mmap_mfn_state { ++ unsigned long va; ++ struct vm_area_struct *vma; ++ domid_t domain; ++}; ++ ++static int mmap_mfn_range(void *data, void *state) ++{ ++ struct privcmd_mmap_entry *msg = data; ++ struct mmap_mfn_state *st = state; ++ struct vm_area_struct *vma = st->vma; ++ int rc; ++ ++ /* Do not allow range to wrap the address space. */ ++ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || ++ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) ++ return -EINVAL; ++ ++ /* Range chunks must be contiguous in va space. */ ++ if ((msg->va != st->va) || ++ ((msg->va+(msg->npages< vma->vm_end)) ++ return -EINVAL; ++ ++ rc = xen_remap_domain_mfn_range(vma, ++ msg->va & PAGE_MASK, ++ msg->mfn, msg->npages, ++ vma->vm_page_prot, ++ st->domain); ++ if (rc < 0) ++ return rc; ++ ++ st->va += msg->npages << PAGE_SHIFT; ++ ++ return 0; ++} ++ ++static long privcmd_ioctl_mmap(void __user *udata) ++{ ++ struct privcmd_mmap mmapcmd; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ int rc; ++ LIST_HEAD(pagelist); ++ struct mmap_mfn_state state; ++ ++ if (!xen_initial_domain()) ++ return -EPERM; ++ ++ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) ++ return -EFAULT; ++ ++ rc = gather_array(&pagelist, ++ mmapcmd.num, sizeof(struct privcmd_mmap_entry), ++ mmapcmd.entry); ++ ++ if (rc || list_empty(&pagelist)) ++ goto out; ++ ++ down_write(&mm->mmap_sem); ++ ++ { ++ struct page *page = list_first_entry(&pagelist, ++ struct page, lru); ++ struct privcmd_mmap_entry *msg = page_address(page); ++ ++ vma = find_vma(mm, msg->va); ++ rc = -EINVAL; ++ ++ if (!vma || (msg->va != vma->vm_start) || ++ !privcmd_enforce_singleshot_mapping(vma)) ++ goto out_up; ++ } ++ ++ state.va = vma->vm_start; ++ state.vma = vma; ++ state.domain = mmapcmd.dom; ++ ++ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), ++ &pagelist, ++ mmap_mfn_range, &state); ++ ++ ++out_up: ++ up_write(&mm->mmap_sem); ++ ++out: ++ free_page_list(&pagelist); ++ ++ return rc; ++} ++ ++struct mmap_batch_state { ++ domid_t domain; ++ unsigned long va; ++ struct vm_area_struct *vma; ++ int err; ++ ++ xen_pfn_t __user *user; ++}; ++ ++static int mmap_batch_fn(void *data, void *state) ++{ ++ xen_pfn_t *mfnp = data; ++ struct mmap_batch_state *st = state; ++ ++ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, ++ st->vma->vm_page_prot, st->domain) < 0) { ++ *mfnp |= 0xf0000000U; ++ st->err++; ++ } ++ st->va += PAGE_SIZE; ++ ++ return 0; ++} ++ ++static int mmap_return_errors(void *data, void *state) ++{ ++ xen_pfn_t *mfnp = data; ++ struct mmap_batch_state *st = state; ++ ++ put_user(*mfnp, st->user++); ++ ++ return 0; ++} ++ ++static struct vm_operations_struct privcmd_vm_ops; ++ ++static long privcmd_ioctl_mmap_batch(void __user *udata) ++{ ++ int ret; ++ struct privcmd_mmapbatch m; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long nr_pages; ++ LIST_HEAD(pagelist); ++ struct mmap_batch_state state; ++ ++ if (!xen_initial_domain()) ++ return -EPERM; ++ ++ if (copy_from_user(&m, udata, sizeof(m))) ++ return -EFAULT; ++ ++ nr_pages = m.num; ++ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) ++ return -EINVAL; ++ ++ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), ++ m.arr); ++ ++ if (ret || list_empty(&pagelist)) ++ goto out; ++ ++ down_write(&mm->mmap_sem); ++ ++ vma = find_vma(mm, m.addr); ++ ret = -EINVAL; ++ if (!vma || ++ vma->vm_ops != &privcmd_vm_ops || ++ (m.addr != vma->vm_start) || ++ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || ++ !privcmd_enforce_singleshot_mapping(vma)) { ++ up_write(&mm->mmap_sem); ++ goto out; ++ } ++ ++ state.domain = m.dom; ++ state.vma = vma; ++ state.va = m.addr; ++ state.err = 0; ++ ++ ret = traverse_pages(m.num, sizeof(xen_pfn_t), ++ &pagelist, mmap_batch_fn, &state); ++ ++ up_write(&mm->mmap_sem); ++ ++ if (state.err > 0) { ++ ret = 0; ++ ++ state.user = m.arr; ++ traverse_pages(m.num, sizeof(xen_pfn_t), ++ &pagelist, ++ mmap_return_errors, &state); ++ } ++ ++out: ++ free_page_list(&pagelist); ++ ++ return ret; ++} ++ ++static long privcmd_ioctl(struct file *file, ++ unsigned int cmd, unsigned long data) ++{ ++ int ret = -ENOSYS; ++ void __user *udata = (void __user *) data; ++ ++ switch (cmd) { ++ case IOCTL_PRIVCMD_HYPERCALL: ++ ret = privcmd_ioctl_hypercall(udata); ++ break; ++ ++ case IOCTL_PRIVCMD_MMAP: ++ ret = privcmd_ioctl_mmap(udata); ++ break; ++ ++ case IOCTL_PRIVCMD_MMAPBATCH: ++ ret = privcmd_ioctl_mmap_batch(udata); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", ++ vma, vma->vm_start, vma->vm_end, ++ vmf->pgoff, vmf->virtual_address); ++ ++ return VM_FAULT_SIGBUS; ++} ++ ++static struct vm_operations_struct privcmd_vm_ops = { ++ .fault = privcmd_fault ++}; ++ ++static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ /* Unsupported for auto-translate guests. */ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return -ENOSYS; ++ ++ /* DONTCOPY is essential for Xen as copy_page_range is broken. */ ++ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; ++ vma->vm_ops = &privcmd_vm_ops; ++ vma->vm_private_data = NULL; ++ ++ return 0; ++} ++ ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) ++{ ++ return (xchg(&vma->vm_private_data, (void *)1) == NULL); ++} ++#endif ++ ++const struct file_operations privcmd_file_ops = { ++ .unlocked_ioctl = privcmd_ioctl, ++ .mmap = privcmd_mmap, ++}; +diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c +index 6559e0c..229c831 100644 +--- a/drivers/xen/xenfs/super.c ++++ b/drivers/xen/xenfs/super.c +@@ -12,6 +12,10 @@ + #include + #include + #include ++#include ++#include ++ ++#include + + #include "xenfs.h" + +@@ -20,6 +24,62 @@ + MODULE_DESCRIPTION("Xen filesystem"); + MODULE_LICENSE("GPL"); + ++static int xenfs_set_page_dirty(struct page *page) ++{ ++ return !TestSetPageDirty(page); ++} ++ ++static const struct address_space_operations xenfs_aops = { ++ .set_page_dirty = xenfs_set_page_dirty, ++}; ++ ++static struct backing_dev_info xenfs_backing_dev_info = { ++ .ra_pages = 0, /* No readahead */ ++ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, ++}; ++ ++static struct inode *xenfs_make_inode(struct super_block *sb, int mode) ++{ ++ struct inode *ret = new_inode(sb); ++ ++ if (ret) { ++ ret->i_mode = mode; ++ ret->i_mapping->a_ops = &xenfs_aops; ++ ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info; ++ ret->i_uid = ret->i_gid = 0; ++ ret->i_blocks = 0; ++ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; ++ } ++ return ret; ++} ++ ++static struct dentry *xenfs_create_file(struct super_block *sb, ++ struct dentry *parent, ++ const char *name, ++ const struct file_operations *fops, ++ void *data, ++ int mode) ++{ ++ struct dentry *dentry; ++ struct inode *inode; ++ ++ dentry = d_alloc_name(parent, name); ++ if (!dentry) ++ return NULL; ++ ++ inode = xenfs_make_inode(sb, S_IFREG | mode); ++ if (!inode) { ++ dput(dentry); ++ return NULL; ++ } ++ ++ inode->i_fop = fops; ++ inode->i_private = data; ++ ++ d_add(dentry, inode); ++ return dentry; ++} ++ + static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) + { +@@ -43,8 +103,22 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) + { "capabilities", &capabilities_file_ops, S_IRUGO }, + {""}, + }; +- +- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); ++ int rc; ++ ++ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); ++ if (rc < 0) ++ return rc; ++ ++ if (xen_initial_domain()) { ++ xenfs_create_file(sb, sb->s_root, "xsd_kva", ++ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); ++ xenfs_create_file(sb, sb->s_root, "xsd_port", ++ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); ++ xenfs_create_file(sb, sb->s_root, "privcmd", ++ &privcmd_file_ops, NULL, S_IRUSR|S_IWUSR); ++ } ++ ++ return rc; + } + + static int xenfs_get_sb(struct file_system_type *fs_type, +@@ -63,11 +137,25 @@ static struct file_system_type xenfs_type = { + + static int __init xenfs_init(void) + { +- if (xen_pv_domain()) +- return register_filesystem(&xenfs_type); ++ int err; ++ if (!xen_pv_domain()) { ++ printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n"); ++ return 0; ++ } ++ ++ err = register_filesystem(&xenfs_type); ++ if (err) { ++ printk(KERN_ERR "xenfs: Unable to register filesystem!\n"); ++ goto out; ++ } ++ ++ err = bdi_init(&xenfs_backing_dev_info); ++ if (err) ++ unregister_filesystem(&xenfs_type); ++ ++ out: + +- printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); +- return 0; ++ return err; + } + + static void __exit xenfs_exit(void) +diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h +index 51f08b2..b68aa62 100644 +--- a/drivers/xen/xenfs/xenfs.h ++++ b/drivers/xen/xenfs/xenfs.h +@@ -2,5 +2,8 @@ + #define _XENFS_XENBUS_H + + extern const struct file_operations xenbus_file_ops; ++extern const struct file_operations privcmd_file_ops; ++extern const struct file_operations xsd_kva_file_ops; ++extern const struct file_operations xsd_port_file_ops; + + #endif /* _XENFS_XENBUS_H */ +diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c +new file mode 100644 +index 0000000..af10804 +--- /dev/null ++++ b/drivers/xen/xenfs/xenstored.c +@@ -0,0 +1,67 @@ ++#include ++#include ++#include ++ ++#include ++ ++#include "xenfs.h" ++#include "../xenbus/xenbus_comms.h" ++ ++static ssize_t xsd_read(struct file *file, char __user *buf, ++ size_t size, loff_t *off) ++{ ++ const char *str = (const char *)file->private_data; ++ return simple_read_from_buffer(buf, size, off, str, strlen(str)); ++} ++ ++static int xsd_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static int xsd_kva_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p", ++ xen_store_interface); ++ if (!file->private_data) ++ return -ENOMEM; ++ return 0; ++} ++ ++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ size_t size = vma->vm_end - vma->vm_start; ++ ++ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) ++ return -EINVAL; ++ ++ if (remap_pfn_range(vma, vma->vm_start, ++ virt_to_pfn(xen_store_interface), ++ size, vma->vm_page_prot)) ++ return -EAGAIN; ++ ++ return 0; ++} ++ ++const struct file_operations xsd_kva_file_ops = { ++ .open = xsd_kva_open, ++ .mmap = xsd_kva_mmap, ++ .read = xsd_read, ++ .release = xsd_release, ++}; ++ ++static int xsd_port_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d", ++ xen_store_evtchn); ++ if (!file->private_data) ++ return -ENOMEM; ++ return 0; ++} ++ ++const struct file_operations xsd_port_file_ops = { ++ .open = xsd_port_open, ++ .read = xsd_read, ++ .release = xsd_release, ++}; +diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h +index 26373cf..9fb4270 100644 +--- a/include/asm-generic/pci.h ++++ b/include/asm-generic/pci.h +@@ -43,6 +43,8 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res) + return root; + } + ++#ifndef HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS ++#endif + #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ + static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) + { +diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h +index dd97fb8..b10ec49 100644 +--- a/include/linux/bootmem.h ++++ b/include/linux/bootmem.h +@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, + unsigned long addr, + unsigned long size); + extern void free_bootmem(unsigned long addr, unsigned long size); ++extern void free_bootmem_late(unsigned long addr, unsigned long size); + + /* + * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, +diff --git a/include/linux/dmar.h b/include/linux/dmar.h +index 4a2b162..5de4c9e 100644 +--- a/include/linux/dmar.h ++++ b/include/linux/dmar.h +@@ -208,16 +208,9 @@ struct dmar_atsr_unit { + u8 include_all:1; /* include all ports */ + }; + +-/* Intel DMAR initialization functions */ + extern int intel_iommu_init(void); +-#else +-static inline int intel_iommu_init(void) +-{ +-#ifdef CONFIG_INTR_REMAP +- return dmar_dev_scope_init(); +-#else +- return -ENODEV; +-#endif +-} +-#endif /* !CONFIG_DMAR */ ++#else /* !CONFIG_DMAR: */ ++static inline int intel_iommu_init(void) { return -ENODEV; } ++#endif /* CONFIG_DMAR */ ++ + #endif /* __DMAR_H__ */ +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 24c3956..3d74515 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -105,6 +105,12 @@ extern unsigned int kobjsize(const void *objp); + #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ + #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ + #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ ++#ifdef CONFIG_XEN ++#define VM_FOREIGN 0x20000000 /* Has pages belonging to another VM */ ++struct vm_foreign_map { ++ struct page **map; ++}; ++#endif + + #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ + #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +@@ -195,6 +201,15 @@ struct vm_operations_struct { + */ + int (*access)(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write); ++ ++ /* Area-specific function for clearing the PTE at @ptep. Returns the ++ * original value of @ptep. */ ++ pte_t (*zap_pte)(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *ptep, int is_fullmm); ++ ++ /* called before close() to indicate no more pages should be mapped */ ++ void (*unmap)(struct vm_area_struct *area); ++ + #ifdef CONFIG_NUMA + /* + * set_policy() op must add a reference to any non-NULL @new mempolicy +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index 6b202b1..b03950e 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -105,6 +105,9 @@ enum pageflags { + #ifdef CONFIG_ARCH_USES_PG_UNCACHED + PG_uncached, /* Page has been mapped as uncached */ + #endif ++#ifdef CONFIG_XEN ++ PG_foreign, ++#endif + #ifdef CONFIG_MEMORY_FAILURE + PG_hwpoison, /* hardware poisoned page. Don't touch */ + #endif +@@ -275,6 +278,23 @@ PAGEFLAG(Uncached, uncached) + PAGEFLAG_FALSE(Uncached) + #endif + ++#ifdef CONFIG_XEN ++TESTPAGEFLAG(Foreign, foreign) ++__SETPAGEFLAG(Foreign, foreign) ++CLEARPAGEFLAG(Foreign, foreign) ++#define SetPageForeign(_page, dtor) do { \ ++ __SetPageForeign(_page); \ ++ BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0); \ ++ (_page)->index = (long)(dtor); \ ++} while (0) ++#define _PageForeignDestructor(_page) \ ++ ((void (*)(struct page *, unsigned int))(_page)->index) ++#define PageForeignDestructor(_page, order) \ ++ _PageForeignDestructor(_page)(_page, order) ++#else ++PAGEFLAG_FALSE(Foreign) ++#endif ++ + #ifdef CONFIG_MEMORY_FAILURE + PAGEFLAG(HWPoison, hwpoison) + TESTSETFLAG(HWPoison, hwpoison) +diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h +index 73b1f1c..113585a 100644 +--- a/include/linux/swiotlb.h ++++ b/include/linux/swiotlb.h +@@ -7,6 +7,8 @@ struct device; + struct dma_attrs; + struct scatterlist; + ++extern int swiotlb_force; ++ + /* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? +@@ -20,9 +22,46 @@ struct scatterlist; + */ + #define IO_TLB_SHIFT 11 + +-extern void +-swiotlb_init(void); +- ++/* swiotlb-core.c */ ++extern void swiotlb_init(int verbose); ++#ifdef CONFIG_SWIOTLB ++extern void __init swiotlb_free(void); ++#else ++static inline void swiotlb_free(void) { } ++#endif ++extern void swiotlb_print_info(void); ++ ++/* swiotlb-core.c: Internal book-keeping functions. ++ * Must be linked against the library to take advantage of them.*/ ++#ifdef CONFIG_SWIOTLB ++/* ++ * Enumeration for sync targets ++ */ ++enum dma_sync_target { ++ SYNC_FOR_CPU = 0, ++ SYNC_FOR_DEVICE = 1, ++}; ++extern char *io_tlb_start; ++extern char *io_tlb_end; ++extern unsigned long io_tlb_nslabs; ++extern void *io_tlb_overflow_buffer; ++extern unsigned long io_tlb_overflow; ++extern int is_swiotlb_buffer(phys_addr_t paddr); ++extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, ++ enum dma_data_direction dir); ++extern void *do_map_single(struct device *hwdev, phys_addr_t phys, ++ unsigned long start_dma_addr, size_t size, int dir); ++ ++extern void do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir); ++ ++extern void do_sync_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir, int target); ++extern void swiotlb_full(struct device *dev, size_t size, int dir, int do_panic); ++extern void __init swiotlb_init_early(size_t default_size, int verbose); ++#endif ++ ++/* swiotlb.c: dma_ops functions. */ + extern void + *swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags); +@@ -88,4 +127,74 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); + extern int + swiotlb_dma_supported(struct device *hwdev, u64 mask); + ++/* swiotlb-xen.c: dma_ops functions. */ ++extern void xen_swiotlb_init(int verbose); ++extern void ++*xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags); ++ ++extern void ++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, ++ void *vaddr, dma_addr_t dma_handle); ++ ++extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern int ++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ int direction); ++ ++extern void ++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, ++ int direction); ++ ++extern int ++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern void ++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs); ++ ++extern void ++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir); ++ ++extern void ++xen_swiotlb_sync_single_range_for_device(struct device *hwdev, ++ dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir); ++ ++extern int ++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); ++ ++extern int ++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask); ++ ++ + #endif /* __LINUX_SWIOTLB_H */ +diff --git a/include/xen/Kbuild b/include/xen/Kbuild +index 4e65c16..84ad8f0 100644 +--- a/include/xen/Kbuild ++++ b/include/xen/Kbuild +@@ -1 +1,2 @@ + header-y += evtchn.h ++header-y += privcmd.h +diff --git a/include/xen/balloon.h b/include/xen/balloon.h +new file mode 100644 +index 0000000..e751514 +--- /dev/null ++++ b/include/xen/balloon.h +@@ -0,0 +1,8 @@ ++#ifndef _XEN_BALLOON_H ++#define _XEN_BALLOON_H ++ ++/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */ ++struct page **alloc_empty_pages_and_pagevec(int nr_pages); ++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages); ++ ++#endif +diff --git a/include/xen/blkif.h b/include/xen/blkif.h +new file mode 100644 +index 0000000..7172081 +--- /dev/null ++++ b/include/xen/blkif.h +@@ -0,0 +1,123 @@ ++/* ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_BLKIF_H__ ++#define __XEN_BLKIF_H__ ++ ++#include ++#include ++#include ++#include ++ ++/* Not a real protocol. Used to generate ring structs which contain ++ * the elements common to all protocols only. This way we get a ++ * compiler-checkable way to use common struct elements, so we can ++ * avoid using switch(protocol) in a number of places. */ ++struct blkif_common_request { ++ char dummy; ++}; ++struct blkif_common_response { ++ char dummy; ++}; ++ ++/* i386 protocol version */ ++#pragma pack(push, 4) ++struct blkif_x86_32_request { ++ uint8_t operation; /* BLKIF_OP_??? */ ++ uint8_t nr_segments; /* number of segments */ ++ blkif_vdev_t handle; /* only for read/write requests */ ++ uint64_t id; /* private guest value, echoed in resp */ ++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ ++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++}; ++struct blkif_x86_32_response { ++ uint64_t id; /* copied from request */ ++ uint8_t operation; /* copied from request */ ++ int16_t status; /* BLKIF_RSP_??? */ ++}; ++typedef struct blkif_x86_32_request blkif_x86_32_request_t; ++typedef struct blkif_x86_32_response blkif_x86_32_response_t; ++#pragma pack(pop) ++ ++/* x86_64 protocol version */ ++struct blkif_x86_64_request { ++ uint8_t operation; /* BLKIF_OP_??? */ ++ uint8_t nr_segments; /* number of segments */ ++ blkif_vdev_t handle; /* only for read/write requests */ ++ uint64_t __attribute__((__aligned__(8))) id; ++ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ ++ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++}; ++struct blkif_x86_64_response { ++ uint64_t __attribute__((__aligned__(8))) id; ++ uint8_t operation; /* copied from request */ ++ int16_t status; /* BLKIF_RSP_??? */ ++}; ++typedef struct blkif_x86_64_request blkif_x86_64_request_t; ++typedef struct blkif_x86_64_response blkif_x86_64_response_t; ++ ++DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); ++DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); ++DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); ++ ++union blkif_back_rings { ++ struct blkif_back_ring native; ++ struct blkif_common_back_ring common; ++ struct blkif_x86_32_back_ring x86_32; ++ struct blkif_x86_64_back_ring x86_64; ++}; ++ ++enum blkif_protocol { ++ BLKIF_PROTOCOL_NATIVE = 1, ++ BLKIF_PROTOCOL_X86_32 = 2, ++ BLKIF_PROTOCOL_X86_64 = 3, ++}; ++ ++static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) ++{ ++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ dst->operation = src->operation; ++ dst->nr_segments = src->nr_segments; ++ dst->handle = src->handle; ++ dst->id = src->id; ++ dst->sector_number = src->sector_number; ++ barrier(); ++ if (n > dst->nr_segments) ++ n = dst->nr_segments; ++ for (i = 0; i < n; i++) ++ dst->seg[i] = src->seg[i]; ++} ++ ++static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src) ++{ ++ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ dst->operation = src->operation; ++ dst->nr_segments = src->nr_segments; ++ dst->handle = src->handle; ++ dst->id = src->id; ++ dst->sector_number = src->sector_number; ++ barrier(); ++ if (n > dst->nr_segments) ++ n = dst->nr_segments; ++ for (i = 0; i < n; i++) ++ dst->seg[i] = src->seg[i]; ++} ++ ++#endif /* __XEN_BLKIF_H__ */ +diff --git a/include/xen/events.h b/include/xen/events.h +index e68d59a..4a934a7 100644 +--- a/include/xen/events.h ++++ b/include/xen/events.h +@@ -12,6 +12,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, + void *dev_id); ++int bind_virq_to_irq(unsigned int virq, unsigned int cpu); ++ + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, +@@ -22,6 +24,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned long irqflags, + const char *devname, + void *dev_id); ++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, ++ unsigned int remote_port, ++ irq_handler_t handler, ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id); + + /* + * Common unbind function for all event sources. Takes IRQ to unbind from. +@@ -56,4 +64,23 @@ void xen_poll_irq(int irq); + /* Determine the IRQ which is bound to an event channel */ + unsigned irq_from_evtchn(unsigned int evtchn); + ++/* Allocate an irq for a physical interrupt, given a gsi. "Legacy" ++ GSIs are identity mapped; others are dynamically allocated as ++ usual. */ ++int xen_allocate_pirq(unsigned gsi, int shareable, char *name); ++ ++/* Return vector allocated to pirq */ ++int xen_vector_from_irq(unsigned pirq); ++ ++/* Return gsi allocated to pirq */ ++int xen_gsi_from_irq(unsigned pirq); ++ ++#ifdef CONFIG_XEN_DOM0_PCI ++void xen_setup_pirqs(void); ++#else ++static inline void xen_setup_pirqs(void) ++{ ++} ++#endif ++ + #endif /* _XEN_EVENTS_H */ +diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h +new file mode 100644 +index 0000000..8bd1467 +--- /dev/null ++++ b/include/xen/gntdev.h +@@ -0,0 +1,119 @@ ++/****************************************************************************** ++ * gntdev.h ++ * ++ * Interface to /dev/xen/gntdev. ++ * ++ * Copyright (c) 2007, D G Murray ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __LINUX_PUBLIC_GNTDEV_H__ ++#define __LINUX_PUBLIC_GNTDEV_H__ ++ ++struct ioctl_gntdev_grant_ref { ++ /* The domain ID of the grant to be mapped. */ ++ uint32_t domid; ++ /* The grant reference of the grant to be mapped. */ ++ uint32_t ref; ++}; ++ ++/* ++ * Inserts the grant references into the mapping table of an instance ++ * of gntdev. N.B. This does not perform the mapping, which is deferred ++ * until mmap() is called with @index as the offset. ++ */ ++#define IOCTL_GNTDEV_MAP_GRANT_REF \ ++_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) ++struct ioctl_gntdev_map_grant_ref { ++ /* IN parameters */ ++ /* The number of grants to be mapped. */ ++ uint32_t count; ++ uint32_t pad; ++ /* OUT parameters */ ++ /* The offset to be used on a subsequent call to mmap(). */ ++ uint64_t index; ++ /* Variable IN parameter. */ ++ /* Array of grant references, of size @count. */ ++ struct ioctl_gntdev_grant_ref refs[1]; ++}; ++ ++/* ++ * Removes the grant references from the mapping table of an instance of ++ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) ++ * before this ioctl is called, or an error will result. ++ */ ++#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ ++_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) ++struct ioctl_gntdev_unmap_grant_ref { ++ /* IN parameters */ ++ /* The offset was returned by the corresponding map operation. */ ++ uint64_t index; ++ /* The number of pages to be unmapped. */ ++ uint32_t count; ++ uint32_t pad; ++}; ++ ++/* ++ * Returns the offset in the driver's address space that corresponds ++ * to @vaddr. This can be used to perform a munmap(), followed by an ++ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by ++ * the caller. The number of pages that were allocated at the same time as ++ * @vaddr is returned in @count. ++ * ++ * N.B. Where more than one page has been mapped into a contiguous range, the ++ * supplied @vaddr must correspond to the start of the range; otherwise ++ * an error will result. It is only possible to munmap() the entire ++ * contiguously-allocated range at once, and not any subrange thereof. ++ */ ++#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ ++_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) ++struct ioctl_gntdev_get_offset_for_vaddr { ++ /* IN parameters */ ++ /* The virtual address of the first mapped page in a range. */ ++ uint64_t vaddr; ++ /* OUT parameters */ ++ /* The offset that was used in the initial mmap() operation. */ ++ uint64_t offset; ++ /* The number of pages mapped in the VM area that begins at @vaddr. */ ++ uint32_t count; ++ uint32_t pad; ++}; ++ ++/* ++ * Sets the maximum number of grants that may mapped at once by this gntdev ++ * instance. ++ * ++ * N.B. This must be called before any other ioctl is performed on the device. ++ */ ++#define IOCTL_GNTDEV_SET_MAX_GRANTS \ ++_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants)) ++struct ioctl_gntdev_set_max_grants { ++ /* IN parameter */ ++ /* The maximum number of grants that may be mapped at once. */ ++ uint32_t count; ++}; ++ ++#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ +diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h +index a40f1cd..9e54167 100644 +--- a/include/xen/grant_table.h ++++ b/include/xen/grant_table.h +@@ -37,10 +37,16 @@ + #ifndef __ASM_GNTTAB_H__ + #define __ASM_GNTTAB_H__ + +-#include ++#include ++ ++#include + #include ++ ++#include + #include + ++#include ++ + /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ + #define NR_GRANT_FRAMES 4 + +@@ -51,6 +57,8 @@ struct gnttab_free_callback { + u16 count; + }; + ++void gnttab_reset_grant_page(struct page *page); ++ + int gnttab_suspend(void); + int gnttab_resume(void); + +@@ -80,6 +88,8 @@ unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); + + int gnttab_query_foreign_access(grant_ref_t ref); + ++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep); ++ + /* + * operations on reserved batches of grant references + */ +@@ -106,6 +116,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); + ++static inline void ++gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr, ++ uint32_t flags, grant_ref_t ref, domid_t domid) ++{ ++ if (flags & GNTMAP_contains_pte) ++ map->host_addr = addr; ++ else if (xen_feature(XENFEAT_auto_translated_physmap)) ++ map->host_addr = __pa(addr); ++ else ++ map->host_addr = addr; ++ ++ map->flags = flags; ++ map->ref = ref; ++ map->dom = domid; ++} ++ ++static inline void ++gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, ++ uint32_t flags, grant_handle_t handle) ++{ ++ if (flags & GNTMAP_contains_pte) ++ unmap->host_addr = addr; ++ else if (xen_feature(XENFEAT_auto_translated_physmap)) ++ unmap->host_addr = __pa(addr); ++ else ++ unmap->host_addr = addr; ++ ++ unmap->handle = handle; ++ unmap->dev_bus_addr = 0; ++} ++ + int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + struct grant_entry **__shared); +diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h +index 39da93c..8211af8 100644 +--- a/include/xen/interface/grant_table.h ++++ b/include/xen/interface/grant_table.h +@@ -321,6 +321,28 @@ struct gnttab_query_size { + DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size); + + /* ++ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings ++ * tracked by but atomically replace the page table entry with one ++ * pointing to the machine address under . will be ++ * redirected to the null entry. ++ * NOTES: ++ * 1. The call may fail in an undefined manner if either mapping is not ++ * tracked by . ++ * 2. After executing a batch of unmaps, it is guaranteed that no stale ++ * mappings will remain in the device or host TLBs. ++ */ ++#define GNTTABOP_unmap_and_replace 7 ++struct gnttab_unmap_and_replace { ++ /* IN parameters. */ ++ uint64_t host_addr; ++ uint64_t new_addr; ++ grant_handle_t handle; ++ /* OUT parameters. */ ++ int16_t status; /* GNTST_* */ ++}; ++DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace); ++ ++/* + * Bitfield values for update_pin_status.flags. + */ + /* Map the grant entry for access by I/O devices. */ +diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h +index e8cbf43..865dcf0 100644 +--- a/include/xen/interface/io/ring.h ++++ b/include/xen/interface/io/ring.h +@@ -73,7 +73,8 @@ union __name##_sring_entry { \ + struct __name##_sring { \ + RING_IDX req_prod, req_event; \ + RING_IDX rsp_prod, rsp_event; \ +- uint8_t pad[48]; \ ++ uint8_t netfront_smartpoll_active; \ ++ uint8_t pad[47]; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ + }; \ + \ +diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h +index af36ead..eac3ce1 100644 +--- a/include/xen/interface/memory.h ++++ b/include/xen/interface/memory.h +@@ -9,6 +9,8 @@ + #ifndef __XEN_PUBLIC_MEMORY_H__ + #define __XEN_PUBLIC_MEMORY_H__ + ++#include ++ + /* + * Increase or decrease the specified domain's memory reservation. Returns a + * -ve errcode on failure, or the # extents successfully allocated or freed. +@@ -53,6 +55,48 @@ struct xen_memory_reservation { + DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation); + + /* ++ * An atomic exchange of memory pages. If return code is zero then ++ * @out.extent_list provides GMFNs of the newly-allocated memory. ++ * Returns zero on complete success, otherwise a negative error code. ++ * On complete success then always @nr_exchanged == @in.nr_extents. ++ * On partial success @nr_exchanged indicates how much work was done. ++ */ ++#define XENMEM_exchange 11 ++struct xen_memory_exchange { ++ /* ++ * [IN] Details of memory extents to be exchanged (GMFN bases). ++ * Note that @in.address_bits is ignored and unused. ++ */ ++ struct xen_memory_reservation in; ++ ++ /* ++ * [IN/OUT] Details of new memory extents. ++ * We require that: ++ * 1. @in.domid == @out.domid ++ * 2. @in.nr_extents << @in.extent_order == ++ * @out.nr_extents << @out.extent_order ++ * 3. @in.extent_start and @out.extent_start lists must not overlap ++ * 4. @out.extent_start lists GPFN bases to be populated ++ * 5. @out.extent_start is overwritten with allocated GMFN bases ++ */ ++ struct xen_memory_reservation out; ++ ++ /* ++ * [OUT] Number of input extents that were successfully exchanged: ++ * 1. The first @nr_exchanged input extents were successfully ++ * deallocated. ++ * 2. The corresponding first entries in the output extent list correctly ++ * indicate the GMFNs that were successfully exchanged. ++ * 3. All other input and output extents are untouched. ++ * 4. If not all input exents are exchanged then the return code of this ++ * command will be non-zero. ++ * 5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER! ++ */ ++ unsigned long nr_exchanged; ++}; ++ ++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange); ++/* + * Returns the maximum machine frame number of mapped RAM in this system. + * This command always succeeds (it never returns an error code). + * arg == NULL. +@@ -97,6 +141,19 @@ struct xen_machphys_mfn_list { + DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list); + + /* ++ * Returns the location in virtual address space of the machine_to_phys ++ * mapping table. Architectures which do not have a m2p table, or which do not ++ * map it by default into guest address space, do not implement this command. ++ * arg == addr of xen_machphys_mapping_t. ++ */ ++#define XENMEM_machphys_mapping 12 ++struct xen_machphys_mapping { ++ unsigned long v_start, v_end; /* Start and end virtual addresses. */ ++ unsigned long max_mfn; /* Maximum MFN that can be looked up. */ ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t); ++ ++/* + * Sets the GPFN at which a particular page appears in the specified guest's + * pseudophysical address space. + * arg == addr of xen_add_to_physmap_t. +@@ -142,4 +199,39 @@ struct xen_translate_gpfn_list { + }; + DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list); + ++/* ++ * Returns the pseudo-physical memory map as it was when the domain ++ * was started (specified by XENMEM_set_memory_map). ++ * arg == addr of struct xen_memory_map. ++ */ ++#define XENMEM_memory_map 9 ++struct xen_memory_map { ++ /* ++ * On call the number of entries which can be stored in buffer. On ++ * return the number of entries which have been stored in ++ * buffer. ++ */ ++ unsigned int nr_entries; ++ ++ /* ++ * Entries in the buffer are in the same format as returned by the ++ * BIOS INT 0x15 EAX=0xE820 call. ++ */ ++ GUEST_HANDLE(void) buffer; ++}; ++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); ++ ++/* ++ * Returns the real physical memory map. Passes the same structure as ++ * XENMEM_memory_map. ++ * arg == addr of struct xen_memory_map. ++ */ ++#define XENMEM_machine_memory_map 10 ++ ++ ++/* ++ * Prevent the balloon driver from changing the memory reservation ++ * during a driver critical region. ++ */ ++extern spinlock_t xen_reservation_lock; + #endif /* __XEN_PUBLIC_MEMORY_H__ */ +diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h +index cd69391..39c2b51 100644 +--- a/include/xen/interface/physdev.h ++++ b/include/xen/interface/physdev.h +@@ -106,6 +106,57 @@ struct physdev_irq { + uint32_t vector; + }; + ++#define MAP_PIRQ_TYPE_MSI 0x0 ++#define MAP_PIRQ_TYPE_GSI 0x1 ++#define MAP_PIRQ_TYPE_UNKNOWN 0x2 ++ ++#define PHYSDEVOP_map_pirq 13 ++struct physdev_map_pirq { ++ domid_t domid; ++ /* IN */ ++ int type; ++ /* IN */ ++ int index; ++ /* IN or OUT */ ++ int pirq; ++ /* IN */ ++ int bus; ++ /* IN */ ++ int devfn; ++ /* IN */ ++ int entry_nr; ++ /* IN */ ++ uint64_t table_base; ++}; ++ ++#define PHYSDEVOP_unmap_pirq 14 ++struct physdev_unmap_pirq { ++ domid_t domid; ++ /* IN */ ++ int pirq; ++}; ++ ++#define PHYSDEVOP_manage_pci_add 15 ++#define PHYSDEVOP_manage_pci_remove 16 ++struct physdev_manage_pci { ++ /* IN */ ++ uint8_t bus; ++ uint8_t devfn; ++}; ++ ++#define PHYSDEVOP_manage_pci_add_ext 20 ++struct physdev_manage_pci_ext { ++ /* IN */ ++ uint8_t bus; ++ uint8_t devfn; ++ unsigned is_extfn; ++ unsigned is_virtfn; ++ struct { ++ uint8_t bus; ++ uint8_t devfn; ++ } physfn; ++}; ++ + /* + * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op() + * hypercall since 0x00030202. +@@ -121,6 +172,16 @@ struct physdev_op { + } u; + }; + ++#define PHYSDEVOP_setup_gsi 21 ++struct physdev_setup_gsi { ++ int gsi; ++ /* IN */ ++ uint8_t triggering; ++ /* IN */ ++ uint8_t polarity; ++ /* IN */ ++}; ++ + /* + * Notify that some PIRQ-bound event channels have been unmasked. + * ** This command is obsolete since interface version 0x00030202 and is ** +diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h +new file mode 100644 +index 0000000..83e4714 +--- /dev/null ++++ b/include/xen/interface/platform.h +@@ -0,0 +1,222 @@ ++/****************************************************************************** ++ * platform.h ++ * ++ * Hardware platform operations. Intended for use by domain-0 kernel. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this software and associated documentation files (the "Software"), to ++ * deal in the Software without restriction, including without limitation the ++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or ++ * sell copies of the Software, and to permit persons to whom the Software is ++ * furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Copyright (c) 2002-2006, K Fraser ++ */ ++ ++#ifndef __XEN_PUBLIC_PLATFORM_H__ ++#define __XEN_PUBLIC_PLATFORM_H__ ++ ++#include "xen.h" ++ ++#define XENPF_INTERFACE_VERSION 0x03000001 ++ ++/* ++ * Set clock such that it would read after 00:00:00 UTC, ++ * 1 January, 1970 if the current system time was . ++ */ ++#define XENPF_settime 17 ++struct xenpf_settime { ++ /* IN variables. */ ++ uint32_t secs; ++ uint32_t nsecs; ++ uint64_t system_time; ++}; ++typedef struct xenpf_settime xenpf_settime_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t); ++ ++/* ++ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type. ++ * On x86, @type is an architecture-defined MTRR memory type. ++ * On success, returns the MTRR that was used (@reg) and a handle that can ++ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting. ++ * (x86-specific). ++ */ ++#define XENPF_add_memtype 31 ++struct xenpf_add_memtype { ++ /* IN variables. */ ++ unsigned long mfn; ++ uint64_t nr_mfns; ++ uint32_t type; ++ /* OUT variables. */ ++ uint32_t handle; ++ uint32_t reg; ++}; ++typedef struct xenpf_add_memtype xenpf_add_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t); ++ ++/* ++ * Tear down an existing memory-range type. If @handle is remembered then it ++ * should be passed in to accurately tear down the correct setting (in case ++ * of overlapping memory regions with differing types). If it is not known ++ * then @handle should be set to zero. In all cases @reg must be set. ++ * (x86-specific). ++ */ ++#define XENPF_del_memtype 32 ++struct xenpf_del_memtype { ++ /* IN variables. */ ++ uint32_t handle; ++ uint32_t reg; ++}; ++typedef struct xenpf_del_memtype xenpf_del_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t); ++ ++/* Read current type of an MTRR (x86-specific). */ ++#define XENPF_read_memtype 33 ++struct xenpf_read_memtype { ++ /* IN variables. */ ++ uint32_t reg; ++ /* OUT variables. */ ++ unsigned long mfn; ++ uint64_t nr_mfns; ++ uint32_t type; ++}; ++typedef struct xenpf_read_memtype xenpf_read_memtype_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t); ++ ++#define XENPF_microcode_update 35 ++struct xenpf_microcode_update { ++ /* IN variables. */ ++ GUEST_HANDLE(void) data; /* Pointer to microcode data */ ++ uint32_t length; /* Length of microcode data. */ ++}; ++typedef struct xenpf_microcode_update xenpf_microcode_update_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t); ++ ++#define XENPF_platform_quirk 39 ++#define QUIRK_NOIRQBALANCING 1 /* Do not restrict IO-APIC RTE targets */ ++#define QUIRK_IOAPIC_BAD_REGSEL 2 /* IO-APIC REGSEL forgets its value */ ++#define QUIRK_IOAPIC_GOOD_REGSEL 3 /* IO-APIC REGSEL behaves properly */ ++struct xenpf_platform_quirk { ++ /* IN variables. */ ++ uint32_t quirk_id; ++}; ++typedef struct xenpf_platform_quirk xenpf_platform_quirk_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t); ++ ++#define XENPF_firmware_info 50 ++#define XEN_FW_DISK_INFO 1 /* from int 13 AH=08/41/48 */ ++#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */ ++#define XEN_FW_VBEDDC_INFO 3 /* from int 10 AX=4f15 */ ++struct xenpf_firmware_info { ++ /* IN variables. */ ++ uint32_t type; ++ uint32_t index; ++ /* OUT variables. */ ++ union { ++ struct { ++ /* Int13, Fn48: Check Extensions Present. */ ++ uint8_t device; /* %dl: bios device number */ ++ uint8_t version; /* %ah: major version */ ++ uint16_t interface_support; /* %cx: support bitmap */ ++ /* Int13, Fn08: Legacy Get Device Parameters. */ ++ uint16_t legacy_max_cylinder; /* %cl[7:6]:%ch: max cyl # */ ++ uint8_t legacy_max_head; /* %dh: max head # */ ++ uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector # */ ++ /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */ ++ /* NB. First uint16_t of buffer must be set to buffer size. */ ++ GUEST_HANDLE(void) edd_params; ++ } disk_info; /* XEN_FW_DISK_INFO */ ++ struct { ++ uint8_t device; /* bios device number */ ++ uint32_t mbr_signature; /* offset 0x1b8 in mbr */ ++ } disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */ ++ struct { ++ /* Int10, AX=4F15: Get EDID info. */ ++ uint8_t capabilities; ++ uint8_t edid_transfer_time; ++ /* must refer to 128-byte buffer */ ++ GUEST_HANDLE(uchar) edid; ++ } vbeddc_info; /* XEN_FW_VBEDDC_INFO */ ++ } u; ++}; ++typedef struct xenpf_firmware_info xenpf_firmware_info_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t); ++ ++#define XENPF_enter_acpi_sleep 51 ++struct xenpf_enter_acpi_sleep { ++ /* IN variables */ ++ uint16_t pm1a_cnt_val; /* PM1a control value. */ ++ uint16_t pm1b_cnt_val; /* PM1b control value. */ ++ uint32_t sleep_state; /* Which state to enter (Sn). */ ++ uint32_t flags; /* Must be zero. */ ++}; ++typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t); ++ ++#define XENPF_change_freq 52 ++struct xenpf_change_freq { ++ /* IN variables */ ++ uint32_t flags; /* Must be zero. */ ++ uint32_t cpu; /* Physical cpu. */ ++ uint64_t freq; /* New frequency (Hz). */ ++}; ++typedef struct xenpf_change_freq xenpf_change_freq_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t); ++ ++/* ++ * Get idle times (nanoseconds since boot) for physical CPUs specified in the ++ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is ++ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap ++ * bit set are written to. On return, @cpumap_bitmap is modified so that any ++ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry ++ * cleared. ++ */ ++#define XENPF_getidletime 53 ++struct xenpf_getidletime { ++ /* IN/OUT variables */ ++ /* IN: CPUs to interrogate; OUT: subset of IN which are present */ ++ GUEST_HANDLE(uchar) cpumap_bitmap; ++ /* IN variables */ ++ /* Size of cpumap bitmap. */ ++ uint32_t cpumap_nr_cpus; ++ /* Must be indexable for every cpu in cpumap_bitmap. */ ++ GUEST_HANDLE(uint64_t) idletime; ++ /* OUT variables */ ++ /* System time when the idletime snapshots were taken. */ ++ uint64_t now; ++}; ++typedef struct xenpf_getidletime xenpf_getidletime_t; ++DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t); ++ ++struct xen_platform_op { ++ uint32_t cmd; ++ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ ++ union { ++ struct xenpf_settime settime; ++ struct xenpf_add_memtype add_memtype; ++ struct xenpf_del_memtype del_memtype; ++ struct xenpf_read_memtype read_memtype; ++ struct xenpf_microcode_update microcode; ++ struct xenpf_platform_quirk platform_quirk; ++ struct xenpf_firmware_info firmware_info; ++ struct xenpf_enter_acpi_sleep enter_acpi_sleep; ++ struct xenpf_change_freq change_freq; ++ struct xenpf_getidletime getidletime; ++ uint8_t pad[128]; ++ } u; ++}; ++typedef struct xen_platform_op xen_platform_op_t; ++DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t); ++ ++#endif /* __XEN_PUBLIC_PLATFORM_H__ */ +diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h +index 2befa3e..327db61 100644 +--- a/include/xen/interface/xen.h ++++ b/include/xen/interface/xen.h +@@ -184,6 +184,8 @@ + #define MMUEXT_NEW_USER_BASEPTR 15 + + #ifndef __ASSEMBLY__ ++#include ++ + struct mmuext_op { + unsigned int cmd; + union { +@@ -449,6 +451,45 @@ struct start_info { + int8_t cmd_line[MAX_GUEST_CMDLINE]; + }; + ++struct dom0_vga_console_info { ++ uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */ ++#define XEN_VGATYPE_TEXT_MODE_3 0x03 ++#define XEN_VGATYPE_VESA_LFB 0x23 ++ ++ union { ++ struct { ++ /* Font height, in pixels. */ ++ uint16_t font_height; ++ /* Cursor location (column, row). */ ++ uint16_t cursor_x, cursor_y; ++ /* Number of rows and columns (dimensions in characters). */ ++ uint16_t rows, columns; ++ } text_mode_3; ++ ++ struct { ++ /* Width and height, in pixels. */ ++ uint16_t width, height; ++ /* Bytes per scan line. */ ++ uint16_t bytes_per_line; ++ /* Bits per pixel. */ ++ uint16_t bits_per_pixel; ++ /* LFB physical address, and size (in units of 64kB). */ ++ uint32_t lfb_base; ++ uint32_t lfb_size; ++ /* RGB mask offsets and sizes, as defined by VBE 1.2+ */ ++ uint8_t red_pos, red_size; ++ uint8_t green_pos, green_size; ++ uint8_t blue_pos, blue_size; ++ uint8_t rsvd_pos, rsvd_size; ++ ++ /* VESA capabilities (offset 0xa, VESA command 0x4f00). */ ++ uint32_t gbl_caps; ++ /* Mode attributes (offset 0x0, VESA command 0x4f01). */ ++ uint16_t mode_attrs; ++ } vesa_lfb; ++ } u; ++}; ++ + /* These flags are passed in the 'flags' field of start_info_t. */ + #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ + #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +@@ -461,6 +502,8 @@ typedef uint8_t xen_domain_handle_t[16]; + #define __mk_unsigned_long(x) x ## UL + #define mk_unsigned_long(x) __mk_unsigned_long(x) + ++DEFINE_GUEST_HANDLE(uint64_t); ++ + #else /* __ASSEMBLY__ */ + + /* In assembly code we cannot use C numeric constant suffixes. */ +diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h +new file mode 100644 +index 0000000..b42cdfd +--- /dev/null ++++ b/include/xen/privcmd.h +@@ -0,0 +1,80 @@ ++/****************************************************************************** ++ * privcmd.h ++ * ++ * Interface to /proc/xen/privcmd. ++ * ++ * Copyright (c) 2003-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __LINUX_PUBLIC_PRIVCMD_H__ ++#define __LINUX_PUBLIC_PRIVCMD_H__ ++ ++#include ++ ++typedef unsigned long xen_pfn_t; ++ ++#ifndef __user ++#define __user ++#endif ++ ++struct privcmd_hypercall { ++ __u64 op; ++ __u64 arg[5]; ++}; ++ ++struct privcmd_mmap_entry { ++ __u64 va; ++ __u64 mfn; ++ __u64 npages; ++}; ++ ++struct privcmd_mmap { ++ int num; ++ domid_t dom; /* target domain */ ++ struct privcmd_mmap_entry __user *entry; ++}; ++ ++struct privcmd_mmapbatch { ++ int num; /* number of pages to populate */ ++ domid_t dom; /* target domain */ ++ __u64 addr; /* virtual address */ ++ xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */ ++}; ++ ++/* ++ * @cmd: IOCTL_PRIVCMD_HYPERCALL ++ * @arg: &privcmd_hypercall_t ++ * Return: Value returned from execution of the specified hypercall. ++ */ ++#define IOCTL_PRIVCMD_HYPERCALL \ ++ _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall)) ++#define IOCTL_PRIVCMD_MMAP \ ++ _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap)) ++#define IOCTL_PRIVCMD_MMAPBATCH \ ++ _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) ++ ++#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ +diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h +index 883a21b..c7b3ce9 100644 +--- a/include/xen/xen-ops.h ++++ b/include/xen/xen-ops.h +@@ -14,4 +14,15 @@ void xen_mm_unpin_all(void); + void xen_timer_resume(void); + void xen_arch_resume(void); + ++int xen_remap_domain_mfn_range(struct vm_area_struct *vma, ++ unsigned long addr, ++ unsigned long mfn, int nr, ++ pgprot_t prot, unsigned domid); ++ ++extern unsigned long *xen_contiguous_bitmap; ++int xen_create_contiguous_region(unsigned long vstart, unsigned int order, ++ unsigned int address_bits); ++ ++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); ++ + #endif /* INCLUDE_XEN_OPS_H */ +diff --git a/include/xen/xen.h b/include/xen/xen.h +new file mode 100644 +index 0000000..a164024 +--- /dev/null ++++ b/include/xen/xen.h +@@ -0,0 +1,32 @@ ++#ifndef _XEN_XEN_H ++#define _XEN_XEN_H ++ ++enum xen_domain_type { ++ XEN_NATIVE, /* running on bare hardware */ ++ XEN_PV_DOMAIN, /* running in a PV domain */ ++ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ ++}; ++ ++#ifdef CONFIG_XEN ++extern enum xen_domain_type xen_domain_type; ++#else ++#define xen_domain_type XEN_NATIVE ++#endif ++ ++#define xen_domain() (xen_domain_type != XEN_NATIVE) ++#define xen_pv_domain() (xen_domain() && \ ++ xen_domain_type == XEN_PV_DOMAIN) ++#define xen_hvm_domain() (xen_domain() && \ ++ xen_domain_type == XEN_HVM_DOMAIN) ++ ++#ifdef CONFIG_XEN_DOM0 ++#include ++#include ++ ++#define xen_initial_domain() (xen_pv_domain() && \ ++ xen_start_info->flags & SIF_INITDOMAIN) ++#else /* !CONFIG_XEN_DOM0 */ ++#define xen_initial_domain() (0) ++#endif /* CONFIG_XEN_DOM0 */ ++ ++#endif /* _XEN_XEN_H */ +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h +index b9763ba..542ca7c 100644 +--- a/include/xen/xenbus.h ++++ b/include/xen/xenbus.h +@@ -93,7 +93,7 @@ struct xenbus_driver { + int (*remove)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev, pm_message_t state); + int (*resume)(struct xenbus_device *dev); +- int (*uevent)(struct xenbus_device *, char **, int, char *, int); ++ int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); + struct device_driver driver; + int (*read_otherend_details)(struct xenbus_device *dev); + int (*is_ready)(struct xenbus_device *dev); +diff --git a/lib/Makefile b/lib/Makefile +index 2e78277..7c31e3d 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -77,7 +77,8 @@ obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o + obj-$(CONFIG_SMP) += percpu_counter.o + obj-$(CONFIG_AUDIT_GENERIC) += audit.o + +-obj-$(CONFIG_SWIOTLB) += swiotlb.o ++obj-$(CONFIG_SWIOTLB) += swiotlb-core.o swiotlb.o ++obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o + obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o + obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o + +diff --git a/lib/swiotlb-core.c b/lib/swiotlb-core.c +new file mode 100644 +index 0000000..a17c89e +--- /dev/null ++++ b/lib/swiotlb-core.c +@@ -0,0 +1,572 @@ ++/* ++ * Dynamic DMA mapping support. ++ * ++ * This implementation is a fallback for platforms that do not support ++ * I/O TLBs (aka DMA address translation hardware). ++ * Copyright (C) 2000 Asit Mallick ++ * Copyright (C) 2000 Goutham Rao ++ * Copyright (C) 2000, 2003 Hewlett-Packard Co ++ * David Mosberger-Tang ++ * ++ * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. ++ * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid ++ * unnecessary i-cache flushing. ++ * 04/07/.. ak Better overflow handling. Assorted fixes. ++ * 05/09/10 linville Add support for syncing ranges, support syncing for ++ * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. ++ * 08/12/11 beckyb Add highmem support ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define OFFSET(val, align) ((unsigned long) ((val) & ((align) - 1))) ++ ++#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) ++ ++/* ++ * Minimum IO TLB size to bother booting with. Systems with mainly ++ * 64bit capable cards will only lightly use the swiotlb. If we can't ++ * allocate a contiguous 1MB, we're probably in trouble anyway. ++ */ ++#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) ++ ++int swiotlb_force; ++ ++/* ++ * Used to do a quick range check in do_unmap_single and ++ * do_sync_single_*, to see if the memory was in fact allocated by this ++ * API. ++ */ ++char *io_tlb_start, *io_tlb_end; ++ ++/* ++ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and ++ * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. ++ */ ++unsigned long io_tlb_nslabs; ++ ++/* ++ * When the IOMMU overflows we return a fallback buffer. This sets the size. ++ */ ++unsigned long io_tlb_overflow = 32*1024; ++ ++void *io_tlb_overflow_buffer; ++ ++/* ++ * This is a free list describing the number of free entries available from ++ * each index ++ */ ++static unsigned int *io_tlb_list; ++static unsigned int io_tlb_index; ++ ++/* ++ * We need to save away the original address corresponding to a mapped entry ++ * for the sync operations. ++ */ ++static phys_addr_t *io_tlb_orig_addr; ++ ++/* ++ * Protect the above data structures in the map and unmap calls ++ */ ++static DEFINE_SPINLOCK(io_tlb_lock); ++ ++static int late_alloc; ++ ++static int __init ++setup_io_tlb_npages(char *str) ++{ ++ int get_value(const char *token, char *str, char **endp) ++ { ++ ssize_t len; ++ int val = 0; ++ ++ len = strlen(token); ++ if (!strncmp(str, token, len)) { ++ str += len; ++ if (*str == '=') ++ ++str; ++ if (*str != '\0') ++ val = simple_strtoul(str, endp, 0); ++ } ++ *endp = str; ++ return val; ++ } ++ ++ int val; ++ ++ while (*str) { ++ /* The old syntax */ ++ if (isdigit(*str)) { ++ io_tlb_nslabs = simple_strtoul(str, &str, 0); ++ /* avoid tail segment of size < IO_TLB_SEGSIZE */ ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ if (!strncmp(str, "force", 5)) ++ swiotlb_force = 1; ++ /* The new syntax: swiotlb=nslabs=16384,overflow=32768,force */ ++ val = get_value("nslabs", str, &str); ++ if (val) ++ io_tlb_nslabs = ALIGN(val, IO_TLB_SEGSIZE); ++ ++ val = get_value("overflow", str, &str); ++ if (val) ++ io_tlb_overflow = val; ++ str = strpbrk(str, ","); ++ if (!str) ++ break; ++ str++; /* skip ',' */ ++ } ++ return 1; ++} ++__setup("swiotlb=", setup_io_tlb_npages); ++ ++void swiotlb_print_info(void) ++{ ++ unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ phys_addr_t pstart, pend; ++ ++ pstart = virt_to_phys(io_tlb_start); ++ pend = virt_to_phys(io_tlb_end); ++ ++ printk(KERN_INFO "DMA: Placing %luMB software IO TLB between %p - %p\n", ++ bytes >> 20, io_tlb_start, io_tlb_end); ++ printk(KERN_INFO "DMA: software IO TLB at phys %#llx - %#llx\n", ++ (unsigned long long)pstart, ++ (unsigned long long)pend); ++} ++ ++/* ++ * Statically reserve bounce buffer space and initialize bounce buffer data ++ * structures for the software IO TLB used to implement the DMA API. ++ */ ++void __init ++swiotlb_init_early(size_t default_size, int verbose) ++{ ++ unsigned long i, bytes; ++ ++ if (!io_tlb_nslabs) { ++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ ++ /* ++ * Get IO TLB memory from the low pages ++ */ ++ io_tlb_start = alloc_bootmem_low_pages(bytes); ++ if (!io_tlb_start) ++ panic("DMA: Cannot allocate SWIOTLB buffer"); ++ io_tlb_end = io_tlb_start + bytes; ++ ++ /* ++ * Allocate and initialize the free list array. This array is used ++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE ++ * between io_tlb_start and io_tlb_end. ++ */ ++ io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); ++ for (i = 0; i < io_tlb_nslabs; i++) ++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); ++ io_tlb_index = 0; ++ io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); ++ ++ /* ++ * Get the overflow emergency buffer ++ */ ++ io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); ++ if (!io_tlb_overflow_buffer) ++ panic("DMA: Cannot allocate SWIOTLB overflow buffer!\n"); ++ if (verbose) ++ swiotlb_print_info(); ++} ++ ++void __init ++swiotlb_init(int verbose) ++{ ++ swiotlb_init_early(64 * (1<<20), verbose); /* default to 64MB */ ++} ++ ++/* ++ * Systems with larger DMA zones (those that don't support ISA) can ++ * initialize the swiotlb later using the slab allocator if needed. ++ * This should be just like above, but with some error catching. ++ */ ++int ++swiotlb_init_late(size_t default_size) ++{ ++ unsigned long i, bytes, req_nslabs = io_tlb_nslabs; ++ unsigned int order; ++ ++ if (!io_tlb_nslabs) { ++ io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); ++ io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); ++ } ++ ++ /* ++ * Get IO TLB memory from the low pages ++ */ ++ order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); ++ io_tlb_nslabs = SLABS_PER_PAGE << order; ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ ++ while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { ++ io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, ++ order); ++ if (io_tlb_start) ++ break; ++ order--; ++ } ++ ++ if (!io_tlb_start) ++ goto cleanup1; ++ ++ if (order != get_order(bytes)) { ++ printk(KERN_WARNING "DMA: Warning: only able to allocate %ld MB" ++ " for software IO TLB\n", (PAGE_SIZE << order) >> 20); ++ io_tlb_nslabs = SLABS_PER_PAGE << order; ++ bytes = io_tlb_nslabs << IO_TLB_SHIFT; ++ } ++ io_tlb_end = io_tlb_start + bytes; ++ memset(io_tlb_start, 0, bytes); ++ ++ /* ++ * Allocate and initialize the free list array. This array is used ++ * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE ++ * between io_tlb_start and io_tlb_end. ++ */ ++ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, ++ get_order(io_tlb_nslabs * sizeof(int))); ++ if (!io_tlb_list) ++ goto cleanup2; ++ ++ for (i = 0; i < io_tlb_nslabs; i++) ++ io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); ++ io_tlb_index = 0; ++ ++ io_tlb_orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ if (!io_tlb_orig_addr) ++ goto cleanup3; ++ ++ memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); ++ ++ /* ++ * Get the overflow emergency buffer ++ */ ++ io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, ++ get_order(io_tlb_overflow)); ++ if (!io_tlb_overflow_buffer) ++ goto cleanup4; ++ ++ swiotlb_print_info(); ++ ++ late_alloc = 1; ++ ++ return 0; ++ ++cleanup4: ++ free_pages((unsigned long)io_tlb_orig_addr, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ io_tlb_orig_addr = NULL; ++cleanup3: ++ free_pages((unsigned long)io_tlb_list, ++ get_order(io_tlb_nslabs * sizeof(int))); ++ io_tlb_list = NULL; ++cleanup2: ++ io_tlb_end = NULL; ++ free_pages((unsigned long)io_tlb_start, order); ++ io_tlb_start = NULL; ++cleanup1: ++ io_tlb_nslabs = req_nslabs; ++ return -ENOMEM; ++} ++ ++void __init swiotlb_free(void) ++{ ++ if (!io_tlb_overflow_buffer) ++ return; ++ ++ if (late_alloc) { ++ free_pages((unsigned long)io_tlb_overflow_buffer, ++ get_order(io_tlb_overflow)); ++ free_pages((unsigned long)io_tlb_orig_addr, ++ get_order(io_tlb_nslabs * sizeof(phys_addr_t))); ++ free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * ++ sizeof(int))); ++ free_pages((unsigned long)io_tlb_start, ++ get_order(io_tlb_nslabs << IO_TLB_SHIFT)); ++ } else { ++ free_bootmem_late(__pa(io_tlb_overflow_buffer), ++ io_tlb_overflow); ++ free_bootmem_late(__pa(io_tlb_orig_addr), ++ io_tlb_nslabs * sizeof(phys_addr_t)); ++ free_bootmem_late(__pa(io_tlb_list), ++ io_tlb_nslabs * sizeof(int)); ++ free_bootmem_late(__pa(io_tlb_start), ++ io_tlb_nslabs << IO_TLB_SHIFT); ++ } ++} ++ ++int is_swiotlb_buffer(phys_addr_t paddr) ++{ ++ return paddr >= virt_to_phys(io_tlb_start) && ++ paddr < virt_to_phys(io_tlb_end); ++} ++ ++/* ++ * Bounce: copy the swiotlb buffer back to the original dma location ++ */ ++void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, ++ enum dma_data_direction dir) ++{ ++ unsigned long pfn = PFN_DOWN(phys); ++ ++ if (PageHighMem(pfn_to_page(pfn))) { ++ /* The buffer does not have a mapping. Map it in and copy */ ++ unsigned int offset = phys & ~PAGE_MASK; ++ char *buffer; ++ unsigned int sz = 0; ++ unsigned long flags; ++ ++ while (size) { ++ sz = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ local_irq_save(flags); ++ buffer = kmap_atomic(pfn_to_page(pfn), ++ KM_BOUNCE_READ); ++ if (dir == DMA_TO_DEVICE) ++ memcpy(dma_addr, buffer + offset, sz); ++ else ++ memcpy(buffer + offset, dma_addr, sz); ++ kunmap_atomic(buffer, KM_BOUNCE_READ); ++ local_irq_restore(flags); ++ ++ size -= sz; ++ pfn++; ++ dma_addr += sz; ++ offset = 0; ++ } ++ } else { ++ if (dir == DMA_TO_DEVICE) ++ memcpy(dma_addr, phys_to_virt(phys), size); ++ else ++ memcpy(phys_to_virt(phys), dma_addr, size); ++ } ++} ++ ++/* ++ * Allocates bounce buffer and returns its kernel virtual address. ++ */ ++void * ++do_map_single(struct device *hwdev, phys_addr_t phys, ++ unsigned long start_dma_addr, size_t size, int dir) ++{ ++ unsigned long flags; ++ char *dma_addr; ++ unsigned int nslots, stride, index, wrap; ++ int i; ++ unsigned long mask; ++ unsigned long offset_slots; ++ unsigned long max_slots; ++ ++ mask = dma_get_seg_boundary(hwdev); ++ start_dma_addr = start_dma_addr & mask; ++ offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ ++ /* ++ * Carefully handle integer overflow which can occur when mask == ~0UL. ++ */ ++ max_slots = mask + 1 ++ ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT ++ : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); ++ ++ /* ++ * For mappings greater than a page, we limit the stride (and ++ * hence alignment) to a page size. ++ */ ++ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ if (size > PAGE_SIZE) ++ stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); ++ else ++ stride = 1; ++ ++ BUG_ON(!nslots); ++ ++ /* ++ * Find suitable number of IO TLB entries size that will fit this ++ * request and allocate a buffer from that IO TLB pool. ++ */ ++ spin_lock_irqsave(&io_tlb_lock, flags); ++ index = ALIGN(io_tlb_index, stride); ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ wrap = index; ++ ++ do { ++ while (iommu_is_span_boundary(index, nslots, offset_slots, ++ max_slots)) { ++ index += stride; ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ if (index == wrap) ++ goto not_found; ++ } ++ ++ /* ++ * If we find a slot that indicates we have 'nslots' number of ++ * contiguous buffers, we allocate the buffers from that slot ++ * and mark the entries as '0' indicating unavailable. ++ */ ++ if (io_tlb_list[index] >= nslots) { ++ int count = 0; ++ ++ for (i = index; i < (int) (index + nslots); i++) ++ io_tlb_list[i] = 0; ++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) ++ != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) ++ io_tlb_list[i] = ++count; ++ dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); ++ ++ /* ++ * Update the indices to avoid searching in the next ++ * round. ++ */ ++ io_tlb_index = ((index + nslots) < io_tlb_nslabs ++ ? (index + nslots) : 0); ++ ++ goto found; ++ } ++ index += stride; ++ if (index >= io_tlb_nslabs) ++ index = 0; ++ } while (index != wrap); ++ ++not_found: ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++ return NULL; ++found: ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++ ++ /* ++ * Save away the mapping from the original address to the DMA address. ++ * This is needed when we sync the memory. Then we sync the buffer if ++ * needed. ++ */ ++ for (i = 0; i < nslots; i++) ++ io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); ++ if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) ++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); ++ ++ return dma_addr; ++} ++ ++/* ++ * dma_addr is the kernel virtual address of the bounce buffer to unmap. ++ */ ++void ++do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) ++{ ++ unsigned long flags; ++ int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; ++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; ++ phys_addr_t phys = io_tlb_orig_addr[index]; ++ ++ /* ++ * First, sync the memory before unmapping the entry ++ */ ++ if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) ++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); ++ ++ /* ++ * Return the buffer to the free list by setting the corresponding ++ * entries to indicate the number of contigous entries available. ++ * While returning the entries to the free list, we merge the entries ++ * with slots below and above the pool being returned. ++ */ ++ spin_lock_irqsave(&io_tlb_lock, flags); ++ { ++ count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? ++ io_tlb_list[index + nslots] : 0); ++ /* ++ * Step 1: return the slots to the free list, merging the ++ * slots with superceeding slots ++ */ ++ for (i = index + nslots - 1; i >= index; i--) ++ io_tlb_list[i] = ++count; ++ /* ++ * Step 2: merge the returned slots with the preceding slots, ++ * if available (non zero) ++ */ ++ for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != ++ IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) ++ io_tlb_list[i] = ++count; ++ } ++ spin_unlock_irqrestore(&io_tlb_lock, flags); ++} ++ ++void ++do_sync_single(struct device *hwdev, char *dma_addr, size_t size, ++ int dir, int target) ++{ ++ int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; ++ phys_addr_t phys = io_tlb_orig_addr[index]; ++ ++ phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); ++ ++ switch (target) { ++ case SYNC_FOR_CPU: ++ if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) ++ swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); ++ else ++ BUG_ON(dir != DMA_TO_DEVICE); ++ break; ++ case SYNC_FOR_DEVICE: ++ if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) ++ swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); ++ else ++ BUG_ON(dir != DMA_FROM_DEVICE); ++ break; ++ default: ++ BUG(); ++ } ++} ++void ++swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) ++{ ++ /* ++ * Ran out of IOMMU space for this operation. This is very bad. ++ * Unfortunately the drivers cannot handle this operation properly. ++ * unless they check for dma_mapping_error (most don't) ++ * When the mapping is small enough return a static buffer to limit ++ * the damage, or panic when the transfer is too big. ++ */ ++ dev_err(dev, "DMA: Out of SW-IOMMU space for %zu bytes.", size); ++ ++ if (size <= io_tlb_overflow || !do_panic) ++ return; ++ ++ if (dir == DMA_BIDIRECTIONAL) ++ panic("DMA: Random memory could be DMA accessed\n"); ++ if (dir == DMA_FROM_DEVICE) ++ panic("DMA: Random memory could be DMA written\n"); ++ if (dir == DMA_TO_DEVICE) ++ panic("DMA: Random memory could be DMA read\n"); ++} +diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c +new file mode 100644 +index 0000000..bee577f +--- /dev/null ++++ b/lib/swiotlb-xen.c +@@ -0,0 +1,504 @@ ++/* An software based IOMMU that utilizes the swiotlb-core fuctionality. ++ * It can function on Xen when there are PCI devices present.*/ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) ++{ ++ return phys_to_machine(XPADDR(paddr)).maddr;; ++} ++ ++static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) ++{ ++ return machine_to_phys(XMADDR(baddr)).paddr; ++} ++ ++static dma_addr_t xen_virt_to_bus(void *address) ++{ ++ return xen_phys_to_bus(virt_to_phys(address)); ++} ++ ++static int check_pages_physically_contiguous(unsigned long pfn, ++ unsigned int offset, ++ size_t length) ++{ ++ unsigned long next_mfn; ++ int i; ++ int nr_pages; ++ ++ next_mfn = pfn_to_mfn(pfn); ++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ for (i = 1; i < nr_pages; i++) { ++ if (pfn_to_mfn(++pfn) != ++next_mfn) ++ return 0; ++ } ++ return 1; ++} ++ ++static int range_straddles_page_boundary(phys_addr_t p, size_t size) ++{ ++ unsigned long pfn = PFN_DOWN(p); ++ unsigned int offset = p & ~PAGE_MASK; ++ ++ if (offset + size <= PAGE_SIZE) ++ return 0; ++ if (check_pages_physically_contiguous(pfn, offset, size)) ++ return 0; ++ return 1; ++} ++ ++ ++bool xen_dma_capable(struct device *dev, dma_addr_t dev_addr, ++ phys_addr_t phys, size_t size) ++{ ++ int rc = 0; ++ ++ rc = dma_capable(dev, dev_addr, size) && ++ !range_straddles_page_boundary(phys, size); ++ return rc; ++} ++ ++static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) ++{ ++ unsigned long mfn = PFN_DOWN(dma_addr); ++ unsigned long pfn = mfn_to_local_pfn(mfn); ++ ++ /* If the address is outside our domain, it CAN have the same virtual ++ * address as another address in our domain. Hence only check address ++ * within our domain. */ ++ if (pfn_valid(pfn)) ++ return is_swiotlb_buffer(PFN_PHYS(pfn)); ++ ++ return 0; ++} ++void * ++xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ++ dma_addr_t *dma_handle, gfp_t flags) ++{ ++ void *ret; ++ int order = get_order(size); ++ u64 dma_mask = DMA_BIT_MASK(32); ++ unsigned long vstart; ++ ++ /* ++ * Ignore region specifiers - the kernel's ideas of ++ * pseudo-phys memory layout has nothing to do with the ++ * machine physical layout. We can't allocate highmem ++ * because we can't return a pointer to it. ++ */ ++ flags &= ~(__GFP_DMA | __GFP_HIGHMEM); ++ ++ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) ++ return ret; ++ ++ vstart = __get_free_pages(flags, order); ++ ret = (void *)vstart; ++ ++ if (hwdev && hwdev->coherent_dma_mask) ++ dma_mask = dma_alloc_coherent_mask(hwdev, flags); ++ ++ if (ret) { ++ if (xen_create_contiguous_region(vstart, order, ++ fls64(dma_mask)) != 0) { ++ free_pages(vstart, order); ++ return NULL; ++ } ++ memset(ret, 0, size); ++ *dma_handle = virt_to_machine(ret).maddr; ++ } ++ return ret; ++} ++EXPORT_SYMBOL(xen_swiotlb_alloc_coherent); ++ ++void ++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, ++ dma_addr_t dev_addr) ++{ ++ int order = get_order(size); ++ ++ if (dma_release_from_coherent(hwdev, order, vaddr)) ++ return; ++ ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++} ++EXPORT_SYMBOL(xen_swiotlb_free_coherent); ++ ++ ++static int max_dma_bits = 32; ++ ++static int ++xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) ++{ ++ int i, rc; ++ int dma_bits; ++ ++ printk(KERN_INFO "xen_swiotlb_fixup: buf=%p size=%zu\n", ++ buf, size); ++ ++ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; ++ ++ i = 0; ++ do { ++ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); ++ ++ do { ++ rc = xen_create_contiguous_region( ++ (unsigned long)buf + (i << IO_TLB_SHIFT), ++ get_order(slabs << IO_TLB_SHIFT), ++ dma_bits); ++ } while (rc && dma_bits++ < max_dma_bits); ++ if (rc) ++ return rc; ++ ++ i += slabs; ++ } while(i < nslabs); ++ return 0; ++} ++ ++void __init xen_swiotlb_init(int verbose) ++{ ++ int rc = 0; ++ ++ swiotlb_init_early(64 * (1<<20), verbose); ++ ++ if ((rc = xen_swiotlb_fixup(io_tlb_start, ++ io_tlb_nslabs << IO_TLB_SHIFT, ++ io_tlb_nslabs))) ++ goto error; ++ ++ if ((rc = xen_swiotlb_fixup(io_tlb_overflow_buffer, ++ io_tlb_overflow, ++ io_tlb_overflow >> IO_TLB_SHIFT))) ++ goto error; ++ ++ return; ++error: ++ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ ++ "We either don't have the permission or you do not have enough"\ ++ "free memory under 4GB!\n", rc); ++} ++ ++/* ++ * Map a single buffer of the indicated size for DMA in streaming mode. The ++ * physical address to use is returned. ++ * ++ * Once the device is given the dma address, the device owns this memory until ++ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. ++ */ ++dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unsigned long start_dma_addr; ++ phys_addr_t phys = page_to_phys(page) + offset; ++ dma_addr_t dev_addr = xen_phys_to_bus(phys); ++ void *map; ++ ++ BUG_ON(dir == DMA_NONE); ++ /* ++ * If the address happens to be in the device's DMA window, ++ * we can safely return the device addr and not worry about bounce ++ * buffering it. ++ */ ++ if (dma_capable(dev, dev_addr, size) && ++ !range_straddles_page_boundary(phys, size) && !swiotlb_force) ++ return dev_addr; ++ ++ /* ++ * Oh well, have to allocate and map a bounce buffer. ++ */ ++ start_dma_addr = xen_virt_to_bus(io_tlb_start); ++ map = do_map_single(dev, phys, start_dma_addr, size, dir); ++ if (!map) { ++ swiotlb_full(dev, size, dir, 1); ++ map = io_tlb_overflow_buffer; ++ } ++ ++ dev_addr = xen_virt_to_bus(map); ++ ++ /* ++ * Ensure that the address returned is DMA'ble ++ */ ++ if (!dma_capable(dev, dev_addr, size)) ++ panic("DMA: xen_swiotlb_map_single: bounce buffer is not " \ ++ "DMA'ble\n"); ++ return dev_addr; ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); ++ ++/* ++ * Unmap a single streaming mode DMA translation. The dma_addr and size must ++ * match what was provided for in a previous xen_swiotlb_map_page call. All ++ * other usages are undefined. ++ * ++ * After this call, reads by the cpu to the buffer are guaranteed to see ++ * whatever the device wrote there. ++ */ ++static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, int dir) ++{ ++ phys_addr_t paddr = xen_bus_to_phys(dev_addr); ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ /* NOTE: We use dev_addr here, not paddr! */ ++ if (is_xen_swiotlb_buffer(dev_addr)) { ++ do_unmap_single(hwdev, phys_to_virt(paddr), size, dir); ++ return; ++ } ++ ++ if (dir != DMA_FROM_DEVICE) ++ return; ++ ++ /* ++ * phys_to_virt doesn't work with hihgmem page but we could ++ * call dma_mark_clean() with hihgmem page here. However, we ++ * are fine since dma_mark_clean() is null on POWERPC. We can ++ * make dma_mark_clean() take a physical address if necessary. ++ */ ++ dma_mark_clean(phys_to_virt(paddr), size); ++} ++ ++void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unmap_single(hwdev, dev_addr, size, dir); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); ++ ++/* ++ * Make physical memory consistent for a single streaming mode DMA translation ++ * after a transfer. ++ * ++ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer ++ * using the cpu, yet do not wish to teardown the dma mapping, you must ++ * call this function before doing so. At the next point you give the dma ++ * address back to the card, you must first perform a ++ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer ++ */ ++static void ++xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, int dir, int target) ++{ ++ phys_addr_t paddr = xen_bus_to_phys(dev_addr); ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ if (is_xen_swiotlb_buffer(dev_addr)) { ++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target); ++ return; ++ } ++ ++ if (dir != DMA_FROM_DEVICE) ++ return; ++ ++ dma_mark_clean(phys_to_virt(paddr), size); ++} ++ ++void ++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_cpu); ++ ++void ++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, ++ size_t size, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_device); ++ ++/* ++ * Same as above, but for a sub-range of the mapping. ++ */ ++static void ++xen_swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ int dir, int target) ++{ ++ xen_swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); ++} ++ ++void ++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, ++ SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_cpu); ++ ++void ++xen_swiotlb_sync_single_range_for_device(struct device *hwdev, ++ dma_addr_t dev_addr, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir, ++ SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_device); ++ ++/* ++ * Map a set of buffers described by scatterlist in streaming mode for DMA. ++ * This is the scatter-gather version of the above xen_swiotlb_map_page ++ * interface. Here the scatter gather list elements are each tagged with the ++ * appropriate dma address and length. They are obtained via ++ * sg_dma_{address,length}(SG). ++ * ++ * NOTE: An implementation may be able to use a smaller number of ++ * DMA address/length pairs than there are SG table elements. ++ * (for example via virtual mapping capabilities) ++ * The routine returns the number of addr/length pairs actually ++ * used, at most nents. ++ * ++ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the ++ * same here. ++ */ ++int ++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ unsigned long start_dma_addr; ++ struct scatterlist *sg; ++ int i; ++ BUG_ON(dir == DMA_NONE); ++ ++ start_dma_addr = xen_virt_to_bus(io_tlb_start); ++ for_each_sg(sgl, sg, nelems, i) { ++ phys_addr_t paddr = sg_phys(sg); ++ dma_addr_t dev_addr = xen_phys_to_bus(paddr); ++ ++ if (swiotlb_force || ++ !dma_capable(hwdev, dev_addr, sg->length) || ++ range_straddles_page_boundary(paddr, sg->length)) { ++ void *map = do_map_single(hwdev, sg_phys(sg), ++ start_dma_addr, ++ sg->length, dir); ++ if (!map) { ++ /* Don't panic here, we expect map_sg users ++ to do proper error handling. */ ++ swiotlb_full(hwdev, sg->length, dir, 0); ++ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, ++ attrs); ++ sgl[0].dma_length = 0; ++ return 0; ++ } ++ sg->dma_address = xen_virt_to_bus(map); ++ } else ++ sg->dma_address = dev_addr; ++ sg->dma_length = sg->length; ++ } ++ return nelems; ++} ++EXPORT_SYMBOL(xen_swiotlb_map_sg_attrs); ++ ++int ++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, ++ int dir) ++{ ++ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); ++} ++EXPORT_SYMBOL(xen_swiotlb_map_sg); ++ ++/* ++ * Unmap a set of streaming mode DMA translations. Again, cpu read rules ++ * concerning calls here are the same as for xen_swiotlb_unmap_page() above. ++ */ ++void ++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ BUG_ON(dir == DMA_NONE); ++ ++ for_each_sg(sgl, sg, nelems, i) ++ unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); ++ ++} ++EXPORT_SYMBOL(xen_swiotlb_unmap_sg_attrs); ++ ++void ++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, ++ int dir) ++{ ++ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); ++} ++EXPORT_SYMBOL(xen_swiotlb_unmap_sg); ++ ++/* ++ * Make physical memory consistent for a set of streaming mode DMA translations ++ * after a transfer. ++ * ++ * The same as xen_swiotlb_sync_single_* but for a scatter-gather list, ++ * same rules and usage. ++ */ ++static void ++xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, ++ int nelems, int dir, int target) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(sgl, sg, nelems, i) ++ xen_swiotlb_sync_single(hwdev, sg->dma_address, ++ sg->dma_length, dir, target); ++} ++ ++void ++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_cpu); ++ ++void ++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, ++ int nelems, enum dma_data_direction dir) ++{ ++ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); ++} ++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_device); ++ ++int ++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) ++{ ++ return (dma_addr == xen_virt_to_bus(io_tlb_overflow_buffer)); ++} ++EXPORT_SYMBOL(xen_swiotlb_dma_mapping_error); ++ ++/* ++ * Return whether the given device DMA address mask can be supported ++ * properly. For example, if your device can only drive the low 24-bits ++ * during bus mastering, then you would pass 0x00ffffff as the mask to ++ * this function. ++ */ ++int ++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) ++{ ++ return xen_virt_to_bus(io_tlb_end - 1) <= mask; ++} ++EXPORT_SYMBOL(xen_swiotlb_dma_supported); +diff --git a/lib/swiotlb.c b/lib/swiotlb.c +index ac25cd2..f6bbcd1 100644 +--- a/lib/swiotlb.c ++++ b/lib/swiotlb.c +@@ -1,118 +1,11 @@ +-/* +- * Dynamic DMA mapping support. +- * +- * This implementation is a fallback for platforms that do not support +- * I/O TLBs (aka DMA address translation hardware). +- * Copyright (C) 2000 Asit Mallick +- * Copyright (C) 2000 Goutham Rao +- * Copyright (C) 2000, 2003 Hewlett-Packard Co +- * David Mosberger-Tang +- * +- * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. +- * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid +- * unnecessary i-cache flushing. +- * 04/07/.. ak Better overflow handling. Assorted fixes. +- * 05/09/10 linville Add support for syncing ranges, support syncing for +- * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. +- * 08/12/11 beckyb Add highmem support +- */ + +-#include + #include +-#include + #include +-#include +-#include + #include +-#include +-#include +-#include +-#include + +-#include +-#include + #include +- +-#include +-#include + #include + +-#define OFFSET(val,align) ((unsigned long) \ +- ( (val) & ( (align) - 1))) +- +-#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +- +-/* +- * Minimum IO TLB size to bother booting with. Systems with mainly +- * 64bit capable cards will only lightly use the swiotlb. If we can't +- * allocate a contiguous 1MB, we're probably in trouble anyway. +- */ +-#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) +- +-/* +- * Enumeration for sync targets +- */ +-enum dma_sync_target { +- SYNC_FOR_CPU = 0, +- SYNC_FOR_DEVICE = 1, +-}; +- +-int swiotlb_force; +- +-/* +- * Used to do a quick range check in unmap_single and +- * sync_single_*, to see if the memory was in fact allocated by this +- * API. +- */ +-static char *io_tlb_start, *io_tlb_end; +- +-/* +- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and +- * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. +- */ +-static unsigned long io_tlb_nslabs; +- +-/* +- * When the IOMMU overflows we return a fallback buffer. This sets the size. +- */ +-static unsigned long io_tlb_overflow = 32*1024; +- +-void *io_tlb_overflow_buffer; +- +-/* +- * This is a free list describing the number of free entries available from +- * each index +- */ +-static unsigned int *io_tlb_list; +-static unsigned int io_tlb_index; +- +-/* +- * We need to save away the original address corresponding to a mapped entry +- * for the sync operations. +- */ +-static phys_addr_t *io_tlb_orig_addr; +- +-/* +- * Protect the above data structures in the map and unmap calls +- */ +-static DEFINE_SPINLOCK(io_tlb_lock); +- +-static int __init +-setup_io_tlb_npages(char *str) +-{ +- if (isdigit(*str)) { +- io_tlb_nslabs = simple_strtoul(str, &str, 0); +- /* avoid tail segment of size < IO_TLB_SEGSIZE */ +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- if (*str == ',') +- ++str; +- if (!strcmp(str, "force")) +- swiotlb_force = 1; +- return 1; +-} +-__setup("swiotlb=", setup_io_tlb_npages); +-/* make io_tlb_overflow tunable too? */ + + /* Note that this doesn't work with highmem page */ + static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, +@@ -120,390 +13,6 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, + { + return phys_to_dma(hwdev, virt_to_phys(address)); + } +- +-static void swiotlb_print_info(unsigned long bytes) +-{ +- phys_addr_t pstart, pend; +- +- pstart = virt_to_phys(io_tlb_start); +- pend = virt_to_phys(io_tlb_end); +- +- printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n", +- bytes >> 20, io_tlb_start, io_tlb_end); +- printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n", +- (unsigned long long)pstart, +- (unsigned long long)pend); +-} +- +-/* +- * Statically reserve bounce buffer space and initialize bounce buffer data +- * structures for the software IO TLB used to implement the DMA API. +- */ +-void __init +-swiotlb_init_with_default_size(size_t default_size) +-{ +- unsigned long i, bytes; +- +- if (!io_tlb_nslabs) { +- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- +- /* +- * Get IO TLB memory from the low pages +- */ +- io_tlb_start = alloc_bootmem_low_pages(bytes); +- if (!io_tlb_start) +- panic("Cannot allocate SWIOTLB buffer"); +- io_tlb_end = io_tlb_start + bytes; +- +- /* +- * Allocate and initialize the free list array. This array is used +- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE +- * between io_tlb_start and io_tlb_end. +- */ +- io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); +- for (i = 0; i < io_tlb_nslabs; i++) +- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); +- io_tlb_index = 0; +- io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); +- +- /* +- * Get the overflow emergency buffer +- */ +- io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); +- if (!io_tlb_overflow_buffer) +- panic("Cannot allocate SWIOTLB overflow buffer!\n"); +- +- swiotlb_print_info(bytes); +-} +- +-void __init +-swiotlb_init(void) +-{ +- swiotlb_init_with_default_size(64 * (1<<20)); /* default to 64MB */ +-} +- +-/* +- * Systems with larger DMA zones (those that don't support ISA) can +- * initialize the swiotlb later using the slab allocator if needed. +- * This should be just like above, but with some error catching. +- */ +-int +-swiotlb_late_init_with_default_size(size_t default_size) +-{ +- unsigned long i, bytes, req_nslabs = io_tlb_nslabs; +- unsigned int order; +- +- if (!io_tlb_nslabs) { +- io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); +- io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); +- } +- +- /* +- * Get IO TLB memory from the low pages +- */ +- order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); +- io_tlb_nslabs = SLABS_PER_PAGE << order; +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- +- while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { +- io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, +- order); +- if (io_tlb_start) +- break; +- order--; +- } +- +- if (!io_tlb_start) +- goto cleanup1; +- +- if (order != get_order(bytes)) { +- printk(KERN_WARNING "Warning: only able to allocate %ld MB " +- "for software IO TLB\n", (PAGE_SIZE << order) >> 20); +- io_tlb_nslabs = SLABS_PER_PAGE << order; +- bytes = io_tlb_nslabs << IO_TLB_SHIFT; +- } +- io_tlb_end = io_tlb_start + bytes; +- memset(io_tlb_start, 0, bytes); +- +- /* +- * Allocate and initialize the free list array. This array is used +- * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE +- * between io_tlb_start and io_tlb_end. +- */ +- io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, +- get_order(io_tlb_nslabs * sizeof(int))); +- if (!io_tlb_list) +- goto cleanup2; +- +- for (i = 0; i < io_tlb_nslabs; i++) +- io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); +- io_tlb_index = 0; +- +- io_tlb_orig_addr = (phys_addr_t *) +- __get_free_pages(GFP_KERNEL, +- get_order(io_tlb_nslabs * +- sizeof(phys_addr_t))); +- if (!io_tlb_orig_addr) +- goto cleanup3; +- +- memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); +- +- /* +- * Get the overflow emergency buffer +- */ +- io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, +- get_order(io_tlb_overflow)); +- if (!io_tlb_overflow_buffer) +- goto cleanup4; +- +- swiotlb_print_info(bytes); +- +- return 0; +- +-cleanup4: +- free_pages((unsigned long)io_tlb_orig_addr, +- get_order(io_tlb_nslabs * sizeof(phys_addr_t))); +- io_tlb_orig_addr = NULL; +-cleanup3: +- free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * +- sizeof(int))); +- io_tlb_list = NULL; +-cleanup2: +- io_tlb_end = NULL; +- free_pages((unsigned long)io_tlb_start, order); +- io_tlb_start = NULL; +-cleanup1: +- io_tlb_nslabs = req_nslabs; +- return -ENOMEM; +-} +- +-static int is_swiotlb_buffer(phys_addr_t paddr) +-{ +- return paddr >= virt_to_phys(io_tlb_start) && +- paddr < virt_to_phys(io_tlb_end); +-} +- +-/* +- * Bounce: copy the swiotlb buffer back to the original dma location +- */ +-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, +- enum dma_data_direction dir) +-{ +- unsigned long pfn = PFN_DOWN(phys); +- +- if (PageHighMem(pfn_to_page(pfn))) { +- /* The buffer does not have a mapping. Map it in and copy */ +- unsigned int offset = phys & ~PAGE_MASK; +- char *buffer; +- unsigned int sz = 0; +- unsigned long flags; +- +- while (size) { +- sz = min_t(size_t, PAGE_SIZE - offset, size); +- +- local_irq_save(flags); +- buffer = kmap_atomic(pfn_to_page(pfn), +- KM_BOUNCE_READ); +- if (dir == DMA_TO_DEVICE) +- memcpy(dma_addr, buffer + offset, sz); +- else +- memcpy(buffer + offset, dma_addr, sz); +- kunmap_atomic(buffer, KM_BOUNCE_READ); +- local_irq_restore(flags); +- +- size -= sz; +- pfn++; +- dma_addr += sz; +- offset = 0; +- } +- } else { +- if (dir == DMA_TO_DEVICE) +- memcpy(dma_addr, phys_to_virt(phys), size); +- else +- memcpy(phys_to_virt(phys), dma_addr, size); +- } +-} +- +-/* +- * Allocates bounce buffer and returns its kernel virtual address. +- */ +-static void * +-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir) +-{ +- unsigned long flags; +- char *dma_addr; +- unsigned int nslots, stride, index, wrap; +- int i; +- unsigned long start_dma_addr; +- unsigned long mask; +- unsigned long offset_slots; +- unsigned long max_slots; +- +- mask = dma_get_seg_boundary(hwdev); +- start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask; +- +- offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- +- /* +- * Carefully handle integer overflow which can occur when mask == ~0UL. +- */ +- max_slots = mask + 1 +- ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT +- : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); +- +- /* +- * For mappings greater than a page, we limit the stride (and +- * hence alignment) to a page size. +- */ +- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- if (size > PAGE_SIZE) +- stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); +- else +- stride = 1; +- +- BUG_ON(!nslots); +- +- /* +- * Find suitable number of IO TLB entries size that will fit this +- * request and allocate a buffer from that IO TLB pool. +- */ +- spin_lock_irqsave(&io_tlb_lock, flags); +- index = ALIGN(io_tlb_index, stride); +- if (index >= io_tlb_nslabs) +- index = 0; +- wrap = index; +- +- do { +- while (iommu_is_span_boundary(index, nslots, offset_slots, +- max_slots)) { +- index += stride; +- if (index >= io_tlb_nslabs) +- index = 0; +- if (index == wrap) +- goto not_found; +- } +- +- /* +- * If we find a slot that indicates we have 'nslots' number of +- * contiguous buffers, we allocate the buffers from that slot +- * and mark the entries as '0' indicating unavailable. +- */ +- if (io_tlb_list[index] >= nslots) { +- int count = 0; +- +- for (i = index; i < (int) (index + nslots); i++) +- io_tlb_list[i] = 0; +- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) +- io_tlb_list[i] = ++count; +- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); +- +- /* +- * Update the indices to avoid searching in the next +- * round. +- */ +- io_tlb_index = ((index + nslots) < io_tlb_nslabs +- ? (index + nslots) : 0); +- +- goto found; +- } +- index += stride; +- if (index >= io_tlb_nslabs) +- index = 0; +- } while (index != wrap); +- +-not_found: +- spin_unlock_irqrestore(&io_tlb_lock, flags); +- return NULL; +-found: +- spin_unlock_irqrestore(&io_tlb_lock, flags); +- +- /* +- * Save away the mapping from the original address to the DMA address. +- * This is needed when we sync the memory. Then we sync the buffer if +- * needed. +- */ +- for (i = 0; i < nslots; i++) +- io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); +- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) +- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); +- +- return dma_addr; +-} +- +-/* +- * dma_addr is the kernel virtual address of the bounce buffer to unmap. +- */ +-static void +-do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +-{ +- unsigned long flags; +- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; +- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; +- phys_addr_t phys = io_tlb_orig_addr[index]; +- +- /* +- * First, sync the memory before unmapping the entry +- */ +- if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) +- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); +- +- /* +- * Return the buffer to the free list by setting the corresponding +- * entries to indicate the number of contigous entries available. +- * While returning the entries to the free list, we merge the entries +- * with slots below and above the pool being returned. +- */ +- spin_lock_irqsave(&io_tlb_lock, flags); +- { +- count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? +- io_tlb_list[index + nslots] : 0); +- /* +- * Step 1: return the slots to the free list, merging the +- * slots with superceeding slots +- */ +- for (i = index + nslots - 1; i >= index; i--) +- io_tlb_list[i] = ++count; +- /* +- * Step 2: merge the returned slots with the preceding slots, +- * if available (non zero) +- */ +- for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) +- io_tlb_list[i] = ++count; +- } +- spin_unlock_irqrestore(&io_tlb_lock, flags); +-} +- +-static void +-sync_single(struct device *hwdev, char *dma_addr, size_t size, +- int dir, int target) +-{ +- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; +- phys_addr_t phys = io_tlb_orig_addr[index]; +- +- phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); +- +- switch (target) { +- case SYNC_FOR_CPU: +- if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) +- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); +- else +- BUG_ON(dir != DMA_TO_DEVICE); +- break; +- case SYNC_FOR_DEVICE: +- if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) +- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); +- else +- BUG_ON(dir != DMA_FROM_DEVICE); +- break; +- default: +- BUG(); +- } +-} +- + void * + swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +@@ -512,12 +21,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); ++ unsigned long start_dma_addr; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; + + ret = (void *)__get_free_pages(flags, order); +- if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) { ++ if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) { + /* + * The allocated memory isn't reachable by the device. + */ +@@ -527,10 +37,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + if (!ret) { + /* + * We are either out of memory or the device can't DMA +- * to GFP_DMA memory; fall back on map_single(), which ++ * to GFP_DMA memory; fall back on do_map_single(), which + * will grab memory from the lowest available address range. + */ +- ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); ++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); ++ ret = do_map_single(hwdev, 0, start_dma_addr, size, ++ DMA_FROM_DEVICE); + if (!ret) + return NULL; + } +@@ -539,12 +51,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dev_addr = swiotlb_virt_to_bus(hwdev, ret); + + /* Confirm address can be DMA'd by device */ +- if (dev_addr + size > dma_mask) { +- printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", ++ if (dev_addr + size - 1 > dma_mask) { ++ dev_err(hwdev, "DMA: hwdev DMA mask = 0x%016Lx, " \ ++ "dev_addr = 0x%016Lx\n", + (unsigned long long)dma_mask, + (unsigned long long)dev_addr); + +- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ ++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */ + do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + return NULL; + } +@@ -563,35 +76,11 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + if (!is_swiotlb_buffer(paddr)) + free_pages((unsigned long)vaddr, get_order(size)); + else +- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ ++ /* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */ + do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + } + EXPORT_SYMBOL(swiotlb_free_coherent); + +-static void +-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +-{ +- /* +- * Ran out of IOMMU space for this operation. This is very bad. +- * Unfortunately the drivers cannot handle this operation properly. +- * unless they check for dma_mapping_error (most don't) +- * When the mapping is small enough return a static buffer to limit +- * the damage, or panic when the transfer is too big. +- */ +- printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at " +- "device %s\n", size, dev ? dev_name(dev) : "?"); +- +- if (size <= io_tlb_overflow || !do_panic) +- return; +- +- if (dir == DMA_BIDIRECTIONAL) +- panic("DMA: Random memory could be DMA accessed\n"); +- if (dir == DMA_FROM_DEVICE) +- panic("DMA: Random memory could be DMA written\n"); +- if (dir == DMA_TO_DEVICE) +- panic("DMA: Random memory could be DMA read\n"); +-} +- + /* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. +@@ -604,6 +93,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + enum dma_data_direction dir, + struct dma_attrs *attrs) + { ++ unsigned long start_dma_addr; + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = phys_to_dma(dev, phys); + void *map; +@@ -620,7 +110,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + /* + * Oh well, have to allocate and map a bounce buffer. + */ +- map = map_single(dev, phys, size, dir); ++ start_dma_addr = swiotlb_virt_to_bus(dev, io_tlb_start); ++ map = do_map_single(dev, phys, start_dma_addr, size, dir); + if (!map) { + swiotlb_full(dev, size, dir, 1); + map = io_tlb_overflow_buffer; +@@ -632,7 +123,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) +- panic("map_single: bounce buffer is not DMA'ble"); ++ panic("DMA: swiotlb_map_single: bounce buffer is not DMA'ble"); + + return dev_addr; + } +@@ -697,7 +188,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { +- sync_single(hwdev, phys_to_virt(paddr), size, dir, target); ++ do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target); + return; + } + +@@ -774,19 +265,22 @@ int + swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, struct dma_attrs *attrs) + { ++ unsigned long start_dma_addr; + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + ++ start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); + + if (swiotlb_force || + !dma_capable(hwdev, dev_addr, sg->length)) { +- void *map = map_single(hwdev, sg_phys(sg), +- sg->length, dir); ++ void *map = do_map_single(hwdev, sg_phys(sg), ++ start_dma_addr, ++ sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ +@@ -819,7 +313,8 @@ EXPORT_SYMBOL(swiotlb_map_sg); + */ + void + swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, +- int nelems, enum dma_data_direction dir, struct dma_attrs *attrs) ++ int nelems, enum dma_data_direction dir, ++ struct dma_attrs *attrs) + { + struct scatterlist *sg; + int i; +diff --git a/mm/bootmem.c b/mm/bootmem.c +index 555d5d2..d1dc23c 100644 +--- a/mm/bootmem.c ++++ b/mm/bootmem.c +@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); + } + ++/* ++ * free_bootmem_late - free bootmem pages directly to page allocator ++ * @addr: starting address of the range ++ * @size: size of the range in bytes ++ * ++ * This is only useful when the bootmem allocator has already been torn ++ * down, but we are still initializing the system. Pages are given directly ++ * to the page allocator, no bootmem metadata is updated because it is gone. ++ */ ++void __init free_bootmem_late(unsigned long addr, unsigned long size) ++{ ++ unsigned long cursor, end; ++ ++ kmemleak_free_part(__va(addr), size); ++ ++ cursor = PFN_UP(addr); ++ end = PFN_DOWN(addr + size); ++ ++ for (; cursor < end; cursor++) { ++ __free_pages_bootmem(pfn_to_page(cursor), 0); ++ totalram_pages++; ++ } ++} ++ + static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) + { + int aligned; +diff --git a/mm/memory.c b/mm/memory.c +index 4e59455..b2de7c9 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -553,6 +553,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + if (is_zero_pfn(pfn)) + return NULL; + check_pfn: ++ ++#if defined(CONFIG_XEN) && defined(CONFIG_X86) ++ /* XEN: Covers user-space grant mappings (even of local pages). */ ++ if (unlikely(vma->vm_flags & VM_FOREIGN)) ++ return NULL; ++#endif ++ + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; +@@ -839,8 +846,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + page->index > details->last_index)) + continue; + } +- ptent = ptep_get_and_clear_full(mm, addr, pte, +- tlb->fullmm); ++ if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) ++ ptent = vma->vm_ops->zap_pte(vma, addr, pte, ++ tlb->fullmm); ++ else ++ ptent = ptep_get_and_clear_full(mm, addr, pte, ++ tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + if (unlikely(!page)) + continue; +@@ -1100,6 +1111,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, + tlb_finish_mmu(tlb, address, end); + return end; + } ++EXPORT_SYMBOL_GPL(zap_page_range); + + /** + * zap_vma_ptes - remove ptes mapping the vma +@@ -1296,6 +1308,29 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + continue; + } + ++#ifdef CONFIG_XEN ++ if (vma && (vma->vm_flags & VM_FOREIGN)) { ++ struct vm_foreign_map *foreign_map = ++ vma->vm_private_data; ++ struct page **map = foreign_map->map; ++ int offset = (start - vma->vm_start) >> PAGE_SHIFT; ++ if (map[offset] != NULL) { ++ if (pages) { ++ struct page *page = map[offset]; ++ ++ pages[i] = page; ++ get_page(page); ++ } ++ if (vmas) ++ vmas[i] = vma; ++ i++; ++ start += PAGE_SIZE; ++ nr_pages--; ++ continue; ++ } ++ } ++#endif ++ + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) +@@ -1771,6 +1806,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + ++#if CONFIG_XEN ++ vma->vm_mm->context.has_foreign_mappings = 1; ++#endif ++ + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); + if (err) { + /* +diff --git a/mm/mmap.c b/mm/mmap.c +index ae19746..9c39fc2 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1785,6 +1785,12 @@ static void unmap_region(struct mm_struct *mm, + tlb_finish_mmu(tlb, start, end); + } + ++static inline void unmap_vma(struct vm_area_struct *vma) ++{ ++ if (unlikely(vma->vm_ops && vma->vm_ops->unmap)) ++ vma->vm_ops->unmap(vma); ++} ++ + /* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. +@@ -1800,6 +1806,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { + rb_erase(&vma->vm_rb, &mm->mm_rb); ++ unmap_vma(vma); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; +@@ -2076,7 +2083,7 @@ EXPORT_SYMBOL(do_brk); + void exit_mmap(struct mm_struct *mm) + { + struct mmu_gather *tlb; +- struct vm_area_struct *vma; ++ struct vm_area_struct *vma, *vma_tmp; + unsigned long nr_accounted = 0; + unsigned long end; + +@@ -2098,6 +2105,9 @@ void exit_mmap(struct mm_struct *mm) + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + ++ for (vma_tmp = mm->mmap; vma_tmp; vma_tmp = vma_tmp->vm_next) ++ unmap_vma(vma_tmp); ++ + lru_add_drain(); + flush_cache_mm(mm); + tlb = tlb_gather_mmu(mm, 1); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 36992b6..bc1b6e9 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -593,6 +593,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) + if (bad) + return; + ++#ifdef CONFIG_XEN ++ if (PageForeign(page)) { ++ PageForeignDestructor(page, order); ++ return; ++ } ++#endif ++ + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page),PAGE_SIZE<mapping = NULL; + if (free_pages_check(page)) diff --git a/debian/patches/series/10-extra b/debian/patches/series/10-extra new file mode 100644 index 000000000..4db8c1dc0 --- /dev/null +++ b/debian/patches/series/10-extra @@ -0,0 +1,2 @@ ++ features/all/xen/pvops.patch featureset=xen ++ features/all/xen/pvops-updates.patch featureset=xen