diff --git a/debian/bin/abicheck.py b/debian/bin/abicheck.py index 3dea267cb..c554c9d53 100755 --- a/debian/bin/abicheck.py +++ b/debian/bin/abicheck.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import fnmatch import sys sys.path.append('debian/lib/python') @@ -91,11 +92,21 @@ class checker(object): return ret def _ignore(self, add, change, remove): - config = self.config.merge('abi', self.arch, self.featureset, self.flavour) - ignores = config.get('ignore-changes', None) - if ignores is None: - return set() - return set(ignores.split()) + all = set(add.keys() + change.keys() + remove.keys()) + # TODO: let config merge this lists + configs = [] + configs.append(self.config.get(('abi', self.arch, self.featureset, self.flavour), {})) + configs.append(self.config.get(('abi', self.arch, None, self.flavour), {})) + configs.append(self.config.get(('abi', self.arch, self.featureset), {})) + configs.append(self.config.get(('abi', self.arch), {})) + configs.append(self.config.get(('abi',), {})) + ignores = set() + for config in configs: + ignores.update(config.get('ignore-changes', [])) + filtered = set() + for m in ignores: + filtered.update(fnmatch.filter(all, m)) + return filtered if __name__ == '__main__': sys.exit(checker(*sys.argv[1:])(sys.stdout)) diff --git a/debian/bin/gencontrol.py b/debian/bin/gencontrol.py index 2ebb9c3d2..4f4858ba9 100755 --- a/debian/bin/gencontrol.py +++ b/debian/bin/gencontrol.py @@ -144,10 +144,10 @@ class Gencontrol(Base): image_fields['Conflicts'].append(PackageRelationGroup([a])) image_fields['Depends'].append(l_depends) - if 'desc-parts' in config_entry_image: + desc_parts = self.config.get_merge('image', arch, featureset, flavour, 'desc-parts') + if desc_parts: desc = image_fields['Description'] - parts = config_entry_image['desc-parts'] - for part in parts: + for part in desc_parts[::-1]: desc.append(config_entry_image['desc-long-part-' + part]) desc.append_short(config_entry_image.get('desc-short-part-' + part, '')) @@ -164,9 +164,12 @@ class Gencontrol(Base): if config_entry_xen.get('dom0-support', True): p = self.process_packages(self.templates['control.xen-linux-system'], vars) l = PackageRelationGroup() - for version in config_entry_xen['versions']: - l.append("xen-hypervisor-%s-%s" % (version, config_entry_xen['flavour'])) - makeflags['XEN_VERSIONS'] = ' '.join(['%s-%s' % (i, config_entry_xen['flavour']) for i in config_entry_xen['versions']]) + xen_versions = [] + for xen_flavour in config_entry_xen['flavours']: + for version in config_entry_xen['versions']: + l.append("xen-hypervisor-%s-%s" % (version, xen_flavour)) + xen_versions.append('%s-%s' % (version, xen_flavour)) + makeflags['XEN_VERSIONS'] = ' '.join(xen_versions) p[0]['Depends'].append(l) packages_dummy.extend(p) else: diff --git a/debian/changelog b/debian/changelog index 8a3e92903..e5fcb7de4 100644 --- a/debian/changelog +++ b/debian/changelog @@ -37,7 +37,601 @@ linux-2.6 (2.6.28-1~experimental.1) UNRELEASED; urgency=low -- dann frazier Fri, 29 Aug 2008 17:02:18 -0600 -linux-2.6 (2.6.26-1~experimental.1) UNRELEASED; urgency=low +linux-2.6 (2.6.26-12) unstable; urgency=high + + [ Ian Campbell ] + * xen: fix ACPI processor throttling for when processor id is -1. (closes: #502849) + + [ dann frazier ] + * Make sendmsg() block during UNIX garbage collection (CVE-2008-5300) + * Fix race conditions between inotify removal and umount (CVE-2008-5182) + * Fix DoS when calling svc_listen twice on the same socket while reading + /proc/net/atm/*vc (CVE-2008-5079) + + [ Bastian Blank ] + * [openvz, vserver] Fix descriptions. + * [sparc] Enable Sun Logical Domains support. (closes: #501684) + * Fix coexistence of pata_marvell and ahci. (closes: #507432) + * [sparc] Support Intergraph graphics chips. (closes: #508108) + + -- Bastian Blank Mon, 15 Dec 2008 12:57:18 +0100 + +linux-2.6 (2.6.26-11) unstable; urgency=low + + [ Bastian Blank ] + * [sparc] Reintroduce dummy PCI host controller to workaround broken X.org. + * [sparc] Fix size checks in PCI maps. + * Add stable release 2.6.26.8: + - netfilter: restore lost ifdef guarding defrag exception + - netfilter: snmp nat leaks memory in case of failure + - netfilter: xt_iprange: fix range inversion match + - ACPI: dock: avoid check _STA method + - ACPI: video: fix brightness allocation + - sparc64: Fix race in arch/sparc64/kernel/trampoline.S + - math-emu: Fix signalling of underflow and inexact while packing result. + - tcpv6: fix option space offsets with md5 + - net: Fix netdev_run_todo dead-lock + - scx200_i2c: Add missing class parameter + - DVB: s5h1411: Power down s5h1411 when not in use + - DVB: s5h1411: Perform s5h1411 soft reset after tuning + - DVB: s5h1411: bugfix: Setting serial or parallel mode could destroy bits + - V4L: pvrusb2: Keep MPEG PTSs from drifting away + - ACPI: Always report a sync event after a lid state change + - ALSA: use correct lock in snd_ctl_dev_disconnect() + - file caps: always start with clear bprm->caps_* + - libertas: fix buffer overrun + - net: Fix recursive descent in __scm_destroy(). + - SCSI: qla2xxx: Skip FDMI registration on ISP21xx/22xx parts. + (Closes: #502552) + - edac cell: fix incorrect edac_mode + - ext[234]: Avoid printk floods in the face of directory corruption + (CVE-2008-3528) + - gpiolib: fix oops in gpio_get_value_cansleep() + * Override ABI changes. + * [xen] Update description. (closes: #505961) + * Revert parts of 2.6.26.6 to fix resume breakage. (closes: #504167) + - clockevents: prevent multiple init/shutdown + - clockevents: broadcast fixup possible waiters + + [ dann frazier ] + * Fix buffer overflow in hfsplus (CVE-2008-4933) + * Fix BUG() in hfsplus (CVE-2008-4934) + * Fix stack corruption in hfs (CVE-2008-5025) + * Fix oops in tvaudio when controlling bass/treble (CVE-2008-5033) + + [ Martin Michlmayr ] + * [arm/iop32x, arm/ixp4xx, arm/orion5x] Enable support for more partition + tables, including MAC_PARTITION (requested by BenoĆ®t Knecht). + * leds-pca9532: Fix memory leak and properly handle errors (Sven Wegener) + * leds-pca9532: Move i2c work to a workqueque (Riku Voipio). (closes: + #506116) + + -- Bastian Blank Wed, 26 Nov 2008 11:43:48 +0100 + +linux-2.6 (2.6.26-10) unstable; urgency=low + + [ dann frazier ] + * sctp: Fix possible kernel panic in sctp_sf_abort_violation (CVE-2008-4618) + + [ Martin Michlmayr ] + * DNS-323: add support for revision B1 machines (Matthew Palmer). + * ext3/ext4: Add support for non-native signed/unsigned htree hash + algorithms (Theodore Ts'o). (closes: #493957) + * [arm/ixp4xx] Enable USB_ACM (closes: #504723). + + [ Bastian Blank ] + * agp: Fix stolen memory counting on Intel G4X. (closes: #502606) + * Add stable release 2.6.26.7: + - security: avoid calling a NULL function pointer in drivers/video/tvaudio.c + - DVB: au0828: add support for another USB id for Hauppauge HVR950Q + - drm/i915: fix ioremap of a user address for non-root (CVE-2008-3831) + - ACPI: Ignore _BQC object when registering backlight device + - hwmon: (it87) Prevent power-off on Shuttle SN68PT + - Check mapped ranges on sysfs resource files + - x86: avoid dereferencing beyond stack + THREAD_SIZE + - PCI: disable ASPM on pre-1.1 PCIe devices + - PCI: disable ASPM per ACPI FADT setting + - V4L/DVB (9053): fix buffer overflow in uvc-video + - V4L/DVB (8617): uvcvideo: don't use stack-based buffers for USB transfers. + - V4L/DVB (8498): uvcvideo: Return sensible min and max values when querying + a boolean control. + - V4L: zr36067: Fix RGBR pixel format + - V4L: bttv: Prevent NULL pointer dereference in radio_open + - libata: fix EH action overwriting in ata_eh_reset() + - libata: always do follow-up SRST if hardreset returned -EAGAIN + - fbcon_set_all_vcs: fix kernel crash when switching the rotated consoles + - modules: fix module "notes" kobject leak + - b43legacy: Fix failure in rate-adjustment mechanism + - CIFS: make sure we have the right resume info before calling CIFSFindNext + - sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq + - tty: Termios locking - sort out real_tty confusions and lock reads + - x86, early_ioremap: fix fencepost error + - x86: improve UP kernel when CPU-hotplug and SMP is enabled + - x86: Reserve FIRST_DEVICE_VECTOR in used_vectors bitmap. + * [xen] Remove pte file workaround. + + [ Ian Campbell ] + * [xen] Disable usage of PAT. (closes: #503821) + + -- Bastian Blank Sat, 08 Nov 2008 10:50:58 +0100 + +linux-2.6 (2.6.26-9) unstable; urgency=low + + [ Bastian Blank ] + * Add stable release 2.6.26.6: + - mm owner: fix race between swapoff and exit + - rtc: fix kernel panic on second use of SIGIO nofitication + - fbcon: fix monochrome color value calculation + - ALSA: snd-powermac: HP detection for 1st iMac G3 SL + - ALSA: snd-powermac: mixers for PowerMac G4 AGP + - sparc64: Fix missing devices due to PCI bridge test in + of_create_pci_dev(). + - sparc64: Fix disappearing PCI devices on e3500. + - sparc64: Fix OOPS in psycho_pcierr_intr_other(). + - sparc64: Fix interrupt register calculations on Psycho and Sabre. + - sparc64: Fix PCI error interrupt registry on PSYCHO. + - udp: Fix rcv socket locking + - sctp: Fix oops when INIT-ACK indicates that peer doesn't support AUTH + (CVE-2008-4576) + - sctp: do not enable peer features if we can't do them. + - ipsec: Fix pskb_expand_head corruption in xfrm_state_check_space + - netlink: fix overrun in attribute iteration + - niu: panic on reset + - ipv6: Fix OOPS in ip6_dst_lookup_tail(). + - XFRM,IPv6: initialize ip6_dst_blackhole_ops.kmem_cachep + - af_key: Free dumping state on socket close + - pcmcia: Fix broken abuse of dev->driver_data + - clockevents: remove WARN_ON which was used to gather information + - ntp: fix calculation of the next jiffie to trigger RTC sync + - x86: HPET: read back compare register before reading counter + - x86: HPET fix moronic 32/64bit thinko + - clockevents: broadcast fixup possible waiters + - HPET: make minimum reprogramming delta useful + - clockevents: prevent endless loop lockup + - clockevents: prevent multiple init/shutdown + - clockevents: enforce reprogram in oneshot setup + - clockevents: prevent endless loop in periodic broadcast handler + - clockevents: prevent clockevent event_handler ending up handler_noop + - x86: fix memmap=exactmap boot argument + - x86: add io delay quirk for Presario F700 + - ACPI: Avoid bogus EC timeout when EC is in Polling mode + - x86: fix SMP alternatives: use mutex instead of spinlock, text_poke is + sleepable + - rtc: fix deadlock + - mm: dirty page tracking race fix + - x86-64: fix overlap of modules and fixmap areas + - x86: PAT proper tracking of set_memory_uc and friends + - x86: fix oprofile + hibernation badness + - x86: fdiv bug detection fix + - rt2x00: Use ieee80211_hw->workqueue again + - x86: Fix 27-rc crash on vsmp due to paravirt during module load + - sg: disable interrupts inside sg_copy_buffer + - ocfs2: Increment the reference count of an already-active stack. + - APIC routing fix + - sched: fix process time monotonicity + - block: submit_bh() inadvertently discards barrier flag on a sync write + - x64, fpu: fix possible FPU leakage in error conditions + - x86-64: Clean up save/restore_i387() usage + - KVM: SVM: fix guest global tlb flushes with NPT + - KVM: SVM: fix random segfaults with NPT enabled + - ALSA: remove unneeded power_mutex lock in snd_pcm_drop + - ALSA: fix locking in snd_pcm_open*() and snd_rawmidi_open*() + - ALSA: oxygen: fix distorted output on AK4396-based cards + - ALSA: hda - Fix model for Dell Inspiron 1525 + - SCSI: qla2xxx: Defer enablement of RISC interrupts until ISP + initialization completes. + - USB: fix hcd interrupt disabling + - smb.h: do not include linux/time.h in userspace + - pxa2xx_spi: fix build breakage + - pxa2xx_spi: chipselect bugfixes + - pxa2xx_spi: dma bugfixes + - mm: mark the correct zone as full when scanning zonelists + - async_tx: fix the bug in async_tx_run_dependencies + - drivers/mmc/card/block.c: fix refcount leak in mmc_block_open() + - ixgbe: initialize interrupt throttle rate + - i2c-dev: Return correct error code on class_create() failure + - x86-32: AMD c1e force timer broadcast late + * [x86] Update patch to detect not properly announced cmos RTC devices. + * [xen] Overtake hvc console by default. + + [ maximilian attems ] + * [openvz] ip: NULL pointer dereferrence in tcp_v(4|6)_send_ack + (closes: #500472) + * [openvz] unset NF_CONNTRACK_IPV6 for now until abi bump. + + [ Stephen R. Marenka ] + * [m68k] add patches to fix atari ethernec per Michael Schmitz: + atari-ethernec-IRQF_SHARED.diff and atari-ethernec-fixes.diff. + * [m68k] add mac-esp-fix-for-quadras-with-two-esp-chips.diff to fix macs + with dual scsi busses and a problem with xorg, per Finn Thain. + * [m68k] add atari-atari_keyb_init-operator-precedence.diff per + Michael Schmitz. + * [m68k] more mac patches, per Finn Thain. + + [ Martin Michlmayr ] + * [arm/ixp4xx] Enable USB_ATM and USB_SPEEDTOUCH (closes: #502182). + * [arm/iop32x, arm/orion5x] Likewise. + * DNS-323: read MAC address from flash (Matthew Palmer). + + [ dann frazier ] + * Restrict access to the DRM_I915_HWS_ADDR ioctl (CVE-2008-3831) + * Don't allow splicing to files opened with O_APPEND (CVE-2008-4554) + + -- Bastian Blank Sat, 18 Oct 2008 12:14:22 +0200 + +linux-2.6 (2.6.26-8) unstable; urgency=medium + + [ dann frazier ] + * [x86] Fix broken LDT access in VMI (CVE-2008-4410) + * ata: Fix off-by-one-error that causes errors when reading a + block on the LBA28-LBA48 boundary + * [s390] prevent ptrace padding area read/write in 31-bit mode + (CVE-2008-1514) + + [ Bastian Blank ] + * Fix generation of i386 Xen image information. + * [i386] Restrict the usage of long NOPs. (closes: #464962) + * Fix access to uninitialized user keyring. (closes: #500279) + * [x86] Fix detection of non-PNP RTC devices. (closes: #499230) + + -- Bastian Blank Thu, 09 Oct 2008 12:07:21 +0200 + +linux-2.6 (2.6.26-7) unstable; urgency=low + + [ Bastian Blank ] + * [xen] Add SuSE Xen patch. (closes: #495895) + * Only register notifiers in braille console if used, fixes Insert key. + (closes: #494374) + * Fix ACPI EC GPE storm detection. (closes: #494546) + * Disable useless support for ISP1760 USB host controller. + (closes: #498304) + * rt61pci: Add a sleep after firmware upload. (closes: #498828) + + [ Stephen R. Marenka ] + * [m68k] Set CONFIG_ATARI_ETHERNEC=m for atari, since it only works + in modular form. + * [m68k] Enable CONFIG_ADB_PMU68K=y for mac. + * [m68k] Add atari-aranym-nf-wrappers.diff patch to fix atari LBD + problems, set CONFIG_LBD=y for atari. + + [ Martin Michlmayr ] + * [arm/orion5x] Enable CONFIG_ATALK (requested by Ben Schwarz). + * [arm/versatile] Enable CONFIG_VFP. (closes: #499463) + * ath5k: Fix bad udelay calls on AR5210 code (Nick Kossifidis). + * [arm] No longer disable ATH5K. + + [ dann frazier ] + * Add missing capability checks in sbni_ioctl (CVE-2008-3525) + + -- Bastian Blank Wed, 01 Oct 2008 09:02:30 +0200 + +linux-2.6 (2.6.26-6) unstable; urgency=low + + [ maximilian attems ] + * [openvz] Enable checkpointing. (closes: #497292) + + [ Bastian Blank ] + * Allow forced module loading again. (closes: #494144) + * Set IEEE 802.11 (wireless) regulatory domain default to EU. + (closes: #497971) + * [i386] Enable IDE ACPI support. Override ABI changes. (closes: #470528) + * [i386/686-bigmem] Promote to generic subarch. (closes: #476120) + + [ Martin Michlmayr ] + * Fix dead 21041 ethernet after ifconfig down (Thomas Bogendoerfer). + + [ dann frazier ] + * [hppa] Enable the FPU before using it, fixes booting on A500s + with our CONFIG_PRINTK_TIME=y setting. (closes: #499458) + + -- Bastian Blank Wed, 24 Sep 2008 12:06:47 +0200 + +linux-2.6 (2.6.26-5) unstable; urgency=low + + [ Martin Michlmayr ] + * Backport power-off method for Kurobox Pro. + * [arm/versatile] Really enable CONFIG_RTC_DRV_PL031 (closes: #484432). + + [ Stephen R. Marenka ] + * [m68k] Set CONFIG_LBD=n for atari, since it conflicts with nfblock. + + [ Bastian Blank ] + * Reenable SiS SATA support. (closes: #496603) + * [amd64,i386] Disable new-style SiS PATA support. + * Add stable release 2.6.26.4: + - sata_mv: don't issue two DMA commands concurrently + - KVM: MMU: Fix torn shadow pte + - x86: work around MTRR mask setting, v2 + - nfsd: fix buffer overrun decoding NFSv4 acl (CVE-2008-3915) + - sunrpc: fix possible overrun on read of /proc/sys/sunrpc/transports + (CVE-2008-3911) + - r8169: balance pci_map / pci_unmap pair + - tg3: Fix firmware event timeouts + - crypto: authenc - Avoid using clobbered request pointer + - sparc64: Fix cmdline_memory_size handling bugs. + - sparc64: Fix overshoot in nid_range(). + - ipsec: Fix deadlock in xfrm_state management. (closes: #497796) + - sctp: fix random memory dereference with SCTP_HMAC_IDENT option. + - sctp: correct bounds check in sctp_setsockopt_auth_key + - sch_prio: Fix nla_parse_nested_compat() regression + - sctp: add verification checks to SCTP_AUTH_KEY option + - sctp: fix potential panics in the SCTP-AUTH API. + - udp: Drop socket lock for encapsulated packets + - pkt_sched: Fix actions referencing + - pkt_sched: Fix return value corruption in HTB and TBF. + - netns: Add network namespace argument to rt6_fill_node() and + ipv6_dev_get_saddr() + - ipv6: Fix OOPS, ip -f inet6 route get fec0::1, linux-2.6.26, + ip6_route_output, rt6_fill_node+0x175 (CVE-2008-3686) + - AX.25: Fix sysctl registration if !CONFIG_AX25_DAMA_SLAVE + - mm: make setup_zone_migrate_reserve() aware of overlapping nodes + - 8250: improve workaround for UARTs that don't re-assert THRE correctly + - rtc_time_to_tm: fix signed/unsigned arithmetic + - drivers/char/random.c: fix a race which can lead to a bogus BUG() + - cifs: fix O_APPEND on directio mounts + - atl1: disable TSO by default + - forcedeth: fix checksum flag + - bio: fix bio_copy_kern() handling of bio->bv_len + - bio: fix __bio_copy_iov() handling of bio->bv_len + - ALSA: oxygen: prevent muting of nonexistent AC97 controls + - S390 dasd: fix data size for PSF/PRSSD command + - x86: fix "kernel won't boot on a Cyrix MediaGXm (Geode)" + - x86: work around MTRR mask setting + - USB: cdc-acm: don't unlock acm->mutex on error path + - binfmt_misc: fix false -ENOEXEC when coupled with other binary handlers + - fbdefio: add set_page_dirty handler to deferred IO FB + - eeepc-laptop: fix use after free + - PCI: fix reference leak in pci_get_dev_by_id() + - cramfs: fix named-pipe handling + * Override ABI changes. + * [hppa] Disable new-style RTC support. Override ABI changes. + + [ maximilian attems ] + * openvz: Add upstream fixes up to 24cebf40278cb071ff8b. (closes: #497528) + + -- Bastian Blank Wed, 10 Sep 2008 12:55:16 +0200 + +linux-2.6 (2.6.26-4) unstable; urgency=low + + [ maximilian attems ] + * x86: Reset ACPI_PROCFS_POWER for Lenny as buggy apps depend on it. + (closes: #495541) + * x86: ACPI: Fix thermal shutdowns + * openvz: Add upstream fixes up to 0f14912e3d2251aff. (closes: #494384) + * Add stable release 2.6.26.3: + - USB: fix interface unregistration logic + - usb-storage: unusual_devs entries for iRiver T10 and Datafab CF+SM reader + - usb-serial: don't release unregistered minors + - usb-storage: revert DMA-alignment change for Wireless USB + - usb-storage: automatically recognize bad residues + - USB: ftdi_sio: Add USB Product Id for ELV HS485 + - qla2xxx: Set an rport's dev_loss_tmo value in a consistent manner. + - dccp: change L/R must have at least one byte in the dccpsf_val field + (CVE-2008-3276) + - KVM: Avoid instruction emulation when event delivery is pending + - cs5520: add enablebits checking + - acer-wmi: Fix wireless and bluetooth on early AMW0 v2 laptops + - USB: usb-storage: quirk around v1.11 firmware on Nikon D4 + - radeonfb: fix accel engine hangs + - radeon: misc corrections + - sparc64: Fix global reg snapshotting on self-cpu. + - sparc64: Do not clobber %g7 in setcontext() trap. + - sparc64: Fix end-of-stack checking in save_stack_trace(). + - sparc64: Fix recursion in stack overflow detection handling. + - sparc64: Make global reg dumping even more useful. + - sparc64: Implement IRQ stacks. + - sparc64: Handle stack trace attempts before irqstacks are setup. + - PCI: Limit VPD length for Broadcom 5708S + - ide: it821x in pass-through mode segfaults in 2.6.26-stable + - syncookies: Make sure ECN is disabled + - USB: ftdi_sio: add support for Luminance Stellaris Evaluation/Development + Kits + - i2c: Fix NULL pointer dereference in i2c_new_probed_device + - SCSI: hptiop: add more PCI device IDs + - SCSI: ses: fix VPD inquiry overrun + - SCSI: scsi_transport_spi: fix oops in revalidate + - CIFS: Fix compiler warning on 64-bit + - x86: fix spin_is_contended() + - matrox maven: fix a broken error path + - i2c: Let users select algorithm drivers manually again + - CIFS: properly account for new user= field in SPNEGO upcall string + allocation + - x86: fix setup code crashes on my old 486 box + - KVM: ia64: Fix irq disabling leak in error handling code + - mlock() fix return values + - rtl8187: Fix lockups due to concurrent access to config routine + - KVM: task switch: segment base is linear address + - KVM: task switch: use seg regs provided by subarch instead of reading + from GDT + - KVM: task switch: translate guest segment limit to virt-extension byte + granular field + - r8169: avoid thrashing PCI conf space above RTL_GIGA_MAC_VER_06 + - sparc64: FUTEX_OP_ANDN fix + - posix-timers: do_schedule_next_timer: fix the setting of ->si_overrun + - posix-timers: fix posix_timer_event() vs dequeue_signal() race + - vt8623fb: fix kernel oops + - ide-cd: fix endianity for the error message in cdrom_read_capacity + - qla2xxx: Add dev_loss_tmo_callbk/terminate_rport_io callback support. + - random32: seeding improvement + - CIFS: mount of IPC$ breaks with iget patch + - CIFS: if get root inode fails during mount, cleanup tree connection + - crypto: padlock - fix VIA PadLock instruction usage with + irq_ts_save/restore() + - ipvs: Fix possible deadlock in estimator code + - SCSI: block: Fix miscalculation of sg_io timeout in CDROM_SEND_PACKET + handler. + - ALSA: asoc: restrict sample rate and size in Freescale MPC8610 sound + drivers + - ALSA: ASoC: fix SNDCTL_DSP_SYNC support in Freescale 8610 sound drivers + - USB: pl2023: Remove USB id (4348:5523) handled by ch341 + - relay: fix "full buffer with exactly full last subbuffer" accounting + problem + - ipv6: Fix ip6_xmit to send fragments if ipfragok is true + - x86: amd opteron TOM2 mask val fix + + [ dann frazier ] + * [ia64] Fix boot-time hang w/ PRINTK_TIME by ensuring that cpu0 can access + per-cpu vars in early boot + * delay calls to sched_clock() until after sched_clock_init() to prevent + inaccurate printk timings on ia64 and presumably other architectures + + [ Ian Campbell ] + * [xen] import upstream fix to fb-defio driver used by Xen framebuffer. + + [ Bastian Blank ] + * [powerpc] Enable proper RTC support. (closes: #484693) + + [ Martin Michlmayr ] + * Add Marvell Orion fixes: + - sata_mv: add the Gen IIE flag to the SoC devices. + - sata_mv: don't avoid clearing interrupt status on SoC host adapters + + [ dann frazier ] + * Fix overflow condition in sctp_setsockopt_auth_key (CVE-2008-3526) + * Fix panics that may occur if SCTP AUTH is disabled (CVE-2008-3792) + * [x86] Fix memory leak in the copy_user routine + (CVE-2008-0598, closes: #490910) + + -- Bastian Blank Thu, 28 Aug 2008 08:46:42 +0200 + +linux-2.6 (2.6.26-3) unstable; urgency=low + + [ Bastian Blank ] + * Disable Emagic Audiowerk 2 soundcard support. The PCI IDs clashes with + many DVB cards. + * Update VServer patch to 2.3.0.35. + * [armel/versatile] Override ABI changes. + * [i386/686-bigmem] Add VServer image. + + [ Aurelien Jarno ] + * [armel/versatile] Disable CONFIG_NO_HZ, CONFIG_HIGH_RES_TIMERS for + dynticks. (closes: #494842) + + [ Martin Michlmayr ] + * Fix PCIe on the Kurobox Pro (Lennert Buytenhek). + * Fix regressions caused by the "use software GSO for SG+CSUM capable + netdevices" patch: + - loopback: Enable TSO (Herbert Xu) + - net: Preserve netfilter attributes in skb_gso_segment using + __copy_skb_header (Herbert Xu) + + [ dann frazier ] + * [amd64] Fix typo in TOM2 mask value, preventing a hang on some opteron + systems. (closes: #494365) + + -- Bastian Blank Mon, 18 Aug 2008 15:34:38 +0200 + +linux-2.6 (2.6.26-2) unstable; urgency=low + + [ Bastian Blank ] + * [powerpc] Install arch/powerpc/lib/crtsavres.o into the headers, it is + used during module linking. + * Add stable release 2.6.26.1: + - Fix off-by-one error in iov_iter_advance() + - ath5k: don't enable MSI, we cannot handle it yet + - b43legacy: Release mutex in error handling code + - cpufreq acpi: only call _PPC after cpufreq ACPI init funcs got called already + - VFS: increase pseudo-filesystem block size to PAGE_SIZE + - markers: fix markers read barrier for multiple probes + - tmpfs: fix kernel BUG in shmem_delete_inode + - mpc52xx_psc_spi: fix block transfer + - ixgbe: remove device ID for unsupported device + - UML - Fix boot crash + - eCryptfs: use page_alloc not kmalloc to get a page of memory + - x86: fix kernel_physical_mapping_init() for large x86 systems + - DVB: cx23885: SRAM changes for the 885 and 887 silicon parts + - DVB: cx23885: Reallocated the sram to avoid concurrent VIDB/C issues + - DVB: cx23885: DVB Transport cards using DVB port VIDB/TS1 did not stream + - DVB: cx23885: Ensure PAD_CTRL is always reset to a sensible default + - V4L: cx23885: Bugfix for concurrent use of /dev/video0 and /dev/video1 + - V4L: saa7134: Copy tuner data earlier to avoid overwriting manual tuner type + - V4L: uvcvideo: Add support for Medion Akoya Mini E1210 integrated webcam + - V4L: uvcvideo: Make input device support optional + - V4L: uvcvideo: Don't free URB buffers on suspend + - V4L: uvcvideo: Use GFP_NOIO when allocating memory during resume + - V4L: uvcvideo: Fix a buffer overflow in format descriptor parsing + - DVB: dib0700: add support for Hauppauge Nova-TD Stick 52009 + - V4L: cx18: Upgrade to newer firmware & update documentation + - ALSA: trident - pause s/pdif output + - myri10ge: do not use mgp->max_intr_slots before loading the firmware + - myri10ge: do not forget to setup the single slice pointers + - iop-adma: fix platform driver hotplug/coldplug + - sparc64: Do not define BIO_VMERGE_BOUNDARY. + - sparc64: Fix cpufreq notifier registry. + - sparc64: Fix lockdep issues in LDC protocol layer. + - tcp: Clear probes_out more aggressively in tcp_ack(). + - ARM: fix fls() for 64-bit arguments + - vmlinux.lds: move __attribute__((__cold__)) functions back into final .text section + - rtc-at91rm9200: avoid spurious irqs + - ide-cd: fix oops when using growisofs + - x86: fix crash due to missing debugctlmsr on AMD K6-3 + - cpusets: fix wrong domain attr updates + - proc: fix /proc/*/pagemap some more + - Fix build on COMPAT platforms when CONFIG_EPOLL is disabled + - markers: fix duplicate modpost entry + - x86, suspend, acpi: enter Big Real Mode + - USB: fix usb serial pm counter decrement for disconnected interfaces + - x86 reboot quirks: add Dell Precision WorkStation T5400 + - Fix typos from signal_32/64.h merge + - rcu: fix rcu_try_flip_waitack_needed() to prevent grace-period stall + - Patch Upstream: x86 ptrace: fix PTRACE_GETFPXREGS error + - KVM: MMU: Fix potential race setting upper shadow ptes on nonpae hosts + - KVM: MMU: nuke shadowed pgtable pages and ptes on memslot destruction + - KVM: x86 emulator: Fix HLT instruction + - KVM: VMX: Add ept_sync_context in flush_tlb + - KVM: mmu_shrink: kvm_mmu_zap_page requires slots_lock to be held + - KVM: SVM: fix suspend/resume support + - KVM: VMX: Fix a wrong usage of vmcs_config + - isofs: fix minor filesystem corruption + - quota: fix possible infinite loop in quota code + - hdlcdrv: Fix CRC calculation. + - ipv6: __KERNEL__ ifdef struct ipv6_devconf + - ipv6: use timer pending + - udplite: Protection against coverage value wrap-around + - pxamci: trivial fix of DMA alignment register bit clearing + * [sparc] Install asm-sparc headers again. + * Force RTC on by default and set clock on startup. Override ABI changes. + * [i386, amd64] Make the CMOS RTC support builtin. (closes: #493567) + * Add stable release 2.6.26.2: + - sound: ensure device number is valid in snd_seq_oss_synth_make_info + - Ath5k: kill tasklets on shutdown + - Ath5k: fix memory corruption + - vfs: fix lookup on deleted directory + - ALSA: emu10k1 - Fix inverted Analog/Digital mixer switch on Audigy2 + - ALSA: hda - Add missing Thinkpad Z60m support + - ALSA: hda - Fix DMA position inaccuracy + - ALSA: hda - Fix wrong volumes in AD1988 auto-probe mode + - Add compat handler for PTRACE_GETSIGINFO + - Bluetooth: Signal user-space for HIDP and BNEP socket errors + - Input: i8042 - add Acer Aspire 1360 to nomux blacklist + - Input: i8042 - add Gericom Bellagio to nomux blacklist + - Input: i8042 - add Intel D845PESV to nopnp list + - jbd: fix race between free buffer and commit transaction + - NFS: Ensure we zap only the access and acl caches when setting new acls + - SCSI: ch: fix ch_remove oops + - linear: correct disk numbering error check + - netfilter: xt_time: fix time's time_mt()'s use of do_div() + - Kprobe smoke test lockdep warning + - Close race in md_probe + - x86: io delay - add checking for NULL early param + - x86: idle process - add checking for NULL early param + - SCSI: bsg: fix bsg_mutex hang with device removal + - netfilter: nf_nat_sip: c= is optional for session + - romfs_readpage: don't report errors for pages beyond i_size + - ftrace: remove unneeded documentation + + [ Martin Michlmayr ] + * METH: fix MAC address setup (Thomas Bogendoerfer) + * Export the reset button of the QNAP TS-409. + * net: use software GSO for SG+CSUM capable netdevices (Lennert Buytenhek) + + [ dann frazier ] + * device_create interface changed between 2.6.26 and 2.6.27; adjust hpilo + backport appropriately. Fixes a NULL pointer dereference in ilo_probe(). + + -- Bastian Blank Fri, 08 Aug 2008 08:09:00 +0200 + +linux-2.6 (2.6.26-1) unstable; urgency=low * New upstream release see http://kernelnewbies.org/Linux_2_6_26 - UDF 2.50 support. (closes: #480910) @@ -66,6 +660,7 @@ linux-2.6 (2.6.26-1~experimental.1) UNRELEASED; urgency=low - alsa snd-hda Dell Inspiron fix (closes: #490649) - ipw2200: queue direct scans (closes: #487721) - better gcc-4.3 support (closes: #492301) + - iwl3945 monitor mode. (closes: #482387) [ maximilian attems ] * topconfig set CRYPTO_CTS, SND_PCSP, SND_AW2, IWL4965_LEDS, IWL3945_LEDS, @@ -90,7 +685,6 @@ linux-2.6 (2.6.26-1~experimental.1) UNRELEASED; urgency=low * topconfig: Enable HYSDN, no longer broken on smp. * Add request_firmware patch for keyspan. (closes: #448900) * [x86]: Enable dma engine. (closes: #473331) - * iwl3945: Add monitor mode patch. (closes: #482387) * [ppc64]: Enable IBMEBUS and EHEA. (closes: #484888) * topconfig: Enable PROFILING across all flavours. (closes: #484885) * 486: enable OLPC support thanks Andres Salomon for merge. @@ -194,7 +788,7 @@ linux-2.6 (2.6.26-1~experimental.1) UNRELEASED; urgency=low * [arm/versatile] Switch scsi/ext3/smc91x to modules now that we have proper d-i support. Remove options defined in toplevel config file. - -- Christian T. Steigies Thu, 12 Jun 2008 15:31:54 +0200 + -- Bastian Blank Wed, 30 Jul 2008 10:17:29 +0200 linux-2.6 (2.6.25-7) unstable; urgency=high diff --git a/debian/config/alpha/vserver/defines b/debian/config/alpha/vserver/defines index c184059be..2d358a44b 100644 --- a/debian/config/alpha/vserver/defines +++ b/debian/config/alpha/vserver/defines @@ -1,9 +1,6 @@ [base] flavours: alpha -[image] -recommends: util-vserver - [alpha_base] class: Alpha diff --git a/debian/config/amd64/config b/debian/config/amd64/config index 6302b64e4..f83faeb3a 100644 --- a/debian/config/amd64/config +++ b/debian/config/amd64/config @@ -118,7 +118,7 @@ CONFIG_CRYPTO_TWOFISH_X86_64=m ## CONFIG_ACPI=y CONFIG_ACPI_PROCFS=y -# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_PROCFS_POWER=y CONFIG_ACPI_SYSFS_POWER=y CONFIG_ACPI_PROC_EVENT=y CONFIG_ACPI_AC=m @@ -141,6 +141,11 @@ CONFIG_ACPI_CONTAINER=m CONFIG_ACPI_HOTPLUG_MEMORY=m CONFIG_ACPI_SBS=m +## +## file: drivers/ata/Kconfig +## +# CONFIG_PATA_SIS is not set + ## ## file: drivers/atm/Kconfig ## @@ -260,9 +265,6 @@ CONFIG_PRINTER=m CONFIG_PPDEV=m CONFIG_HVC_XEN=y CONFIG_NVRAM=m -CONFIG_RTC=y -CONFIG_GEN_RTC=m -CONFIG_GEN_RTC_X=y CONFIG_DTLK=m CONFIG_R3964=m CONFIG_APPLICOM=m @@ -270,7 +272,6 @@ CONFIG_MWAVE=m CONFIG_RAW_DRIVER=m CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y -CONFIG_HPET_RTC_IRQ=y CONFIG_HPET_MMAP=y CONFIG_HANGCHECK_TIMER=m @@ -1143,6 +1144,11 @@ CONFIG_POWER_SUPPLY=m CONFIG_PDA_POWER=m CONFIG_BATTERY_DS2760=m +## +## file: drivers/rtc/Kconfig +## +CONFIG_RTC_DRV_CMOS=y + ## ## file: drivers/scsi/Kconfig ## diff --git a/debian/config/amd64/defines b/debian/config/amd64/defines index eb28a5a34..b818d99de 100644 --- a/debian/config/amd64/defines +++ b/debian/config/amd64/defines @@ -7,9 +7,6 @@ flavours: amd64 kernel-arch: x86 -[xen_base] -enabled: false - [image] suggests: grub | lilo diff --git a/debian/config/amd64/vserver/defines b/debian/config/amd64/vserver/defines index 749e14e9c..f18d9a372 100644 --- a/debian/config/amd64/vserver/defines +++ b/debian/config/amd64/vserver/defines @@ -2,6 +2,3 @@ flavours: amd64 -[image] -recommends: util-vserver - diff --git a/debian/config/amd64/xen/config b/debian/config/amd64/xen/config index 0223e0fd1..8ce377fee 100644 --- a/debian/config/amd64/xen/config +++ b/debian/config/amd64/xen/config @@ -1,15 +1 @@ CONFIG_X86_64_XEN=y - -## options needed for initial pvops fedora domU support -# CONFIG_SMP is not set -# CONFIG_NUMA is not set -# CONFIG_NEED_MULTIPLE_NODES is not set - -## x86_64 breaks with a different CONFIG_PHYSICAL_START, currently -CONFIG_PHYSICAL_START=0x200000 - -## x86_64 breaks with CONFIG_SPARSEMEM_VMEMMAP, currently -# CONFIG_SPARSEMEM_VMEMMAP is not set - -## 32-bit emulation isn't ready yet -# CONFIG_IA32_EMULATION is not set diff --git a/debian/config/amd64/xen/defines b/debian/config/amd64/xen/defines index 3dd39296a..a97430813 100644 --- a/debian/config/amd64/xen/defines +++ b/debian/config/amd64/xen/defines @@ -8,5 +8,5 @@ suggests: grub type: plain-xen [xen] -flavour: amd64 +flavours: amd64 diff --git a/debian/config/arm/config.iop32x b/debian/config/arm/config.iop32x index bd5cc7156..bec952a99 100644 --- a/debian/config/arm/config.iop32x +++ b/debian/config/arm/config.iop32x @@ -645,6 +645,12 @@ CONFIG_SPI=y ## CONFIG_USB=m +## +## file: drivers/usb/atm/Kconfig +## +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m + ## ## file: drivers/usb/class/Kconfig ## @@ -716,6 +722,31 @@ CONFIG_BINFMT_ELF=y CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m +## +## file: fs/partitions/Kconfig +## +CONFIG_ACORN_PARTITION=y +# CONFIG_ACORN_PARTITION_CUMANA is not set +# CONFIG_ACORN_PARTITION_EESOX is not set +CONFIG_ACORN_PARTITION_ICS=y +# CONFIG_ACORN_PARTITION_ADFS is not set +# CONFIG_ACORN_PARTITION_POWERTEC is not set +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + ## ## file: init/Kconfig ## diff --git a/debian/config/arm/config.ixp4xx b/debian/config/arm/config.ixp4xx index 2637dc5a4..8e1cd7396 100644 --- a/debian/config/arm/config.ixp4xx +++ b/debian/config/arm/config.ixp4xx @@ -703,9 +703,16 @@ CONFIG_SPI_BITBANG=m ## CONFIG_USB=m +## +## file: drivers/usb/atm/Kconfig +## +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m + ## ## file: drivers/usb/class/Kconfig ## +CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m ## @@ -774,6 +781,31 @@ CONFIG_BINFMT_ELF=y CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m +## +## file: fs/partitions/Kconfig +## +CONFIG_ACORN_PARTITION=y +# CONFIG_ACORN_PARTITION_CUMANA is not set +# CONFIG_ACORN_PARTITION_EESOX is not set +CONFIG_ACORN_PARTITION_ICS=y +# CONFIG_ACORN_PARTITION_ADFS is not set +# CONFIG_ACORN_PARTITION_POWERTEC is not set +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + ## ## file: init/Kconfig ## diff --git a/debian/config/arm/config.orion5x b/debian/config/arm/config.orion5x index 7e528ee56..18ded1e8a 100644 --- a/debian/config/arm/config.orion5x +++ b/debian/config/arm/config.orion5x @@ -140,11 +140,30 @@ CONFIG_SATA_MV=m # CONFIG_PATA_PLATFORM is not set # CONFIG_PATA_SCH is not set +## +## file: drivers/bluetooth/Kconfig +## +CONFIG_BT_HCIUSB=m +CONFIG_BT_HCIUSB_SCO=y +# CONFIG_BT_HCIUART is not set +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIVHCI=m + ## ## file: drivers/char/drm/Kconfig ## # CONFIG_DRM is not set +## +## file: drivers/dma/Kconfig +## +CONFIG_DMADEVICES=y +CONFIG_MV_XOR=y +CONFIG_DMA_ENGINE=y +# CONFIG_NET_DMA is not set + ## ## file: drivers/i2c/Kconfig ## @@ -304,11 +323,25 @@ CONFIG_MV643XX_ETH=m # CONFIG_NETDEV_10000 is not set CONFIG_NETCONSOLE=m +## +## file: drivers/net/appletalk/Kconfig +## +CONFIG_ATALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y + ## ## file: drivers/net/arcnet/Kconfig ## # CONFIG_ARCNET is not set +## +## file: drivers/net/wireless/Kconfig +## +CONFIG_WLAN_PRE80211=y +CONFIG_WLAN_80211=y + ## ## file: drivers/pci/Kconfig ## @@ -411,6 +444,12 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=2 CONFIG_USB_SUPPORT=y CONFIG_USB=m +## +## file: drivers/usb/atm/Kconfig +## +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m + ## ## file: drivers/usb/class/Kconfig ## @@ -474,6 +513,44 @@ CONFIG_USB_STORAGE_JUMPSHOT=y ## # CONFIG_DISPLAY_SUPPORT is not set +## +## file: drivers/watchdog/Kconfig +## +CONFIG_WATCHDOG=y +CONFIG_ORION5X_WATCHDOG=m + +## +## file: fs/Kconfig.binfmt +## +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_AOUT=m +CONFIG_BINFMT_MISC=m + +## +## file: fs/partitions/Kconfig +## +CONFIG_ACORN_PARTITION=y +# CONFIG_ACORN_PARTITION_CUMANA is not set +# CONFIG_ACORN_PARTITION_EESOX is not set +CONFIG_ACORN_PARTITION_ICS=y +# CONFIG_ACORN_PARTITION_ADFS is not set +# CONFIG_ACORN_PARTITION_POWERTEC is not set +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_EFI_PARTITION=y + ## ## file: kernel/power/Kconfig ## @@ -494,6 +571,11 @@ CONFIG_FLATMEM_MANUAL=y ## end choice # CONFIG_RESOURCES_64BIT is not set +## +## file: net/atm/Kconfig +## +CONFIG_ATM=m + ## ## file: net/ax25/Kconfig ## @@ -503,50 +585,13 @@ CONFIG_AX25=m CONFIG_NETROM=m CONFIG_ROSE=m -CONFIG_DMADEVICES=y -CONFIG_MV_XOR=y -CONFIG_DMA_ENGINE=y -# CONFIG_NET_DMA is not set - -## -## file: fs/Kconfig.binfmt -## -CONFIG_BINFMT_ELF=y -CONFIG_BINFMT_AOUT=m -CONFIG_BINFMT_MISC=m - -## -## file: net/x25/Kconfig -## -CONFIG_X25=m - -## -## file: drivers/net/wireless/Kconfig -## -CONFIG_WLAN_PRE80211=y -CONFIG_WLAN_80211=y - -## -## file: net/atm/Kconfig -## -CONFIG_ATM=m - ## ## file: net/wanrouter/Kconfig ## CONFIG_WAN_ROUTER=m ## -## file: drivers/bluetooth/Kconfig +## file: net/x25/Kconfig ## -CONFIG_BT_HCIUSB=m -CONFIG_BT_HCIUSB_SCO=y -# CONFIG_BT_HCIUART is not set -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIVHCI=m - -CONFIG_WATCHDOG=y -CONFIG_ORION5X_WATCHDOG=m +CONFIG_X25=m diff --git a/debian/config/arm/config.versatile b/debian/config/arm/config.versatile index bf06d401e..4235d046b 100644 --- a/debian/config/arm/config.versatile +++ b/debian/config/arm/config.versatile @@ -341,6 +341,11 @@ CONFIG_PCI_LEGACY=y ## # CONFIG_PCCARD is not set +## +## file: drivers/rtc/Kconfig +## +CONFIG_RTC_DRV_PL031=y + ## ## file: drivers/scsi/Kconfig ## @@ -469,6 +474,13 @@ CONFIG_LOGO_LINUX_CLUT224=y ## # CONFIG_PM is not set +## +## file: kernel/time/Kconfig +## +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set + ## ## file: lib/Kconfig.debug ## diff --git a/debian/config/config b/debian/config/config index 27d8a85a0..ad90f5569 100644 --- a/debian/config/config +++ b/debian/config/config @@ -98,7 +98,7 @@ CONFIG_SATA_QSTOR=m CONFIG_SATA_PROMISE=m CONFIG_SATA_SX4=m CONFIG_SATA_SIL=m -# CONFIG_SATA_SIS is not set +CONFIG_SATA_SIS=m CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m @@ -783,7 +783,6 @@ CONFIG_VIDEO_HEXIUM_ORION=m CONFIG_VIDEO_HEXIUM_GEMINI=m CONFIG_VIDEO_CAFE_CCIC=m CONFIG_V4L_USB_DRIVERS=y -CONFIG_USB_VIDEO_CLASS=m CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y CONFIG_USB_GSPCA=m CONFIG_VIDEO_OVCAMCHIP=m @@ -884,6 +883,11 @@ CONFIG_USB_QUICKCAM_MESSENGER=m ## CONFIG_VIDEO_USBVISION=m +## +## file: drivers/media/video/uvc/Kconfig +## +CONFIG_USB_VIDEO_CLASS=m + ## ## file: drivers/media/video/zc0301/Kconfig ## @@ -1379,7 +1383,9 @@ CONFIG_YENTA_TOSHIBA=y ## ## file: drivers/rtc/Kconfig ## -CONFIG_RTC_CLASS=m +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" CONFIG_RTC_INTF_SYSFS=y CONFIG_RTC_INTF_PROC=y CONFIG_RTC_INTF_DEV=y @@ -1593,7 +1599,7 @@ CONFIG_USB_DEVICE_CLASS=y CONFIG_USB_C67X00_HCD=m CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_ISP1760_HCD=m +# CONFIG_USB_ISP1760_HCD is not set # CONFIG_USB_ISP1760_PCI is not set CONFIG_USB_U132_HCD=m CONFIG_USB_R8A66597_HCD=m @@ -2044,7 +2050,7 @@ CONFIG_SLAB=y CONFIG_PROFILING=y # CONFIG_MARKERS is not set CONFIG_MODULES=y -# CONFIG_MODULE_FORCE_LOAD is not set +CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y @@ -2657,7 +2663,7 @@ CONFIG_SND_PORTMAN2X4=m CONFIG_SND_PCI=y CONFIG_SND_AD1889=m CONFIG_SND_ALS300=m -CONFIG_SND_AW2=m +# CONFIG_SND_AW2 is not set CONFIG_SND_OXYGEN=m CONFIG_SND_CS5530=m CONFIG_SND_DARLA20=m diff --git a/debian/config/defines b/debian/config/defines index 84e42bbd6..d8088f3f5 100644 --- a/debian/config/defines +++ b/debian/config/defines @@ -29,14 +29,18 @@ enabled: false [featureset-vserver_base] enabled: false +[featureset-xen_base] +enabled: false + [image] +desc-long-part-xen: This kernel also runs on a Xen hypervisor. + It supports only unpriviledged (domU) operation. initramfs-generators: initramfs-tools yaird initramfs-fallback type: plain [xen] -dom0-support: false versions: - 3.1-1 + 3.2-1 [commands-image-initramfs-generators] initramfs-tools: mkinitramfs-kpkg diff --git a/debian/config/featureset-openvz/config b/debian/config/featureset-openvz/config index 2c3c83c0a..16445fc0a 100644 --- a/debian/config/featureset-openvz/config +++ b/debian/config/featureset-openvz/config @@ -6,7 +6,7 @@ CONFIG_VE_ETHDEV=m CONFIG_VZ_DEV=m CONFIG_VE_IPTABLES=y CONFIG_VZ_WDOG=m -# CONFIG_VZ_CHECKPOINT is not set +CONFIG_VZ_CHECKPOINT=m CONFIG_SIM_FS=m CONFIG_VZ_QUOTA=m # CONFIG_VZ_QUOTA_UNLOAD is not set @@ -19,3 +19,6 @@ CONFIG_BC_IO_SCHED=y CONFIG_BC_SWAP_ACCOUNTING=y CONFIG_BC_PROC=y # CONFIG_BC_DEBUG is not set + +# buggy +# CONFIG_NF_CONNTRACK_IPV6 is not set diff --git a/debian/config/featureset-vserver/config b/debian/config/featureset-vserver/config index 17e8e4b23..98eb26308 100644 --- a/debian/config/featureset-vserver/config +++ b/debian/config/featureset-vserver/config @@ -1,8 +1,19 @@ +## +## file: drivers/block/Kconfig +## +CONFIG_BLK_DEV_VROOT=y + +## +## file: kernel/vserver/Kconfig +## +CONFIG_VSERVER_AUTO_LBACK=y +# CONFIG_VSERVER_AUTO_SINGLE is not set CONFIG_VSERVER_COWBL=y # CONFIG_VSERVER_VTIME is not set +# CONFIG_VSERVER_DEVICE is not set CONFIG_VSERVER_PROC_SECURE=y CONFIG_VSERVER_HARDCPU=y -# CONFIG_VSERVER_IDLETIME is not set +CONFIG_VSERVER_IDLETIME=y CONFIG_VSERVER_IDLELIMIT=y # CONFIG_TAGGING_NONE is not set # CONFIG_TAGGING_UID16 is not set @@ -11,5 +22,11 @@ CONFIG_TAGGING_ID24=y # CONFIG_TAGGING_INTERN is not set # CONFIG_TAG_NFSD is not set CONFIG_VSERVER_PRIVACY=y +CONFIG_VSERVER_CONTEXTS=512 +CONFIG_VSERVER_WARN=y # CONFIG_VSERVER_DEBUG is not set -CONFIG_BLK_DEV_VROOT=y + +## +## file: net/ipv6/Kconfig +## +CONFIG_IPV6=y diff --git a/debian/config/featureset-vserver/defines b/debian/config/featureset-vserver/defines new file mode 100644 index 000000000..08d396600 --- /dev/null +++ b/debian/config/featureset-vserver/defines @@ -0,0 +1,5 @@ +[image] +desc-long-part-vserver: This kernel includes support for Linux-VServer virtualization. +desc-parts: vserver +desc-short-part-vserver: Linux-VServer support +recommends: util-vserver diff --git a/debian/config/featureset-xen/config b/debian/config/featureset-xen/config index 8d7ac72df..23b7d2cbd 100644 --- a/debian/config/featureset-xen/config +++ b/debian/config/featureset-xen/config @@ -1,47 +1,20 @@ CONFIG_XEN=y -CONFIG_XENCTRL=y +CONFIG_XEN_PRIVILEGED_GUEST=y +CONFIG_XEN_BACKEND=y +CONFIG_XEN_BLKDEV_BACKEND=y +CONFIG_XEN_BLKDEV_TAP=m CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_HVC_XEN=y +CONFIG_XEN_NETDEV_BACKEND=y CONFIG_XEN_NETDEV_FRONTEND=y - -# CONFIG_VMI is not set -CONFIG_PM=y -# CONFIG_PM_DEBUG is not set -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_SLEEP=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_HIBERNATION is not set -# CONFIG_APM is not set -# CONFIG_CPU_FREQ is not set -# CONFIG_PCI is not set -# CONFIG_ISA is not set -# CONFIG_EISA is not set -# CONFIG_SCx200 is not set -# CONFIG_PCCARD is not set -# CONFIG_PARPORT is not set -# CONFIG_BLK_DEV_FD is not set -# CONFIG_PARIDE is not set -# CONFIG_CDROM_PKTCDVD is not set -# CONFIG_IDE is not set -# CONFIG_SOUND is not set -# CONFIG_NL80211 is not set -# CONFIG_MAC80211 is not set -# CONFIG_RFKILL is not set -# CONFIG_ISDN is not set -# CONFIG_INPUT is not set -# CONFIG_SERIO is not set -# CONFIG_I2C is not set -# CONFIG_W1 is not set -# CONFIG_HWMON is not set -# CONFIG_VIDEO_DEV is not set -# CONFIG_DVB_CORE is not set -# CONFIG_DAB is not set -# CONFIG_MTD is not set -# CONFIG_SPI is not set -# CONFIG_USB_SUPPORT is not set -# CONFIG_MMC is not set -# CONFIG_WLAN_PRE80211 is not set -# CONFIG_WLAN_80211 is not set -# CONFIG_ATA is not set -# CONFIG_WAN is not set +CONFIG_XEN_NETDEV_LOOPBACK=m +CONFIG_XEN_PCIDEV_BACKEND=y +CONFIG_XEN_PCIDEV_BACKEND_VPCI=y +CONFIG_XEN_PCIDEV_FRONTEND=y +# CONFIG_XEN_DISABLE_SERIAL is not set +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_SCSI_FRONTEND=m +# CONFIG_XEN_COMPAT_030002_AND_LATER is not set +# CONCIF_XEN_COMPAT_030004_AND_LATER is not set +CONFIG_XEN_COMPAT_030100_AND_LATER=y +# CONFIG_XEN_COMPAT_LATEST_ONLY is not set diff --git a/debian/config/featureset-xen/defines b/debian/config/featureset-xen/defines new file mode 100644 index 000000000..7365cbf26 --- /dev/null +++ b/debian/config/featureset-xen/defines @@ -0,0 +1,5 @@ +[image] +desc-long-part-xenold: This kernel only runs on a Xen hypervisor. + It supports both priviledged (dom0) and unpriviledged (domU) operation. +desc-parts: xenold +desc-short-part-xenold: oldstyle Xen support diff --git a/debian/config/hppa/config b/debian/config/hppa/config index e7fb68a3a..d033adb68 100644 --- a/debian/config/hppa/config +++ b/debian/config/hppa/config @@ -438,6 +438,11 @@ CONFIG_I82365=m ## # CONFIG_PNP is not set +## +## file: drivers/rtc/Kconfig +## +# CONFIG_RTC_CLASS is not set + ## ## file: drivers/scsi/Kconfig ## diff --git a/debian/config/i386/config b/debian/config/i386/config index 1eabf24bb..4baad236f 100644 --- a/debian/config/i386/config +++ b/debian/config/i386/config @@ -176,7 +176,7 @@ CONFIG_CRYPTO_TWOFISH_586=m ## CONFIG_ACPI=y CONFIG_ACPI_PROCFS=y -# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_PROCFS_POWER=y CONFIG_ACPI_SYSFS_POWER=y CONFIG_ACPI_PROC_EVENT=y CONFIG_ACPI_AC=m @@ -203,6 +203,7 @@ CONFIG_ACPI_SBS=m ## # CONFIG_PATA_CS5535 is not set # CONFIG_PATA_CS5536 is not set +# CONFIG_PATA_SIS is not set ## ## file: drivers/atm/Kconfig @@ -330,9 +331,6 @@ CONFIG_PRINTER=m CONFIG_PPDEV=m CONFIG_HVC_XEN=y CONFIG_NVRAM=m -CONFIG_RTC=m -CONFIG_GEN_RTC=m -CONFIG_GEN_RTC_X=y CONFIG_DTLK=m CONFIG_R3964=m CONFIG_APPLICOM=m @@ -343,7 +341,6 @@ CONFIG_CS5535_GPIO=m CONFIG_RAW_DRIVER=m CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y -CONFIG_HPET_RTC_IRQ=y CONFIG_HPET_MMAP=y CONFIG_HANGCHECK_TIMER=m @@ -565,7 +562,7 @@ CONFIG_BLK_DEV_IDECS=m CONFIG_BLK_DEV_IDECD=m CONFIG_BLK_DEV_IDETAPE=m CONFIG_BLK_DEV_IDEFLOPPY=m -# CONFIG_BLK_DEV_IDEACPI is not set +CONFIG_BLK_DEV_IDEACPI=y CONFIG_IDE_GENERIC=m CONFIG_BLK_DEV_CMD640=y # CONFIG_BLK_DEV_CMD640_ENHANCED is not set @@ -1354,6 +1351,11 @@ CONFIG_POWER_SUPPLY=m CONFIG_PDA_POWER=m CONFIG_BATTERY_DS2760=m +## +## file: drivers/rtc/Kconfig +## +CONFIG_RTC_DRV_CMOS=y + ## ## file: drivers/scsi/Kconfig ## diff --git a/debian/config/i386/config.686-bigmem b/debian/config/i386/config.686-bigmem index 60da0f13b..c1b2292ca 100644 --- a/debian/config/i386/config.686-bigmem +++ b/debian/config/i386/config.686-bigmem @@ -2,6 +2,19 @@ ## file: arch/x86/Kconfig ## CONFIG_SMP=y +## choice: Subarchitecture Type +# CONFIG_X86_PC is not set +# CONFIG_X86_ELAN is not set +# CONFIG_X86_VOYAGER is not set +# CONFIG_X86_NUMAQ is not set +# CONFIG_X86_SUMMIT is not set +# CONFIG_X86_BIGSMP is not set +# CONFIG_X86_VISWS is not set +CONFIG_X86_GENERICARCH=y +# CONFIG_X86_ES7000 is not set +# CONFIG_X86_RDC321X is not set +# CONFIG_X86_VSMP is not set +## end choice CONFIG_NR_CPUS=32 CONFIG_X86_MCE=y CONFIG_X86_MCE_NONFATAL=m diff --git a/debian/config/i386/defines b/debian/config/i386/defines index 7e2e29385..99fa8512f 100644 --- a/debian/config/i386/defines +++ b/debian/config/i386/defines @@ -29,6 +29,7 @@ class: PPro/Celeron/PII/PIII/P4 longclass: Pentium Pro/Celeron/Pentium II/Pentium III/Pentium 4 with 4-64G RAM [686-bigmem_image] +desc-parts: xen recommends: libc6-i686 [amd64_base] diff --git a/debian/config/i386/openvz/defines b/debian/config/i386/openvz/defines index 03892b46a..fc159f646 100644 --- a/debian/config/i386/openvz/defines +++ b/debian/config/i386/openvz/defines @@ -5,4 +5,5 @@ flavours: [686_image] configs: i386/config.686-bigmem +desc-parts: xen diff --git a/debian/config/i386/vserver/defines b/debian/config/i386/vserver/defines index de4b248df..c27b9b6fa 100644 --- a/debian/config/i386/vserver/defines +++ b/debian/config/i386/vserver/defines @@ -1,6 +1,7 @@ [base] flavours: 686 + 686-bigmem [image] recommends: util-vserver, libc6-i686 diff --git a/debian/config/i386/xen/config b/debian/config/i386/xen/config new file mode 100644 index 000000000..87dbe22ee --- /dev/null +++ b/debian/config/i386/xen/config @@ -0,0 +1,2 @@ +# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_XEN=y diff --git a/debian/config/i386/xen/defines b/debian/config/i386/xen/defines index 5b11adac0..8e66961c6 100644 --- a/debian/config/i386/xen/defines +++ b/debian/config/i386/xen/defines @@ -8,7 +8,9 @@ suggests: grub type: plain-xen [xen] -flavour: i386-pae +flavours: + i386 + amd64 [686_base] class: i686 @@ -16,6 +18,7 @@ longclass: i686 and compatible [686_image] configs: - i386/config.686-bigmem + i386/config.686-bigmem + i386/xen/config recommends: libc6-xen diff --git a/debian/config/ia64/defines b/debian/config/ia64/defines index b4199555e..7e19331bb 100644 --- a/debian/config/ia64/defines +++ b/debian/config/ia64/defines @@ -1,6 +1,6 @@ [base] featuresets: -# vserver + vserver flavours: itanium mckinley diff --git a/debian/config/ia64/vserver/defines b/debian/config/ia64/vserver/defines index 692763f2b..51c333be5 100644 --- a/debian/config/ia64/vserver/defines +++ b/debian/config/ia64/vserver/defines @@ -3,12 +3,3 @@ flavours: itanium mckinley -[image] -recommends: util-vserver - -[itanium_image] -configs: ia64/config.itanium - -[mckinley_image] -configs: ia64/config.mckinley - diff --git a/debian/config/powerpc/config b/debian/config/powerpc/config index dc3c2e45c..371be9106 100644 --- a/debian/config/powerpc/config +++ b/debian/config/powerpc/config @@ -130,8 +130,6 @@ CONFIG_VT_CONSOLE=y CONFIG_PRINTER=m CONFIG_HVC_RTAS=y CONFIG_NVRAM=y -CONFIG_GEN_RTC=y -CONFIG_GEN_RTC_X=y CONFIG_DTLK=m CONFIG_APPLICOM=m @@ -751,6 +749,11 @@ CONFIG_PD6729=m CONFIG_I82092=m CONFIG_TCIC=m +## +## file: drivers/rtc/Kconfig +## +CONFIG_RTC_DRV_PPC=y + ## ## file: drivers/scsi/Kconfig ## diff --git a/debian/config/powerpc/config.powerpc64 b/debian/config/powerpc/config.powerpc64 index 71536c55f..b15ddd303 100644 --- a/debian/config/powerpc/config.powerpc64 +++ b/debian/config/powerpc/config.powerpc64 @@ -101,8 +101,6 @@ CONFIG_SCANLOG=m ## CONFIG_HVC_CONSOLE=y CONFIG_HVCS=m -# CONFIG_RTC is not set -# CONFIG_GEN_RTC is not set CONFIG_HANGCHECK_TIMER=m ## @@ -156,11 +154,6 @@ CONFIG_IBM_NEW_EMAC_RX_SKB_HEADROOM=0 ## # CONFIG_HOTPLUG_PCI is not set -## -## file: drivers/rtc/Kconfig -## -CONFIG_RTC_DRV_PPC=m - ## ## file: drivers/scsi/Kconfig ## diff --git a/debian/config/powerpc/vserver/defines b/debian/config/powerpc/vserver/defines index f655a17eb..7e466df66 100644 --- a/debian/config/powerpc/vserver/defines +++ b/debian/config/powerpc/vserver/defines @@ -3,9 +3,6 @@ flavours: powerpc powerpc64 -[image] -recommends: util-vserver - [powerpc_image] configs: powerpc/config.powerpc-smp diff --git a/debian/config/s390/defines b/debian/config/s390/defines index d0bda04ed..6cb2f1c6d 100644 --- a/debian/config/s390/defines +++ b/debian/config/s390/defines @@ -1,6 +1,6 @@ [base] featuresets: -# vserver + vserver flavours: s390 s390-tape @@ -10,13 +10,15 @@ kernel-arch: s390 [image] desc-long-part-reader: This kernel has support to IPL (boot) from a VM reader or DASD device. desc-long-part-tape: This kernel has support to IPL (boot) from a tape. -desc-parts: reader desc-short-part-tape: IPL from tape suggests: s390-tools [s390_base] class: IBM S/390 +[s390_image] +desc-parts: reader + [s390-tape_base] class: IBM S/390 modules: false @@ -30,3 +32,6 @@ type: plain-s390-tape [s390x_base] class: IBM zSeries +[s390x_image] +desc-parts: reader + diff --git a/debian/config/sparc/config b/debian/config/sparc/config index 50ce00bf3..49a48d2c7 100644 --- a/debian/config/sparc/config +++ b/debian/config/sparc/config @@ -2,6 +2,7 @@ ## file: arch/sparc64/Kconfig ## CONFIG_SECCOMP=y +CONFIG_SUN_LDOMS=y CONFIG_PCI=y CONFIG_SUN_OPENPROMFS=m @@ -13,6 +14,7 @@ CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_CISS_DA is not set # CONFIG_BLK_DEV_DAC960 is not set # CONFIG_BLK_DEV_UMEM is not set +CONFIG_SUNVDC=m ## ## file: drivers/char/Kconfig @@ -125,6 +127,7 @@ CONFIG_SUNLANCE=m CONFIG_HAPPYMEAL=m CONFIG_SUNBMAC=m CONFIG_SUNQE=m +CONFIG_SUNVNET=m # CONFIG_HP100 is not set # CONFIG_HAMACHI is not set # CONFIG_R8169 is not set diff --git a/debian/config/sparc/defines b/debian/config/sparc/defines index 05e3b9f4d..9a4695303 100644 --- a/debian/config/sparc/defines +++ b/debian/config/sparc/defines @@ -1,10 +1,11 @@ [base] featuresets: -# vserver + vserver flavours: sparc64 sparc64-smp kernel-arch: sparc64 +kernel-header-dirs: sparc sparc64 [image] suggests: silo, fdutils diff --git a/debian/lib/python/debian_linux/config.py b/debian/lib/python/debian_linux/config.py index 6858ee85c..899915c2e 100644 --- a/debian/lib/python/debian_linux/config.py +++ b/debian/lib/python/debian_linux/config.py @@ -26,6 +26,35 @@ class SchemaItemList(object): return [j.strip() for j in re.split(self.type, i)] class ConfigCore(dict): + def get_merge(self, section, arch, featureset, flavour, key, default=None): + temp = [] + + if arch and featureset and flavour: + temp.append(self.get((section, arch, featureset, flavour), {}).get(key)) + temp.append(self.get((section, arch, None, flavour), {}).get(key)) + if arch and featureset: + temp.append(self.get((section, arch, featureset), {}).get(key)) + if arch: + temp.append(self.get((section, arch), {}).get(key)) + if featureset: + temp.append(self.get((section, None, featureset), {}).get(key)) + temp.append(self.get((section,), {}).get(key)) + + ret = [] + + for i in temp: + if i is None: + continue + elif isinstance(i, (list, tuple)): + ret.extend(i) + elif ret: + # TODO + return ret + else: + return i + + return ret or default + def merge(self, section, arch = None, featureset = None, flavour = None): ret = {} ret.update(self.get((section,), {})) @@ -73,6 +102,9 @@ class ConfigCoreHierarchy(ConfigCore): config_name = "defines" schemas = { + 'abi': { + 'ignore-changes': SchemaItemList(), + }, 'base': { 'arches': SchemaItemList(), 'enabled': SchemaItemBoolean(), @@ -90,6 +122,7 @@ class ConfigCoreHierarchy(ConfigCore): }, 'xen': { 'dom0-support': SchemaItemBoolean(), + 'flavours': SchemaItemList(), 'versions': SchemaItemList(), } } diff --git a/debian/lib/python/debian_linux/patches.py b/debian/lib/python/debian_linux/patches.py index 31a360a7e..e20716bdb 100644 --- a/debian/lib/python/debian_linux/patches.py +++ b/debian/lib/python/debian_linux/patches.py @@ -183,7 +183,7 @@ class PatchSeriesList(list): l = self else: l = self[::-1] - for i in self: + for i in l: if reverse: print "--> Try to unapply %s." % i.name else: diff --git a/debian/patches/bugfix/all/ext3-add-support-for-non-native-signed-unsigned-htr.patch b/debian/patches/bugfix/all/ext3-add-support-for-non-native-signed-unsigned-htr.patch new file mode 100644 index 000000000..98a2d00b4 --- /dev/null +++ b/debian/patches/bugfix/all/ext3-add-support-for-non-native-signed-unsigned-htr.patch @@ -0,0 +1,230 @@ +From: Theodore Ts'o +Date: Mon, 20 Oct 2008 23:16:21 -0400 +Subject: [PATCH] ext3: Add support for non-native signed/unsigned htree hash algorithms + +The original ext3 hash algorithms assumed that variables of type char +were signed, as God and K&R intended. Unfortunately, this assumption +is not true on some architectures. Userspace support for marking +filesystems with non-native signed/unsigned chars was added two years +ago, but the kernel-side support was never added (until now). + +Signed-off-by: "Theodore Ts'o" + +diff -urN a/fs/ext3/hash.c b/fs/ext3/hash.c +--- a/fs/ext3/hash.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext3/hash.c 2008-10-28 19:44:22.000000000 +0000 +@@ -35,11 +35,20 @@ + + + /* The old legacy hash */ +-static __u32 dx_hack_hash (const char *name, int len) ++static __u32 dx_hack_hash (const char *name, int len, int unsigned_flag) + { +- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ const unsigned char *ucp = (const unsigned char *) name; ++ const signed char *scp = (const signed char *) name; ++ int c; ++ + while (len--) { +- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ if (unsigned_flag) ++ c = (int) *ucp++; ++ else ++ c = (int) *scp++; ++ ++ hash = hash1 + (hash0 ^ (c * 7152373)); + + if (hash & 0x80000000) hash -= 0x7fffffff; + hash1 = hash0; +@@ -48,10 +57,13 @@ + return (hash0 << 1); + } + +-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num, ++ int unsigned_flag) + { + __u32 pad, val; +- int i; ++ int i, c; ++ const unsigned char *ucp = (const unsigned char *) msg; ++ const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; +@@ -62,7 +74,12 @@ + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; +- val = msg[i] + (val << 8); ++ if (unsigned_flag) ++ c = (int) ucp[i]; ++ else ++ c = (int) scp[i]; ++ ++ val = c + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; +@@ -95,6 +112,7 @@ + const char *p; + int i; + __u32 in[8], buf[4]; ++ int unsigned_flag = 0; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; +@@ -113,13 +131,17 @@ + } + + switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_LEGACY: +- hash = dx_hack_hash(name, len); ++ hash = dx_hack_hash(name, len, unsigned_flag); + break; ++ case DX_HASH_HALF_MD4_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { +- str2hashbuf(p, len, in, 8); ++ str2hashbuf(p, len, in, 8, unsigned_flag); + half_md4_transform(buf, in); + len -= 32; + p += 32; +@@ -127,10 +149,12 @@ + minor_hash = buf[2]; + hash = buf[1]; + break; ++ case DX_HASH_TEA_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_TEA: + p = name; + while (len > 0) { +- str2hashbuf(p, len, in, 4); ++ str2hashbuf(p, len, in, 4, unsigned_flag); + TEA_transform(buf, in); + len -= 16; + p += 16; +diff -urN a/fs/ext3/namei.c b/fs/ext3/namei.c +--- a/fs/ext3/namei.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext3/namei.c 2008-10-28 19:44:22.000000000 +0000 +@@ -369,6 +369,8 @@ + goto fail; + } + hinfo->hash_version = root->info.hash_version; ++ if (hinfo->hash_version <= DX_HASH_TEA) ++ hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; + if (dentry) + ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); +@@ -637,6 +639,9 @@ + dir = dir_file->f_path.dentry->d_inode; + if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { + hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; ++ if (hinfo.hash_version <= DX_HASH_TEA) ++ hinfo.hash_version += ++ EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); +@@ -1413,6 +1418,8 @@ + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; ++ if (hinfo.hash_version <= DX_HASH_TEA) ++ hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; + ext3fs_dirhash(name, namelen, &hinfo); + frame = frames; +diff -urN a/fs/ext3/super.c b/fs/ext3/super.c +--- a/fs/ext3/super.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext3/super.c 2008-10-28 19:44:22.000000000 +0000 +@@ -1712,6 +1712,21 @@ + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; ++ i = le32_to_cpu(es->s_flags); ++ if (i & EXT2_FLAGS_UNSIGNED_HASH) ++ sbi->s_hash_unsigned = 3; ++ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { ++ char c; ++ ++ c = (char) 255; ++ if (((int) c) == -1) { ++ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); ++ } else { ++ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); ++ sbi->s_hash_unsigned = 3; ++ } ++ sb->s_dirt = 1; ++ } + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +diff -urN a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h +--- a/include/linux/ext3_fs.h 2008-10-28 19:43:42.000000000 +0000 ++++ b/include/linux/ext3_fs.h 2008-10-28 19:44:22.000000000 +0000 +@@ -354,6 +354,13 @@ + #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + + /* ++ * Misc. filesystem flags ++ */ ++#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ ++#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ ++#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ ++ ++/* + * Mount flags + */ + #define EXT3_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +@@ -487,7 +494,23 @@ + __u16 s_reserved_word_pad; + __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ +- __u32 s_reserved[190]; /* Padding to the end of the block */ ++ __le32 s_mkfs_time; /* When the filesystem was created */ ++ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ ++ /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ ++/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ ++ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ ++ __le32 s_free_blocks_count_hi; /* Free blocks count */ ++ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ ++ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ ++ __le32 s_flags; /* Miscellaneous flags */ ++ __le16 s_raid_stride; /* RAID stride */ ++ __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ ++ __le64 s_mmp_block; /* Block for multi-mount protection */ ++ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ ++ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ ++ __u8 s_reserved_char_pad2; ++ __le16 s_reserved_pad; ++ __u32 s_reserved[162]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -692,6 +715,9 @@ + #define DX_HASH_LEGACY 0 + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 ++#define DX_HASH_LEGACY_UNSIGNED 3 ++#define DX_HASH_HALF_MD4_UNSIGNED 4 ++#define DX_HASH_TEA_UNSIGNED 5 + + #ifdef __KERNEL__ + +diff -urN a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h +--- a/include/linux/ext3_fs_sb.h 2008-10-28 19:43:42.000000000 +0000 ++++ b/include/linux/ext3_fs_sb.h 2008-10-28 19:44:22.000000000 +0000 +@@ -57,6 +57,7 @@ + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; ++ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; diff --git a/debian/patches/bugfix/all/ext4-add-support-for-non-native-signed-unsigned-htr.patch b/debian/patches/bugfix/all/ext4-add-support-for-non-native-signed-unsigned-htr.patch new file mode 100644 index 000000000..d4c41695d --- /dev/null +++ b/debian/patches/bugfix/all/ext4-add-support-for-non-native-signed-unsigned-htr.patch @@ -0,0 +1,191 @@ +From: Theodore Ts'o +Date: Mon, 20 Oct 2008 22:57:37 -0400 +Subject: [PATCH] ext4: Add support for non-native signed/unsigned htree hash algorithms + +The original ext3 hash algorithms assumed that variables of type char +were signed, as God and K&R intended. Unfortunately, this assumption +is not true on some architectures. Userspace support for marking +filesystems with non-native signed/unsigned chars was added two years +ago, but the kernel-side support was never added (until now). + +Signed-off-by: "Theodore Ts'o" + +diff -urN a/fs/ext4/ext4.h b/fs/ext4/ext4.h +--- a/fs/ext4/ext4.h 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext4/ext4.h 2008-10-28 19:48:22.000000000 +0000 +@@ -872,6 +872,9 @@ + #define DX_HASH_LEGACY 0 + #define DX_HASH_HALF_MD4 1 + #define DX_HASH_TEA 2 ++#define DX_HASH_LEGACY_UNSIGNED 3 ++#define DX_HASH_HALF_MD4_UNSIGNED 4 ++#define DX_HASH_TEA_UNSIGNED 5 + + #ifdef __KERNEL__ + +diff -urN a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h +--- a/fs/ext4/ext4_sb.h 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext4/ext4_sb.h 2008-10-28 19:48:22.000000000 +0000 +@@ -56,6 +56,7 @@ + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; ++ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; +diff -urN a/fs/ext4/hash.c b/fs/ext4/hash.c +--- a/fs/ext4/hash.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext4/hash.c 2008-10-28 19:59:19.000000000 +0000 +@@ -35,11 +35,20 @@ + + + /* The old legacy hash */ +-static __u32 dx_hack_hash (const char *name, int len) ++static __u32 dx_hack_hash(const char *name, int len, int unsigned_flag) + { +- __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ const unsigned char *ucp = (const unsigned char *) name; ++ const signed char *scp = (const signed char *) name; ++ int c; ++ + while (len--) { +- __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ if (unsigned_flag) ++ c = (int) *ucp++; ++ else ++ c = (int) *scp++; ++ ++ hash = hash1 + (hash0 ^ (c * 7152373)); + + if (hash & 0x80000000) hash -= 0x7fffffff; + hash1 = hash0; +@@ -48,10 +57,13 @@ + return (hash0 << 1); + } + +-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num, ++ int unsigned_flag) + { + __u32 pad, val; +- int i; ++ int i, c; ++ const unsigned char *ucp = (const unsigned char *) msg; ++ const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; +@@ -62,7 +74,12 @@ + for (i=0; i < len; i++) { + if ((i % 4) == 0) + val = pad; +- val = msg[i] + (val << 8); ++ if (unsigned_flag) ++ c = (int) ucp[i]; ++ else ++ c = (int) scp[i]; ++ ++ val = c + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; +@@ -95,6 +112,7 @@ + const char *p; + int i; + __u32 in[8], buf[4]; ++ int unsigned_flag = 0; + + /* Initialize the default seed for the hash checksum functions */ + buf[0] = 0x67452301; +@@ -113,13 +131,17 @@ + } + + switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_LEGACY: +- hash = dx_hack_hash(name, len); ++ hash = dx_hack_hash(name, len, unsigned_flag); + break; ++ case DX_HASH_HALF_MD4_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_HALF_MD4: + p = name; + while (len > 0) { +- str2hashbuf(p, len, in, 8); ++ str2hashbuf(p, len, in, 8, unsigned_flag); + half_md4_transform(buf, in); + len -= 32; + p += 32; +@@ -127,10 +149,12 @@ + minor_hash = buf[2]; + hash = buf[1]; + break; ++ case DX_HASH_TEA_UNSIGNED: ++ unsigned_flag++; + case DX_HASH_TEA: + p = name; + while (len > 0) { +- str2hashbuf(p, len, in, 4); ++ str2hashbuf(p, len, in, 4, unsigned_flag); + TEA_transform(buf, in); + len -= 16; + p += 16; +diff -urN a/fs/ext4/namei.c b/fs/ext4/namei.c +--- a/fs/ext4/namei.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext4/namei.c 2008-10-28 19:48:22.000000000 +0000 +@@ -361,6 +361,8 @@ + goto fail; + } + hinfo->hash_version = root->info.hash_version; ++ if (hinfo->hash_version <= DX_HASH_TEA) ++ hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; + if (dentry) + ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); +@@ -639,6 +641,9 @@ + dir = dir_file->f_path.dentry->d_inode; + if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; ++ if (hinfo.hash_version <= DX_HASH_TEA) ++ hinfo.hash_version += ++ EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, + start_hash, start_minor_hash); +@@ -1415,6 +1420,8 @@ + + /* Initialize as for dx_probe */ + hinfo.hash_version = root->info.hash_version; ++ if (hinfo.hash_version <= DX_HASH_TEA) ++ hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(name, namelen, &hinfo); + frame = frames; +diff -urN a/fs/ext4/super.c b/fs/ext4/super.c +--- a/fs/ext4/super.c 2008-10-28 19:43:40.000000000 +0000 ++++ b/fs/ext4/super.c 2008-10-28 19:48:22.000000000 +0000 +@@ -2071,6 +2071,21 @@ + for (i=0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + sbi->s_def_hash_version = es->s_def_hash_version; ++ i = le32_to_cpu(es->s_flags); ++ if (i & EXT2_FLAGS_UNSIGNED_HASH) ++ sbi->s_hash_unsigned = 3; ++ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { ++ char c; ++ ++ c = (char) 255; ++ if (((int) c) == -1) { ++ es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); ++ } else { ++ es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); ++ sbi->s_hash_unsigned = 3; ++ } ++ sb->s_dirt = 1; ++ } + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR diff --git a/debian/patches/bugfix/all/sata_mv-clear_irq.patch b/debian/patches/bugfix/all/sata_mv-clear_irq.patch new file mode 100644 index 000000000..1c42166f1 --- /dev/null +++ b/debian/patches/bugfix/all/sata_mv-clear_irq.patch @@ -0,0 +1,61 @@ +From: Lennert Buytenhek +Subject: [PATCH,RFC] sata_mv: don't avoid clearing interrupt status on SoC host adapters +Date: Sun, 24 Aug 2008 05:04:29 +0200 +To: Saeed Bishara , linux-ide@vger.kernel.org +Cc: Mark Lord , Jeff Garzik + +For some reason, sata_mv doesn't clear interrupt status during init +when it's running on an SoC host adapter. If the bootloader has +touched the SATA controller before starting Linux, Linux can end up +enabling the SATA interrupt with events pending, which will cause the +interrupt to be marked as spurious and then be disabled, which then +breaks all further accesses to the controller. + +This patch makes the SoC path clear interrupt status on init like in +the non-SoC case. + +Signed-off-by: Lennert Buytenhek +--- + drivers/ata/sata_mv.c | 21 ++++++++++----------- + 1 files changed, 10 insertions(+), 11 deletions(-) + +diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c +index ad169ff..e829a3a 100644 +--- a/drivers/ata/sata_mv.c ++++ b/drivers/ata/sata_mv.c +@@ -3131,19 +3131,18 @@ static int mv_init_host(struct ata_host *host, unsigned int board_idx) + writelfl(0, hc_mmio + HC_IRQ_CAUSE_OFS); + } + +- if (!IS_SOC(hpriv)) { +- /* Clear any currently outstanding host interrupt conditions */ +- writelfl(0, mmio + hpriv->irq_cause_ofs); ++ /* Clear any currently outstanding host interrupt conditions */ ++ writelfl(0, mmio + hpriv->irq_cause_ofs); + +- /* and unmask interrupt generation for host regs */ +- writelfl(hpriv->unmask_all_irqs, mmio + hpriv->irq_mask_ofs); ++ /* and unmask interrupt generation for host regs */ ++ writelfl(hpriv->unmask_all_irqs, mmio + hpriv->irq_mask_ofs); ++ ++ /* ++ * enable only global host interrupts for now. ++ * The per-port interrupts get done later as ports are set up. ++ */ ++ mv_set_main_irq_mask(host, 0, PCI_ERR); + +- /* +- * enable only global host interrupts for now. +- * The per-port interrupts get done later as ports are set up. +- */ +- mv_set_main_irq_mask(host, 0, PCI_ERR); +- } + done: + return rc; + } +-- +1.5.6.4 +-- +To unsubscribe from this list: send the line "unsubscribe linux-ide" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/debian/patches/bugfix/all/stable/gen-patch b/debian/patches/bugfix/all/stable/gen-patch index 203838780..040f02fe9 100755 --- a/debian/patches/bugfix/all/stable/gen-patch +++ b/debian/patches/bugfix/all/stable/gen-patch @@ -1,6 +1,6 @@ #!/usr/bin/env python -import os.path, re, sys +import os.path, re, sys, textwrap class Version(object): _rules = ur"^(\d+\.\d+\.\d+)\.(\d+)$" @@ -45,6 +45,8 @@ class GenPatch(object): if not line: continue hash, log = line.split(' ', 1) + log = textwrap.wrap(log, 74) + log = '\n '.join(log) out.write(" - %s\n" % log) if f.close() is not None: raise RuntimeError diff --git a/debian/patches/bugfix/all/wireless-regulatory-default-EU.patch b/debian/patches/bugfix/all/wireless-regulatory-default-EU.patch new file mode 100644 index 000000000..98660230e --- /dev/null +++ b/debian/patches/bugfix/all/wireless-regulatory-default-EU.patch @@ -0,0 +1,13 @@ +diff --git a/net/wireless/reg.c b/net/wireless/reg.c +index 855bff4..5969480 100644 +--- a/net/wireless/reg.c ++++ b/net/wireless/reg.c +@@ -29,7 +29,7 @@ + #include + #include "core.h" + +-static char *ieee80211_regdom = "US"; ++static char *ieee80211_regdom = "EU"; + module_param(ieee80211_regdom, charp, 0444); + MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); + diff --git a/debian/patches/bugfix/arm/kurobox_ignore_pci.patch b/debian/patches/bugfix/arm/kurobox_ignore_pci.patch new file mode 100644 index 000000000..bb0b2639f --- /dev/null +++ b/debian/patches/bugfix/arm/kurobox_ignore_pci.patch @@ -0,0 +1,64 @@ +diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h +index bd0f05d..c3ca3b1 100644 +--- a/arch/arm/mach-orion5x/common.h ++++ b/arch/arm/mach-orion5x/common.h +@@ -32,6 +32,7 @@ struct pci_sys_data; + struct pci_bus; + + void orion5x_pcie_id(u32 *dev, u32 *rev); ++void orion5x_pci_disable(void); + int orion5x_pci_sys_setup(int nr, struct pci_sys_data *sys); + struct pci_bus *orion5x_pci_sys_scan_bus(int nr, struct pci_sys_data *sys); + int orion5x_pci_map_irq(struct pci_dev *dev, u8 slot, u8 pin); +diff --git a/arch/arm/mach-orion5x/kurobox_pro-setup.c b/arch/arm/mach-orion5x/kurobox_pro-setup.c +index f5074b8..142148f 100644 +--- a/arch/arm/mach-orion5x/kurobox_pro-setup.c ++++ b/arch/arm/mach-orion5x/kurobox_pro-setup.c +@@ -147,8 +147,10 @@ static struct hw_pci kurobox_pro_pci __initdata = { + + static int __init kurobox_pro_pci_init(void) + { +- if (machine_is_kurobox_pro()) ++ if (machine_is_kurobox_pro()) { ++ orion5x_pci_disable(); + pci_common_init(&kurobox_pro_pci); ++ } + + return 0; + } +diff --git a/arch/arm/mach-orion5x/pci.c b/arch/arm/mach-orion5x/pci.c +index 9d5d39f..a3285da 100644 +--- a/arch/arm/mach-orion5x/pci.c ++++ b/arch/arm/mach-orion5x/pci.c +@@ -522,6 +522,13 @@ static void __devinit rc_pci_fixup(struct pci_dev *dev) + } + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL, PCI_ANY_ID, rc_pci_fixup); + ++static int orion5x_pci_disabled __initdata; ++ ++void __init orion5x_pci_disable(void) ++{ ++ orion5x_pci_disabled = 1; ++} ++ + int __init orion5x_pci_sys_setup(int nr, struct pci_sys_data *sys) + { + int ret = 0; +@@ -529,7 +536,7 @@ int __init orion5x_pci_sys_setup(int nr, struct pci_sys_data *sys) + if (nr == 0) { + orion_pcie_set_local_bus_nr(PCIE_BASE, sys->busnr); + ret = pcie_setup(sys); +- } else if (nr == 1) { ++ } else if (nr == 1 && !orion5x_pci_disabled) { + orion5x_pci_set_bus_nr(sys->busnr); + ret = pci_setup(sys); + } +@@ -543,7 +550,7 @@ struct pci_bus __init *orion5x_pci_sys_scan_bus(int nr, struct pci_sys_data *sys + + if (nr == 0) { + bus = pci_scan_bus(sys->busnr, &pcie_ops, sys); +- } else if (nr == 1) { ++ } else if (nr == 1 && !orion5x_pci_disabled) { + bus = pci_scan_bus(sys->busnr, &pci_ops, sys); + } else { + bus = NULL; diff --git a/debian/patches/bugfix/arm/kurobox_powerdown.patch b/debian/patches/bugfix/arm/kurobox_powerdown.patch new file mode 100644 index 000000000..84ca1c4de --- /dev/null +++ b/debian/patches/bugfix/arm/kurobox_powerdown.patch @@ -0,0 +1,181 @@ +2.6.26 backport of +http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a0087f2fcf5cb4e16502f5334168fbe304af318b + +[ARM] Orion: implement power-off method for Kurobox Pro +Sylver Bruneau [Wed, 30 Apr 2008 06:14:58 +0000 (08:14 +0200)] +This patch implements the communication with the microcontroller on the +Kurobox Pro and Linkstation Pro/Live boards. This is allowing to send +the commands needed to power-off the board correctly. + +Signed-off-by: Sylver Bruneau +Acked-by: Russell King +Signed-off-by: Lennert Buytenhek + +diff --git a/arch/arm/mach-orion5x/kurobox_pro-setup.c b/arch/arm/mach-orion5x/kurobox_pro-setup.c +index f5074b8..7570c8b 100644 +--- a/arch/arm/mach-orion5x/kurobox_pro-setup.c ++++ b/arch/arm/mach-orion5x/kurobox_pro-setup.c +@@ -13,10 +13,12 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + #include + #include + #include +@@ -179,6 +181,140 @@ static struct mv_sata_platform_data kurobox_pro_sata_data = { + }; + + /***************************************************************************** ++ * Kurobox Pro specific power off method via UART1-attached microcontroller ++ ****************************************************************************/ ++ ++#define UART1_REG(x) (UART1_VIRT_BASE + ((UART_##x) << 2)) ++ ++static int kurobox_pro_miconread(unsigned char *buf, int count) ++{ ++ int i; ++ int timeout; ++ ++ for (i = 0; i < count; i++) { ++ timeout = 10; ++ ++ while (!(readl(UART1_REG(LSR)) & UART_LSR_DR)) { ++ if (--timeout == 0) ++ break; ++ udelay(1000); ++ } ++ ++ if (timeout == 0) ++ break; ++ buf[i] = readl(UART1_REG(RX)); ++ } ++ ++ /* return read bytes */ ++ return i; ++} ++ ++static int kurobox_pro_miconwrite(const unsigned char *buf, int count) ++{ ++ int i = 0; ++ ++ while (count--) { ++ while (!(readl(UART1_REG(LSR)) & UART_LSR_THRE)) ++ barrier(); ++ writel(buf[i++], UART1_REG(TX)); ++ } ++ ++ return 0; ++} ++ ++static int kurobox_pro_miconsend(const unsigned char *data, int count) ++{ ++ int i; ++ unsigned char checksum = 0; ++ unsigned char recv_buf[40]; ++ unsigned char send_buf[40]; ++ unsigned char correct_ack[3]; ++ int retry = 2; ++ ++ /* Generate checksum */ ++ for (i = 0; i < count; i++) ++ checksum -= data[i]; ++ ++ do { ++ /* Send data */ ++ kurobox_pro_miconwrite(data, count); ++ ++ /* send checksum */ ++ kurobox_pro_miconwrite(&checksum, 1); ++ ++ if (kurobox_pro_miconread(recv_buf, sizeof(recv_buf)) <= 3) { ++ printk(KERN_ERR ">%s: receive failed.\n", __func__); ++ ++ /* send preamble to clear the receive buffer */ ++ memset(&send_buf, 0xff, sizeof(send_buf)); ++ kurobox_pro_miconwrite(send_buf, sizeof(send_buf)); ++ ++ /* make dummy reads */ ++ mdelay(100); ++ kurobox_pro_miconread(recv_buf, sizeof(recv_buf)); ++ } else { ++ /* Generate expected ack */ ++ correct_ack[0] = 0x01; ++ correct_ack[1] = data[1]; ++ correct_ack[2] = 0x00; ++ ++ /* checksum Check */ ++ if ((recv_buf[0] + recv_buf[1] + recv_buf[2] + ++ recv_buf[3]) & 0xFF) { ++ printk(KERN_ERR ">%s: Checksum Error : " ++ "Received data[%02x, %02x, %02x, %02x]" ++ "\n", __func__, recv_buf[0], ++ recv_buf[1], recv_buf[2], recv_buf[3]); ++ } else { ++ /* Check Received Data */ ++ if (correct_ack[0] == recv_buf[0] && ++ correct_ack[1] == recv_buf[1] && ++ correct_ack[2] == recv_buf[2]) { ++ /* Interval for next command */ ++ mdelay(10); ++ ++ /* Receive ACK */ ++ return 0; ++ } ++ } ++ /* Received NAK or illegal Data */ ++ printk(KERN_ERR ">%s: Error : NAK or Illegal Data " ++ "Received\n", __func__); ++ } ++ } while (retry--); ++ ++ /* Interval for next command */ ++ mdelay(10); ++ ++ return -1; ++} ++ ++static void kurobox_pro_power_off(void) ++{ ++ const unsigned char watchdogkill[] = {0x01, 0x35, 0x00}; ++ const unsigned char shutdownwait[] = {0x00, 0x0c}; ++ const unsigned char poweroff[] = {0x00, 0x06}; ++ /* 38400 baud divisor */ ++ const unsigned divisor = ((ORION5X_TCLK + (8 * 38400)) / (16 * 38400)); ++ ++ pr_info("%s: triggering power-off...\n", __func__); ++ ++ /* hijack uart1 and reset into sane state (38400,8n1,even parity) */ ++ writel(0x83, UART1_REG(LCR)); ++ writel(divisor & 0xff, UART1_REG(DLL)); ++ writel((divisor >> 8) & 0xff, UART1_REG(DLM)); ++ writel(0x1b, UART1_REG(LCR)); ++ writel(0x00, UART1_REG(IER)); ++ writel(0x07, UART1_REG(FCR)); ++ writel(0x00, UART1_REG(MCR)); ++ ++ /* Send the commands to shutdown the Kurobox Pro */ ++ kurobox_pro_miconsend(watchdogkill, sizeof(watchdogkill)) ; ++ kurobox_pro_miconsend(shutdownwait, sizeof(shutdownwait)) ; ++ kurobox_pro_miconsend(poweroff, sizeof(poweroff)); ++} ++ ++/***************************************************************************** + * General Setup + ****************************************************************************/ + +@@ -229,6 +365,9 @@ static void __init kurobox_pro_init(void) + i2c_register_board_info(0, &kurobox_pro_i2c_rtc, 1); + orion5x_eth_init(&kurobox_pro_eth_data); + orion5x_sata_init(&kurobox_pro_sata_data); ++ ++ /* register Kurobox Pro specific power-off method */ ++ pm_power_off = kurobox_pro_power_off; + } + + #ifdef CONFIG_MACH_KUROBOX_PRO diff --git a/debian/patches/bugfix/parisc/add-lockless-fw-funcs.patch b/debian/patches/bugfix/parisc/add-lockless-fw-funcs.patch new file mode 100644 index 000000000..588fa6af3 --- /dev/null +++ b/debian/patches/bugfix/parisc/add-lockless-fw-funcs.patch @@ -0,0 +1,135 @@ +From: Kyle McMartin +Date: Tue, 29 Jul 2008 04:09:22 +0000 (-0400) +Subject: parisc: add pdc_coproc_cfg_unlocked and set_firmware_width_unlocked +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fkyle%2Fparisc-2.6.git;a=commitdiff_plain;h=8735c10cff8486d58e03ee692ae9720f10cf9806 + +parisc: add pdc_coproc_cfg_unlocked and set_firmware_width_unlocked + +These functions are called only when bringing up the monarch cpu, +so it is safe to call them without taking the pdc spinlock. In the +future, this may become relevant for lockdep, since these functions were +taking spinlocks before start_kernel called the lockdep initializers. +--- + +Adjusted to apply to Debian's 2.6.26 by dann frazier + +diff -urpN linux-source-2.6.26.orig/arch/parisc/kernel/firmware.c linux-source-2.6.26/arch/parisc/kernel/firmware.c +--- linux-source-2.6.26.orig/arch/parisc/kernel/firmware.c 2008-07-13 15:51:29.000000000 -0600 ++++ linux-source-2.6.26/arch/parisc/kernel/firmware.c 2008-09-18 16:28:07.000000000 -0600 +@@ -150,26 +150,40 @@ static void convert_to_wide(unsigned lon + #endif + } + ++#ifdef CONFIG_64BIT ++void __init set_firmware_width_unlocked(void) ++{ ++ int ret; ++ ++ ret = mem_pdc_call(PDC_MODEL, PDC_MODEL_CAPABILITIES, ++ __pa(pdc_result), 0); ++ convert_to_wide(pdc_result); ++ if (pdc_result[0] != NARROW_FIRMWARE) ++ parisc_narrow_firmware = 0; ++} ++ + /** + * set_firmware_width - Determine if the firmware is wide or narrow. + * +- * This function must be called before any pdc_* function that uses the convert_to_wide +- * function. ++ * This function must be called before any pdc_* function that uses the ++ * convert_to_wide function. + */ + void __init set_firmware_width(void) + { +-#ifdef CONFIG_64BIT +- int retval; + unsigned long flags; ++ spin_lock_irqsave(&pdc_lock, flags); ++ set_firmware_width_unlocked(); ++ spin_unlock_irqrestore(&pdc_lock, flags); ++} ++#else ++void __init set_firmware_width_unlocked(void) { ++ return; ++} + +- spin_lock_irqsave(&pdc_lock, flags); +- retval = mem_pdc_call(PDC_MODEL, PDC_MODEL_CAPABILITIES, __pa(pdc_result), 0); +- convert_to_wide(pdc_result); +- if(pdc_result[0] != NARROW_FIRMWARE) +- parisc_narrow_firmware = 0; +- spin_unlock_irqrestore(&pdc_lock, flags); +-#endif ++void __init set_firmware_width(void) { ++ return; + } ++#endif /*CONFIG_64BIT*/ + + /** + * pdc_emergency_unlock - Unlock the linux pdc lock +@@ -288,6 +302,20 @@ int pdc_chassis_warn(unsigned long *warn + return retval; + } + ++int __init pdc_coproc_cfg_unlocked(struct pdc_coproc_cfg *pdc_coproc_info) ++{ ++ int ret; ++ ++ ret = mem_pdc_call(PDC_COPROC, PDC_COPROC_CFG, __pa(pdc_result)); ++ convert_to_wide(pdc_result); ++ pdc_coproc_info->ccr_functional = pdc_result[0]; ++ pdc_coproc_info->ccr_present = pdc_result[1]; ++ pdc_coproc_info->revision = pdc_result[17]; ++ pdc_coproc_info->model = pdc_result[18]; ++ ++ return ret; ++} ++ + /** + * pdc_coproc_cfg - To identify coprocessors attached to the processor. + * @pdc_coproc_info: Return buffer address. +@@ -297,19 +325,14 @@ int pdc_chassis_warn(unsigned long *warn + */ + int __init pdc_coproc_cfg(struct pdc_coproc_cfg *pdc_coproc_info) + { +- int retval; ++ int ret; + unsigned long flags; + +- spin_lock_irqsave(&pdc_lock, flags); +- retval = mem_pdc_call(PDC_COPROC, PDC_COPROC_CFG, __pa(pdc_result)); +- convert_to_wide(pdc_result); +- pdc_coproc_info->ccr_functional = pdc_result[0]; +- pdc_coproc_info->ccr_present = pdc_result[1]; +- pdc_coproc_info->revision = pdc_result[17]; +- pdc_coproc_info->model = pdc_result[18]; +- spin_unlock_irqrestore(&pdc_lock, flags); ++ spin_lock_irqsave(&pdc_lock, flags); ++ ret = pdc_coproc_cfg_unlocked(pdc_coproc_info); ++ spin_unlock_irqrestore(&pdc_lock, flags); + +- return retval; ++ return ret; + } + + /** +diff -urpN linux-source-2.6.26.orig/include/asm-parisc/pdc.h linux-source-2.6.26/include/asm-parisc/pdc.h +--- linux-source-2.6.26.orig/include/asm-parisc/pdc.h 2008-07-13 15:51:29.000000000 -0600 ++++ linux-source-2.6.26/include/asm-parisc/pdc.h 2008-09-18 16:28:07.000000000 -0600 +@@ -600,6 +600,7 @@ int pdc_chassis_info(struct pdc_chassis_ + int pdc_chassis_disp(unsigned long disp); + int pdc_chassis_warn(unsigned long *warn); + int pdc_coproc_cfg(struct pdc_coproc_cfg *pdc_coproc_info); ++int pdc_coproc_cfg_unlocked(struct pdc_coproc_cfg *pdc_coproc_info); + int pdc_iodc_read(unsigned long *actcnt, unsigned long hpa, unsigned int index, + void *iodc_data, unsigned int iodc_data_size); + int pdc_system_map_find_mods(struct pdc_system_map_mod_info *pdc_mod_info, +@@ -638,6 +639,7 @@ int pdc_mem_mem_table(struct pdc_memory_ + #endif + + void set_firmware_width(void); ++void set_firmware_width_unlocked(void); + int pdc_do_firm_test_reset(unsigned long ftc_bitmap); + int pdc_do_reset(void); + int pdc_soft_power_info(unsigned long *power_reg); diff --git a/debian/patches/bugfix/parisc/hijack-jump-to-start_kernel.patch b/debian/patches/bugfix/parisc/hijack-jump-to-start_kernel.patch new file mode 100644 index 000000000..1450dbb9b --- /dev/null +++ b/debian/patches/bugfix/parisc/hijack-jump-to-start_kernel.patch @@ -0,0 +1,68 @@ +From: Kyle McMartin +Date: Tue, 29 Jul 2008 04:11:13 +0000 (-0400) +Subject: parisc: hijack jump to start_kernel +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Fkyle%2Fparisc-2.6.git;a=commitdiff_plain;h=1a189c07f9f65305442d6e99efac9ac44f86bc04 + +parisc: hijack jump to start_kernel + +Bang in our own start_parisc call, which initializes the PDC +width, and turns on the FPU. + +Previously, if CONFIG_PRINTK_TIME was on, we'd attempt to use +the FPU before we had enabled it, resulting in a difficult +to diagnose panic. + +This patch causes init_per_cpu to redundantly set these for +cpu0, but this is harmless. +--- + +diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S +index a84e31e..0e3d9f9 100644 +--- a/arch/parisc/kernel/head.S ++++ b/arch/parisc/kernel/head.S +@@ -121,7 +121,7 @@ $pgt_fill_loop: + copy %r0,%r2 + + /* And the RFI Target address too */ +- load32 start_kernel,%r11 ++ load32 start_parisc,%r11 + + /* And the initial task pointer */ + load32 init_thread_union,%r6 +diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c +index 39e7c5a..a59b71e 100644 +--- a/arch/parisc/kernel/setup.c ++++ b/arch/parisc/kernel/setup.c +@@ -368,6 +368,31 @@ static int __init parisc_init(void) + + return 0; + } +- + arch_initcall(parisc_init); + ++void start_parisc(void) ++{ ++ extern void start_kernel(void); ++ ++ int ret, cpunum; ++ struct pdc_coproc_cfg coproc_cfg; ++ ++ cpunum = smp_processor_id(); ++ ++ set_firmware_width_unlocked(); ++ ++ ret = pdc_coproc_cfg_unlocked(&coproc_cfg); ++ if (ret >= 0 && coproc_cfg.ccr_functional) { ++ mtctl(coproc_cfg.ccr_functional, 10); ++ ++ cpu_data[cpunum].fp_rev = coproc_cfg.revision; ++ cpu_data[cpunum].fp_model = coproc_cfg.model; ++ ++ asm volatile ("fstd %fr0,8(%sp)"); ++ } else { ++ panic("must have an fpu to boot linux"); ++ } ++ ++ start_kernel(); ++ // not reached ++} diff --git a/debian/patches/bugfix/powerpc/lpar-console.patch b/debian/patches/bugfix/powerpc/lpar-console.patch index 159c80643..4383163d6 100644 --- a/debian/patches/bugfix/powerpc/lpar-console.patch +++ b/debian/patches/bugfix/powerpc/lpar-console.patch @@ -1,40 +1,39 @@ diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c -index 9235c46..626290d 100644 +index 9235c46..8a20452 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c -@@ -57,6 +57,7 @@ extern void pSeries_find_serial_port(void); +@@ -204,6 +204,7 @@ void __init find_udbg_vterm(void) + struct device_node *stdout_node; + const u32 *termno; + const char *name; ++ int add_console; + /* find the boot console from /chosen/stdout */ + if (!of_chosen) +@@ -219,6 +220,8 @@ void __init find_udbg_vterm(void) + printk(KERN_WARNING "stdout node missing 'name' property!\n"); + goto out; + } ++ /* The user has requested a console so this is already set up. */ ++ add_console = !strstr(cmd_line, "console="); - int vtermno; /* virtual terminal# for udbg */ -+static char *console_name; - - #define __ALIGNED__ __attribute__((__aligned__(sizeof(long)))) - static void udbg_hvsi_putc(char c) -@@ -232,18 +233,24 @@ void __init find_udbg_vterm(void) + /* Check if it's a virtual terminal */ + if (strncmp(name, "vty", 3) != 0) +@@ -232,13 +235,15 @@ void __init find_udbg_vterm(void) udbg_putc = udbg_putcLP; udbg_getc = udbg_getcLP; udbg_getc_poll = udbg_getc_pollLP; - add_preferred_console("hvc", termno[0] & 0xff, NULL); -+ console_name = "hvc"; ++ if (add_console) ++ add_preferred_console("hvc", termno[0] & 0xff, NULL); } else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) { -- vtermno = termno[0]; + vtermno = termno[0]; udbg_putc = udbg_hvsi_putc; udbg_getc = udbg_hvsi_getc; udbg_getc_poll = udbg_hvsi_getc_poll; - add_preferred_console("hvsi", termno[0] & 0xff, NULL); -+ console_name = "hvsi"; ++ if (add_console) ++ add_preferred_console("hvsi", termno[0] & 0xff, NULL); } out: of_node_put(stdout_node); - } - -+static void __init enable_vterm(void) -+{ -+ if (console_name) -+ add_preferred_console(console_name, vtermno, NULL); -+} -+arch_initcall(enable_vterm); -+ - void vpa_init(int cpu) - { - int hwcpu = get_hard_smp_processor_id(cpu); diff --git a/debian/patches/bugfix/powerpc/mv643xx-hotplug-support.patch b/debian/patches/bugfix/powerpc/mv643xx-hotplug-support.patch deleted file mode 100644 index 6f714e27b..000000000 --- a/debian/patches/bugfix/powerpc/mv643xx-hotplug-support.patch +++ /dev/null @@ -1,35 +0,0 @@ -# -# Hotplug support patch for mv643xx_eth driver used on pegasos machines. -# Author: Nicolas Det -# Upstream status: In the process of being submitted, may need a bit of -# cleanup in order to not break embedded arches using this controller, but -# should not be a worry for debian. -# ---- linux/drivers/net/mv643xx_eth.c.orig 2006-01-01 17:22:07.000000000 +0000 -+++ linux/drivers/net/mv643xx_eth.c 2006-01-01 17:23:56.000000000 +0000 -@@ -41,6 +41,8 @@ - #include - #include - -+#include -+ - #include - #include - #include -@@ -1635,6 +1637,15 @@ - " and Dale Farnsworth"); - MODULE_DESCRIPTION("Ethernet driver for Marvell MV643XX"); - -+static struct pci_device_id pci_marvell_mv64360[] = { -+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, PCI_DEVICE_ID_MARVELL_MV64360) }, -+ { PCI_DEVICE(PCI_VENDOR_ID_MARVELL, PCI_DEVICE_ID_MARVELL_MV64460) }, -+ { } -+}; -+ -+MODULE_DEVICE_TABLE(pci, pci_marvell_mv64360); -+ -+ - /* - * The second part is the low level driver of the gigE ethernet ports. - */ - diff --git a/debian/patches/bugfix/powerpc/serial.patch b/debian/patches/bugfix/powerpc/serial.patch deleted file mode 100644 index d59796c9a..000000000 --- a/debian/patches/bugfix/powerpc/serial.patch +++ /dev/null @@ -1,50 +0,0 @@ -#! /bin/sh -e -## -## All lines beginning with `## DP:' are a description of the patch. -## DP: Description: Disables legacy serial driver on powermacs. -## DP: Patch author: Sven Luther -## DP: Patch author: adapted from the SuSE kernel tree. -## DP: Forward ported to 2.6.17 by Mark Hymers -## DP: Upstream status: workaround hack waiting for a clean legacy device solution. - -diff -aurN a/drivers/serial/8250.c b/drivers/serial/8250.c ---- a/drivers/serial/8250.c 2005-06-17 15:48:29.000000000 -0400 -+++ b/drivers/serial/8250.c 2005-06-18 12:05:39.000000000 -0400 -@@ -45,6 +45,10 @@ - #include - #include - -+#ifdef CONFIG_PPC_PMAC -+#include -+#endif -+ - #include "8250.h" - - /* -@@ -2307,6 +2312,12 @@ - - static int __init serial8250_console_init(void) - { -+#ifdef CONFIG_PPC_PMAC -+ if(machine_is(powermac)) { -+ printk("%s: nothing to do on PowerMac\n",__FUNCTION__); -+ return -ENODEV; -+ } -+#endif - serial8250_isa_init_ports(); - register_console(&serial8250_console); - return 0; -@@ -2617,6 +2628,13 @@ - { - int ret, i; - -+#ifdef CONFIG_PPC_PMAC -+ if(machine_is(powermac)) { -+ printk("%s: nothing to do on PowerMac\n",__FUNCTION__); -+ return -ENODEV; -+ } -+#endif -+ - if (nr_uarts > UART_NR) - nr_uarts = UART_NR; - diff --git a/debian/patches/bugfix/s390/prevent-ptrace-padding-area-read-write-in-31-bit-mode.patch b/debian/patches/bugfix/s390/prevent-ptrace-padding-area-read-write-in-31-bit-mode.patch new file mode 100644 index 000000000..eda78d4ef --- /dev/null +++ b/debian/patches/bugfix/s390/prevent-ptrace-padding-area-read-write-in-31-bit-mode.patch @@ -0,0 +1,113 @@ +From 3d6e48f43340343d97839eadb1ab7b6a3ea98797 Mon Sep 17 00:00:00 2001 +From: Jarod Wilson +Date: Tue, 9 Sep 2008 12:38:56 +0200 +Subject: S390: CVE-2008-1514: prevent ptrace padding area read/write in 31-bit mode + +From: Jarod Wilson + +commit 3d6e48f43340343d97839eadb1ab7b6a3ea98797 upstream + +When running a 31-bit ptrace, on either an s390 or s390x kernel, +reads and writes into a padding area in struct user_regs_struct32 +will result in a kernel panic. + +This is also known as CVE-2008-1514. + +Test case available here: +http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/user-area-padding.c?cvsroot=systemtap + +Steps to reproduce: +1) wget the above +2) gcc -o user-area-padding-31bit user-area-padding.c -Wall -ggdb2 -D_GNU_SOURCE -m31 +3) ./user-area-padding-31bit + + +Test status +----------- +Without patch, both s390 and s390x kernels panic. With patch, the test case, +as well as the gdb testsuite, pass without incident, padding area reads +returning zero, writes ignored. + +Nb: original version returned -EINVAL on write attempts, which broke the +gdb test and made the test case slightly unhappy, Jan Kratochvil suggested +the change to return 0 on write attempts. + +Signed-off-by: Jarod Wilson +Tested-by: Jan Kratochvil +Signed-off-by: Martin Schwidefsky +Cc: Moritz Muehlenhoff +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kernel/compat_ptrace.h | 1 + + arch/s390/kernel/ptrace.c | 28 ++++++++++++++++++++++++++++ + 2 files changed, 29 insertions(+) + +--- a/arch/s390/kernel/compat_ptrace.h ++++ b/arch/s390/kernel/compat_ptrace.h +@@ -42,6 +42,7 @@ struct user_regs_struct32 + u32 gprs[NUM_GPRS]; + u32 acrs[NUM_ACRS]; + u32 orig_gpr2; ++ /* nb: there's a 4-byte hole here */ + s390_fp_regs fp_regs; + /* + * These per registers are in here so that gdb can modify them +--- a/arch/s390/kernel/ptrace.c ++++ b/arch/s390/kernel/ptrace.c +@@ -177,6 +177,13 @@ peek_user(struct task_struct *child, add + */ + tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + ++ } else if (addr < (addr_t) &dummy->regs.fp_regs) { ++ /* ++ * prevent reads of padding hole between ++ * orig_gpr2 and fp_regs on s390. ++ */ ++ tmp = 0; ++ + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + /* + * floating point regs. are stored in the thread structure +@@ -268,6 +275,13 @@ poke_user(struct task_struct *child, add + */ + task_pt_regs(child)->orig_gpr2 = data; + ++ } else if (addr < (addr_t) &dummy->regs.fp_regs) { ++ /* ++ * prevent writes of padding hole between ++ * orig_gpr2 and fp_regs on s390. ++ */ ++ return 0; ++ + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { + /* + * floating point regs. are stored in the thread structure +@@ -409,6 +423,13 @@ peek_user_emu31(struct task_struct *chil + */ + tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + ++ } else if (addr < (addr_t) &dummy32->regs.fp_regs) { ++ /* ++ * prevent reads of padding hole between ++ * orig_gpr2 and fp_regs on s390. ++ */ ++ tmp = 0; ++ + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + /* + * floating point regs. are stored in the thread structure +@@ -488,6 +509,13 @@ poke_user_emu31(struct task_struct *chil + */ + *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + ++ } else if (addr < (addr_t) &dummy32->regs.fp_regs) { ++ /* ++ * prevent writess of padding hole between ++ * orig_gpr2 and fp_regs on s390. ++ */ ++ return 0; ++ + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { + /* + * floating point regs. are stored in the thread structure diff --git a/debian/patches/bugfix/x86/nonpnp-rtc-device-1.patch b/debian/patches/bugfix/x86/nonpnp-rtc-device-1.patch new file mode 100644 index 000000000..2c7993706 --- /dev/null +++ b/debian/patches/bugfix/x86/nonpnp-rtc-device-1.patch @@ -0,0 +1,148 @@ +From: Bjorn Helgaas +Date: Tue, 14 Oct 2008 23:01:03 +0000 (-0600) +Subject: x86: register a platform RTC device if PNP doesn't describe it +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=758a7f7bb86b520aadc484f23da85e547b3bf3d8 + +x86: register a platform RTC device if PNP doesn't describe it + +Most if not all x86 platforms have an RTC device, but sometimes the RTC +is not exposed as a PNP0b00/PNP0b01/PNP0b02 device in PNPBIOS or ACPI: + + http://bugzilla.kernel.org/show_bug.cgi?id=11580 + https://bugzilla.redhat.com/show_bug.cgi?id=451188 + +It's best if we can discover the RTC via PNP because then we know +which flavor of device it is, where it lives, and which IRQ it uses. + +But if we can't, we should register a platform device using the +compiled-in RTC_PORT/RTC_IRQ resource assumptions. + +Signed-off-by: Bjorn Helgaas +Acked-by: Rafael J. Wysocki +Acked-by: David Brownell +Reported-by: Rik Theys +Reported-by: shr_msn@yahoo.com.tw +Signed-off-by: Linus Torvalds +--- + +diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c +index 05191bb..0a23b57 100644 +--- a/arch/x86/kernel/rtc.c ++++ b/arch/x86/kernel/rtc.c +@@ -223,11 +223,25 @@ static struct platform_device rtc_device = { + static __init int add_rtc_cmos(void) + { + #ifdef CONFIG_PNP +- if (!pnp_platform_devices) +- platform_device_register(&rtc_device); +-#else ++ static const char *ids[] __initconst = ++ { "PNP0b00", "PNP0b01", "PNP0b02", }; ++ struct pnp_dev *dev; ++ struct pnp_id *id; ++ int i; ++ ++ pnp_for_each_dev(dev) { ++ for (id = dev->id; id; id = id->next) { ++ for (i = 0; i < ARRAY_SIZE(ids); i++) { ++ if (compare_pnp_id(id, ids[i]) != 0) ++ return 0; ++ } ++ } ++ } ++#endif ++ + platform_device_register(&rtc_device); +-#endif /* CONFIG_PNP */ ++ dev_info(&rtc_device.dev, ++ "registered platform RTC device (no PNP device found)\n"); + return 0; + } + device_initcall(add_rtc_cmos); + +From: Bjorn Helgaas +Date: Tue, 14 Oct 2008 23:01:59 +0000 (-0600) +Subject: rtc-cmos: look for PNP RTC first, then for platform RTC +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=72f22b1eb6ca5e4676a632a04d40d46cb61d4562 + +rtc-cmos: look for PNP RTC first, then for platform RTC + +We shouldn't rely on "pnp_platform_devices" to tell us whether there +is a PNP RTC device. + +I introduced "pnp_platform_devices", but I think it was a mistake. +All it tells us is whether we found any PNPBIOS or PNPACPI devices. +Many machines have some PNP devices, but do not describe the RTC +via PNP. On those machines, we need to do the platform driver probe +to find the RTC. + +We should just register the PNP driver and see whether it claims anything. +If we don't find a PNP RTC, fall back to the platform driver probe. + +This (in conjunction with the arch/x86/kernel/rtc.c patch to add +a platform RTC device when PNP doesn't have one) should resolve +these issues: + + http://bugzilla.kernel.org/show_bug.cgi?id=11580 + https://bugzilla.redhat.com/show_bug.cgi?id=451188 + +Signed-off-by: Bjorn Helgaas +Acked-by: Rafael J. Wysocki +Acked-by: David Brownell +Reported-by: Rik Theys +Reported-by: shr_msn@yahoo.com.tw +Signed-off-by: Linus Torvalds +--- + +diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c +index 6778f82..963ad0b 100644 +--- a/drivers/rtc/rtc-cmos.c ++++ b/drivers/rtc/rtc-cmos.c +@@ -1120,29 +1120,32 @@ static struct platform_driver cmos_platform_driver = { + + static int __init cmos_init(void) + { ++ int retval = 0; ++ + #ifdef CONFIG_PNP +- if (pnp_platform_devices) +- return pnp_register_driver(&cmos_pnp_driver); +- else +- return platform_driver_probe(&cmos_platform_driver, +- cmos_platform_probe); +-#else +- return platform_driver_probe(&cmos_platform_driver, +- cmos_platform_probe); +-#endif /* CONFIG_PNP */ ++ pnp_register_driver(&cmos_pnp_driver); ++#endif ++ ++ if (!cmos_rtc.dev) ++ retval = platform_driver_probe(&cmos_platform_driver, ++ cmos_platform_probe); ++ ++ if (retval == 0) ++ return 0; ++ ++#ifdef CONFIG_PNP ++ pnp_unregister_driver(&cmos_pnp_driver); ++#endif ++ return retval; + } + module_init(cmos_init); + + static void __exit cmos_exit(void) + { + #ifdef CONFIG_PNP +- if (pnp_platform_devices) +- pnp_unregister_driver(&cmos_pnp_driver); +- else +- platform_driver_unregister(&cmos_platform_driver); +-#else ++ pnp_unregister_driver(&cmos_pnp_driver); ++#endif + platform_driver_unregister(&cmos_platform_driver); +-#endif /* CONFIG_PNP */ + } + module_exit(cmos_exit); + diff --git a/debian/patches/bugfix/x86/nonpnp-rtc-device.patch b/debian/patches/bugfix/x86/nonpnp-rtc-device.patch new file mode 100644 index 000000000..9f982778d --- /dev/null +++ b/debian/patches/bugfix/x86/nonpnp-rtc-device.patch @@ -0,0 +1,129 @@ +From: David Brownell + +A bugzilla entry (http://bugzilla.kernel.org/show_bug.cgi?id=11580) +reports that PNPACPI tables in some HP servers don't list RTC devices. + +Work around that on x86 (ignore ia64, the other user of ACPI) by +having ACPI glue check for that case, and if necessary then setting +up a platform device and having rtc_cmos use it. + +Signed-off-by: David Brownell +--- + arch/x86/kernel/rtc.c | 13 ++++++++++++- + drivers/acpi/glue.c | 14 +++++++++++++- + drivers/pnp/core.c | 7 +++++++ + drivers/rtc/rtc-cmos.c | 4 ++-- + include/asm-x86/mc146818rtc.h | 5 +++++ + include/linux/pnp.h | 1 + + 6 files changed, 40 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/rtc.c ++++ b/arch/x86/kernel/rtc.c +@@ -220,10 +220,21 @@ static struct platform_device rtc_device + .num_resources = ARRAY_SIZE(rtc_resources), + }; + ++#ifdef CONFIG_PNP ++/* PNPACPI tables sometimes omit the RTC, or are ignored */ ++struct device *__init add_nonpnp_rtc_cmos(void) ++{ ++ if (!rtc_device.dev.bus) ++ platform_device_register(&rtc_device); ++ return &rtc_device.dev; ++} ++#endif ++ + static __init int add_rtc_cmos(void) + { + #ifdef CONFIG_PNP +- if (!pnp_platform_devices) ++ /* sometimes pnpacpi=off */ ++ if (!pnp_platform_devices && !rtc_device.dev.bus) + platform_device_register(&rtc_device); + #else + platform_device_register(&rtc_device); +--- a/drivers/acpi/glue.c ++++ b/drivers/acpi/glue.c +@@ -338,7 +338,19 @@ static int __init pnp_match(struct devic + + static struct device *__init get_rtc_dev(void) + { +- return bus_find_device(&pnp_bus_type, NULL, NULL, pnp_match); ++ struct device *rtc; ++ ++ /* return RTC from PNPACPI tables */ ++ rtc = bus_find_device(&pnp_bus_type, NULL, NULL, pnp_match); ++ ++#ifdef ARCH_PNP_RTC_WORKAROUND ++ /* cope with buggy PNPACPI tables; seen on some HP DL3x0 servers */ ++ if (!rtc) { ++ pnp_rtc_missing = true; ++ rtc = add_nonpnp_rtc_cmos(); ++ } ++#endif ++ return rtc; + } + + static int __init acpi_rtc_init(void) +--- a/drivers/pnp/core.c ++++ b/drivers/pnp/core.c +@@ -25,10 +25,17 @@ DEFINE_SPINLOCK(pnp_lock); + * ACPI or PNPBIOS should tell us about all platform devices, so we can + * skip some blind probes. ISAPNP typically enumerates only plug-in ISA + * devices, not built-in things like COM ports. ++ * ++ * Sometimes ACPI tables omit critical devices, assigning them to grab-bag ++ * nodes and forcing drivers to use a platform_device node or (yeech!) use ++ * poke-at-the-hardware algorithms for device discovery. + */ + int pnp_platform_devices; + EXPORT_SYMBOL(pnp_platform_devices); + ++bool pnp_rtc_missing; ++EXPORT_SYMBOL(pnp_rtc_missing); ++ + void *pnp_alloc(long size) + { + void *result; +--- a/drivers/rtc/rtc-cmos.c ++++ b/drivers/rtc/rtc-cmos.c +@@ -1137,7 +1137,7 @@ static struct platform_driver cmos_platf + static int __init cmos_init(void) + { + #ifdef CONFIG_PNP +- if (pnp_platform_devices) ++ if (pnp_platform_devices && !pnp_rtc_missing) + return pnp_register_driver(&cmos_pnp_driver); + else + return platform_driver_probe(&cmos_platform_driver, +@@ -1152,7 +1152,7 @@ module_init(cmos_init); + static void __exit cmos_exit(void) + { + #ifdef CONFIG_PNP +- if (pnp_platform_devices) ++ if (pnp_platform_devices && !pnp_rtc_missing) + pnp_unregister_driver(&cmos_pnp_driver); + else + platform_driver_unregister(&cmos_platform_driver); +--- a/include/asm-x86/mc146818rtc.h ++++ b/include/asm-x86/mc146818rtc.h +@@ -101,4 +101,9 @@ extern unsigned long mach_get_cmos_time( + + #define RTC_IRQ 8 + ++#ifdef CONFIG_PNP ++#define ARCH_PNP_RTC_WORKAROUND ++extern struct device *add_nonpnp_rtc_cmos(void); ++#endif ++ + #endif /* _ASM_MC146818RTC_H */ +--- a/include/linux/pnp.h ++++ b/include/linux/pnp.h +@@ -420,6 +420,7 @@ int pnp_device_attach(struct pnp_dev *pn + void pnp_device_detach(struct pnp_dev *pnp_dev); + extern struct list_head pnp_global; + extern int pnp_platform_devices; ++extern bool pnp_rtc_missing; + + /* multidevice card support */ + struct pnp_dev *pnp_request_card_device(struct pnp_card_link *clink, diff --git a/debian/patches/debian/drivers-ata-pata_sis-postpone-pata.patch b/debian/patches/debian/drivers-ata-pata_sis-postpone-pata.patch new file mode 100644 index 000000000..4ad71ff53 --- /dev/null +++ b/debian/patches/debian/drivers-ata-pata_sis-postpone-pata.patch @@ -0,0 +1,69 @@ +diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig +index ae84949..f17c19b 100644 +--- a/drivers/ata/Kconfig ++++ b/drivers/ata/Kconfig +@@ -172,7 +172,7 @@ config SATA_SIL + config SATA_SIS + tristate "SiS 964/965/966/180 SATA support" + depends on PCI +- select PATA_SIS ++ select PATA_SIS_STUB + help + This option enables support for SiS Serial ATA on + SiS 964/965/966/180 and Parallel ATA on SiS 180. +@@ -618,9 +618,13 @@ config PATA_SIL680 + + If unsure, say N. + ++config PATA_SIS_STUB ++ tristate ++ + config PATA_SIS + tristate "SiS PATA support" + depends on PCI ++ select PATA_SIS_STUB + help + This option enables support for SiS PATA controllers + +diff --git a/drivers/ata/Makefile b/drivers/ata/Makefile +index 674965f..c4bce57 100644 +--- a/drivers/ata/Makefile ++++ b/drivers/ata/Makefile +@@ -63,7 +63,7 @@ obj-$(CONFIG_PATA_SIL680) += pata_sil680.o + obj-$(CONFIG_PATA_VIA) += pata_via.o + obj-$(CONFIG_PATA_WINBOND) += pata_sl82c105.o + obj-$(CONFIG_PATA_WINBOND_VLB) += pata_winbond.o +-obj-$(CONFIG_PATA_SIS) += pata_sis.o ++obj-$(CONFIG_PATA_SIS_STUB) += pata_sis.o + obj-$(CONFIG_PATA_TRIFLEX) += pata_triflex.o + obj-$(CONFIG_PATA_IXP4XX_CF) += pata_ixp4xx_cf.o + obj-$(CONFIG_PATA_SCC) += pata_scc.o +diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c +index 26345d7..2c3e3ba 100644 +--- a/drivers/ata/pata_sis.c ++++ b/drivers/ata/pata_sis.c +@@ -826,13 +826,16 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) + } + + static const struct pci_device_id sis_pci_tbl[] = { ++#ifdef CONFIG_PATA_SIS + { PCI_VDEVICE(SI, 0x5513), }, /* SiS 5513 */ + { PCI_VDEVICE(SI, 0x5518), }, /* SiS 5518 */ + { PCI_VDEVICE(SI, 0x1180), }, /* SiS 1180 */ ++#endif + + { } + }; + ++#ifdef CONFIG_PATA_SIS + static struct pci_driver sis_pci_driver = { + .name = DRV_NAME, + .id_table = sis_pci_tbl, +@@ -856,6 +859,7 @@ static void __exit sis_exit(void) + + module_init(sis_init); + module_exit(sis_exit); ++#endif + + MODULE_AUTHOR("Alan Cox"); + MODULE_DESCRIPTION("SCSI low-level driver for SiS ATA"); diff --git a/debian/patches/debian/scripts-kconfig-reportoldconfig.patch b/debian/patches/debian/scripts-kconfig-reportoldconfig.patch index 91bbbbe94..007af77c4 100644 --- a/debian/patches/debian/scripts-kconfig-reportoldconfig.patch +++ b/debian/patches/debian/scripts-kconfig-reportoldconfig.patch @@ -2,15 +2,9 @@ diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 32e8c5a..7e14c56 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile -@@ -2,7 +2,7 @@ - # Kernel configuration targets - # These targets are used from top-level makefile - +@@ -5,1 +5,1 @@ -PHONY += oldconfig xconfig gconfig menuconfig config silentoldconfig update-po-config +PHONY += oldconfig xconfig gconfig menuconfig config reportoldconfig silentoldconfig updateoldconfig update-po-config - - Kconfig := arch/$(SRCARCH)/Kconfig - @@ -21,9 +21,15 @@ config: $(obj)/conf oldconfig: $(obj)/conf $< -o $(Kconfig) diff --git a/debian/patches/features/all/vserver/bindmount-dev.patch b/debian/patches/features/all/vserver/bindmount-dev.patch index b43f54100..2cb270691 100644 --- a/debian/patches/features/all/vserver/bindmount-dev.patch +++ b/debian/patches/features/all/vserver/bindmount-dev.patch @@ -7,19 +7,19 @@ + if (!capable(CAP_SYS_ADMIN) && (old_nd.path.mnt->mnt_flags & MNT_NODEV)) + mnt_flags |= MNT_NODEV; + - mnt->mnt_flags = mnt_flags; - if (flags & MS_TAGID) { - mnt->mnt_tag = tag; + err = graft_tree(mnt, &nd->path); + if (err) { + LIST_HEAD(umount_list); @@ -1030,6 +1033,9 @@ static int do_remount(struct nameidata * - if (nd->dentry != nd->mnt->mnt_root) + if (nd->path.dentry != nd->path.mnt->mnt_root) return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + mnt_flags |= MNT_NODEV; + down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); - if (!err) + if (flags & MS_BIND) + err = change_mount_flags(nd->path.mnt, flags); @@ -1138,6 +1144,9 @@ static int do_new_mount(struct nameidata if (!vx_capable(CAP_SYS_ADMIN, VXC_SECURE_MOUNT)) return -EPERM; @@ -31,8 +31,8 @@ if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -1489,8 +1498,6 @@ long do_mount(char *dev_name, char *dir_ - if (flags & MS_RELATIME) - mnt_flags |= MNT_RELATIME; + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; - if (!capable(CAP_SYS_ADMIN)) - mnt_flags |= MNT_NODEV; diff --git a/debian/patches/features/all/vserver/gen-patch b/debian/patches/features/all/vserver/gen-patch index a99dccb16..88a2cf83e 100755 --- a/debian/patches/features/all/vserver/gen-patch +++ b/debian/patches/features/all/vserver/gen-patch @@ -11,4 +11,4 @@ version=$(filterdiff -p 1 -i Makefile "$patch" | grep "+EXTRAVERSION" | sed -e ' file="$(dirname $0)/$version.patch" -filterdiff -p 1 --strip 1 --addprefix=a/ -x Makefile "$patch" | grep -v "^diff" > "$file" +filterdiff -p 1 --strip 1 --addprefix=a/ -x Makefile -x include/linux/Kbuild -x include/linux/vserver/Kbuild "$patch" | grep -v "^diff" > "$file" diff --git a/debian/patches/features/all/vserver/vs2.3.0.35-update.patch b/debian/patches/features/all/vserver/vs2.3.0.35-update.patch new file mode 100644 index 000000000..9c4f3be82 --- /dev/null +++ b/debian/patches/features/all/vserver/vs2.3.0.35-update.patch @@ -0,0 +1,48 @@ +diff --git a/include/asm-s390/tlb.h b/include/asm-s390/tlb.h +index 3d8a96d..656a647 100644 +--- a/include/asm-s390/tlb.h ++++ b/include/asm-s390/tlb.h +@@ -27,6 +27,7 @@ + #include + #include + #include ++#include + + #ifndef CONFIG_SMP + #define TLB_NR_PTRS 1 +diff --git a/include/linux/vserver/debug.h b/include/linux/vserver/debug.h +index 2c989b5..c43bbb9 100644 +--- a/include/linux/vserver/debug.h ++++ b/include/linux/vserver/debug.h +@@ -11,9 +11,6 @@ + #define VXF_DEV "%p[%lu,%d:%d]" + + +-#define __FUNC__ __func__ +- +- + #define vxd_path(p) \ + ({ static char _buffer[PATH_MAX]; \ + d_path(p, _buffer, sizeof(_buffer)); }) +diff --git a/include/linux/vserver/pid.h b/include/linux/vserver/pid.h +index eb0b9cb..c76a6ee 100644 +--- a/include/linux/vserver/pid.h ++++ b/include/linux/vserver/pid.h +@@ -4,7 +4,7 @@ + /* pid faking stuff */ + + #define vx_info_map_pid(v, p) \ +- __vx_info_map_pid((v), (p), __FUNC__, __FILE__, __LINE__) ++ __vx_info_map_pid((v), (p), __func__, __FILE__, __LINE__) + #define vx_info_map_tgid(v,p) vx_info_map_pid(v,p) + #define vx_map_pid(p) vx_info_map_pid(current->vx_info, p) + #define vx_map_tgid(p) vx_map_pid(p) +@@ -27,7 +27,7 @@ static inline int __vx_info_map_pid(struct vx_info *vxi, int pid, + } + + #define vx_info_rmap_pid(v, p) \ +- __vx_info_rmap_pid((v), (p), __FUNC__, __FILE__, __LINE__) ++ __vx_info_rmap_pid((v), (p), __func__, __FILE__, __LINE__) + #define vx_rmap_pid(p) vx_info_rmap_pid(current->vx_info, p) + #define vx_rmap_tgid(p) vx_rmap_pid(p) + diff --git a/debian/patches/features/all/vserver/vs2.3.0.34.11.patch b/debian/patches/features/all/vserver/vs2.3.0.35.patch similarity index 83% rename from debian/patches/features/all/vserver/vs2.3.0.34.11.patch rename to debian/patches/features/all/vserver/vs2.3.0.35.patch index 2602c827f..89985304f 100644 --- a/debian/patches/features/all/vserver/vs2.3.0.34.11.patch +++ b/debian/patches/features/all/vserver/vs2.3.0.35.patch @@ -1,5 +1,5 @@ --- a/arch/alpha/Kconfig 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/alpha/Kconfig 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/alpha/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -671,6 +671,8 @@ config DUMMY_CONSOLE depends on VGA_HOSE default y @@ -10,7 +10,7 @@ source "crypto/Kconfig" --- a/arch/alpha/kernel/entry.S 2008-04-17 11:31:21.000000000 -0400 -+++ a/arch/alpha/kernel/entry.S 2008-04-21 11:09:01.000000000 -0400 ++++ a/arch/alpha/kernel/entry.S 2008-07-16 22:41:36.000000000 -0400 @@ -872,24 +872,15 @@ sys_getxgid: .globl sys_getxpid .ent sys_getxpid @@ -43,8 +43,8 @@ ret .end sys_getxpid ---- a/arch/alpha/kernel/osf_sys.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/arch/alpha/kernel/osf_sys.c 2008-05-21 14:30:40.000000000 -0400 +--- a/arch/alpha/kernel/osf_sys.c 2008-07-14 17:22:23.000000000 -0400 ++++ a/arch/alpha/kernel/osf_sys.c 2008-07-16 22:41:36.000000000 -0400 @@ -883,7 +883,7 @@ osf_gettimeofday(struct timeval32 __user { if (tv) { @@ -55,7 +55,7 @@ return -EFAULT; } --- a/arch/alpha/kernel/ptrace.c 2008-04-17 11:31:21.000000000 -0400 -+++ a/arch/alpha/kernel/ptrace.c 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/alpha/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -15,6 +15,7 @@ #include #include @@ -64,43 +64,8 @@ #include #include ---- a/arch/alpha/kernel/semaphore.c 2008-04-17 11:31:21.000000000 -0400 -+++ a/arch/alpha/kernel/semaphore.c 2008-04-19 15:14:51.000000000 -0400 -@@ -68,8 +68,8 @@ __down_failed(struct semaphore *sem) - DECLARE_WAITQUEUE(wait, tsk); - - #ifdef CONFIG_DEBUG_SEMAPHORE -- printk("%s(%d): down failed(%p)\n", -- tsk->comm, task_pid_nr(tsk), sem); -+ printk("%s(%d:#%u): down failed(%p)\n", -+ tsk->comm, task_pid_nr(tsk), tsk->xid, sem); - #endif - - tsk->state = TASK_UNINTERRUPTIBLE; -@@ -97,8 +97,8 @@ __down_failed(struct semaphore *sem) - wake_up(&sem->wait); - - #ifdef CONFIG_DEBUG_SEMAPHORE -- printk("%s(%d): down acquired(%p)\n", -- tsk->comm, task_pid_nr(tsk), sem); -+ printk("%s(%d:#%u): down acquired(%p)\n", -+ tsk->comm, task_pid_nr(tsk), tsk->xid, sem); - #endif - } - -@@ -110,8 +110,8 @@ __down_failed_interruptible(struct semap - long ret = 0; - - #ifdef CONFIG_DEBUG_SEMAPHORE -- printk("%s(%d): down failed(%p)\n", -- tsk->comm, task_pid_nr(tsk), sem); -+ printk("%s(%d:#%u): down failed(%p)\n", -+ tsk->comm, task_pid_nr(tsk), tsk->xid, sem); - #endif - - tsk->state = TASK_INTERRUPTIBLE; --- a/arch/alpha/kernel/systbls.S 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/alpha/kernel/systbls.S 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/alpha/kernel/systbls.S 2008-07-16 22:41:36.000000000 -0400 @@ -446,7 +446,7 @@ sys_call_table: .quad sys_stat64 /* 425 */ .quad sys_lstat64 @@ -110,9 +75,9 @@ .quad sys_ni_syscall /* sys_mbind */ .quad sys_ni_syscall /* sys_get_mempolicy */ .quad sys_ni_syscall /* sys_set_mempolicy */ ---- a/arch/alpha/kernel/traps.c 2008-04-17 11:31:21.000000000 -0400 -+++ a/arch/alpha/kernel/traps.c 2008-04-19 15:14:51.000000000 -0400 -@@ -182,7 +182,8 @@ die_if_kernel(char * str, struct pt_regs +--- a/arch/alpha/kernel/traps.c 2008-07-14 17:22:23.000000000 -0400 ++++ a/arch/alpha/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 +@@ -183,7 +183,8 @@ die_if_kernel(char * str, struct pt_regs #ifdef CONFIG_SMP printk("CPU %d ", hard_smp_processor_id()); #endif @@ -123,7 +88,7 @@ add_taint(TAINT_DIE); dik_show_trace((unsigned long *)(regs+1)); --- a/arch/alpha/mm/fault.c 2008-04-17 11:31:21.000000000 -0400 -+++ a/arch/alpha/mm/fault.c 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/alpha/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -193,8 +193,8 @@ do_page_fault(unsigned long address, uns down_read(&mm->mmap_sem); goto survive; @@ -135,9 +100,9 @@ if (!user_mode(regs)) goto no_context; do_group_exit(SIGKILL); ---- a/arch/arm/Kconfig 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/arm/Kconfig 2008-04-19 15:14:51.000000000 -0400 -@@ -1180,6 +1180,8 @@ source "fs/Kconfig" +--- a/arch/arm/Kconfig 2008-07-14 17:22:23.000000000 -0400 ++++ a/arch/arm/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -1187,6 +1187,8 @@ source "fs/Kconfig" source "arch/arm/Kconfig.debug" @@ -146,8 +111,8 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/arm/kernel/calls.S 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/arm/kernel/calls.S 2008-04-19 15:14:51.000000000 -0400 +--- a/arch/arm/kernel/calls.S 2008-07-14 17:22:23.000000000 -0400 ++++ a/arch/arm/kernel/calls.S 2008-07-16 22:41:36.000000000 -0400 @@ -322,7 +322,7 @@ /* 310 */ CALL(sys_request_key) CALL(sys_keyctl) @@ -158,7 +123,7 @@ /* 315 */ CALL(sys_ioprio_get) CALL(sys_inotify_init) --- a/arch/arm/kernel/process.c 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/arm/kernel/process.c 2008-04-21 11:09:01.000000000 -0400 ++++ a/arch/arm/kernel/process.c 2008-07-16 22:41:36.000000000 -0400 @@ -264,7 +264,8 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs * regs) { @@ -170,7 +135,7 @@ __backtrace(); } --- a/arch/arm/kernel/traps.c 2008-04-17 12:05:26.000000000 -0400 -+++ a/arch/arm/kernel/traps.c 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/arm/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -214,8 +214,8 @@ static void __die(const char *str, int e str, err, ++die_counter); print_modules(); @@ -183,7 +148,7 @@ if (!user_mode(regs) || in_interrupt()) { dump_mem("Stack: ", regs->ARM_sp, --- a/arch/arm/mm/fault.c 2008-04-17 12:05:27.000000000 -0400 -+++ a/arch/arm/mm/fault.c 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/arm/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -292,7 +292,8 @@ do_page_fault(unsigned long addr, unsign * happened to us that made us unable to handle * the page fault gracefully. @@ -195,7 +160,7 @@ return 0; } --- a/arch/cris/Kconfig 2008-04-17 12:05:27.000000000 -0400 -+++ a/arch/cris/Kconfig 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/cris/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -679,6 +679,8 @@ source "drivers/usb/Kconfig" source "arch/cris/Kconfig.debug" @@ -206,7 +171,7 @@ source "crypto/Kconfig" --- a/arch/frv/kernel/kernel_thread.S 2007-02-04 13:44:54.000000000 -0500 -+++ a/arch/frv/kernel/kernel_thread.S 2008-04-21 11:09:01.000000000 -0400 ++++ a/arch/frv/kernel/kernel_thread.S 2008-07-16 22:41:36.000000000 -0400 @@ -37,7 +37,7 @@ kernel_thread: # start by forking the current process, but with shared VM @@ -217,7 +182,7 @@ setlo #0xe4e4,gr9 setlos.p #0,gr10 ; third syscall arg [parent_tidptr] --- a/arch/h8300/Kconfig 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/h8300/Kconfig 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/h8300/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -233,6 +233,8 @@ source "fs/Kconfig" source "arch/h8300/Kconfig.debug" @@ -228,7 +193,7 @@ source "crypto/Kconfig" --- a/arch/ia64/ia32/ia32_entry.S 2008-04-17 10:37:14.000000000 -0400 -+++ a/arch/ia64/ia32/ia32_entry.S 2008-04-19 15:14:51.000000000 -0400 ++++ a/arch/ia64/ia32/ia32_entry.S 2008-07-16 22:41:36.000000000 -0400 @@ -446,7 +446,7 @@ ia32_syscall_table: data8 sys_tgkill /* 270 */ data8 compat_sys_utimes @@ -238,9 +203,9 @@ data8 sys_ni_syscall data8 sys_ni_syscall /* 275 */ data8 sys_ni_syscall ---- a/arch/ia64/ia32/sys_ia32.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/ia32/sys_ia32.c 2008-04-19 15:14:51.000000000 -0400 -@@ -1177,7 +1177,7 @@ sys32_gettimeofday (struct compat_timeva +--- a/arch/ia64/ia32/sys_ia32.c 2008-07-14 17:22:26.000000000 -0400 ++++ a/arch/ia64/ia32/sys_ia32.c 2008-07-16 22:41:36.000000000 -0400 +@@ -1178,7 +1178,7 @@ sys32_gettimeofday (struct compat_timeva { if (tv) { struct timeval ktv; @@ -249,9 +214,9 @@ if (put_tv32(tv, &ktv)) return -EFAULT; } ---- a/arch/ia64/Kconfig 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/Kconfig 2008-04-19 15:14:51.000000000 -0400 -@@ -615,6 +615,8 @@ source "arch/ia64/hp/sim/Kconfig" +--- a/arch/ia64/Kconfig 2008-07-14 17:22:26.000000000 -0400 ++++ a/arch/ia64/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -638,6 +638,8 @@ source "arch/ia64/hp/sim/Kconfig" source "arch/ia64/Kconfig.debug" @@ -260,9 +225,9 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/ia64/kernel/entry.S 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/kernel/entry.S 2008-04-19 15:14:51.000000000 -0400 -@@ -1547,7 +1547,7 @@ sys_call_table: +--- a/arch/ia64/kernel/entry.S 2008-07-14 17:22:26.000000000 -0400 ++++ a/arch/ia64/kernel/entry.S 2008-07-16 22:41:36.000000000 -0400 +@@ -1619,7 +1619,7 @@ sys_call_table: data8 sys_mq_notify data8 sys_mq_getsetattr data8 sys_kexec_load @@ -271,8 +236,8 @@ data8 sys_waitid // 1270 data8 sys_add_key data8 sys_request_key ---- a/arch/ia64/kernel/perfmon.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/kernel/perfmon.c 2008-04-19 15:14:51.000000000 -0400 +--- a/arch/ia64/kernel/perfmon.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/ia64/kernel/perfmon.c 2008-07-16 22:41:36.000000000 -0400 @@ -40,6 +40,7 @@ #include #include @@ -281,7 +246,7 @@ #include #include -@@ -2374,7 +2375,7 @@ pfm_smpl_buffer_alloc(struct task_struct +@@ -2376,7 +2377,7 @@ pfm_smpl_buffer_alloc(struct task_struct */ insert_vm_struct(mm, vma); @@ -290,8 +255,8 @@ vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, vma_pages(vma)); up_write(&task->mm->mmap_sem); ---- a/arch/ia64/kernel/process.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/kernel/process.c 2008-04-21 11:09:01.000000000 -0400 +--- a/arch/ia64/kernel/process.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/ia64/kernel/process.c 2008-07-16 22:41:36.000000000 -0400 @@ -105,8 +105,8 @@ show_regs (struct pt_regs *regs) unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; @@ -303,18 +268,18 @@ printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s (%s)\n", regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(), init_utsname()->release); ---- a/arch/ia64/kernel/ptrace.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/kernel/ptrace.c 2008-04-19 15:14:51.000000000 -0400 -@@ -17,6 +17,7 @@ - #include - #include +--- a/arch/ia64/kernel/ptrace.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/ia64/kernel/ptrace.c 2008-07-17 21:01:48.000000000 -0400 +@@ -22,6 +22,7 @@ #include + #include + #include +#include #include #include --- a/arch/ia64/kernel/traps.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/kernel/traps.c 2008-04-21 10:33:04.000000000 -0400 ++++ a/arch/ia64/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -60,8 +60,9 @@ die (const char *str, struct pt_regs *re put_cpu(); @@ -340,7 +305,7 @@ } } --- a/arch/ia64/mm/fault.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/ia64/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/ia64/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -10,6 +10,7 @@ #include #include @@ -350,7 +315,7 @@ #include #include --- a/arch/m32r/kernel/traps.c 2008-04-17 11:31:23.000000000 -0400 -+++ a/arch/m32r/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/m32r/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -195,8 +195,9 @@ static void show_registers(struct pt_reg } else { printk("SPI: %08lx\n", sp); @@ -363,9 +328,9 @@ /* * When in-kernel, we also print out the stack and code at the ---- a/arch/m68k/Kconfig 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/m68k/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -674,6 +674,8 @@ source "fs/Kconfig" +--- a/arch/m68k/Kconfig 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/m68k/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -667,6 +667,8 @@ source "fs/Kconfig" source "arch/m68k/Kconfig.debug" @@ -375,7 +340,7 @@ source "crypto/Kconfig" --- a/arch/m68k/kernel/ptrace.c 2008-04-17 11:31:23.000000000 -0400 -+++ a/arch/m68k/kernel/ptrace.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/m68k/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,7 @@ #include #include @@ -393,9 +358,9 @@ return ret; out_eio: ---- a/arch/m68k/kernel/traps.c 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/m68k/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 -@@ -898,8 +898,8 @@ void show_registers(struct pt_regs *regs +--- a/arch/m68k/kernel/traps.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/m68k/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 +@@ -909,8 +909,8 @@ void show_registers(struct pt_regs *regs printk("d4: %08lx d5: %08lx a0: %08lx a1: %08lx\n", regs->d4, regs->d5, regs->a0, regs->a1); @@ -406,9 +371,9 @@ addr = (unsigned long)&fp->un; printk("Frame format=%X ", regs->format); switch (regs->format) { ---- a/arch/m68knommu/Kconfig 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/m68knommu/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -722,6 +722,8 @@ source "fs/Kconfig" +--- a/arch/m68knommu/Kconfig 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/m68knommu/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -725,6 +725,8 @@ source "fs/Kconfig" source "arch/m68knommu/Kconfig.debug" @@ -417,9 +382,9 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/m68knommu/kernel/traps.c 2008-04-17 10:37:14.000000000 -0400 -+++ a/arch/m68knommu/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 -@@ -78,8 +78,9 @@ void die_if_kernel(char *str, struct pt_ +--- a/arch/m68knommu/kernel/traps.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/m68knommu/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 +@@ -79,8 +79,9 @@ void die_if_kernel(char *str, struct pt_ printk(KERN_EMERG "d4: %08lx d5: %08lx a0: %08lx a1: %08lx\n", fp->d4, fp->d5, fp->a0, fp->a1); @@ -431,9 +396,9 @@ show_stack(NULL, (unsigned long *)(fp + 1)); add_taint(TAINT_DIE); do_exit(SIGSEGV); ---- a/arch/mips/Kconfig 2008-04-17 12:05:28.000000000 -0400 -+++ a/arch/mips/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -2099,6 +2099,8 @@ source "fs/Kconfig" +--- a/arch/mips/Kconfig 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/mips/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -2131,6 +2131,8 @@ source "fs/Kconfig" source "arch/mips/Kconfig.debug" @@ -443,7 +408,7 @@ source "crypto/Kconfig" --- a/arch/mips/kernel/linux32.c 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/linux32.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/linux32.c 2008-07-16 22:41:36.000000000 -0400 @@ -209,7 +209,7 @@ sys32_gettimeofday(struct compat_timeval { if (tv) { @@ -454,7 +419,7 @@ return -EFAULT; } --- a/arch/mips/kernel/ptrace.c 2008-04-17 11:31:23.000000000 -0400 -+++ a/arch/mips/kernel/ptrace.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -25,6 +25,7 @@ #include #include @@ -474,7 +439,7 @@ /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ --- a/arch/mips/kernel/scall32-o32.S 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/scall32-o32.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/scall32-o32.S 2008-07-16 22:41:36.000000000 -0400 @@ -619,7 +619,7 @@ einval: li v0, -EINVAL sys sys_mq_timedreceive 5 sys sys_mq_notify 2 /* 4275 */ @@ -485,7 +450,7 @@ sys sys_ni_syscall 0 /* available, was setaltroot */ sys sys_add_key 5 /* 4280 */ --- a/arch/mips/kernel/scall64-64.S 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/scall64-64.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/scall64-64.S 2008-07-16 22:41:36.000000000 -0400 @@ -434,7 +434,7 @@ sys_call_table: PTR sys_mq_timedreceive PTR sys_mq_notify @@ -496,7 +461,7 @@ PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key --- a/arch/mips/kernel/scall64-n32.S 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/scall64-n32.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/scall64-n32.S 2008-07-16 22:41:36.000000000 -0400 @@ -360,7 +360,7 @@ EXPORT(sysn32_call_table) PTR compat_sys_mq_timedreceive PTR compat_sys_mq_notify @@ -507,7 +472,7 @@ PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key --- a/arch/mips/kernel/scall64-o32.S 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/scall64-o32.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/kernel/scall64-o32.S 2008-07-16 22:41:36.000000000 -0400 @@ -482,7 +482,7 @@ sys_call_table: PTR compat_sys_mq_timedreceive PTR compat_sys_mq_notify /* 4275 */ @@ -517,22 +482,24 @@ PTR sys32_waitid PTR sys_ni_syscall /* available, was setaltroot */ PTR sys_add_key /* 4280 */ ---- a/arch/mips/kernel/traps.c 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/mips/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 -@@ -313,8 +313,9 @@ void show_registers(const struct pt_regs - { +--- a/arch/mips/kernel/traps.c 2008-07-14 17:22:27.000000000 -0400 ++++ a/arch/mips/kernel/traps.c 2008-07-17 21:08:35.000000000 -0400 +@@ -324,9 +324,10 @@ void show_registers(const struct pt_regs + __show_regs(regs); print_modules(); -- printk("Process %s (pid: %d, threadinfo=%p, task=%p)\n", -- current->comm, task_pid_nr(current), current_thread_info(), current); -+ printk("Process %s (pid: %d:#%u, threadinfo=%p, task=%p)\n", +- printk("Process %s (pid: %d, threadinfo=%p, task=%p, tls=%0*lx)\n", +- current->comm, current->pid, current_thread_info(), current, +- field, current_thread_info()->tp_value); ++ printk("Process %s (pid: %d:#%u, threadinfo=%p, task=%p, tls=%0*lx)\n", + current->comm, task_pid_nr(current), current->xid, -+ current_thread_info(), current); - show_stacktrace(current, regs); - show_code((unsigned int __user *) regs->cp0_epc); - printk("\n"); ++ current_thread_info(), current, ++ field, current_thread_info()->tp_value); + if (cpu_has_userlocal) { + unsigned long tls; + --- a/arch/mips/mm/fault.c 2008-04-17 11:31:24.000000000 -0400 -+++ a/arch/mips/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/mips/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -178,7 +178,8 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; @@ -544,7 +511,7 @@ do_group_exit(SIGKILL); goto no_context; --- a/arch/parisc/Kconfig 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/parisc/Kconfig 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/parisc/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -278,6 +278,8 @@ source "fs/Kconfig" source "arch/parisc/Kconfig.debug" @@ -555,7 +522,7 @@ source "crypto/Kconfig" --- a/arch/parisc/kernel/syscall_table.S 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/parisc/kernel/syscall_table.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/parisc/kernel/syscall_table.S 2008-07-16 22:41:36.000000000 -0400 @@ -361,7 +361,7 @@ ENTRY_COMP(mbind) /* 260 */ ENTRY_COMP(get_mempolicy) @@ -565,9 +532,9 @@ ENTRY_SAME(add_key) ENTRY_SAME(request_key) /* 265 */ ENTRY_SAME(keyctl) ---- a/arch/parisc/kernel/sys_parisc32.c 2008-04-17 11:31:24.000000000 -0400 -+++ a/arch/parisc/kernel/sys_parisc32.c 2008-04-19 15:14:52.000000000 -0400 -@@ -204,11 +204,11 @@ static inline long get_ts32(struct times +--- a/arch/parisc/kernel/sys_parisc32.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/parisc/kernel/sys_parisc32.c 2008-07-16 22:41:36.000000000 -0400 +@@ -203,11 +203,11 @@ static inline long get_ts32(struct times asmlinkage int sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -581,8 +548,8 @@ if (put_compat_timeval(tv, &ktv)) return -EFAULT; } ---- a/arch/parisc/kernel/traps.c 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/parisc/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/parisc/kernel/traps.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/parisc/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -237,8 +237,9 @@ void die_if_kernel(char *str, struct pt_ if (err == 0) return; /* STFU */ @@ -607,7 +574,7 @@ /* Wot's wrong wif bein' racy? */ if (current->thread.flags & PARISC_KERNEL_DEATH) { --- a/arch/parisc/mm/fault.c 2008-04-17 11:31:24.000000000 -0400 -+++ a/arch/parisc/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/parisc/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -210,8 +210,9 @@ bad_area: #ifdef PRINT_USER_FAULTS @@ -630,9 +597,9 @@ if (user_mode(regs)) do_group_exit(SIGKILL); goto no_context; ---- a/arch/powerpc/Kconfig 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/powerpc/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -706,6 +706,8 @@ source "lib/Kconfig" +--- a/arch/powerpc/Kconfig 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/powerpc/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -800,6 +800,8 @@ source "lib/Kconfig" source "arch/powerpc/Kconfig.debug" @@ -641,8 +608,8 @@ source "security/Kconfig" config KEYS_COMPAT ---- a/arch/powerpc/kernel/irq.c 2008-04-17 12:05:29.000000000 -0400 -+++ a/arch/powerpc/kernel/irq.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/powerpc/kernel/irq.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/powerpc/kernel/irq.c 2008-07-16 22:41:36.000000000 -0400 @@ -53,6 +53,7 @@ #include #include @@ -651,8 +618,8 @@ #include #include ---- a/arch/powerpc/kernel/process.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/powerpc/kernel/process.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/powerpc/kernel/process.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/powerpc/kernel/process.c 2008-07-16 22:41:36.000000000 -0400 @@ -464,8 +464,9 @@ void show_regs(struct pt_regs * regs) #else printk("DAR: "REG", DSISR: "REG"\n", regs->dar, regs->dsisr); @@ -665,9 +632,9 @@ #ifdef CONFIG_SMP printk(" CPU: %d", raw_smp_processor_id()); ---- a/arch/powerpc/kernel/sys_ppc32.c 2008-04-17 11:31:24.000000000 -0400 -+++ a/arch/powerpc/kernel/sys_ppc32.c 2008-04-19 15:14:52.000000000 -0400 -@@ -205,7 +205,7 @@ asmlinkage long compat_sys_gettimeofday( +--- a/arch/powerpc/kernel/sys_ppc32.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/powerpc/kernel/sys_ppc32.c 2008-07-16 22:41:36.000000000 -0400 +@@ -204,7 +204,7 @@ asmlinkage long compat_sys_gettimeofday( { if (tv) { struct timeval ktv; @@ -677,7 +644,7 @@ return -EFAULT; } --- a/arch/powerpc/kernel/traps.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/powerpc/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/powerpc/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -941,8 +941,9 @@ void nonrecoverable_exception(struct pt_ void trace_syscall(struct pt_regs *regs) @@ -690,18 +657,18 @@ regs->ccr&0x10000000?"Error=":"", regs->gpr[3], print_tainted()); } ---- a/arch/powerpc/kernel/vdso.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/powerpc/kernel/vdso.c 2008-04-19 15:14:52.000000000 -0400 -@@ -21,6 +21,7 @@ - #include +--- a/arch/powerpc/kernel/vdso.c 2008-07-14 17:22:28.000000000 -0400 ++++ a/arch/powerpc/kernel/vdso.c 2008-07-17 21:09:13.000000000 -0400 +@@ -22,6 +22,7 @@ #include #include + #include +#include #include #include --- a/arch/powerpc/mm/fault.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/powerpc/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/powerpc/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -378,7 +378,8 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; @@ -712,9 +679,9 @@ if (user_mode(regs)) do_group_exit(SIGKILL); return SIGKILL; ---- a/arch/ppc/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/ppc/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -1261,6 +1261,8 @@ source "lib/Kconfig" +--- a/arch/ppc/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/ppc/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -1181,6 +1181,8 @@ source "lib/Kconfig" source "arch/ppc/Kconfig.debug" @@ -724,7 +691,7 @@ source "crypto/Kconfig" --- a/arch/ppc/kernel/traps.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/ppc/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/ppc/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -669,8 +669,9 @@ void nonrecoverable_exception(struct pt_ void trace_syscall(struct pt_regs *regs) @@ -738,7 +705,7 @@ } --- a/arch/ppc/mm/fault.c 2008-04-17 11:31:25.000000000 -0400 -+++ a/arch/ppc/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/ppc/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -295,7 +295,8 @@ out_of_memory: down_read(&mm->mmap_sem); goto survive; @@ -749,9 +716,9 @@ if (user_mode(regs)) do_group_exit(SIGKILL); return SIGKILL; ---- a/arch/s390/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/s390/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -544,6 +544,8 @@ source "fs/Kconfig" +--- a/arch/s390/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/s390/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -562,6 +562,8 @@ source "fs/Kconfig" source "arch/s390/Kconfig.debug" @@ -760,9 +727,9 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/s390/kernel/compat_linux.c 2008-04-17 11:31:25.000000000 -0400 -+++ a/arch/s390/kernel/compat_linux.c 2008-04-19 15:14:52.000000000 -0400 -@@ -567,7 +567,7 @@ asmlinkage long sys32_gettimeofday(struc +--- a/arch/s390/kernel/compat_linux.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/s390/kernel/compat_linux.c 2008-07-16 22:41:36.000000000 -0400 +@@ -566,7 +566,7 @@ asmlinkage long sys32_gettimeofday(struc { if (tv) { struct timeval ktv; @@ -771,23 +738,8 @@ if (put_tv32(tv, &ktv)) return -EFAULT; } ---- a/arch/s390/kernel/process.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/s390/kernel/process.c 2008-04-21 11:09:01.000000000 -0400 -@@ -194,9 +194,9 @@ void show_regs(struct pt_regs *regs) - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); -- printk("Process %s (pid: %d, task: %p, ksp: %p)\n", -- current->comm, current->pid, current, -- (void *) current->thread.ksp); -+ printk("Process %s (pid: %d[#%u], task: %p, ksp: %p)\n", -+ current->comm, current->pid, current->xid, -+ (void *) current, (void *) current->thread.ksp); - show_registers(regs); - /* Show stack backtrace if pt_regs is from kernel mode */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) ---- a/arch/s390/kernel/ptrace.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/s390/kernel/ptrace.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/s390/kernel/ptrace.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/s390/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -33,6 +33,7 @@ #include #include @@ -796,22 +748,8 @@ #include #include -@@ -710,7 +711,13 @@ sys_ptrace(long request, long pid, long - goto out; - } - -+ if (!vx_check(vx_task_xid(child), VS_WATCH_P | VS_IDENT)) { -+ ret = -EPERM; -+ goto out_tsk; -+ } -+ - ret = do_ptrace(child, request, addr, data); -+out_tsk: - put_task_struct(child); - out: - unlock_kernel(); --- a/arch/s390/kernel/syscalls.S 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/s390/kernel/syscalls.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/s390/kernel/syscalls.S 2008-07-16 22:41:36.000000000 -0400 @@ -271,7 +271,7 @@ SYSCALL(sys_clock_settime,sys_clock_sett SYSCALL(sys_clock_gettime,sys_clock_gettime,sys32_clock_gettime_wrapper) /* 260 */ SYSCALL(sys_clock_getres,sys_clock_getres,sys32_clock_getres_wrapper) @@ -821,9 +759,9 @@ SYSCALL(s390_fadvise64_64,sys_ni_syscall,sys32_fadvise64_64_wrapper) SYSCALL(sys_statfs64,sys_statfs64,compat_sys_statfs64_wrapper) SYSCALL(sys_fstatfs64,sys_fstatfs64,compat_sys_fstatfs64_wrapper) ---- a/arch/s390/mm/fault.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/s390/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 -@@ -217,7 +217,8 @@ static int do_out_of_memory(struct pt_re +--- a/arch/s390/mm/fault.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/s390/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 +@@ -216,7 +216,8 @@ static int do_out_of_memory(struct pt_re down_read(&mm->mmap_sem); return 1; } @@ -833,9 +771,9 @@ if (regs->psw.mask & PSW_MASK_PSTATE) do_group_exit(SIGKILL); do_no_context(regs, error_code, address); ---- a/arch/sh/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sh/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -913,6 +913,8 @@ source "fs/Kconfig" +--- a/arch/sh/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sh/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -927,6 +927,8 @@ source "fs/Kconfig" source "arch/sh/Kconfig.debug" @@ -844,8 +782,8 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/sh/kernel/irq.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sh/kernel/irq.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/sh/kernel/irq.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sh/kernel/irq.c 2008-07-16 22:41:36.000000000 -0400 @@ -11,6 +11,7 @@ #include #include @@ -855,7 +793,7 @@ #include #include --- a/arch/sh/kernel/vsyscall/vsyscall.c 2008-04-17 10:37:14.000000000 -0400 -+++ a/arch/sh/kernel/vsyscall/vsyscall.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/sh/kernel/vsyscall/vsyscall.c 2008-07-16 22:41:36.000000000 -0400 @@ -19,6 +19,7 @@ #include #include @@ -864,9 +802,9 @@ /* * Should the kernel map a VDSO page into processes and pass its ---- a/arch/sparc/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -330,6 +330,8 @@ source "fs/Kconfig" +--- a/arch/sparc/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -318,6 +318,8 @@ source "fs/Kconfig" source "arch/sparc/Kconfig.debug" @@ -875,8 +813,8 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/sparc/kernel/ptrace.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/arch/sparc/kernel/ptrace.c 2008-05-21 14:30:40.000000000 -0400 +--- a/arch/sparc/kernel/ptrace.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -885,19 +823,8 @@ #include #include -@@ -270,6 +271,10 @@ static int fpregs32_set(struct task_stru - 33 * sizeof(u32), - 34 * sizeof(u32)); - } -+ if (!vx_check(vx_task_xid(child), VS_WATCH_P | VS_IDENT)) { -+ pt_error_return(regs, ESRCH); -+ goto out_tsk; -+ } - - if (!ret) - ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, ---- a/arch/sparc/kernel/systbls.S 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc/kernel/systbls.S 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/sparc/kernel/systbls.S 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc/kernel/systbls.S 2008-07-16 22:41:36.000000000 -0400 @@ -70,7 +70,7 @@ sys_call_table: /*250*/ .long sparc_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep @@ -907,8 +834,8 @@ /*270*/ .long sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink /*275*/ .long sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid /*280*/ .long sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat ---- a/arch/sparc/kernel/traps.c 2008-04-17 11:31:25.000000000 -0400 -+++ a/arch/sparc/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/sparc/kernel/traps.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 @@ -99,7 +99,8 @@ void die_if_kernel(char *str, struct pt_ " /_| \\__/ |_\\\n" " \\__U_/\n"); @@ -919,9 +846,9 @@ show_regs(regs); add_taint(TAINT_DIE); ---- a/arch/sparc/mm/fault.c 2008-04-17 11:31:25.000000000 -0400 -+++ a/arch/sparc/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 -@@ -367,7 +367,8 @@ no_context: +--- a/arch/sparc/mm/fault.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 +@@ -318,7 +318,8 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); @@ -931,9 +858,9 @@ if (from_user) do_group_exit(SIGKILL); goto no_context; ---- a/arch/sparc64/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/Kconfig 2008-04-19 15:14:52.000000000 -0400 -@@ -471,6 +471,8 @@ source "fs/Kconfig" +--- a/arch/sparc64/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -407,6 +407,8 @@ source "fs/Kconfig" source "arch/sparc64/Kconfig.debug" @@ -942,18 +869,8 @@ source "security/Kconfig" source "crypto/Kconfig" ---- a/arch/sparc64/kernel/binfmt_aout32.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/kernel/binfmt_aout32.c 2008-04-19 15:14:52.000000000 -0400 -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - - #include - #include ---- a/arch/sparc64/kernel/ptrace.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/arch/sparc64/kernel/ptrace.c 2008-05-21 14:30:40.000000000 -0400 +--- a/arch/sparc64/kernel/ptrace.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -25,6 +25,7 @@ #include #include @@ -962,20 +879,9 @@ #include #include -@@ -222,6 +223,10 @@ static int genregs64_get(struct task_str - 16 * sizeof(u64), - 32 * sizeof(u64)); - } -+ if (!vx_check(vx_task_xid(child), VS_WATCH_P | VS_IDENT)) { -+ pt_error_return(regs, ESRCH); -+ goto out_tsk; -+ } - - if (!ret) { - /* TSTATE, TPC, TNPC */ ---- a/arch/sparc64/kernel/sys_sparc32.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/arch/sparc64/kernel/sys_sparc32.c 2008-05-21 14:30:40.000000000 -0400 -@@ -722,7 +722,7 @@ asmlinkage long sys32_gettimeofday(struc +--- a/arch/sparc64/kernel/sys_sparc32.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/kernel/sys_sparc32.c 2008-07-16 22:41:36.000000000 -0400 +@@ -707,7 +707,7 @@ asmlinkage long sys32_gettimeofday(struc { if (tv) { struct timeval ktv; @@ -984,8 +890,8 @@ if (put_tv32(tv, &ktv)) return -EFAULT; } ---- a/arch/sparc64/kernel/systbls.S 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/kernel/systbls.S 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/sparc64/kernel/systbls.S 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/kernel/systbls.S 2008-07-16 22:41:36.000000000 -0400 @@ -71,7 +71,7 @@ sys_call_table32: /*250*/ .word sys32_mremap, sys32_sysctl, sys32_getsid, sys_fdatasync, sys32_nfsservctl .word sys32_sync_file_range, compat_sys_clock_settime, compat_sys_clock_gettime, compat_sys_clock_getres, sys32_clock_nanosleep @@ -1004,9 +910,9 @@ /*270*/ .word sys_io_submit, sys_io_cancel, sys_io_getevents, sys_mq_open, sys_mq_unlink .word sys_mq_timedsend, sys_mq_timedreceive, sys_mq_notify, sys_mq_getsetattr, sys_waitid /*280*/ .word sys_tee, sys_add_key, sys_request_key, sys_keyctl, sys_openat ---- a/arch/sparc64/kernel/traps.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/kernel/traps.c 2008-04-19 15:14:52.000000000 -0400 -@@ -2183,7 +2183,8 @@ void die_if_kernel(char *str, struct pt_ +--- a/arch/sparc64/kernel/traps.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/kernel/traps.c 2008-07-16 22:41:36.000000000 -0400 +@@ -2196,7 +2196,8 @@ void die_if_kernel(char *str, struct pt_ " /_| \\__/ |_\\\n" " \\__U_/\n"); @@ -1016,8 +922,8 @@ notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); __asm__ __volatile__("flushw"); __show_regs(regs); ---- a/arch/sparc64/mm/fault.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/mm/fault.c 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/sparc64/mm/fault.c 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/sparc64/mm/fault.c 2008-07-16 22:41:36.000000000 -0400 @@ -453,7 +453,8 @@ handle_kernel_fault: out_of_memory: insn = get_fault_insn(regs, insn); @@ -1028,28 +934,8 @@ if (!(regs->tstate & TSTATE_PRIV)) do_group_exit(SIGKILL); goto handle_kernel_fault; ---- a/arch/sparc64/solaris/fs.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/sparc64/solaris/fs.c 2008-04-19 15:14:52.000000000 -0400 -@@ -368,7 +368,7 @@ static int report_statvfs(struct vfsmoun - int j = strlen (p); - - if (j > 15) j = 15; -- if (IS_RDONLY(inode)) i = 1; -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(mnt)) i = 1; - if (mnt->mnt_flags & MNT_NOSUID) i |= 2; - if (!sysv_valid_dev(inode->i_sb->s_dev)) - return -EOVERFLOW; -@@ -404,7 +404,7 @@ static int report_statvfs64(struct vfsmo - int j = strlen (p); - - if (j > 15) j = 15; -- if (IS_RDONLY(inode)) i = 1; -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(mnt)) i = 1; - if (mnt->mnt_flags & MNT_NOSUID) i |= 2; - if (!sysv_valid_dev(inode->i_sb->s_dev)) - return -EOVERFLOW; ---- a/arch/um/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/um/Kconfig 2008-04-19 15:14:52.000000000 -0400 +--- a/arch/um/Kconfig 2008-07-14 17:22:29.000000000 -0400 ++++ a/arch/um/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -245,6 +245,8 @@ source "drivers/connector/Kconfig" source "fs/Kconfig" @@ -1060,7 +946,7 @@ source "crypto/Kconfig" --- a/arch/um/kernel/trap.c 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/um/kernel/trap.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/um/kernel/trap.c 2008-07-16 22:41:36.000000000 -0400 @@ -215,7 +215,8 @@ unsigned long segv(struct faultinfo fi, current->thread.arch.faultinfo = fi; force_sig_info(SIGBUS, &si, current); @@ -1072,7 +958,7 @@ } else { BUG_ON(err != -EFAULT); --- a/arch/v850/Kconfig 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/v850/Kconfig 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/v850/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -344,6 +344,8 @@ source "drivers/usb/Kconfig" source "arch/v850/Kconfig.debug" @@ -1083,7 +969,7 @@ source "crypto/Kconfig" --- a/arch/v850/kernel/ptrace.c 2008-04-17 10:37:14.000000000 -0400 -+++ a/arch/v850/kernel/ptrace.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/v850/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -23,6 +23,7 @@ #include #include @@ -1102,9 +988,9 @@ switch (request) { unsigned long val; ---- a/arch/x86/ia32/ia32entry.S 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/x86/ia32/ia32entry.S 2008-04-22 20:07:49.000000000 -0400 -@@ -673,7 +673,7 @@ ia32_sys_call_table: +--- a/arch/x86/ia32/ia32entry.S 2008-07-14 17:22:30.000000000 -0400 ++++ a/arch/x86/ia32/ia32entry.S 2008-07-16 22:41:36.000000000 -0400 +@@ -677,7 +677,7 @@ ia32_sys_call_table: .quad sys_tgkill /* 270 */ .quad compat_sys_utimes .quad sys32_fadvise64_64 @@ -1113,9 +999,9 @@ .quad sys_mbind .quad compat_sys_get_mempolicy /* 275 */ .quad sys_set_mempolicy ---- a/arch/x86/Kconfig 2008-05-21 14:30:05.000000000 -0400 -+++ a/arch/x86/Kconfig 2008-05-21 14:30:40.000000000 -0400 -@@ -1623,6 +1623,8 @@ source "fs/Kconfig" +--- a/arch/x86/Kconfig 2008-07-14 17:22:30.000000000 -0400 ++++ a/arch/x86/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -1722,6 +1722,8 @@ source "fs/Kconfig" source "arch/x86/Kconfig.debug" @@ -1125,7 +1011,7 @@ source "crypto/Kconfig" --- a/arch/x86/kernel/syscall_table_32.S 2008-04-17 12:05:30.000000000 -0400 -+++ a/arch/x86/kernel/syscall_table_32.S 2008-04-19 15:14:52.000000000 -0400 ++++ a/arch/x86/kernel/syscall_table_32.S 2008-07-16 22:41:36.000000000 -0400 @@ -272,7 +272,7 @@ ENTRY(sys_call_table) .long sys_tgkill /* 270 */ .long sys_utimes @@ -1136,7 +1022,7 @@ .long sys_get_mempolicy .long sys_set_mempolicy --- a/Documentation/vserver/debug.txt 1969-12-31 19:00:00.000000000 -0500 -+++ a/Documentation/vserver/debug.txt 2008-04-19 15:14:51.000000000 -0400 ++++ a/Documentation/vserver/debug.txt 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,154 @@ + +debug_cvirt: @@ -1293,7 +1179,7 @@ + "vx_acc_pages[%5d,%s,%2d]: %5d += %5d" + "vx_pages_avail[%5d,%s,%2d]: %5ld > %5d + %5d" --- a/drivers/block/Kconfig 2008-04-17 12:05:31.000000000 -0400 -+++ a/drivers/block/Kconfig 2008-04-19 15:14:52.000000000 -0400 ++++ a/drivers/block/Kconfig 2008-07-16 22:41:36.000000000 -0400 @@ -263,6 +263,13 @@ config BLK_DEV_CRYPTOLOOP instead, which can be configured to be on-disk compatible with the cryptoloop device. @@ -1308,8 +1194,8 @@ config BLK_DEV_NBD tristate "Network block device support" depends on NET ---- a/drivers/block/loop.c 2008-04-17 12:05:32.000000000 -0400 -+++ a/drivers/block/loop.c 2008-04-19 15:14:52.000000000 -0400 +--- a/drivers/block/loop.c 2008-07-14 17:22:32.000000000 -0400 ++++ a/drivers/block/loop.c 2008-07-16 22:41:36.000000000 -0400 @@ -76,6 +76,7 @@ #include #include @@ -1318,7 +1204,7 @@ #include -@@ -789,6 +790,7 @@ static int loop_set_fd(struct loop_devic +@@ -794,6 +795,7 @@ static int loop_set_fd(struct loop_devic lo->lo_blocksize = lo_blocksize; lo->lo_device = bdev; lo->lo_flags = lo_flags; @@ -1326,7 +1212,7 @@ lo->lo_backing_file = file; lo->transfer = transfer_none; lo->ioctl = NULL; -@@ -908,6 +910,7 @@ static int loop_clr_fd(struct loop_devic +@@ -915,6 +917,7 @@ static int loop_clr_fd(struct loop_devic lo->lo_encrypt_key_size = 0; lo->lo_flags = 0; lo->lo_thread = NULL; @@ -1334,7 +1220,7 @@ memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); -@@ -929,7 +932,7 @@ loop_set_status(struct loop_device *lo, +@@ -938,7 +941,7 @@ loop_set_status(struct loop_device *lo, struct loop_func_table *xfer; if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid && @@ -1343,7 +1229,7 @@ return -EPERM; if (lo->lo_state != Lo_bound) return -ENXIO; -@@ -1013,7 +1016,8 @@ loop_get_status(struct loop_device *lo, +@@ -1022,7 +1025,8 @@ loop_get_status(struct loop_device *lo, memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); info->lo_encrypt_type = lo->lo_encryption ? lo->lo_encryption->number : 0; @@ -1353,7 +1239,7 @@ info->lo_encrypt_key_size = lo->lo_encrypt_key_size; memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, lo->lo_encrypt_key_size); -@@ -1322,6 +1326,9 @@ static int lo_open(struct inode *inode, +@@ -1331,6 +1335,9 @@ static int lo_open(struct inode *inode, { struct loop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -1364,7 +1250,7 @@ lo->lo_refcnt++; mutex_unlock(&lo->lo_ctl_mutex); --- a/drivers/block/Makefile 2008-04-17 12:05:31.000000000 -0400 -+++ a/drivers/block/Makefile 2008-04-19 15:14:52.000000000 -0400 ++++ a/drivers/block/Makefile 2008-07-16 22:41:36.000000000 -0400 @@ -29,5 +29,6 @@ obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o @@ -1373,8 +1259,8 @@ obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o --- a/drivers/block/vroot.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/drivers/block/vroot.c 2008-04-19 15:14:52.000000000 -0400 -@@ -0,0 +1,280 @@ ++++ a/drivers/block/vroot.c 2008-08-11 23:21:07.000000000 -0400 +@@ -0,0 +1,283 @@ +/* + * linux/drivers/block/vroot.c + * @@ -1586,6 +1472,9 @@ + disks[i] = alloc_disk(1); + if (!disks[i]) + goto out_mem3; ++ disks[i]->queue = blk_alloc_queue(GFP_KERNEL); ++ if (!disks[i]->queue) ++ goto out_mem3; + } + + for (i = 0; i < max_vroot; i++) { @@ -1655,8 +1544,8 @@ + +#endif + ---- a/drivers/char/sysrq.c 2008-04-17 11:31:27.000000000 -0400 -+++ a/drivers/char/sysrq.c 2008-04-19 15:14:52.000000000 -0400 +--- a/drivers/char/sysrq.c 2008-07-14 17:22:33.000000000 -0400 ++++ a/drivers/char/sysrq.c 2008-07-16 22:41:36.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -1665,7 +1554,7 @@ #include #include -@@ -310,6 +311,21 @@ static struct sysrq_key_op sysrq_unrt_op +@@ -351,6 +352,21 @@ static struct sysrq_key_op sysrq_unrt_op .enable_mask = SYSRQ_ENABLE_RTNICE, }; @@ -1687,9 +1576,9 @@ /* Key Operations table and lock */ static DEFINE_SPINLOCK(sysrq_key_table_lock); -@@ -358,7 +374,11 @@ static struct sysrq_key_op *sysrq_key_ta - /* x: May be registered on ppc/powerpc for xmon */ +@@ -404,7 +420,11 @@ static struct sysrq_key_op *sysrq_key_ta NULL, /* x */ + /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ - NULL /* z */ +#ifdef CONFIG_VSERVER_DEBUG @@ -1700,7 +1589,7 @@ }; /* key2index calculation, -1 on invalid index */ -@@ -370,6 +390,8 @@ static int sysrq_key_table_key2index(int +@@ -416,6 +436,8 @@ static int sysrq_key_table_key2index(int retval = key - '0'; else if ((key >= 'a') && (key <= 'z')) retval = key + 10 - 'a'; @@ -1709,8 +1598,8 @@ else retval = -1; return retval; ---- a/drivers/char/tty_io.c 2008-04-17 12:05:32.000000000 -0400 -+++ a/drivers/char/tty_io.c 2008-04-19 15:14:52.000000000 -0400 +--- a/drivers/char/tty_io.c 2008-07-14 17:22:33.000000000 -0400 ++++ a/drivers/char/tty_io.c 2008-07-16 22:41:36.000000000 -0400 @@ -105,6 +105,7 @@ #include @@ -1719,7 +1608,7 @@ #undef TTY_DEBUG_HANGUP -@@ -3142,6 +3143,7 @@ static int tiocspgrp(struct tty_struct * +@@ -3230,6 +3231,7 @@ static int tiocspgrp(struct tty_struct * return -ENOTTY; if (get_user(pgrp_nr, p)) return -EFAULT; @@ -1728,7 +1617,7 @@ return -EINVAL; rcu_read_lock(); --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c 2008-04-17 10:37:17.000000000 -0400 -+++ a/drivers/infiniband/hw/ipath/ipath_user_pages.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/drivers/infiniband/hw/ipath/ipath_user_pages.c 2008-07-16 22:41:36.000000000 -0400 @@ -33,6 +33,7 @@ #include @@ -1774,8 +1663,8 @@ up_write(&work->mm->mmap_sem); mmput(work->mm); kfree(work); ---- a/drivers/md/dm.c 2008-04-17 12:05:33.000000000 -0400 -+++ a/drivers/md/dm.c 2008-04-19 15:14:52.000000000 -0400 +--- a/drivers/md/dm.c 2008-07-14 17:22:35.000000000 -0400 ++++ a/drivers/md/dm.c 2008-07-16 22:41:36.000000000 -0400 @@ -22,6 +22,7 @@ #include #include @@ -1792,7 +1681,7 @@ unsigned long flags; -@@ -250,6 +252,7 @@ static void __exit dm_exit(void) +@@ -252,6 +254,7 @@ static void __exit dm_exit(void) static int dm_blk_open(struct inode *inode, struct file *file) { struct mapped_device *md; @@ -1800,7 +1689,7 @@ spin_lock(&_minor_lock); -@@ -258,18 +261,19 @@ static int dm_blk_open(struct inode *ino +@@ -260,18 +263,19 @@ static int dm_blk_open(struct inode *ino goto out; if (test_bit(DMF_FREEING, &md->flags) || @@ -1826,7 +1715,7 @@ } static int dm_blk_close(struct inode *inode, struct file *file) -@@ -465,6 +469,14 @@ int dm_set_geometry(struct mapped_device +@@ -467,6 +471,14 @@ int dm_set_geometry(struct mapped_device return 0; } @@ -1849,9 +1738,9 @@ md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) goto bad_queue; ---- a/drivers/md/dm.h 2008-04-17 11:31:28.000000000 -0400 -+++ a/drivers/md/dm.h 2008-04-19 15:14:52.000000000 -0400 -@@ -127,6 +127,8 @@ void dm_put_target_type(struct target_ty +--- a/drivers/md/dm.h 2008-07-14 17:22:35.000000000 -0400 ++++ a/drivers/md/dm.h 2008-07-16 22:41:36.000000000 -0400 +@@ -66,6 +66,8 @@ void dm_put_target_type(struct target_ty int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param); @@ -1861,7 +1750,7 @@ * Useful inlines. *---------------------------------------------------------------*/ --- a/drivers/md/dm-ioctl.c 2008-04-17 12:05:33.000000000 -0400 -+++ a/drivers/md/dm-ioctl.c 2008-04-21 10:45:53.000000000 -0400 ++++ a/drivers/md/dm-ioctl.c 2008-07-16 22:41:36.000000000 -0400 @@ -16,6 +16,7 @@ #include #include @@ -1943,17 +1832,17 @@ return -EACCES; if (_IOC_TYPE(command) != DM_IOCTL) ---- a/drivers/net/tun.c 2008-04-17 12:05:36.000000000 -0400 -+++ a/drivers/net/tun.c 2008-04-19 15:39:04.000000000 -0400 -@@ -62,6 +62,7 @@ - #include +--- a/drivers/net/tun.c 2008-07-14 17:22:40.000000000 -0400 ++++ a/drivers/net/tun.c 2008-07-17 21:09:54.000000000 -0400 +@@ -63,6 +63,7 @@ #include #include + #include +#include #include + #include - #include -@@ -86,6 +87,7 @@ struct tun_struct { +@@ -88,6 +89,7 @@ struct tun_struct { int attached; uid_t owner; gid_t group; @@ -1961,7 +1850,7 @@ wait_queue_head_t read_wait; struct sk_buff_head readq; -@@ -465,6 +467,7 @@ static void tun_setup(struct net_device +@@ -486,6 +488,7 @@ static void tun_setup(struct net_device tun->owner = -1; tun->group = -1; @@ -1969,9 +1858,9 @@ dev->open = tun_net_open; dev->hard_start_xmit = tun_net_xmit; -@@ -494,6 +497,9 @@ static int tun_set_iff(struct file *file - - tun = tun_get_by_name(ifr->ifr_name); +@@ -518,6 +521,9 @@ static int tun_set_iff(struct net *net, + tn = net_generic(net, tun_net_id); + tun = tun_get_by_name(tn, ifr->ifr_name); if (tun) { + if (!nx_check(tun->nid, VS_IDENT | VS_HOSTID | VS_ADMIN_P)) + return -EPERM; @@ -1979,7 +1868,7 @@ if (tun->attached) return -EBUSY; -@@ -502,7 +508,7 @@ static int tun_set_iff(struct file *file +@@ -526,7 +532,7 @@ static int tun_set_iff(struct net *net, current->euid != tun->owner) || (tun->group != -1 && current->egid != tun->group)) && @@ -1987,8 +1876,8 @@ + !cap_raised(current->cap_effective, CAP_NET_ADMIN)) return -EPERM; } - else if (__dev_get_by_name(&init_net, ifr->ifr_name)) -@@ -513,7 +519,7 @@ static int tun_set_iff(struct file *file + else if (__dev_get_by_name(net, ifr->ifr_name)) +@@ -537,7 +543,7 @@ static int tun_set_iff(struct net *net, err = -EINVAL; @@ -1997,7 +1886,7 @@ return -EPERM; /* Set dev type */ -@@ -656,6 +662,16 @@ static int tun_chr_ioctl(struct inode *i +@@ -688,6 +694,16 @@ static int tun_chr_ioctl(struct inode *i DBG(KERN_INFO "%s: group set to %d\n", tun->dev->name, tun->group); break; @@ -2012,10 +1901,10 @@ + break; + case TUNSETLINK: - /* Only allow setting the type when the interface is down */ - if (tun->dev->flags & IFF_UP) { + { + int ret; --- a/fs/attr.c 2008-04-17 11:31:35.000000000 -0400 -+++ a/fs/attr.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/attr.c 2008-07-16 22:41:36.000000000 -0400 @@ -14,6 +14,9 @@ #include #include @@ -2073,8 +1962,8 @@ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; if (!error) error = inode_setattr(inode, attr); ---- a/fs/binfmt_aout.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/binfmt_aout.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/binfmt_aout.c 2008-07-14 17:22:48.000000000 -0400 ++++ a/fs/binfmt_aout.c 2008-07-16 22:41:36.000000000 -0400 @@ -24,6 +24,7 @@ #include #include @@ -2083,9 +1972,9 @@ #include #include ---- a/fs/binfmt_elf.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/binfmt_elf.c 2008-04-19 15:14:52.000000000 -0400 -@@ -39,6 +39,7 @@ +--- a/fs/binfmt_elf.c 2008-07-14 17:22:48.000000000 -0400 ++++ a/fs/binfmt_elf.c 2008-07-16 22:41:36.000000000 -0400 +@@ -38,6 +38,7 @@ #include #include #include @@ -2093,8 +1982,8 @@ #include #include #include ---- a/fs/binfmt_flat.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/binfmt_flat.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/binfmt_flat.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/binfmt_flat.c 2008-07-16 22:41:36.000000000 -0400 @@ -35,6 +35,7 @@ #include #include @@ -2103,8 +1992,8 @@ #include #include ---- a/fs/binfmt_som.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/binfmt_som.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/binfmt_som.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/binfmt_som.c 2008-07-16 22:41:36.000000000 -0400 @@ -28,6 +28,7 @@ #include #include @@ -2113,9 +2002,9 @@ #include #include ---- a/fs/block_dev.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/block_dev.c 2008-04-19 15:14:52.000000000 -0400 -@@ -23,6 +23,7 @@ +--- a/fs/block_dev.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/block_dev.c 2008-07-16 22:41:36.000000000 -0400 +@@ -24,6 +24,7 @@ #include #include #include @@ -2123,7 +2012,7 @@ #include #include "internal.h" -@@ -388,6 +389,7 @@ struct block_device *bdget(dev_t dev) +@@ -389,6 +390,7 @@ struct block_device *bdget(dev_t dev) bdev->bd_invalidated = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; @@ -2131,7 +2020,7 @@ inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); -@@ -424,6 +426,11 @@ EXPORT_SYMBOL(bdput); +@@ -425,6 +427,11 @@ EXPORT_SYMBOL(bdput); static struct block_device *bd_acquire(struct inode *inode) { struct block_device *bdev; @@ -2143,7 +2032,7 @@ spin_lock(&bdev_lock); bdev = inode->i_bdev; -@@ -434,7 +441,7 @@ static struct block_device *bd_acquire(s +@@ -435,7 +442,7 @@ static struct block_device *bd_acquire(s } spin_unlock(&bdev_lock); @@ -2152,8 +2041,8 @@ if (bdev) { spin_lock(&bdev_lock); if (!inode->i_bdev) { ---- a/fs/char_dev.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/char_dev.c 2008-04-29 18:44:50.000000000 -0400 +--- a/fs/char_dev.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/char_dev.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,8 @@ #include #include @@ -2163,7 +2052,7 @@ #ifdef CONFIG_KMOD #include -@@ -362,14 +364,21 @@ static int chrdev_open(struct inode *ino +@@ -361,14 +363,21 @@ static int chrdev_open(struct inode *ino struct cdev *p; struct cdev *new = NULL; int ret = 0; @@ -2186,9 +2075,9 @@ if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); ---- a/fs/dcache.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/dcache.c 2008-04-19 17:06:15.000000000 -0400 -@@ -31,6 +31,7 @@ +--- a/fs/dcache.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/dcache.c 2008-07-16 22:41:36.000000000 -0400 +@@ -32,6 +32,7 @@ #include #include #include @@ -2196,7 +2085,7 @@ #include "internal.h" -@@ -184,6 +185,7 @@ void dput(struct dentry *dentry) +@@ -187,6 +188,7 @@ void dput(struct dentry *dentry) if (!dentry) return; @@ -2204,7 +2093,7 @@ repeat: if (atomic_read(&dentry->d_count) == 1) might_sleep(); -@@ -197,6 +199,8 @@ repeat: +@@ -200,6 +202,8 @@ repeat: return; } @@ -2213,7 +2102,7 @@ /* * AV: ->d_delete() is _NOT_ allowed to block now. */ -@@ -288,6 +292,7 @@ static inline struct dentry * __dget_loc +@@ -291,6 +295,7 @@ static inline struct dentry * __dget_loc { atomic_inc(&dentry->d_count); dentry_lru_remove(dentry); @@ -2221,7 +2110,7 @@ return dentry; } -@@ -885,6 +890,9 @@ struct dentry *d_alloc(struct dentry * p +@@ -888,6 +893,9 @@ struct dentry *d_alloc(struct dentry * p struct dentry *dentry; char *dname; @@ -2231,7 +2120,7 @@ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) return NULL; -@@ -933,6 +941,7 @@ struct dentry *d_alloc(struct dentry * p +@@ -936,6 +944,7 @@ struct dentry *d_alloc(struct dentry * p if (parent) list_add(&dentry->d_u.d_child, &parent->d_subdirs); dentry_stat.nr_dentry++; @@ -2239,7 +2128,7 @@ spin_unlock(&dcache_lock); return dentry; -@@ -1282,6 +1291,7 @@ struct dentry * __d_lookup(struct dentry +@@ -1285,6 +1294,7 @@ struct dentry * __d_lookup(struct dentry if (!d_unhashed(dentry)) { atomic_inc(&dentry->d_count); @@ -2247,20 +2136,20 @@ found = dentry; } spin_unlock(&dentry->d_lock); ---- a/fs/devpts/inode.c 2008-04-17 12:05:39.000000000 -0400 -+++ a/fs/devpts/inode.c 2008-04-21 09:23:34.000000000 -0400 -@@ -17,15 +17,30 @@ - #include - #include +--- a/fs/devpts/inode.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/devpts/inode.c 2008-07-17 17:43:28.000000000 -0400 +@@ -19,15 +19,29 @@ #include + #include + #include +#include #include #include #include #include -+#include - +- -#define DEVPTS_SUPER_MAGIC 0x1cd1 ++#include #define DEVPTS_DEFAULT_MODE 0600 @@ -2278,10 +2167,10 @@ + .permission = devpts_permission, +}; + - static struct vfsmount *devpts_mnt; - static struct dentry *devpts_root; - -@@ -106,6 +121,25 @@ static int devpts_show_options(struct se + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_IDR(allocated_ptys); + static DEFINE_MUTEX(allocated_ptys_lock); +@@ -112,6 +126,25 @@ static int devpts_show_options(struct se return 0; } @@ -2307,7 +2196,7 @@ static const struct super_operations devpts_sops = { .statfs = simple_statfs, .remount_fs = devpts_remount, -@@ -132,8 +166,10 @@ devpts_fill_super(struct super_block *s, +@@ -138,8 +171,10 @@ devpts_fill_super(struct super_block *s, inode->i_uid = inode->i_gid = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; @@ -2319,7 +2208,7 @@ devpts_root = s->s_root = d_alloc_root(inode); if (s->s_root) -@@ -191,6 +227,9 @@ int devpts_pty_new(struct tty_struct *tt +@@ -232,6 +267,9 @@ int devpts_pty_new(struct tty_struct *tt inode->i_gid = config.setgid ? config.gid : current->fsgid; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; init_special_inode(inode, S_IFCHR|config.mode, device); @@ -2329,65 +2218,9 @@ inode->i_private = tty; dentry = get_node(number); ---- a/fs/ecryptfs/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ecryptfs/inode.c 2008-04-19 15:14:52.000000000 -0400 -@@ -400,7 +400,7 @@ static int ecryptfs_link(struct dentry * - dget(lower_new_dentry); - lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, -- lower_new_dentry); -+ lower_new_dentry, NULL); - if (rc || !lower_new_dentry->d_inode) - goto out_lock; - rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); -@@ -428,7 +428,7 @@ static int ecryptfs_unlink(struct inode - struct dentry *lower_dir_dentry; - - lower_dir_dentry = lock_parent(lower_dentry); -- rc = vfs_unlink(lower_dir_inode, lower_dentry); -+ rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); - if (rc) { - printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); - goto out_unlock; -@@ -466,7 +466,7 @@ static int ecryptfs_symlink(struct inode - goto out_lock; - } - rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, -- encoded_symname, mode); -+ encoded_symname, mode, NULL); - kfree(encoded_symname); - if (rc || !lower_dentry->d_inode) - goto out_lock; -@@ -491,7 +491,7 @@ static int ecryptfs_mkdir(struct inode * - - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); -- rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); -+ rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode, NULL); - if (rc || !lower_dentry->d_inode) - goto out; - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); -@@ -517,7 +517,7 @@ static int ecryptfs_rmdir(struct inode * - dget(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - dget(lower_dentry); -- rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); -+ rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry, NULL); - dput(lower_dentry); - if (!rc) - d_delete(lower_dentry); -@@ -539,7 +539,7 @@ ecryptfs_mknod(struct inode *dir, struct - - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); -- rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); -+ rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev, NULL); - if (rc || !lower_dentry->d_inode) - goto out; - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); ---- a/fs/exec.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/exec.c 2008-04-19 15:14:52.000000000 -0400 -@@ -249,7 +249,9 @@ static int __bprm_mm_init(struct linux_b +--- a/fs/exec.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/exec.c 2008-07-16 22:41:36.000000000 -0400 +@@ -254,7 +254,9 @@ static int __bprm_mm_init(struct linux_b goto err; } @@ -2398,7 +2231,7 @@ up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); -@@ -1452,7 +1454,7 @@ static int format_corename(char *corenam +@@ -1447,7 +1449,7 @@ static int format_corename(char *corenam /* UNIX time of coredump */ case 't': { struct timeval tv; @@ -2407,8 +2240,8 @@ rc = snprintf(out_ptr, out_end - out_ptr, "%lu", tv.tv_sec); if (rc > out_end - out_ptr) ---- a/fs/ext2/balloc.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/balloc.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext2/balloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/balloc.c 2008-07-16 22:41:36.000000000 -0400 @@ -16,6 +16,8 @@ #include #include @@ -2446,7 +2279,7 @@ sbi = EXT2_SB(sb); es = EXT2_SB(sb)->s_es; -@@ -1403,6 +1410,7 @@ allocated: +@@ -1408,6 +1415,7 @@ allocated: *errp = 0; brelse(bitmap_bh); @@ -2454,7 +2287,7 @@ DQUOT_FREE_BLOCK(inode, *count-num); *count = num; return ret_block; -@@ -1413,8 +1421,10 @@ out: +@@ -1418,8 +1426,10 @@ out: /* * Undo the block allocation */ @@ -2467,7 +2300,7 @@ return 0; } --- a/fs/ext2/ext2.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/ext2.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ext2/ext2.h 2008-07-16 22:41:36.000000000 -0400 @@ -168,6 +168,7 @@ extern const struct file_operations ext2 extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_aops_xip; @@ -2477,15 +2310,15 @@ /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; --- a/fs/ext2/file.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/file.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ext2/file.c 2008-07-16 22:41:36.000000000 -0400 @@ -86,4 +86,5 @@ const struct inode_operations ext2_file_ #endif .setattr = ext2_setattr, .permission = ext2_permission, + .sync_flags = ext2_sync_flags, }; ---- a/fs/ext2/ialloc.c 2008-04-17 11:31:35.000000000 -0400 -+++ a/fs/ext2/ialloc.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext2/ialloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/ialloc.c 2008-07-16 22:41:36.000000000 -0400 @@ -17,6 +17,8 @@ #include #include @@ -2495,7 +2328,7 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -@@ -125,6 +127,7 @@ void ext2_free_inode (struct inode * ino +@@ -123,6 +125,7 @@ void ext2_free_inode (struct inode * ino ext2_xattr_delete_inode(inode); DQUOT_FREE_INODE(inode); DQUOT_DROP(inode); @@ -2503,7 +2336,7 @@ } es = EXT2_SB(sb)->s_es; -@@ -456,6 +459,11 @@ struct inode *ext2_new_inode(struct inod +@@ -454,6 +457,11 @@ struct inode *ext2_new_inode(struct inod if (!inode) return ERR_PTR(-ENOMEM); @@ -2515,17 +2348,17 @@ ei = EXT2_I(inode); sbi = EXT2_SB(sb); es = sbi->s_es; -@@ -569,7 +577,8 @@ got: +@@ -565,7 +573,8 @@ got: inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; + ei->i_flags = EXT2_I(dir)->i_flags & -+ ~(EXT2_BTREE_FL|EXT2_IUNLINK_FL|EXT2_BARRIER_FL); ++ ~(EXT2_BTREE_FL|EXT2_IXUNLINK_FL|EXT2_BARRIER_FL); if (S_ISLNK(mode)) ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL); /* dirsync is only applied to directories */ -@@ -614,12 +623,15 @@ fail_free_drop: +@@ -610,12 +619,15 @@ fail_free_drop: fail_drop: DQUOT_DROP(inode); @@ -2541,8 +2374,8 @@ make_bad_inode(inode); iput(inode); return ERR_PTR(err); ---- a/fs/ext2/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/inode.c 2008-04-21 10:14:57.000000000 -0400 +--- a/fs/ext2/inode.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -31,6 +31,7 @@ #include #include @@ -2551,7 +2384,7 @@ #include "ext2.h" #include "acl.h" #include "xip.h" -@@ -1011,7 +1012,7 @@ void ext2_truncate(struct inode *inode) +@@ -1010,7 +1011,7 @@ void ext2_truncate(struct inode *inode) return; if (ext2_inode_is_fast_symlink(inode)) return; @@ -2560,20 +2393,19 @@ return; blocksize = inode->i_sb->s_blocksize; -@@ -1149,13 +1150,20 @@ void ext2_set_inode_flags(struct inode * +@@ -1148,38 +1149,72 @@ void ext2_set_inode_flags(struct inode * { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); -+ inode->i_flags &= ~(S_IMMUTABLE | S_IUNLINK | S_BARRIER | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + ++ + if (flags & EXT2_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; -+ if (flags & EXT2_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; -+ if (flags & EXT2_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ if (flags & EXT2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; @@ -2584,30 +2416,56 @@ if (flags & EXT2_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) -@@ -1181,6 +1189,30 @@ void ext2_get_inode_flags(struct ext2_in + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT2_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ + void ext2_get_inode_flags(struct ext2_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT2_SYNC_FL | EXT2_APPEND_FL | ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL | ++ EXT2_NOATIME_FL | EXT2_DIRSYNC_FL | ++ EXT2_BARRIER_FL | EXT2_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT2_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| +- EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT2_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT2_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT2_NOATIME_FL; + if (flags & S_DIRSYNC) ei->i_flags |= EXT2_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT2_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT2_COW_FL; } +int ext2_sync_flags(struct inode *inode) +{ -+ unsigned int oldflags, newflags; -+ -+ oldflags = EXT2_I(inode)->i_flags; -+ newflags = oldflags & ~(EXT2_IMMUTABLE_FL | -+ EXT2_IUNLINK_FL | EXT2_BARRIER_FL); -+ -+ if (IS_IMMUTABLE(inode)) -+ newflags |= EXT2_IMMUTABLE_FL; -+ if (IS_IUNLINK(inode)) -+ newflags |= EXT2_IUNLINK_FL; -+ if (IS_BARRIER(inode)) -+ newflags |= EXT2_BARRIER_FL; -+ -+ if (oldflags ^ newflags) { -+ EXT2_I(inode)->i_flags = newflags; -+ inode->i_ctime = CURRENT_TIME; -+ mark_inode_dirty(inode); -+ } ++ ext2_get_inode_flags(EXT2_I(inode)); ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); + return 0; +} + @@ -2615,7 +2473,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; -@@ -1188,6 +1220,8 @@ struct inode *ext2_iget (struct super_bl +@@ -1187,6 +1222,8 @@ struct inode *ext2_iget (struct super_bl struct ext2_inode *raw_inode; struct inode *inode; long ret = -EIO; @@ -2624,7 +2482,7 @@ int n; inode = iget_locked(sb, ino); -@@ -1210,12 +1244,17 @@ struct inode *ext2_iget (struct super_bl +@@ -1209,12 +1246,17 @@ struct inode *ext2_iget (struct super_bl } inode->i_mode = le16_to_cpu(raw_inode->i_mode); @@ -2646,7 +2504,7 @@ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); -@@ -1311,8 +1350,8 @@ static int ext2_update_inode(struct inod +@@ -1310,8 +1352,8 @@ static int ext2_update_inode(struct inod struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; @@ -2657,7 +2515,7 @@ struct buffer_head * bh; struct ext2_inode * raw_inode = ext2_get_inode(sb, ino, &bh); int n; -@@ -1348,6 +1387,9 @@ static int ext2_update_inode(struct inod +@@ -1347,6 +1389,9 @@ static int ext2_update_inode(struct inod raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } @@ -2667,7 +2525,7 @@ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(inode->i_size); raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -@@ -1434,7 +1476,8 @@ int ext2_setattr(struct dentry *dentry, +@@ -1433,7 +1478,8 @@ int ext2_setattr(struct dentry *dentry, if (error) return error; if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) || @@ -2677,49 +2535,41 @@ error = DQUOT_TRANSFER(inode, iattr) ? -EDQUOT : 0; if (error) return error; ---- a/fs/ext2/ioctl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/ioctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -13,6 +13,7 @@ - #include +--- a/fs/ext2/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/ioctl.c 2008-07-16 22:41:36.000000000 -0400 +@@ -14,6 +14,7 @@ #include + #include #include +#include #include #include -@@ -34,7 +35,8 @@ long ext2_ioctl(struct file *filp, unsig - case EXT2_IOC_SETFLAGS: { - unsigned int oldflags; +@@ -53,6 +54,11 @@ long ext2_ioctl(struct file *filp, unsig + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT2_DIRSYNC_FL; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -60,7 +62,9 @@ long ext2_ioctl(struct file *filp, unsig ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { +@@ -68,7 +74,9 @@ long ext2_ioctl(struct file *filp, unsig * * This test looks nicer. Thanks to Pauline Middelink */ - if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { + if ((oldflags & EXT2_IMMUTABLE_FL) || + ((flags ^ oldflags) & (EXT2_APPEND_FL | -+ EXT2_IMMUTABLE_FL | EXT2_IUNLINK_FL))) { ++ EXT2_IMMUTABLE_FL | EXT2_IXUNLINK_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; -@@ -82,7 +86,8 @@ long ext2_ioctl(struct file *filp, unsig - case EXT2_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) - return -EPERM; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - if (get_user(inode->i_generation, (int __user *) arg)) - return -EFAULT; + ret = -EPERM; --- a/fs/ext2/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/namei.c 2008-04-21 10:12:49.000000000 -0400 ++++ a/fs/ext2/namei.c 2008-07-16 22:41:36.000000000 -0400 @@ -31,6 +31,7 @@ */ @@ -2750,9 +2600,9 @@ .permission = ext2_permission, + .sync_flags = ext2_sync_flags, }; ---- a/fs/ext2/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/super.c 2008-04-19 15:14:52.000000000 -0400 -@@ -390,7 +390,8 @@ enum { +--- a/fs/ext2/super.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/super.c 2008-07-16 22:41:36.000000000 -0400 +@@ -389,7 +389,8 @@ enum { Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, @@ -2762,7 +2612,7 @@ }; static match_table_t tokens = { -@@ -418,6 +419,9 @@ static match_table_t tokens = { +@@ -417,6 +418,9 @@ static match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, @@ -2772,7 +2622,7 @@ {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, -@@ -488,6 +492,20 @@ static int parse_options (char * options +@@ -487,6 +491,20 @@ static int parse_options (char * options case Opt_nouid32: set_opt (sbi->s_mount_opt, NO_UID32); break; @@ -2793,7 +2643,7 @@ case Opt_nocheck: clear_opt (sbi->s_mount_opt, CHECK); break; -@@ -831,6 +849,8 @@ static int ext2_fill_super(struct super_ +@@ -829,6 +847,8 @@ static int ext2_fill_super(struct super_ if (!parse_options ((char *) data, sbi)) goto failed_mount; @@ -2802,7 +2652,7 @@ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); -@@ -1164,6 +1184,13 @@ static int ext2_remount (struct super_bl +@@ -1161,6 +1181,13 @@ static int ext2_remount (struct super_bl goto restore_opts; } @@ -2817,7 +2667,7 @@ ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); --- a/fs/ext2/symlink.c 2008-04-17 10:32:27.000000000 -0400 -+++ a/fs/ext2/symlink.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ext2/symlink.c 2008-07-16 22:41:36.000000000 -0400 @@ -38,6 +38,7 @@ const struct inode_operations ext2_symli .listxattr = ext2_listxattr, .removexattr = generic_removexattr, @@ -2832,8 +2682,8 @@ #endif + .sync_flags = ext2_sync_flags, }; ---- a/fs/ext2/xattr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext2/xattr.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext2/xattr.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext2/xattr.c 2008-07-16 22:41:36.000000000 -0400 @@ -60,6 +60,7 @@ #include #include @@ -2855,15 +2705,15 @@ unlock_buffer(new_bh); goto cleanup; } -@@ -735,6 +740,7 @@ ext2_xattr_set2(struct inode *inode, str - le32_to_cpu(HDR(old_bh)->h_refcount) - 1); +@@ -731,6 +736,7 @@ ext2_xattr_set2(struct inode *inode, str + le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + DLIMIT_FREE_BLOCK(inode, 1); DQUOT_FREE_BLOCK(inode, 1); mark_buffer_dirty(old_bh); ea_bdebug(old_bh, "refcount now=%d", -@@ -799,6 +805,7 @@ ext2_xattr_delete_inode(struct inode *in +@@ -794,6 +800,7 @@ ext2_xattr_delete_inode(struct inode *in mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); @@ -2871,8 +2721,8 @@ DQUOT_FREE_BLOCK(inode, 1); } EXT2_I(inode)->i_file_acl = 0; ---- a/fs/ext3/balloc.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/balloc.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext3/balloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/balloc.c 2008-07-16 22:41:36.000000000 -0400 @@ -19,6 +19,8 @@ #include #include @@ -2962,7 +2812,7 @@ *errp = -ENOSPC; goto out; } -@@ -1710,12 +1731,16 @@ allocated: +@@ -1714,12 +1735,16 @@ allocated: *errp = 0; brelse(bitmap_bh); DQUOT_FREE_BLOCK(inode, *count-num); @@ -2980,7 +2830,7 @@ *errp = fatal; ext3_std_error(sb, fatal); --- a/fs/ext3/file.c 2008-04-17 10:37:23.000000000 -0400 -+++ a/fs/ext3/file.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ext3/file.c 2008-07-16 22:41:36.000000000 -0400 @@ -134,5 +134,6 @@ const struct inode_operations ext3_file_ .removexattr = generic_removexattr, #endif @@ -2988,8 +2838,8 @@ + .sync_flags = ext3_sync_flags, }; ---- a/fs/ext3/ialloc.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/ialloc.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext3/ialloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/ialloc.c 2008-07-16 22:41:36.000000000 -0400 @@ -23,6 +23,8 @@ #include #include @@ -3026,7 +2876,7 @@ - ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; + ei->i_flags = EXT3_I(dir)->i_flags & -+ ~(EXT3_INDEX_FL|EXT3_IUNLINK_FL|EXT3_BARRIER_FL); ++ ~(EXT3_INDEX_FL|EXT3_IXUNLINK_FL|EXT3_BARRIER_FL); if (S_ISLNK(mode)) ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); /* dirsync only applies to directories */ @@ -3047,8 +2897,8 @@ inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); ---- a/fs/ext3/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/inode.c 2008-04-20 13:25:49.000000000 -0400 +--- a/fs/ext3/inode.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -36,6 +36,7 @@ #include #include @@ -3057,7 +2907,7 @@ #include "xattr.h" #include "acl.h" -@@ -2300,7 +2301,7 @@ void ext3_truncate(struct inode *inode) +@@ -2302,7 +2303,7 @@ void ext3_truncate(struct inode *inode) return; if (ext3_inode_is_fast_symlink(inode)) return; @@ -3066,20 +2916,18 @@ return; /* -@@ -2622,13 +2623,20 @@ void ext3_set_inode_flags(struct inode * +@@ -2611,36 +2612,84 @@ void ext3_set_inode_flags(struct inode * { unsigned int flags = EXT3_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); -+ inode->i_flags &= ~(S_IMMUTABLE | S_IUNLINK | S_BARRIER | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & EXT3_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; -+ if (flags & EXT3_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; -+ if (flags & EXT3_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ if (flags & EXT3_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + if (flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; @@ -3090,53 +2938,77 @@ if (flags & EXT3_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & EXT3_DIRSYNC_FL) -@@ -2654,6 +2662,45 @@ void ext3_get_inode_flags(struct ext3_in - ei->i_flags |= EXT3_DIRSYNC_FL; + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT3_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT3_COW_FL) ++ inode->i_vflags |= V_COW; } -+int ext3_sync_flags(struct inode *inode) -+{ -+ unsigned int oldflags, newflags; -+ int err = 0; + /* Propagate flags from i_flags to EXT3_I(inode)->i_flags */ + void ext3_get_inode_flags(struct ext3_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; + -+ oldflags = EXT3_I(inode)->i_flags; -+ newflags = oldflags & ~(EXT3_IMMUTABLE_FL | -+ EXT3_IUNLINK_FL | EXT3_BARRIER_FL); ++ ei->i_flags &= ~(EXT3_SYNC_FL | EXT3_APPEND_FL | ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL | ++ EXT3_NOATIME_FL | EXT3_DIRSYNC_FL | ++ EXT3_BARRIER_FL | EXT3_COW_FL); + -+ if (IS_IMMUTABLE(inode)) -+ newflags |= EXT3_IMMUTABLE_FL; -+ if (IS_IUNLINK(inode)) -+ newflags |= EXT3_IUNLINK_FL; -+ if (IS_BARRIER(inode)) -+ newflags |= EXT3_BARRIER_FL; ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT3_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT3_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT3_SYNC_FL|EXT3_APPEND_FL| +- EXT3_IMMUTABLE_FL|EXT3_NOATIME_FL|EXT3_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT3_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT3_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT3_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT3_NOATIME_FL; + if (flags & S_DIRSYNC) + ei->i_flags |= EXT3_DIRSYNC_FL; + -+ if (oldflags ^ newflags) { -+ handle_t *handle; -+ struct ext3_iloc iloc; -+ -+ handle = ext3_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ err = ext3_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto flags_err; -+ -+ EXT3_I(inode)->i_flags = newflags; -+ inode->i_ctime = CURRENT_TIME; -+ -+ err = ext3_mark_iloc_dirty(handle, inode, &iloc); -+ flags_err: -+ ext3_journal_stop(handle); -+ } -+ return err; ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT3_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT3_COW_FL; +} + ++int ext3_sync_flags(struct inode *inode) ++{ ++ struct ext3_iloc iloc; ++ handle_t *handle; ++ int err; ++ ++ handle = ext3_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ err = ext3_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; ++ ++ ext3_get_inode_flags(EXT3_I(inode)); ++ inode->i_ctime = CURRENT_TIME; ++ ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext3_journal_stop(handle); ++ return err; + } + struct inode *ext3_iget(struct super_block *sb, unsigned long ino) - { - struct ext3_iloc iloc; -@@ -2663,6 +2710,8 @@ struct inode *ext3_iget(struct super_blo +@@ -2652,6 +2701,8 @@ struct inode *ext3_iget(struct super_blo struct inode *inode; long ret; int block; @@ -3145,7 +3017,7 @@ inode = iget_locked(sb, ino); if (!inode) -@@ -2683,12 +2732,17 @@ struct inode *ext3_iget(struct super_blo +@@ -2672,12 +2723,17 @@ struct inode *ext3_iget(struct super_blo bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); @@ -3167,7 +3039,7 @@ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); -@@ -2817,6 +2871,8 @@ static int ext3_do_update_inode(handle_t +@@ -2806,6 +2862,8 @@ static int ext3_do_update_inode(handle_t struct ext3_inode *raw_inode = ext3_raw_inode(iloc); struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; @@ -3176,7 +3048,7 @@ int err = 0, rc, block; /* For fields not not tracking in the in-memory inode, -@@ -2827,29 +2883,32 @@ static int ext3_do_update_inode(handle_t +@@ -2816,29 +2874,32 @@ static int ext3_do_update_inode(handle_t ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if(!(test_opt(inode->i_sb, NO_UID32))) { @@ -3215,7 +3087,7 @@ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(ei->i_disksize); raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); -@@ -3002,7 +3061,8 @@ int ext3_setattr(struct dentry *dentry, +@@ -2991,7 +3052,8 @@ int ext3_setattr(struct dentry *dentry, return error; if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || @@ -3225,7 +3097,7 @@ handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, -@@ -3024,6 +3084,8 @@ int ext3_setattr(struct dentry *dentry, +@@ -3013,6 +3075,8 @@ int ext3_setattr(struct dentry *dentry, inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; @@ -3234,8 +3106,8 @@ error = ext3_mark_inode_dirty(handle, inode); ext3_journal_stop(handle); } ---- a/fs/ext3/ioctl.c 2008-04-17 11:31:35.000000000 -0400 -+++ a/fs/ext3/ioctl.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext3/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/ioctl.c 2008-07-16 22:41:36.000000000 -0400 @@ -8,6 +8,7 @@ */ @@ -3244,7 +3116,7 @@ #include #include #include -@@ -15,6 +16,7 @@ +@@ -16,6 +17,7 @@ #include #include #include @@ -3252,69 +3124,31 @@ #include int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, -@@ -38,7 +40,8 @@ int ext3_ioctl (struct inode * inode, st - unsigned int oldflags; - unsigned int jflag; +@@ -56,6 +58,11 @@ int ext3_ioctl (struct inode * inode, st + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT3_DIRSYNC_FL; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -67,7 +70,9 @@ int ext3_ioctl (struct inode * inode, st ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { +@@ -74,7 +81,9 @@ int ext3_ioctl (struct inode * inode, st * * This test looks nicer. Thanks to Pauline Middelink */ - if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { + if ((oldflags & EXT3_IMMUTABLE_FL) || + ((flags ^ oldflags) & (EXT3_APPEND_FL | -+ EXT3_IMMUTABLE_FL | EXT3_IUNLINK_FL))) { ++ EXT3_IMMUTABLE_FL | EXT3_IXUNLINK_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; -@@ -129,7 +134,8 @@ flags_err: - - if (!is_owner_or_cap(inode)) - return -EPERM; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; -@@ -183,7 +189,8 @@ flags_err: - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -218,7 +225,8 @@ flags_err: - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) -@@ -239,7 +247,8 @@ flags_err: - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, ---- a/fs/ext3/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/namei.c 2008-04-19 15:14:52.000000000 -0400 + err = -EPERM; +--- a/fs/ext3/namei.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/namei.c 2008-07-16 22:41:36.000000000 -0400 @@ -36,6 +36,7 @@ #include #include @@ -3323,7 +3157,7 @@ #include "namei.h" #include "xattr.h" -@@ -907,6 +908,7 @@ restart: +@@ -912,6 +913,7 @@ restart: if (bh) ll_rw_block(READ_META, 1, &bh); } @@ -3331,7 +3165,7 @@ } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; -@@ -2417,6 +2419,7 @@ const struct inode_operations ext3_dir_i +@@ -2424,6 +2426,7 @@ const struct inode_operations ext3_dir_i .removexattr = generic_removexattr, #endif .permission = ext3_permission, @@ -3339,15 +3173,15 @@ }; const struct inode_operations ext3_special_inode_operations = { -@@ -2428,4 +2431,5 @@ const struct inode_operations ext3_speci +@@ -2435,4 +2438,5 @@ const struct inode_operations ext3_speci .removexattr = generic_removexattr, #endif .permission = ext3_permission, + .sync_flags = ext3_sync_flags, }; ---- a/fs/ext3/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/super.c 2008-04-20 13:26:55.000000000 -0400 -@@ -756,7 +756,7 @@ enum { +--- a/fs/ext3/super.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/super.c 2008-07-17 17:20:52.000000000 -0400 +@@ -757,7 +757,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -3356,7 +3190,7 @@ }; static match_table_t tokens = { -@@ -807,6 +807,9 @@ static match_table_t tokens = { +@@ -808,6 +808,9 @@ static match_table_t tokens = { {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, {Opt_resize, "resize"}, @@ -3366,7 +3200,7 @@ {Opt_err, NULL}, }; -@@ -899,6 +902,20 @@ static int parse_options (char *options, +@@ -900,6 +903,20 @@ static int parse_options (char *options, case Opt_nouid32: set_opt (sbi->s_mount_opt, NO_UID32); break; @@ -3387,7 +3221,7 @@ case Opt_nocheck: clear_opt (sbi->s_mount_opt, CHECK); break; -@@ -1591,6 +1608,9 @@ static int ext3_fill_super (struct super +@@ -1594,6 +1611,9 @@ static int ext3_fill_super (struct super NULL, 0)) goto failed_mount; @@ -3397,21 +3231,22 @@ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); -@@ -2428,6 +2448,12 @@ static int ext3_remount (struct super_bl - +@@ -2432,6 +2452,13 @@ static int ext3_remount (struct super_bl if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + ext3_abort(sb, __func__, "Abort forced by user"); + + if ((sbi->s_mount_opt & EXT3_MOUNT_TAGGED) && + !(sb->s_flags & MS_TAGGED)) { + printk("EXT3-fs: %s: tagging not permitted on remount.\n", + sb->s_id); + return -EINVAL; + } - ++ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + --- a/fs/ext3/symlink.c 2008-04-17 10:32:27.000000000 -0400 -+++ a/fs/ext3/symlink.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ext3/symlink.c 2008-07-16 22:41:36.000000000 -0400 @@ -40,6 +40,7 @@ const struct inode_operations ext3_symli .listxattr = ext3_listxattr, .removexattr = generic_removexattr, @@ -3426,8 +3261,8 @@ #endif + .sync_flags = ext3_sync_flags, }; ---- a/fs/ext3/xattr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext3/xattr.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext3/xattr.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext3/xattr.c 2008-07-16 22:41:36.000000000 -0400 @@ -58,6 +58,7 @@ #include #include @@ -3436,7 +3271,7 @@ #include "xattr.h" #include "acl.h" -@@ -496,6 +497,7 @@ ext3_xattr_release_block(handle_t *handl +@@ -498,6 +499,7 @@ ext3_xattr_release_block(handle_t *handl error = ext3_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; @@ -3444,7 +3279,7 @@ DQUOT_FREE_BLOCK(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); -@@ -769,11 +771,14 @@ inserted: +@@ -771,11 +773,14 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { @@ -3469,18 +3304,18 @@ goto cleanup; bad_block: ---- a/fs/ext4/balloc.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/balloc.c 2008-04-21 10:23:33.000000000 -0400 -@@ -19,6 +19,8 @@ - #include +--- a/fs/ext4/balloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/balloc.c 2008-07-17 20:27:00.000000000 -0400 +@@ -17,6 +17,8 @@ + #include #include #include +#include +#include - + #include "ext4.h" + #include "ext4_jbd2.h" #include "group.h" - /* -@@ -810,8 +812,10 @@ void ext4_free_blocks(handle_t *handle, +@@ -861,8 +863,10 @@ void ext4_free_blocks(handle_t *handle, else ext4_mb_free_blocks(handle, inode, block, count, metadata, &dquot_freed_blocks); @@ -3492,7 +3327,7 @@ return; } -@@ -1551,18 +1555,33 @@ out: +@@ -1602,18 +1606,33 @@ out: * * Check if filesystem has at least 1 free block available for allocation. */ @@ -3532,7 +3367,7 @@ } /** -@@ -1579,7 +1598,7 @@ static int ext4_has_free_blocks(struct e +@@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct e */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { @@ -3541,7 +3376,7 @@ return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); -@@ -1639,6 +1658,8 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ +@@ -1690,6 +1709,8 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ *errp = -EDQUOT; return 0; } @@ -3550,7 +3385,7 @@ sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; -@@ -1655,7 +1676,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ +@@ -1706,7 +1727,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_ if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) my_rsv = &block_i->rsv_window_node; @@ -3559,7 +3394,7 @@ *errp = -ENOSPC; goto out; } -@@ -1841,12 +1862,16 @@ allocated: +@@ -1896,12 +1917,16 @@ allocated: *errp = 0; brelse(bitmap_bh); DQUOT_FREE_BLOCK(inode, *count-num); @@ -3576,8 +3411,40 @@ if (fatal) { *errp = fatal; ext4_std_error(sb, fatal); ---- a/fs/ext4/file.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/file.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ext4/ext4.h 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/ext4.h 2008-08-11 21:57:27.000000000 -0400 +@@ -230,8 +230,12 @@ struct ext4_group_desc + #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ + #define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ ++#define EXT4_IXUNLINK_FL 0x01000000 /* Immutable invert on unlink */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + ++#define EXT4_BARRIER_FL 0x10000000 /* Barrier for chroot() */ ++#define EXT4_COW_FL 0x20000000 /* Copy on Write marker */ ++ + #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ + #define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +@@ -527,6 +531,8 @@ do { \ + #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ + #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ + #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ ++#define EXT4_MOUNT_TAGGED (1<<24) /* Enable Context Tags */ ++ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt +@@ -1029,6 +1035,7 @@ int ext4_get_blocks_handle(handle_t *han + ext4_lblk_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize); ++extern int ext4_sync_flags(struct inode *inode); + + extern struct inode *ext4_iget(struct super_block *, unsigned long); + extern int ext4_write_inode (struct inode *, int); +--- a/fs/ext4/file.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/file.c 2008-07-29 18:35:50.000000000 -0400 @@ -152,5 +152,6 @@ const struct inode_operations ext4_file_ #endif .permission = ext4_permission, @@ -3585,18 +3452,18 @@ + .sync_flags = ext4_sync_flags, }; ---- a/fs/ext4/ialloc.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/ialloc.c 2008-04-21 10:29:28.000000000 -0400 -@@ -24,6 +24,8 @@ +--- a/fs/ext4/ialloc.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/ialloc.c 2008-07-16 22:41:36.000000000 -0400 +@@ -22,6 +22,8 @@ #include #include #include +#include +#include #include - - #include "xattr.h" -@@ -186,6 +188,7 @@ void ext4_free_inode (handle_t *handle, + #include "ext4.h" + #include "ext4_jbd2.h" +@@ -185,6 +187,7 @@ void ext4_free_inode (handle_t *handle, ext4_xattr_delete_inode(handle, inode); DQUOT_FREE_INODE(inode); DQUOT_DROP(inode); @@ -3604,7 +3471,7 @@ is_directory = S_ISDIR(inode->i_mode); -@@ -513,6 +516,12 @@ struct inode *ext4_new_inode(handle_t *h +@@ -510,6 +513,12 @@ struct inode *ext4_new_inode(handle_t *h inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -3617,17 +3484,17 @@ ei = EXT4_I(inode); sbi = EXT4_SB(sb); -@@ -707,7 +716,8 @@ got: +@@ -702,7 +711,8 @@ got: * newly created directory and file only if -o extent mount option is * specified */ - ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL); + ei->i_flags = EXT4_I(dir)->i_flags & -+ ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL|EXT4_IUNLINK_FL|EXT4_BARRIER_FL); ++ ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL|EXT4_IXUNLINK_FL|EXT4_BARRIER_FL); if (S_ISLNK(mode)) ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL); /* dirsync only applies to directories */ -@@ -766,6 +776,8 @@ got: +@@ -762,6 +772,8 @@ got: fail: ext4_std_error(sb, err); out: @@ -3636,7 +3503,7 @@ iput(inode); ret = ERR_PTR(err); really_out: -@@ -777,6 +789,7 @@ fail_free_drop: +@@ -773,6 +785,7 @@ fail_free_drop: fail_drop: DQUOT_DROP(inode); @@ -3644,17 +3511,17 @@ inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); ---- a/fs/ext4/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/inode.c 2008-04-21 10:20:53.000000000 -0400 -@@ -36,6 +36,7 @@ +--- a/fs/ext4/inode.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/inode.c 2008-07-17 20:26:11.000000000 -0400 +@@ -35,6 +35,7 @@ #include #include #include +#include + #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" - -@@ -2342,7 +2343,7 @@ void ext4_truncate(struct inode *inode) +@@ -2354,7 +2355,7 @@ void ext4_truncate(struct inode *inode) return; if (ext4_inode_is_fast_symlink(inode)) return; @@ -3663,20 +3530,18 @@ return; /* -@@ -2671,13 +2672,20 @@ void ext4_set_inode_flags(struct inode * +@@ -2669,37 +2670,86 @@ void ext4_set_inode_flags(struct inode * { unsigned int flags = EXT4_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); -+ inode->i_flags &= ~(S_IMMUTABLE | S_IUNLINK | S_BARRIER | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); + + if (flags & EXT4_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; -+ if (flags & EXT4_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; -+ if (flags & EXT4_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ if (flags & EXT4_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + if (flags & EXT4_SYNC_FL) inode->i_flags |= S_SYNC; @@ -3687,54 +3552,79 @@ if (flags & EXT4_NOATIME_FL) inode->i_flags |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) -@@ -2702,6 +2710,46 @@ void ext4_get_inode_flags(struct ext4_in + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & EXT4_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & EXT4_COW_FL) ++ inode->i_vflags |= V_COW; + } + + /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ + void ext4_get_inode_flags(struct ext4_inode_info *ei) + { + unsigned int flags = ei->vfs_inode.i_flags; ++ unsigned int vflags = ei->vfs_inode.i_vflags; ++ ++ ei->i_flags &= ~(EXT4_SYNC_FL | EXT4_APPEND_FL | ++ EXT4_IMMUTABLE_FL | EXT4_IXUNLINK_FL | ++ EXT4_NOATIME_FL | EXT4_DIRSYNC_FL | ++ EXT4_BARRIER_FL | EXT4_COW_FL); ++ ++ if (flags & S_IMMUTABLE) ++ ei->i_flags |= EXT4_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ ei->i_flags |= EXT4_IXUNLINK_FL; + +- ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| +- EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); + if (flags & S_SYNC) + ei->i_flags |= EXT4_SYNC_FL; + if (flags & S_APPEND) + ei->i_flags |= EXT4_APPEND_FL; +- if (flags & S_IMMUTABLE) +- ei->i_flags |= EXT4_IMMUTABLE_FL; + if (flags & S_NOATIME) + ei->i_flags |= EXT4_NOATIME_FL; if (flags & S_DIRSYNC) ei->i_flags |= EXT4_DIRSYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ ei->i_flags |= EXT4_BARRIER_FL; ++ if (vflags & V_COW) ++ ei->i_flags |= EXT4_COW_FL; } + +int ext4_sync_flags(struct inode *inode) +{ -+ unsigned int oldflags, newflags; -+ int err = 0; ++ struct ext4_iloc iloc; ++ handle_t *handle; ++ int err; + -+ oldflags = EXT4_I(inode)->i_flags; -+ newflags = oldflags & ~(EXT4_IMMUTABLE_FL | -+ EXT4_IUNLINK_FL | EXT4_BARRIER_FL); ++ handle = ext4_journal_start(inode, 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ err = ext4_reserve_inode_write(handle, inode, &iloc); ++ if (err) ++ goto flags_err; + -+ if (IS_IMMUTABLE(inode)) -+ newflags |= EXT4_IMMUTABLE_FL; -+ if (IS_IUNLINK(inode)) -+ newflags |= EXT4_IUNLINK_FL; -+ if (IS_BARRIER(inode)) -+ newflags |= EXT4_BARRIER_FL; ++ ext4_get_inode_flags(EXT4_I(inode)); ++ inode->i_ctime = CURRENT_TIME; + -+ if (oldflags ^ newflags) { -+ handle_t *handle; -+ struct ext4_iloc iloc; -+ -+ handle = ext4_journal_start(inode, 1); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ err = ext4_reserve_inode_write(handle, inode, &iloc); -+ if (err) -+ goto flags_err; -+ -+ EXT4_I(inode)->i_flags = newflags; -+ inode->i_ctime = CURRENT_TIME; -+ -+ err = ext4_mark_iloc_dirty(handle, inode, &iloc); -+ flags_err: -+ ext4_journal_stop(handle); -+ } ++ err = ext4_mark_iloc_dirty(handle, inode, &iloc); ++flags_err: ++ ext4_journal_stop(handle); + return err; +} + static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { -@@ -2734,6 +2782,8 @@ struct inode *ext4_iget(struct super_blo +@@ -2732,6 +2782,8 @@ struct inode *ext4_iget(struct super_blo struct inode *inode; long ret; int block; @@ -3743,7 +3633,7 @@ inode = iget_locked(sb, ino); if (!inode) -@@ -2754,12 +2804,17 @@ struct inode *ext4_iget(struct super_blo +@@ -2752,12 +2804,17 @@ struct inode *ext4_iget(struct super_blo bh = iloc.bh; raw_inode = ext4_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); @@ -3765,7 +3655,7 @@ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); ei->i_state = 0; -@@ -2933,6 +2988,8 @@ static int ext4_do_update_inode(handle_t +@@ -2931,6 +2988,8 @@ static int ext4_do_update_inode(handle_t struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; @@ -3774,7 +3664,7 @@ int err = 0, rc, block; /* For fields not not tracking in the in-memory inode, -@@ -2943,29 +3000,32 @@ static int ext4_do_update_inode(handle_t +@@ -2941,29 +3000,32 @@ static int ext4_do_update_inode(handle_t ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if(!(test_opt(inode->i_sb, NO_UID32))) { @@ -3813,7 +3703,7 @@ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); -@@ -3121,7 +3181,8 @@ int ext4_setattr(struct dentry *dentry, +@@ -3120,7 +3182,8 @@ int ext4_setattr(struct dentry *dentry, return error; if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || @@ -3823,7 +3713,7 @@ handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, -@@ -3143,6 +3204,8 @@ int ext4_setattr(struct dentry *dentry, +@@ -3142,6 +3205,8 @@ int ext4_setattr(struct dentry *dentry, inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; @@ -3832,96 +3722,57 @@ error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } ---- a/fs/ext4/ioctl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/ioctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -8,6 +8,7 @@ +--- a/fs/ext4/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/ioctl.c 2008-07-17 20:58:51.000000000 -0400 +@@ -8,12 +8,14 @@ */ #include +#include #include #include - #include -@@ -15,6 +16,7 @@ #include #include #include + #include +#include #include + #include "ext4_jbd2.h" + #include "ext4.h" +@@ -52,6 +54,11 @@ long ext4_ioctl(struct file *filp, unsig + if (!S_ISDIR(inode->i_mode)) + flags &= ~EXT4_DIRSYNC_FL; - int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, -@@ -38,7 +40,8 @@ int ext4_ioctl (struct inode * inode, st - unsigned int oldflags; - unsigned int jflag; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -67,7 +70,9 @@ int ext4_ioctl (struct inode * inode, st ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + err = -EPERM; + mutex_lock(&inode->i_mutex); + /* Is it quota file? Do not allow user to mess with it */ +@@ -69,7 +76,9 @@ long ext4_ioctl(struct file *filp, unsig * * This test looks nicer. Thanks to Pauline Middelink */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if ((oldflags & EXT4_IMMUTABLE_FL) || + ((flags ^ oldflags) & (EXT4_APPEND_FL | -+ EXT4_IMMUTABLE_FL | EXT4_IUNLINK_FL))) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; -@@ -129,7 +134,8 @@ flags_err: - - if (!is_owner_or_cap(inode)) - return -EPERM; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; -@@ -183,7 +189,8 @@ flags_err: - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -218,7 +225,8 @@ flags_err: - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) -@@ -239,7 +247,8 @@ flags_err: - if (!capable(CAP_SYS_RESOURCE)) - return -EPERM; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, ---- a/fs/ext4/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/namei.c 2008-04-19 15:14:52.000000000 -0400 -@@ -36,6 +36,7 @@ ++ EXT4_IMMUTABLE_FL | EXT4_IXUNLINK_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } +--- a/fs/ext4/namei.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/namei.c 2008-07-17 20:24:56.000000000 -0400 +@@ -34,6 +34,7 @@ #include #include #include +#include + #include "ext4.h" + #include "ext4_jbd2.h" - #include "namei.h" - #include "xattr.h" -@@ -908,6 +909,7 @@ restart: +@@ -913,6 +914,7 @@ restart: if (bh) ll_rw_block(READ_META, 1, &bh); } @@ -3929,7 +3780,7 @@ } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; -@@ -2448,6 +2450,7 @@ const struct inode_operations ext4_dir_i +@@ -2458,6 +2460,7 @@ const struct inode_operations ext4_dir_i .removexattr = generic_removexattr, #endif .permission = ext4_permission, @@ -3937,15 +3788,15 @@ }; const struct inode_operations ext4_special_inode_operations = { -@@ -2459,4 +2462,5 @@ const struct inode_operations ext4_speci +@@ -2469,4 +2472,5 @@ const struct inode_operations ext4_speci .removexattr = generic_removexattr, #endif .permission = ext4_permission, + .sync_flags = ext4_sync_flags, }; ---- a/fs/ext4/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/super.c 2008-04-21 10:31:22.000000000 -0400 -@@ -887,6 +887,7 @@ enum { +--- a/fs/ext4/super.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/super.c 2008-07-17 21:00:22.000000000 -0400 +@@ -895,6 +895,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, Opt_mballoc, Opt_nomballoc, Opt_stripe, @@ -3953,20 +3804,17 @@ }; static match_table_t tokens = { -@@ -944,8 +945,11 @@ static match_table_t tokens = { - {Opt_mballoc, "mballoc"}, +@@ -953,6 +954,9 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, -- {Opt_err, NULL}, {Opt_resize, "resize"}, + {Opt_tag, "tag"}, + {Opt_notag, "notag"}, + {Opt_tagid, "tagid=%u"}, -+ {Opt_err, NULL}, + {Opt_err, NULL}, }; - static ext4_fsblk_t get_sb_block(void **data) -@@ -1037,6 +1041,20 @@ static int parse_options (char *options, +@@ -1045,6 +1049,20 @@ static int parse_options (char *options, case Opt_nouid32: set_opt (sbi->s_mount_opt, NO_UID32); break; @@ -3987,7 +3835,7 @@ case Opt_nocheck: clear_opt (sbi->s_mount_opt, CHECK); break; -@@ -1909,6 +1927,9 @@ static int ext4_fill_super (struct super +@@ -1932,6 +1950,9 @@ static int ext4_fill_super (struct super NULL, 0)) goto failed_mount; @@ -3997,21 +3845,22 @@ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); -@@ -2828,6 +2849,12 @@ static int ext4_remount (struct super_bl - +@@ -2875,6 +2896,13 @@ static int ext4_remount (struct super_bl if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) - ext4_abort(sb, __FUNCTION__, "Abort forced by user"); + ext4_abort(sb, __func__, "Abort forced by user"); + + if ((sbi->s_mount_opt & EXT4_MOUNT_TAGGED) && + !(sb->s_flags & MS_TAGGED)) { + printk("EXT4-fs: %s: tagging not permitted on remount.\n", + sb->s_id); + return -EINVAL; + } - ++ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); ---- a/fs/ext4/symlink.c 2008-04-17 10:32:27.000000000 -0400 -+++ a/fs/ext4/symlink.c 2008-04-19 15:14:52.000000000 -0400 + +--- a/fs/ext4/symlink.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/symlink.c 2008-07-16 22:41:36.000000000 -0400 @@ -40,6 +40,7 @@ const struct inode_operations ext4_symli .listxattr = ext4_listxattr, .removexattr = generic_removexattr, @@ -4026,17 +3875,17 @@ #endif + .sync_flags = ext4_sync_flags, }; ---- a/fs/ext4/xattr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ext4/xattr.c 2008-04-19 15:14:52.000000000 -0400 -@@ -58,6 +58,7 @@ +--- a/fs/ext4/xattr.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ext4/xattr.c 2008-07-17 20:24:11.000000000 -0400 +@@ -56,6 +56,7 @@ #include #include #include +#include + #include "ext4_jbd2.h" + #include "ext4.h" #include "xattr.h" - #include "acl.h" - -@@ -489,6 +490,7 @@ ext4_xattr_release_block(handle_t *handl +@@ -490,6 +491,7 @@ ext4_xattr_release_block(handle_t *handl error = ext4_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; @@ -4044,7 +3893,7 @@ DQUOT_FREE_BLOCK(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); -@@ -779,11 +781,14 @@ inserted: +@@ -780,11 +782,14 @@ inserted: if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { @@ -4060,7 +3909,7 @@ error = ext4_journal_get_write_access(handle, new_bh); if (error) -@@ -860,6 +865,8 @@ cleanup: +@@ -858,6 +863,8 @@ cleanup: cleanup_dquot: DQUOT_FREE_BLOCK(inode, 1); @@ -4069,9 +3918,9 @@ goto cleanup; bad_block: ---- a/fs/fcntl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/fcntl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -19,6 +19,7 @@ +--- a/fs/fcntl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/fcntl.c 2008-07-17 17:22:30.000000000 -0400 +@@ -20,6 +20,7 @@ #include #include #include @@ -4079,7 +3928,7 @@ #include #include -@@ -85,6 +86,8 @@ repeat: +@@ -88,6 +89,8 @@ repeat: error = -EMFILE; if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) goto out; @@ -4088,15 +3937,20 @@ error = expand_files(files, newfd); if (error < 0) -@@ -128,6 +131,7 @@ static int dupfd(struct file *file, unsi - else - FD_CLR(fd, fdt->close_on_exec); - spin_unlock(&files->file_lock); -+ vx_openfd_inc(fd); +@@ -118,9 +121,10 @@ out: + static int dupfd(struct file *file, unsigned int start, int cloexec) + { + int fd = locate_fd(start, cloexec); +- if (fd >= 0) ++ if (fd >= 0) { fd_install(fd, file); - } else { - spin_unlock(&files->file_lock); -@@ -180,6 +184,9 @@ asmlinkage long sys_dup2(unsigned int ol +- else ++ vx_openfd_inc(fd); ++ } else + fput(file); + + return fd; +@@ -169,6 +173,9 @@ asmlinkage long sys_dup2(unsigned int ol if (tofree) filp_close(tofree, files); @@ -4106,9 +3960,28 @@ err = newfd; out: return err; ---- a/fs/file_table.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/file_table.c 2008-04-19 15:14:52.000000000 -0400 -@@ -20,6 +20,8 @@ +--- a/fs/file.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/file.c 2008-07-27 14:14:05.000000000 -0400 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + struct fdtable_defer { + spinlock_t lock; +@@ -357,6 +358,8 @@ struct files_struct *dup_fd(struct files + struct file *f = *old_fds++; + if (f) { + get_file(f); ++ /* TODO: sum it first for check and performance */ ++ vx_openfd_inc(open_files - i); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet +--- a/fs/file_table.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/file_table.c 2008-07-16 22:41:36.000000000 -0400 +@@ -21,6 +21,8 @@ #include #include #include @@ -4117,7 +3990,7 @@ #include -@@ -124,6 +126,8 @@ struct file *get_empty_filp(void) +@@ -126,6 +128,8 @@ struct file *get_empty_filp(void) f->f_gid = tsk->fsgid; eventpoll_init_file(f); /* f->f_version: 0 */ @@ -4126,16 +3999,16 @@ return f; over: -@@ -239,6 +243,8 @@ void __fput(struct file *file) - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); +@@ -276,6 +280,8 @@ void __fput(struct file *file) + cdev_put(inode->i_cdev); + fops_put(file->f_op); put_pid(file->f_owner.pid); + vx_files_dec(file); + file->f_xid = 0; file_kill(file); - file->f_path.dentry = NULL; - file->f_path.mnt = NULL; -@@ -304,6 +310,8 @@ void put_filp(struct file *file) + if (file->f_mode & FMODE_WRITE) + drop_file_write_access(file); +@@ -343,6 +349,8 @@ void put_filp(struct file *file) { if (atomic_dec_and_test(&file->f_count)) { security_file_free(file); @@ -4144,28 +4017,18 @@ file_kill(file); file_free(file); } ---- a/fs/hfsplus/ioctl.c 2008-04-17 10:37:23.000000000 -0400 -+++ a/fs/hfsplus/ioctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -16,6 +16,7 @@ - #include +--- a/fs/hfsplus/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/hfsplus/ioctl.c 2008-07-16 22:41:36.000000000 -0400 +@@ -17,6 +17,7 @@ + #include #include #include +#include #include #include "hfsplus_fs.h" -@@ -35,7 +36,8 @@ int hfsplus_ioctl(struct inode *inode, s - flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ - return put_user(flags, (int __user *)arg); - case HFSPLUS_IOC_EXT2_SETFLAGS: { -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) ---- a/fs/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/inode.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/inode.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -124,6 +124,9 @@ static struct inode *alloc_inode(struct struct address_space * const mapping = &inode->i_data; @@ -4193,7 +4056,7 @@ /** * clear_inode - clear an inode * @inode: inode to clear -@@ -1434,9 +1440,11 @@ void init_special_inode(struct inode *in +@@ -1426,9 +1432,11 @@ void init_special_inode(struct inode *in if (S_ISCHR(mode)) { inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; @@ -4205,8 +4068,8 @@ } else if (S_ISFIFO(mode)) inode->i_fop = &def_fifo_fops; else if (S_ISSOCK(mode)) ---- a/fs/ioctl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ioctl.c 2008-04-21 09:25:46.000000000 -0400 +--- a/fs/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/ioctl.c 2008-07-16 22:41:36.000000000 -0400 @@ -13,6 +13,9 @@ #include #include @@ -4218,7 +4081,7 @@ #include --- a/fs/ioprio.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ioprio.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ioprio.c 2008-07-16 22:41:36.000000000 -0400 @@ -26,6 +26,7 @@ #include #include @@ -4246,7 +4109,7 @@ if (tmpio < 0) continue; --- a/fs/jfs/acl.c 2007-02-04 13:44:54.000000000 -0500 -+++ a/fs/jfs/acl.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/acl.c 2008-07-16 22:41:36.000000000 -0400 @@ -232,7 +232,8 @@ int jfs_setattr(struct dentry *dentry, s return rc; @@ -4258,7 +4121,7 @@ return -EDQUOT; } --- a/fs/jfs/file.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/file.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/file.c 2008-07-16 22:41:36.000000000 -0400 @@ -98,6 +98,7 @@ const struct inode_operations jfs_file_i .setattr = jfs_setattr, .permission = jfs_permission, @@ -4268,7 +4131,7 @@ const struct file_operations jfs_file_operations = { --- a/fs/jfs/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/inode.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -22,6 +22,7 @@ #include #include @@ -4285,56 +4148,59 @@ } clear_inode(inode); ---- a/fs/jfs/ioctl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/ioctl.c 2008-04-21 09:25:22.000000000 -0400 -@@ -10,6 +10,7 @@ - #include +--- a/fs/jfs/ioctl.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/jfs/ioctl.c 2008-07-16 22:41:36.000000000 -0400 +@@ -11,6 +11,7 @@ + #include #include #include +#include #include #include -@@ -66,7 +67,8 @@ long jfs_ioctl(struct file *filp, unsign - case JFS_IOC_SETFLAGS: { - unsigned int oldflags; +@@ -85,6 +86,11 @@ long jfs_ioctl(struct file *filp, unsign + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -94,8 +96,8 @@ long jfs_ioctl(struct file *filp, unsign ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } ++ + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; +@@ -102,8 +108,8 @@ long jfs_ioctl(struct file *filp, unsign * the relevant capability. */ if ((oldflags & JFS_IMMUTABLE_FL) || - ((flags ^ oldflags) & - (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { + ((flags ^ oldflags) & (JFS_APPEND_FL | -+ JFS_IMMUTABLE_FL | JFS_IUNLINK_FL))) { ++ JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; --- a/fs/jfs/jfs_dinode.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/jfs_dinode.h 2008-04-19 15:14:52.000000000 -0400 -@@ -162,9 +162,12 @@ struct dinode { ++++ a/fs/jfs/jfs_dinode.h 2008-07-16 22:41:36.000000000 -0400 +@@ -161,9 +161,13 @@ struct dinode { + #define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ #define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ ++#define JFS_IXUNLINK_FL 0x04000000 /* Immutable invert on unlink */ -#define JFS_FL_USER_VISIBLE 0x03F80000 -+#define JFS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ -+#define JFS_IUNLINK_FL 0x08000000 /* Immutable unlink */ +-#define JFS_FL_USER_MODIFIABLE 0x03F80000 ++#define JFS_BARRIER_FL 0x10000000 /* Barrier for chroot() */ ++#define JFS_COW_FL 0x20000000 /* Copy on Write marker */ + -+#define JFS_FL_USER_VISIBLE 0x0FF80000 - #define JFS_FL_USER_MODIFIABLE 0x03F80000 --#define JFS_FL_INHERIT 0x03C80000 -+#define JFS_FL_INHERIT 0x0BC80000 ++#define JFS_FL_USER_VISIBLE 0x07F80000 ++#define JFS_FL_USER_MODIFIABLE 0x07F80000 + #define JFS_FL_INHERIT 0x03C80000 /* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ - #define JFS_IOC_GETFLAGS _IOR('f', 1, long) --- a/fs/jfs/jfs_dtree.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/jfs_dtree.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/jfs_dtree.c 2008-07-16 22:41:36.000000000 -0400 @@ -102,6 +102,7 @@ #include @@ -4447,7 +4313,7 @@ DQUOT_FREE_BLOCK(ip, xlen); --- a/fs/jfs/jfs_extent.c 2008-04-17 10:37:23.000000000 -0400 -+++ a/fs/jfs/jfs_extent.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/jfs_extent.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,7 @@ #include @@ -4510,7 +4376,7 @@ goto exit; } --- a/fs/jfs/jfs_filsys.h 2008-04-17 10:37:23.000000000 -0400 -+++ a/fs/jfs/jfs_filsys.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/jfs_filsys.h 2008-07-16 22:41:36.000000000 -0400 @@ -263,6 +263,7 @@ #define JFS_NAME_MAX 255 #define JFS_PATH_MAX BPSIZE @@ -4519,8 +4385,8 @@ /* * file system state (superblock state) ---- a/fs/jfs/jfs_imap.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/jfs_imap.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/jfs/jfs_imap.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/jfs/jfs_imap.c 2008-07-16 22:41:36.000000000 -0400 @@ -45,6 +45,7 @@ #include #include @@ -4529,7 +4395,7 @@ #include "jfs_incore.h" #include "jfs_inode.h" -@@ -3061,6 +3062,8 @@ static int copy_from_dinode(struct dinod +@@ -3058,6 +3059,8 @@ static int copy_from_dinode(struct dinod { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); @@ -4538,7 +4404,7 @@ jfs_ip->fileset = le32_to_cpu(dip->di_fileset); jfs_ip->mode2 = le32_to_cpu(dip->di_mode); -@@ -3081,14 +3084,18 @@ static int copy_from_dinode(struct dinod +@@ -3078,14 +3081,18 @@ static int copy_from_dinode(struct dinod } ip->i_nlink = le32_to_cpu(dip->di_nlink); @@ -4559,7 +4425,7 @@ if (sbi->gid == -1) ip->i_gid = jfs_ip->saved_gid; else { -@@ -3153,14 +3160,12 @@ static void copy_to_dinode(struct dinode +@@ -3150,14 +3157,12 @@ static void copy_to_dinode(struct dinode dip->di_size = cpu_to_le64(ip->i_size); dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); dip->di_nlink = cpu_to_le32(ip->i_nlink); @@ -4581,7 +4447,7 @@ /* * mode2 is only needed for storing the higher order bits. --- a/fs/jfs/jfs_inode.c 2008-04-17 10:33:02.000000000 -0400 -+++ a/fs/jfs/jfs_inode.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/jfs_inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,8 @@ #include @@ -4591,21 +4457,19 @@ #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" -@@ -30,19 +32,47 @@ void jfs_set_inode_flags(struct inode *i +@@ -30,29 +32,46 @@ void jfs_set_inode_flags(struct inode *i { unsigned int flags = JFS_IP(inode)->mode2; - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | - S_NOATIME | S_DIRSYNC | S_SYNC); -+ inode->i_flags &= ~(S_IMMUTABLE | S_IUNLINK | S_BARRIER | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); if (flags & JFS_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; -+ if (flags & JFS_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; -+ if (flags & JFS_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ if (flags & JFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + + if (flags & JFS_SYNC_FL) + inode->i_flags |= S_SYNC; @@ -4617,33 +4481,56 @@ inode->i_flags |= S_DIRSYNC; - if (flags & JFS_SYNC_FL) - inode->i_flags |= S_SYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & JFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & JFS_COW_FL) ++ inode->i_vflags |= V_COW; + } + + void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) + { + unsigned int flags = jfs_ip->vfs_inode.i_flags; ++ unsigned int vflags = jfs_ip->vfs_inode.i_vflags; ++ ++ jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_IXUNLINK_FL | ++ JFS_APPEND_FL | JFS_NOATIME_FL | ++ JFS_DIRSYNC_FL | JFS_SYNC_FL | ++ JFS_BARRIER_FL | JFS_COW_FL); + +- jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | +- JFS_DIRSYNC_FL | JFS_SYNC_FL); + if (flags & S_IMMUTABLE) + jfs_ip->mode2 |= JFS_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ jfs_ip->mode2 |= JFS_IXUNLINK_FL; ++ + if (flags & S_APPEND) + jfs_ip->mode2 |= JFS_APPEND_FL; + if (flags & S_NOATIME) +@@ -61,6 +80,19 @@ void jfs_get_inode_flags(struct jfs_inod + jfs_ip->mode2 |= JFS_DIRSYNC_FL; + if (flags & S_SYNC) + jfs_ip->mode2 |= JFS_SYNC_FL; ++ ++ if (vflags & V_BARRIER) ++ jfs_ip->mode2 |= JFS_BARRIER_FL; ++ if (vflags & V_COW) ++ jfs_ip->mode2 |= JFS_COW_FL; +} + +int jfs_sync_flags(struct inode *inode) +{ -+ unsigned int oldflags, newflags; -+ -+ oldflags = JFS_IP(inode)->mode2; -+ newflags = oldflags & ~(JFS_IMMUTABLE_FL | -+ JFS_IUNLINK_FL | JFS_BARRIER_FL); -+ -+ if (IS_IMMUTABLE(inode)) -+ newflags |= JFS_IMMUTABLE_FL; -+ if (IS_IUNLINK(inode)) -+ newflags |= JFS_IUNLINK_FL; -+ if (IS_BARRIER(inode)) -+ newflags |= JFS_BARRIER_FL; -+ -+ if (oldflags ^ newflags) { -+ JFS_IP(inode)->mode2 = newflags; -+ inode->i_ctime = CURRENT_TIME; -+ mark_inode_dirty(inode); -+ } ++ jfs_get_inode_flags(JFS_IP(inode)); ++ inode->i_ctime = CURRENT_TIME; ++ mark_inode_dirty(inode); + return 0; } - void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) -@@ -108,10 +138,17 @@ struct inode *ialloc(struct inode *paren + /* +@@ -108,10 +140,17 @@ struct inode *ialloc(struct inode *paren jfs_inode->saved_uid = inode->i_uid; jfs_inode->saved_gid = inode->i_gid; @@ -4662,7 +4549,7 @@ inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; --- a/fs/jfs/jfs_inode.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/jfs_inode.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/jfs_inode.h 2008-07-16 22:41:36.000000000 -0400 @@ -39,6 +39,7 @@ extern struct dentry *jfs_fh_to_dentry(s extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); @@ -4671,8 +4558,8 @@ extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations jfs_aops; ---- a/fs/jfs/jfs_xtree.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/jfs_xtree.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/jfs/jfs_xtree.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/jfs/jfs_xtree.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include @@ -4702,7 +4589,7 @@ DQUOT_FREE_BLOCK(ip, xlen); } return rc; -@@ -1236,6 +1243,7 @@ xtSplitPage(tid_t tid, struct inode *ip, +@@ -1232,6 +1239,7 @@ xtSplitPage(tid_t tid, struct inode *ip, struct tlock *tlck; struct xtlock *sxtlck = NULL, *rxtlck = NULL; int quota_allocation = 0; @@ -4710,7 +4597,7 @@ smp = split->mp; sp = XT_PAGE(ip, smp); -@@ -1255,6 +1263,13 @@ xtSplitPage(tid_t tid, struct inode *ip, +@@ -1251,6 +1259,13 @@ xtSplitPage(tid_t tid, struct inode *ip, quota_allocation += lengthPXD(pxd); @@ -4724,7 +4611,7 @@ /* * allocate the new right page for the split */ -@@ -1456,6 +1471,9 @@ xtSplitPage(tid_t tid, struct inode *ip, +@@ -1452,6 +1467,9 @@ xtSplitPage(tid_t tid, struct inode *ip, clean_up: @@ -4734,7 +4621,7 @@ /* Rollback quota allocation. */ if (quota_allocation) DQUOT_FREE_BLOCK(ip, quota_allocation); -@@ -1519,6 +1537,12 @@ xtSplitRoot(tid_t tid, +@@ -1515,6 +1533,12 @@ xtSplitRoot(tid_t tid, release_metapage(rmp); return -EDQUOT; } @@ -4747,7 +4634,7 @@ jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); -@@ -3948,6 +3972,8 @@ s64 xtTruncate(tid_t tid, struct inode * +@@ -3938,6 +3962,8 @@ s64 xtTruncate(tid_t tid, struct inode * else ip->i_size = newsize; @@ -4757,7 +4644,7 @@ DQUOT_FREE_BLOCK(ip, nfreed); --- a/fs/jfs/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/namei.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/namei.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -4783,7 +4670,7 @@ const struct file_operations jfs_dir_operations = { --- a/fs/jfs/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/jfs/super.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/super.c 2008-07-16 22:41:36.000000000 -0400 @@ -195,7 +195,8 @@ static void jfs_put_super(struct super_b enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, @@ -4851,7 +4738,7 @@ if (newLVSize) { printk(KERN_ERR "resize option for remount only\n"); --- a/fs/jfs/xattr.c 2008-04-17 10:37:23.000000000 -0400 -+++ a/fs/jfs/xattr.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/jfs/xattr.c 2008-07-16 22:41:36.000000000 -0400 @@ -23,6 +23,7 @@ #include #include @@ -4929,8 +4816,8 @@ inode->i_ctime = CURRENT_TIME; ---- a/fs/libfs.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/libfs.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/libfs.c 2008-08-12 01:41:51.000000000 -0400 ++++ a/fs/libfs.c 2008-08-12 01:42:21.000000000 -0400 @@ -125,7 +125,8 @@ static inline unsigned char dt_type(stru * both impossible due to the lock on directory. */ @@ -4969,7 +4856,7 @@ ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; -@@ -778,6 +793,7 @@ EXPORT_SYMBOL(dcache_dir_close); +@@ -823,6 +838,7 @@ EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); EXPORT_SYMBOL(dcache_readdir); @@ -4977,17 +4864,17 @@ EXPORT_SYMBOL(generic_read_dir); EXPORT_SYMBOL(get_sb_pseudo); EXPORT_SYMBOL(simple_write_begin); ---- a/fs/locks.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/fs/locks.c 2008-05-21 14:30:41.000000000 -0400 -@@ -126,6 +126,8 @@ +--- a/fs/locks.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/locks.c 2008-07-16 22:41:36.000000000 -0400 +@@ -127,6 +127,8 @@ #include #include #include +#include +#include - #include #include + @@ -148,6 +150,8 @@ static struct kmem_cache *filelock_cache /* Allocate an empty lock structure. */ static struct file_lock *locks_alloc_lock(void) @@ -5013,7 +4900,7 @@ } EXPORT_SYMBOL(locks_init_lock); -@@ -246,6 +252,7 @@ void locks_copy_lock(struct file_lock *n +@@ -247,6 +253,7 @@ void locks_copy_lock(struct file_lock *n new->fl_file = fl->fl_file; new->fl_ops = fl->fl_ops; new->fl_lmops = fl->fl_lmops; @@ -5021,7 +4908,7 @@ locks_copy_private(new, fl); } -@@ -284,6 +291,11 @@ static int flock_make_lock(struct file * +@@ -285,6 +292,11 @@ static int flock_make_lock(struct file * fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; @@ -5033,7 +4920,7 @@ *lock = fl; return 0; -@@ -449,6 +461,7 @@ static int lease_init(struct file *filp, +@@ -450,6 +462,7 @@ static int lease_init(struct file *filp, fl->fl_owner = current->files; fl->fl_pid = current->tgid; @@ -5041,7 +4928,7 @@ fl->fl_file = filp; fl->fl_flags = FL_LEASE; -@@ -468,6 +481,11 @@ static struct file_lock *lease_alloc(str +@@ -469,6 +482,11 @@ static struct file_lock *lease_alloc(str if (fl == NULL) return ERR_PTR(error); @@ -5053,15 +4940,15 @@ error = lease_init(filp, type, fl); if (error) { locks_free_lock(fl); -@@ -774,6 +792,7 @@ static int flock_lock_file(struct file * +@@ -769,6 +787,7 @@ static int flock_lock_file(struct file * if (found) - cond_resched(); + cond_resched_bkl(); + new_fl->fl_xid = -1; find_conflict: for_each_lock(inode, before) { struct file_lock *fl = *before; -@@ -792,6 +811,7 @@ find_conflict: +@@ -787,6 +806,7 @@ find_conflict: goto out; locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); @@ -5069,7 +4956,7 @@ new_fl = NULL; error = 0; -@@ -802,7 +822,8 @@ out: +@@ -797,7 +817,8 @@ out: return error; } @@ -5079,7 +4966,7 @@ { struct file_lock *fl; struct file_lock *new_fl = NULL; -@@ -812,6 +833,8 @@ static int __posix_lock_file(struct inod +@@ -807,6 +828,8 @@ static int __posix_lock_file(struct inod struct file_lock **before; int error, added = 0; @@ -5088,7 +4975,7 @@ /* * We may need two file_lock structures for this operation, * so we get them in advance to avoid races. -@@ -822,7 +845,11 @@ static int __posix_lock_file(struct inod +@@ -817,7 +840,11 @@ static int __posix_lock_file(struct inod (request->fl_type != F_UNLCK || request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { new_fl = locks_alloc_lock(); @@ -5100,7 +4987,7 @@ } lock_kernel(); -@@ -1021,7 +1048,8 @@ static int __posix_lock_file(struct inod +@@ -1016,7 +1043,8 @@ static int __posix_lock_file(struct inod int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { @@ -5110,7 +4997,7 @@ } EXPORT_SYMBOL(posix_lock_file); -@@ -1111,7 +1139,7 @@ int locks_mandatory_area(int read_write, +@@ -1106,7 +1134,7 @@ int locks_mandatory_area(int read_write, fl.fl_end = offset + count - 1; for (;;) { @@ -5119,7 +5006,7 @@ if (error != -EAGAIN) break; if (!(fl.fl_flags & FL_SLEEP)) -@@ -1425,6 +1453,7 @@ int generic_setlease(struct file *filp, +@@ -1423,6 +1451,7 @@ int generic_setlease(struct file *filp, locks_copy_lock(new_fl, lease); locks_insert_lock(before, new_fl); @@ -5127,7 +5014,7 @@ *flp = new_fl; return 0; -@@ -1756,6 +1785,11 @@ int fcntl_setlk(unsigned int fd, struct +@@ -1753,6 +1782,11 @@ int fcntl_setlk(unsigned int fd, struct if (file_lock == NULL) return -ENOLCK; @@ -5139,7 +5026,7 @@ /* * This might block, so we do it before checking the inode. */ -@@ -1893,6 +1927,11 @@ int fcntl_setlk64(unsigned int fd, struc +@@ -1890,6 +1924,11 @@ int fcntl_setlk64(unsigned int fd, struc if (file_lock == NULL) return -ENOLCK; @@ -5151,7 +5038,7 @@ /* * This might block, so we do it before checking the inode. */ -@@ -2176,8 +2215,11 @@ static int locks_show(struct seq_file *f +@@ -2173,8 +2212,11 @@ static int locks_show(struct seq_file *f lock_get_status(f, fl, (long)f->private, ""); @@ -5164,12 +5051,12 @@ f->private++; return 0; ---- a/fs/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/namei.c 2008-04-22 19:23:22.000000000 -0400 -@@ -30,6 +30,13 @@ - #include +--- a/fs/namei.c 2008-08-12 01:41:51.000000000 -0400 ++++ a/fs/namei.c 2008-08-12 01:42:21.000000000 -0400 +@@ -31,6 +31,13 @@ #include #include + #include +#include +#include +#include @@ -5180,7 +5067,7 @@ #include #include -@@ -225,6 +232,28 @@ int generic_permission(struct inode *ino +@@ -226,6 +233,28 @@ int generic_permission(struct inode *ino return -EACCES; } @@ -5209,15 +5096,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) { int retval, submask; -@@ -239,14 +268,14 @@ int permission(struct inode *inode, int - /* - * Nobody gets write access to a read-only fs. - */ -- if (IS_RDONLY(inode) && -+ if ((IS_RDONLY(inode) || (nd && MNT_IS_RDONLY(nd->path.mnt))) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; - +@@ -247,7 +276,7 @@ int permission(struct inode *inode, int /* * Nobody gets write access to an immutable file. */ @@ -5226,7 +5105,7 @@ return -EACCES; } -@@ -261,6 +290,11 @@ int permission(struct inode *inode, int +@@ -262,6 +291,11 @@ int permission(struct inode *inode, int /* Ordinary permission routines do not understand MAY_APPEND. */ submask = mask & ~MAY_APPEND; @@ -5238,7 +5117,7 @@ if (inode->i_op && inode->i_op->permission) { retval = inode->i_op->permission(inode, submask, nd); if (!retval) { -@@ -459,6 +493,8 @@ static int exec_permission_lite(struct i +@@ -464,6 +498,8 @@ static int exec_permission_lite(struct i { umode_t mode = inode->i_mode; @@ -5247,7 +5126,7 @@ if (inode->i_op && inode->i_op->permission) return -EAGAIN; -@@ -789,7 +825,8 @@ static __always_inline void follow_dotdo +@@ -800,7 +836,8 @@ static __always_inline void follow_dotdo if (nd->path.dentry == fs->root.dentry && nd->path.mnt == fs->root.mnt) { read_unlock(&fs->lock); @@ -5257,7 +5136,7 @@ } read_unlock(&fs->lock); spin_lock(&dcache_lock); -@@ -826,16 +863,39 @@ static int do_lookup(struct nameidata *n +@@ -837,16 +874,39 @@ static int do_lookup(struct nameidata *n { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry = __d_lookup(nd->path.dentry, name); @@ -5297,24 +5176,7 @@ need_lookup: dentry = real_lookup(nd->path.dentry, name, nd); -@@ -1464,7 +1524,8 @@ static inline int check_sticky(struct in - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by - * nfs_async_unlink(). - */ --static int may_delete(struct inode *dir,struct dentry *victim,int isdir) -+static int may_delete(struct inode *dir, struct dentry *victim, -+ int isdir, struct nameidata *nd) - { - int error; - -@@ -1474,13 +1535,13 @@ static int may_delete(struct inode *dir, - BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim->d_name.name, victim, dir); - -- error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); -+ error = permission(dir,MAY_WRITE | MAY_EXEC, nd); - if (error) - return error; +@@ -1499,7 +1559,7 @@ static int may_delete(struct inode *dir, if (IS_APPEND(dir)) return -EPERM; if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| @@ -5323,24 +5185,24 @@ return -EPERM; if (isdir) { if (!S_ISDIR(victim->d_inode->i_mode)) -@@ -1626,6 +1687,14 @@ int may_open(struct nameidata *nd, int a - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) - return -EROFS; +@@ -1644,6 +1704,14 @@ int may_open(struct nameidata *nd, int a + flag &= ~O_TRUNC; + } +#ifdef CONFIG_VSERVER_COWBL + if (IS_COW(inode) && (flag & FMODE_WRITE)) { + if (IS_COW_LINK(inode)) + return -EMLINK; -+ inode->i_flags &= ~(S_IUNLINK|S_IMMUTABLE); ++ inode->i_flags &= ~(S_IXUNLINK|S_IMMUTABLE); + mark_inode_dirty(inode); + } +#endif error = vfs_permission(nd, acc_mode); if (error) return error; -@@ -1717,6 +1786,11 @@ int open_namei(int dfd, const char *path - struct dentry *dir; - int count = 0; +@@ -1770,6 +1838,11 @@ struct file *do_filp_open(int dfd, const + int will_write; + int flag = open_to_namei_flags(open_flag); +#ifdef CONFIG_VSERVER_COWBL + int rflag = flag; @@ -5350,10 +5212,10 @@ acc_mode = ACC_MODE(flag); /* O_TRUNC implies we need access checks for write permissions */ -@@ -1810,6 +1884,22 @@ do_last: - goto exit; - ok: - error = may_open(nd, acc_mode, flag); +@@ -1893,6 +1966,22 @@ ok: + goto exit; + } + error = may_open(&nd, acc_mode, flag); +#ifdef CONFIG_VSERVER_COWBL + if (error == -EMLINK) { + struct dentry *dentry; @@ -5363,27 +5225,17 @@ + goto exit; + } + dput(dentry); -+ release_open_intent(nd); -+ path_put(&nd->path); ++ release_open_intent(&nd); ++ path_put(&nd.path); + flag = rflag; + mode = rmode; + goto restart; + } +#endif - if (error) - goto exit; - return 0; -@@ -1921,16 +2011,25 @@ fail: - } - EXPORT_SYMBOL_GPL(lookup_create); - --int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) -+int vfs_mknod(struct inode *dir, struct dentry *dentry, -+ int mode, dev_t dev, struct nameidata *nd) - { -- int error = may_create(dir, dentry, NULL); -+ int error = may_create(dir, dentry, nd); - + if (error) { + if (will_write) + mnt_drop_write(nd.path.mnt); +@@ -2045,9 +2134,17 @@ int vfs_mknod(struct inode *dir, struct if (error) return error; @@ -5402,131 +5254,7 @@ if (!dir->i_op || !dir->i_op->mknod) return -EPERM; -@@ -1973,11 +2072,12 @@ asmlinkage long sys_mknodat(int dfd, con - error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); - break; - case S_IFCHR: case S_IFBLK: -- error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode, -- new_decode_dev(dev)); -+ error = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, -+ new_decode_dev(dev), &nd); - break; - case S_IFIFO: case S_IFSOCK: -- error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); -+ error = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, -+ 0, &nd); - break; - case S_IFDIR: - error = -EPERM; -@@ -2000,9 +2100,10 @@ asmlinkage long sys_mknod(const char __u - return sys_mknodat(AT_FDCWD, filename, mode, dev); - } - --int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) -+int vfs_mkdir(struct inode *dir, struct dentry *dentry, -+ int mode, struct nameidata *nd) - { -- int error = may_create(dir, dentry, NULL); -+ int error = may_create(dir, dentry, nd); - - if (error) - return error; -@@ -2044,7 +2145,7 @@ asmlinkage long sys_mkdirat(int dfd, con - - if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; -- error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); -+ error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode, &nd); - dput(dentry); - out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -@@ -2087,9 +2188,10 @@ void dentry_unhash(struct dentry *dentry - spin_unlock(&dcache_lock); - } - --int vfs_rmdir(struct inode *dir, struct dentry *dentry) -+int vfs_rmdir(struct inode *dir, struct dentry *dentry, -+ struct nameidata *nd) - { -- int error = may_delete(dir, dentry, 1); -+ int error = may_delete(dir, dentry, 1, nd); - - if (error) - return error; -@@ -2151,7 +2253,7 @@ static long do_rmdir(int dfd, const char - error = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto exit2; -- error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -+ error = vfs_rmdir(nd.path.dentry->d_inode, dentry, &nd); - dput(dentry); - exit2: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -@@ -2167,9 +2269,10 @@ asmlinkage long sys_rmdir(const char __u - return do_rmdir(AT_FDCWD, pathname); - } - --int vfs_unlink(struct inode *dir, struct dentry *dentry) -+int vfs_unlink(struct inode *dir, struct dentry *dentry, -+ struct nameidata *nd) - { -- int error = may_delete(dir, dentry, 0); -+ int error = may_delete(dir, dentry, 0, nd); - - if (error) - return error; -@@ -2232,7 +2335,7 @@ static long do_unlinkat(int dfd, const c - inode = dentry->d_inode; - if (inode) - atomic_inc(&inode->i_count); -- error = vfs_unlink(nd.path.dentry->d_inode, dentry); -+ error = vfs_unlink(nd.path.dentry->d_inode, dentry, &nd); - exit2: - dput(dentry); - } -@@ -2267,9 +2370,10 @@ asmlinkage long sys_unlink(const char __ - return do_unlinkat(AT_FDCWD, pathname); - } - --int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) -+int vfs_symlink(struct inode *dir, struct dentry *dentry, -+ const char *oldname, int mode, struct nameidata *nd) - { -- int error = may_create(dir, dentry, NULL); -+ int error = may_create(dir, dentry, nd); - - if (error) - return error; -@@ -2313,7 +2417,8 @@ asmlinkage long sys_symlinkat(const char - if (IS_ERR(dentry)) - goto out_unlock; - -- error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO); -+ error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, -+ S_IALLUGO, &nd); - dput(dentry); - out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -@@ -2330,7 +2435,8 @@ asmlinkage long sys_symlink(const char _ - return sys_symlinkat(oldname, AT_FDCWD, newname); - } - --int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) -+int vfs_link(struct dentry *old_dentry, struct inode *dir, -+ struct dentry *new_dentry, struct nameidata *nd) - { - struct inode *inode = old_dentry->d_inode; - int error; -@@ -2338,7 +2444,7 @@ int vfs_link(struct dentry *old_dentry, - if (!inode) - return -ENOENT; - -- error = may_create(dir, new_dentry, NULL); -+ error = may_create(dir, new_dentry, nd); - if (error) - return error; - -@@ -2348,7 +2454,7 @@ int vfs_link(struct dentry *old_dentry, +@@ -2509,7 +2606,7 @@ int vfs_link(struct dentry *old_dentry, /* * A link to an append-only or immutable file cannot be created. */ @@ -5535,44 +5263,7 @@ return -EPERM; if (!dir->i_op || !dir->i_op->link) return -EPERM; -@@ -2408,7 +2514,8 @@ asmlinkage long sys_linkat(int olddfd, c - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto out_unlock; -- error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry); -+ error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, -+ new_dentry, &nd); - dput(new_dentry); - out_unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); -@@ -2540,14 +2647,14 @@ int vfs_rename(struct inode *old_dir, st - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - -- error = may_delete(old_dir, old_dentry, is_dir); -+ error = may_delete(old_dir, old_dentry, is_dir, NULL); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry, NULL); - else -- error = may_delete(new_dir, new_dentry, is_dir); -+ error = may_delete(new_dir, new_dentry, is_dir, NULL); - if (error) - return error; - -@@ -2625,6 +2732,9 @@ static int do_rename(int olddfd, const c - error = -EINVAL; - if (old_dentry == trap) - goto exit4; -+ error = -EROFS; -+ if (MNT_IS_RDONLY(newnd.path.mnt)) -+ goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) -@@ -2718,6 +2828,214 @@ int vfs_follow_link(struct nameidata *nd +@@ -2889,6 +2986,214 @@ int vfs_follow_link(struct nameidata *nd return __vfs_follow_link(nd, link); } @@ -5750,7 +5441,7 @@ + goto out_redo; + + /* error path cleanup */ -+ vfs_unlink(dir->d_inode, new_path.dentry, &dir_nd); ++ vfs_unlink(dir->d_inode, new_path.dentry); + dput(new_path.dentry); + +out_redo: @@ -5787,12 +5478,12 @@ /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) { ---- a/fs/namespace.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/namespace.c 2008-04-21 17:20:53.000000000 -0400 -@@ -26,6 +26,11 @@ - #include +--- a/fs/namespace.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/namespace.c 2008-07-17 18:08:01.000000000 -0400 +@@ -27,6 +27,11 @@ #include #include + #include +#include +#include +#include @@ -5801,7 +5492,7 @@ #include #include #include "pnode.h" -@@ -244,6 +249,7 @@ static struct vfsmount *clone_mnt(struct +@@ -572,6 +577,7 @@ static struct vfsmount *clone_mnt(struct mnt->mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; @@ -5809,7 +5500,7 @@ if (flag & CL_SLAVE) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); -@@ -323,6 +329,31 @@ static inline void mangle(struct seq_fil +@@ -684,6 +690,31 @@ static inline void mangle(struct seq_fil seq_escape(m, s, " \t\n\\"); } @@ -5841,102 +5532,15 @@ /* * Simple .show_options callback for filesystems which don't want to * implement more complex mount option showing. -@@ -388,44 +419,61 @@ static int show_vfsmnt(struct seq_file * - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - static struct proc_fs_info { -- int flag; -- char *str; -+ int s_flag; -+ int mnt_flag; -+ char *set_str; -+ char *unset_str; - } fs_info[] = { -- { MS_SYNCHRONOUS, ",sync" }, -- { MS_DIRSYNC, ",dirsync" }, -- { MS_MANDLOCK, ",mand" }, -- { 0, NULL } -- }; -- static struct proc_fs_info mnt_info[] = { -- { MNT_NOSUID, ",nosuid" }, -- { MNT_NODEV, ",nodev" }, -- { MNT_NOEXEC, ",noexec" }, -- { MNT_NOATIME, ",noatime" }, -- { MNT_NODIRATIME, ",nodiratime" }, -- { MNT_RELATIME, ",relatime" }, -- { 0, NULL } -+ { MS_RDONLY, MNT_RDONLY, "ro", "rw" }, -+ { MS_SYNCHRONOUS, 0, ",sync", NULL }, -+ { MS_DIRSYNC, 0, ",dirsync", NULL }, -+ { MS_MANDLOCK, 0, ",mand", NULL }, -+ { MS_TAGGED, 0, ",tag", NULL }, -+ { MS_NOATIME, MNT_NOATIME, ",noatime", NULL }, -+ { MS_NODIRATIME, MNT_NODIRATIME, ",nodiratime", NULL }, -+ { MS_RELATIME, MNT_RELATIME, ",relatime", NULL }, -+ { 0, MNT_NOSUID, ",nosuid", NULL }, -+ { 0, MNT_NODEV, ",nodev", NULL }, -+ { 0, MNT_NOEXEC, ",noexec", NULL }, -+ { 0, 0, NULL, NULL } +@@ -756,6 +787,7 @@ static void show_sb_opts(struct seq_file + { MS_SYNCHRONOUS, ",sync" }, + { MS_DIRSYNC, ",dirsync" }, + { MS_MANDLOCK, ",mand" }, ++ { MS_TAGGED, ",tag" }, + { 0, NULL } }; -- struct proc_fs_info *fs_infop; -- struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; -+ struct proc_fs_info *p; -+ unsigned long s_flags = mnt->mnt_sb->s_flags; -+ int mnt_flags = mnt->mnt_flags; - -- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -- seq_putc(m, ' '); -- seq_path(m, &mnt_path, " \t\n\\"); -- seq_putc(m, ' '); -- mangle(m, mnt->mnt_sb->s_type->name); -- if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { -- seq_putc(m, '.'); -- mangle(m, mnt->mnt_sb->s_subtype); -- } -- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); -- for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { -- if (mnt->mnt_sb->s_flags & fs_infop->flag) -- seq_puts(m, fs_infop->str); -+ if (vx_flags(VXF_HIDE_MOUNT, 0)) -+ return 0; -+ if (!mnt_is_reachable(mnt) && !vx_check(0, VS_WATCH_P)) -+ return 0; -+ -+ if (!vx_check(0, VS_ADMIN|VS_WATCH) && -+ mnt == current->fs->root.mnt) { -+ seq_puts(m, "/dev/root / "); -+ } else { -+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; -+ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -+ seq_putc(m, ' '); -+ seq_path(m, &mnt_path, " \t\n\\"); -+ seq_putc(m, ' '); -+ -+ if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { -+ seq_putc(m, '.'); -+ mangle(m, mnt->mnt_sb->s_subtype); -+ } - } -- for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { -- if (mnt->mnt_flags & fs_infop->flag) -- seq_puts(m, fs_infop->str); -+ mangle(m, mnt->mnt_sb->s_type->name); -+ seq_putc(m, ' '); -+ for (p = fs_info; (p->s_flag | p->mnt_flag) ; p++) { -+ if ((s_flags & p->s_flag) || (mnt_flags & p->mnt_flag)) { -+ if (p->set_str) -+ seq_puts(m, p->set_str); -+ } else { -+ if (p->unset_str) -+ seq_puts(m, p->unset_str); -+ } - } -+ if (mnt->mnt_flags & MNT_TAGID) -+ seq_printf(m, ",tag=%d", mnt->mnt_tag); - if (mnt->mnt_sb->s_op->show_options) - err = mnt->mnt_sb->s_op->show_options(m, mnt); - seq_puts(m, " 0 0\n"); -@@ -445,17 +493,27 @@ static int show_vfsstat(struct seq_file + const struct proc_fs_info *fs_infop; +@@ -885,17 +917,27 @@ static int show_vfsstat(struct seq_file struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err = 0; @@ -5974,7 +5578,7 @@ /* file system type */ seq_puts(m, "with fstype "); -@@ -693,7 +751,7 @@ asmlinkage long sys_umount(char __user * +@@ -1134,7 +1176,7 @@ asmlinkage long sys_umount(char __user * goto dput_and_out; retval = -EPERM; @@ -5983,7 +5587,7 @@ goto dput_and_out; retval = do_umount(nd.path.mnt, flags); -@@ -719,7 +777,7 @@ asmlinkage long sys_oldumount(char __use +@@ -1160,7 +1202,7 @@ asmlinkage long sys_oldumount(char __use static int mount_is_safe(struct nameidata *nd) { @@ -5992,7 +5596,7 @@ return 0; return -EPERM; #ifdef notyet -@@ -974,11 +1032,13 @@ static noinline int do_change_type(struc +@@ -1453,11 +1495,13 @@ static noinline int do_change_type(struc * noinline this do_mount helper to save do_mount stack space. */ static noinline int do_loopback(struct nameidata *nd, char *old_name, @@ -6007,20 +5611,7 @@ if (err) return err; if (!old_name || !*old_name) -@@ -1004,6 +1064,12 @@ static noinline int do_loopback(struct n - if (!mnt) - goto out; - -+ mnt->mnt_flags = mnt_flags; -+ if (flags & MS_TAGID) { -+ mnt->mnt_tag = tag; -+ mnt->mnt_flags |= MNT_TAGID; -+ } -+ - err = graft_tree(mnt, nd); - if (err) { - LIST_HEAD(umount_list); -@@ -1012,6 +1078,7 @@ static noinline int do_loopback(struct n +@@ -1491,6 +1535,7 @@ static noinline int do_loopback(struct n spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } @@ -6028,7 +5619,7 @@ out: up_write(&namespace_sem); -@@ -1026,12 +1093,12 @@ out: +@@ -1522,12 +1567,12 @@ static int change_mount_flags(struct vfs * noinline this do_mount helper to save do_mount stack space. */ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, @@ -6043,7 +5634,7 @@ return -EPERM; if (!check_mnt(nd->path.mnt)) -@@ -1069,7 +1136,7 @@ static noinline int do_move_mount(struct +@@ -1568,7 +1613,7 @@ static noinline int do_move_mount(struct struct path parent_path; struct vfsmount *p; int err = 0; @@ -6052,7 +5643,7 @@ return -EPERM; if (!old_name || !*old_name) return -EINVAL; -@@ -1152,7 +1219,7 @@ static noinline int do_new_mount(struct +@@ -1651,7 +1696,7 @@ static noinline int do_new_mount(struct return -EINVAL; /* we need capabilities... */ @@ -6061,7 +5652,7 @@ return -EPERM; mnt = do_kern_mount(type, flags, name, data); -@@ -1397,6 +1464,7 @@ long do_mount(char *dev_name, char *dir_ +@@ -1896,6 +1941,7 @@ long do_mount(char *dev_name, char *dir_ struct nameidata nd; int retval = 0; int mnt_flags = 0; @@ -6069,7 +5660,7 @@ /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) -@@ -1412,7 +1480,17 @@ long do_mount(char *dev_name, char *dir_ +@@ -1911,6 +1957,14 @@ long do_mount(char *dev_name, char *dir_ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; @@ -6082,21 +5673,18 @@ + } + /* Separate the per-mountpoint flags */ -+ if (flags & MS_RDONLY) -+ mnt_flags |= MNT_RDONLY; if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; - if (flags & MS_NODEV) -@@ -1426,6 +1504,8 @@ long do_mount(char *dev_name, char *dir_ - if (flags & MS_RELATIME) - mnt_flags |= MNT_RELATIME; +@@ -1927,6 +1981,8 @@ long do_mount(char *dev_name, char *dir_ + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + if (!capable(CAP_SYS_ADMIN)) + mnt_flags |= MNT_NODEV; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); -@@ -1440,9 +1520,9 @@ long do_mount(char *dev_name, char *dir_ +@@ -1942,9 +1998,9 @@ long do_mount(char *dev_name, char *dir_ if (flags & MS_REMOUNT) retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, @@ -6108,7 +5696,7 @@ else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) -@@ -1515,6 +1595,7 @@ static struct mnt_namespace *dup_mnt_ns( +@@ -2017,6 +2073,7 @@ static struct mnt_namespace *dup_mnt_ns( q = next_mnt(q, new_ns->root); } up_write(&namespace_sem); @@ -6116,16 +5704,16 @@ if (rootmnt) mntput(rootmnt); -@@ -1850,5 +1931,6 @@ void __put_mnt_ns(struct mnt_namespace * +@@ -2349,5 +2406,6 @@ void __put_mnt_ns(struct mnt_namespace * spin_unlock(&vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); + atomic_dec(&vs_global_mnt_ns); kfree(ns); } ---- a/fs/nfs/client.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/client.c 2008-04-19 15:14:52.000000000 -0400 -@@ -589,6 +589,9 @@ static int nfs_init_server_rpcclient(str +--- a/fs/nfs/client.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/nfs/client.c 2008-07-16 22:41:36.000000000 -0400 +@@ -598,6 +598,9 @@ static int nfs_init_server_rpcclient(str if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; @@ -6135,7 +5723,7 @@ return 0; } -@@ -742,6 +745,10 @@ static void nfs_server_set_fsinfo(struct +@@ -763,6 +766,10 @@ static void nfs_server_set_fsinfo(struct server->acdirmin = server->acdirmax = 0; } @@ -6146,8 +5734,8 @@ server->maxfilesize = fsinfo->maxfilesize; /* We're airborne Set socket buffersize */ ---- a/fs/nfs/dir.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/dir.c 2008-04-21 16:52:03.000000000 -0400 +--- a/fs/nfs/dir.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/nfs/dir.c 2008-07-16 22:41:36.000000000 -0400 @@ -34,6 +34,7 @@ #include #include @@ -6164,18 +5752,8 @@ no_entry: res = d_materialise_unique(dentry, inode); if (res != NULL) { -@@ -967,7 +969,8 @@ static int is_atomic_open(struct inode * - if (nd->flags & LOOKUP_DIRECTORY) - return 0; - /* Are we trying to write to a read only partition? */ -- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) -+ if ((IS_RDONLY(dir) || MNT_IS_RDONLY(nd->path.mnt)) && -+ (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) - return 0; - return 1; - } ---- a/fs/nfs/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/inode.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/nfs/inode.c 2008-08-12 01:41:51.000000000 -0400 ++++ a/fs/nfs/inode.c 2008-08-12 01:42:21.000000000 -0400 @@ -37,6 +37,7 @@ #include #include @@ -6184,7 +5762,7 @@ #include #include -@@ -316,8 +317,10 @@ nfs_fhget(struct super_block *sb, struct +@@ -314,8 +315,10 @@ nfs_fhget(struct super_block *sb, struct nfsi->change_attr = fattr->change_attr; inode->i_size = nfs_size_to_loff_t(fattr->size); inode->i_nlink = fattr->nlink; @@ -6197,7 +5775,7 @@ if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { /* * report the blocks in 512byte units -@@ -410,6 +413,8 @@ void nfs_setattr_update_inode(struct ino +@@ -408,6 +411,8 @@ void nfs_setattr_update_inode(struct ino inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; @@ -6206,7 +5784,7 @@ spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); -@@ -841,6 +846,9 @@ static int nfs_check_inode_attributes(st +@@ -849,6 +854,9 @@ static int nfs_check_inode_attributes(st struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; @@ -6216,7 +5794,7 @@ /* Has the inode gone and changed behind our back? */ -@@ -865,10 +873,15 @@ static int nfs_check_inode_attributes(st +@@ -873,10 +881,15 @@ static int nfs_check_inode_attributes(st if (cur_size != new_isize && nfsi->npages == 0) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -6234,7 +5812,7 @@ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ -@@ -989,6 +1002,9 @@ static int nfs_update_inode(struct inode +@@ -997,6 +1010,9 @@ static int nfs_update_inode(struct inode loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; @@ -6243,8 +5821,8 @@ + tag_t tag; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", - __FUNCTION__, inode->i_sb->s_id, inode->i_ino, -@@ -1062,15 +1078,21 @@ static int nfs_update_inode(struct inode + __func__, inode->i_sb->s_id, inode->i_ino, +@@ -1070,15 +1086,21 @@ static int nfs_update_inode(struct inode memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); nfsi->change_attr = fattr->change_attr; @@ -6270,8 +5848,8 @@ if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) { /* ---- a/fs/nfs/nfs3xdr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/nfs3xdr.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/nfs/nfs3xdr.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/nfs/nfs3xdr.c 2008-07-16 22:41:36.000000000 -0400 @@ -22,6 +22,7 @@ #include #include @@ -6362,7 +5940,7 @@ *p++ = htonl(MAJOR(args->rdev)); *p++ = htonl(MINOR(args->rdev)); --- a/fs/nfs/nfsroot.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/nfsroot.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/nfs/nfsroot.c 2008-07-16 22:41:36.000000000 -0400 @@ -119,12 +119,12 @@ static int mount_port __initdata = 0; / enum { /* Options that take integer arguments */ @@ -6409,8 +5987,8 @@ default: printk(KERN_WARNING "Root-NFS: unknown " "option: %s\n", p); ---- a/fs/nfs/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfs/super.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/nfs/super.c 2008-07-14 17:22:49.000000000 -0400 ++++ a/fs/nfs/super.c 2008-07-16 22:41:36.000000000 -0400 @@ -50,6 +50,7 @@ #include #include @@ -6419,7 +5997,7 @@ #include #include -@@ -458,6 +459,7 @@ static void nfs_show_mount_options(struc +@@ -502,6 +503,7 @@ static void nfs_show_mount_options(struc { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, { NFS_MOUNT_UNSHARED, ",nosharecache", ""}, @@ -6427,17 +6005,17 @@ { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; ---- a/fs/nfsd/auth.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/auth.c 2008-04-19 17:07:41.000000000 -0400 +--- a/fs/nfsd/auth.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/nfsd/auth.c 2008-07-17 17:19:21.000000000 -0400 @@ -10,6 +10,7 @@ #include #include #include +#include + #include "auth.h" int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) - { -@@ -54,19 +55,23 @@ int nfsd_setuser(struct svc_rqst *rqstp, +@@ -55,19 +56,23 @@ int nfsd_setuser(struct svc_rqst *rqstp, get_group_info(cred.cr_group_info); if (cred.cr_uid != (uid_t) -1) @@ -6465,7 +6043,7 @@ cap_drop_nfsd_set(current->cap_effective); } else { --- a/fs/nfsd/nfs3xdr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/nfs3xdr.c 2008-04-19 17:08:09.000000000 -0400 ++++ a/fs/nfsd/nfs3xdr.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -6516,37 +6094,8 @@ if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); } else { ---- a/fs/nfsd/nfs4recover.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/nfs4recover.c 2008-04-20 13:24:11.000000000 -0400 -@@ -154,7 +154,7 @@ nfsd4_create_clid_dir(struct nfs4_client - dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); - goto out_put; - } -- status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU); -+ status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU, NULL); - out_put: - dput(dentry); - out_unlock: -@@ -258,7 +258,7 @@ nfsd4_remove_clid_file(struct dentry *di - return -EINVAL; - } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); -- status = vfs_unlink(dir->d_inode, dentry); -+ status = vfs_unlink(dir->d_inode, dentry, NULL); - mutex_unlock(&dir->d_inode->i_mutex); - return status; - } -@@ -273,7 +273,7 @@ nfsd4_clear_clid_dir(struct dentry *dir, - * a kernel from the future.... */ - nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); -- status = vfs_rmdir(dir->d_inode, dentry); -+ status = vfs_rmdir(dir->d_inode, dentry, NULL); - mutex_unlock(&dir->d_inode->i_mutex); - return status; - } ---- a/fs/nfsd/nfs4xdr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/nfs4xdr.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/nfsd/nfs4xdr.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/nfsd/nfs4xdr.c 2008-07-16 22:41:36.000000000 -0400 @@ -58,6 +58,7 @@ #include #include @@ -6555,7 +6104,7 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR -@@ -1759,14 +1760,18 @@ out_acl: +@@ -1745,14 +1746,18 @@ out_acl: WRITE32(stat.nlink); } if (bmval1 & FATTR4_WORD1_OWNER) { @@ -6577,7 +6126,7 @@ goto out_resource; if (status) --- a/fs/nfsd/nfsxdr.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/nfsxdr.c 2008-04-20 13:23:36.000000000 -0400 ++++ a/fs/nfsd/nfsxdr.c 2008-07-16 22:41:36.000000000 -0400 @@ -15,6 +15,7 @@ #include #include @@ -6626,73 +6175,8 @@ if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { *p++ = htonl(NFS_MAXPATHLEN); ---- a/fs/nfsd/vfs.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/nfsd/vfs.c 2008-04-21 17:24:34.000000000 -0400 -@@ -1258,13 +1258,13 @@ nfsd_create(struct svc_rqst *rqstp, stru - host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); - break; - case S_IFDIR: -- host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); -+ host_err = vfs_mkdir(dirp, dchild, iap->ia_mode, NULL); - break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: -- host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); -+ host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev, NULL); - break; - default: - printk("nfsd: bad file type %o in nfsd_create\n", type); -@@ -1529,11 +1529,13 @@ nfsd_symlink(struct svc_rqst *rqstp, str - else { - strncpy(path_alloced, path, plen); - path_alloced[plen] = 0; -- host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); -+ host_err = vfs_symlink(dentry->d_inode, dnew, -+ path_alloced, mode, NULL); - kfree(path_alloced); - } - } else -- host_err = vfs_symlink(dentry->d_inode, dnew, path, mode); -+ host_err = vfs_symlink(dentry->d_inode, dnew, -+ path, mode, NULL); - - if (!host_err) { - if (EX_ISSYNC(fhp->fh_export)) -@@ -1592,7 +1594,7 @@ nfsd_link(struct svc_rqst *rqstp, struct - dold = tfhp->fh_dentry; - dest = dold->d_inode; - -- host_err = vfs_link(dold, dirp, dnew); -+ host_err = vfs_link(dold, dirp, dnew, NULL); - if (!host_err) { - if (EX_ISSYNC(ffhp->fh_export)) { - err = nfserrno(nfsd_sync_dir(ddir)); -@@ -1757,9 +1759,9 @@ nfsd_unlink(struct svc_rqst *rqstp, stru - host_err = -EPERM; - } else - #endif -- host_err = vfs_unlink(dirp, rdentry); -+ host_err = vfs_unlink(dirp, rdentry, NULL); - } else { /* It's RMDIR */ -- host_err = vfs_rmdir(dirp, rdentry); -+ host_err = vfs_rmdir(dirp, rdentry, NULL); - } - - dput(rdentry); -@@ -1876,7 +1878,8 @@ nfsd_permission(struct svc_rqst *rqstp, - */ - if (!(acc & MAY_LOCAL_ACCESS)) - if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { -- if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode)) -+ if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode) -+ || MNT_IS_RDONLY(exp->ex_path.mnt)) - return nfserr_rofs; - if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) - return nfserr_perm; ---- a/fs/ocfs2/dlm/dlmfs.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/dlm/dlmfs.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ocfs2/dlm/dlmfs.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/dlm/dlmfs.c 2008-07-16 22:41:36.000000000 -0400 @@ -43,6 +43,7 @@ #include #include @@ -6717,9 +6201,9 @@ inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ---- a/fs/ocfs2/dlmglue.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/dlmglue.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1665,6 +1665,7 @@ static void __ocfs2_stuff_meta_lvb(struc +--- a/fs/ocfs2/dlmglue.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/dlmglue.c 2008-07-16 22:41:36.000000000 -0400 +@@ -1769,6 +1769,7 @@ static void __ocfs2_stuff_meta_lvb(struc lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); lvb->lvb_iuid = cpu_to_be32(inode->i_uid); lvb->lvb_igid = cpu_to_be32(inode->i_gid); @@ -6727,7 +6211,7 @@ lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); lvb->lvb_iatime_packed = -@@ -1719,6 +1720,7 @@ static void ocfs2_refresh_inode_from_lvb +@@ -1823,6 +1824,7 @@ static void ocfs2_refresh_inode_from_lvb inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); @@ -6735,8 +6219,8 @@ inode->i_mode = be16_to_cpu(lvb->lvb_imode); inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); ocfs2_unpack_timespec(&inode->i_atime, ---- a/fs/ocfs2/dlmglue.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/dlmglue.h 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/ocfs2/dlmglue.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/dlmglue.h 2008-07-16 22:41:36.000000000 -0400 @@ -46,7 +46,8 @@ struct ocfs2_meta_lvb { __be16 lvb_inlink; __be32 lvb_iattr; @@ -6747,9 +6231,9 @@ }; /* ocfs2_inode_lock_full() 'arg_flags' flags */ ---- a/fs/ocfs2/file.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/file.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1054,13 +1054,15 @@ int ocfs2_setattr(struct dentry *dentry, +--- a/fs/ocfs2/file.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/file.c 2008-07-16 22:41:36.000000000 -0400 +@@ -1058,13 +1058,15 @@ int ocfs2_setattr(struct dentry *dentry, mlog(0, "uid change: %d\n", attr->ia_uid); if (attr->ia_valid & ATTR_GID) mlog(0, "gid change: %d\n", attr->ia_gid); @@ -6766,7 +6250,7 @@ if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); return 0; -@@ -2229,6 +2231,7 @@ const struct inode_operations ocfs2_file +@@ -2233,6 +2235,7 @@ const struct inode_operations ocfs2_file const struct inode_operations ocfs2_special_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -6775,7 +6259,7 @@ }; --- a/fs/ocfs2/inode.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/inode.c 2008-04-21 10:16:08.000000000 -0400 ++++ a/fs/ocfs2/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -28,6 +28,7 @@ #include #include @@ -6792,46 +6276,116 @@ #include "journal.h" #include "namei.h" #include "suballoc.h" -@@ -79,6 +81,10 @@ void ocfs2_set_inode_flags(struct inode +@@ -74,11 +76,13 @@ void ocfs2_set_inode_flags(struct inode + { + unsigned int flags = OCFS2_I(inode)->ip_attr; + +- inode->i_flags &= ~(S_IMMUTABLE | ++ inode->i_flags &= ~(S_IMMUTABLE | S_IXUNLINK | + S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); if (flags & OCFS2_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; -+ if (flags & OCFS2_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; -+ if (flags & OCFS2_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ if (flags & OCFS2_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; if (flags & OCFS2_SYNC_FL) inode->i_flags |= S_SYNC; -@@ -109,6 +115,27 @@ void ocfs2_get_inode_flags(struct ocfs2_ - oi->ip_attr |= OCFS2_DIRSYNC_FL; +@@ -88,25 +92,89 @@ void ocfs2_set_inode_flags(struct inode + inode->i_flags |= S_NOATIME; + if (flags & OCFS2_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; ++ ++ inode->i_vflags &= ~(V_BARRIER | V_COW); ++ ++ if (flags & OCFS2_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ if (flags & OCFS2_COW_FL) ++ inode->i_vflags |= V_COW; } -+int ocfs2_sync_flags(struct inode *inode) -+{ -+ unsigned int oldflags, newflags; + /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ + void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) + { + unsigned int flags = oi->vfs_inode.i_flags; ++ unsigned int vflags = oi->vfs_inode.i_vflags; + -+ oldflags = OCFS2_I(inode)->ip_flags; -+ newflags = oldflags & ~(OCFS2_IMMUTABLE_FL | -+ OCFS2_IUNLINK_FL | OCFS2_BARRIER_FL); ++ oi->ip_attr &= ~(OCFS2_SYNC_FL | OCFS2_APPEND_FL | ++ OCFS2_IMMUTABLE_FL | OCFS2_IXUNLINK_FL | ++ OCFS2_NOATIME_FL | OCFS2_DIRSYNC_FL | ++ OCFS2_BARRIER_FL | OCFS2_COW_FL); + -+ if (IS_IMMUTABLE(inode)) -+ newflags |= OCFS2_IMMUTABLE_FL; -+ if (IS_IUNLINK(inode)) -+ newflags |= OCFS2_IUNLINK_FL; -+ if (IS_BARRIER(inode)) -+ newflags |= OCFS2_BARRIER_FL; ++ if (flags & S_IMMUTABLE) ++ oi->ip_attr |= OCFS2_IMMUTABLE_FL; ++ if (flags & S_IXUNLINK) ++ oi->ip_attr |= OCFS2_IXUNLINK_FL; + +- oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| +- OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); + if (flags & S_SYNC) + oi->ip_attr |= OCFS2_SYNC_FL; + if (flags & S_APPEND) + oi->ip_attr |= OCFS2_APPEND_FL; +- if (flags & S_IMMUTABLE) +- oi->ip_attr |= OCFS2_IMMUTABLE_FL; + if (flags & S_NOATIME) + oi->ip_attr |= OCFS2_NOATIME_FL; + if (flags & S_DIRSYNC) + oi->ip_attr |= OCFS2_DIRSYNC_FL; + -+ if (oldflags ^ newflags) -+ return ocfs2_set_inode_attr(inode, -+ newflags, OCFS2_FL_MASK); -+ return 0; ++ if (vflags & V_BARRIER) ++ oi->ip_attr |= OCFS2_BARRIER_FL; ++ if (vflags & V_COW) ++ oi->ip_attr |= OCFS2_COW_FL; +} + ++int ocfs2_sync_flags(struct inode *inode) ++{ ++ struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); ++ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); ++ handle_t *handle = NULL; ++ struct buffer_head *bh = NULL; ++ int status; ++ ++ mutex_lock(&inode->i_mutex); ++ ++ status = ocfs2_inode_lock(inode, &bh, 1); ++ if (status < 0) { ++ mlog_errno(status); ++ goto bail; ++ } ++ ++ status = -EROFS; ++ if (IS_RDONLY(inode)) ++ goto bail_unlock; ++ ++ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); ++ if (IS_ERR(handle)) { ++ status = PTR_ERR(handle); ++ mlog_errno(status); ++ goto bail_unlock; ++ } ++ ++ ocfs2_set_inode_flags(inode); ++ status = ocfs2_mark_inode_dirty(handle, inode, bh); ++ if (status < 0) ++ mlog_errno(status); ++ ++ ocfs2_commit_trans(osb, handle); ++bail_unlock: ++ ocfs2_inode_unlock(inode, 1); ++bail: ++ mutex_unlock(&inode->i_mutex); ++ ++ if (bh) ++ brelse(bh); ++ ++ mlog_exit(status); ++ return status; + } + struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, - int sysfile_type) - { -@@ -219,6 +246,8 @@ int ocfs2_populate_inode(struct inode *i +@@ -219,6 +287,8 @@ int ocfs2_populate_inode(struct inode *i struct super_block *sb; struct ocfs2_super *osb; int status = -EINVAL; @@ -6840,7 +6394,7 @@ mlog_entry("(0x%p, size:%llu)\n", inode, (unsigned long long)le64_to_cpu(fe->i_size)); -@@ -254,8 +283,12 @@ int ocfs2_populate_inode(struct inode *i +@@ -254,8 +324,12 @@ int ocfs2_populate_inode(struct inode *i inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); inode->i_mode = le16_to_cpu(fe->i_mode); @@ -6855,7 +6409,7 @@ /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) -@@ -1230,8 +1263,11 @@ int ocfs2_mark_inode_dirty(handle_t *han +@@ -1230,8 +1304,11 @@ int ocfs2_mark_inode_dirty(handle_t *han fe->i_size = cpu_to_le64(i_size_read(inode)); fe->i_links_count = cpu_to_le16(inode->i_nlink); @@ -6869,7 +6423,7 @@ fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); -@@ -1259,16 +1295,25 @@ leave: +@@ -1259,16 +1336,25 @@ leave: void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe) { @@ -6898,7 +6452,7 @@ if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; --- a/fs/ocfs2/inode.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/inode.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/ocfs2/inode.h 2008-07-16 22:41:36.000000000 -0400 @@ -143,6 +143,7 @@ int ocfs2_aio_write(struct file *file, s void ocfs2_set_inode_flags(struct inode *inode); @@ -6907,9 +6461,9 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) { ---- a/fs/ocfs2/ioctl.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/ioctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -41,7 +41,7 @@ static int ocfs2_get_inode_attr(struct i +--- a/fs/ocfs2/ioctl.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/ioctl.c 2008-07-16 22:41:36.000000000 -0400 +@@ -42,7 +42,7 @@ static int ocfs2_get_inode_attr(struct i return status; } @@ -6918,8 +6472,20 @@ unsigned mask) { struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode); ---- a/fs/ocfs2/ioctl.h 2008-04-17 10:33:02.000000000 -0400 -+++ a/fs/ocfs2/ioctl.h 2008-04-19 15:14:52.000000000 -0400 +@@ -67,6 +67,11 @@ static int ocfs2_set_inode_attr(struct i + if (!S_ISDIR(inode->i_mode)) + flags &= ~OCFS2_DIRSYNC_FL; + ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ goto bail_unlock; ++ } ++ + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); +--- a/fs/ocfs2/ioctl.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/ioctl.h 2008-07-17 18:10:34.000000000 -0400 @@ -10,6 +10,9 @@ #ifndef OCFS2_IOCTL_H #define OCFS2_IOCTL_H @@ -6927,11 +6493,11 @@ +int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, + unsigned mask); + - int ocfs2_ioctl(struct inode * inode, struct file * filp, - unsigned int cmd, unsigned long arg); + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); ---- a/fs/ocfs2/namei.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/namei.c 2008-04-19 15:14:52.000000000 -0400 + +--- a/fs/ocfs2/namei.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/namei.c 2008-07-17 18:13:38.000000000 -0400 @@ -40,6 +40,7 @@ #include #include @@ -6953,7 +6519,7 @@ @@ -425,13 +429,19 @@ static int ocfs2_mknod_locked(struct ocf fe->i_blkno = cpu_to_le64(fe_blkno); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); - fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); + fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); - fe->i_uid = cpu_to_le32(current->fsuid); + + tag = dx_current_fstag(osb->sb); @@ -6980,24 +6546,28 @@ + .sync_flags = ocfs2_sync_flags, .permission = ocfs2_permission, }; ---- a/fs/ocfs2/ocfs2_fs.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/ocfs2_fs.h 2008-04-19 15:14:52.000000000 -0400 -@@ -188,8 +188,12 @@ +--- a/fs/ocfs2/ocfs2_fs.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/ocfs2_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -204,9 +204,14 @@ + #define OCFS2_NODUMP_FL (0x00000040) /* do not dump file */ #define OCFS2_NOATIME_FL (0x00000080) /* do not update atime */ #define OCFS2_DIRSYNC_FL (0x00010000) /* dirsync behaviour (directories only) */ ++#define OCFS2_IXUNLINK_FL (0x00020000) /* Immutable invert on unlink */ -+#define OCFS2_BARRIER_FL (0x04000000) /* Barrier for chroot() */ -+#define OCFS2_IUNLINK_FL (0x08000000) /* Immutable unlink */ +-#define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ +-#define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ ++#define OCFS2_BARRIER_FL (0x01000000) /* Barrier for chroot() */ ++#define OCFS2_COW_FL (0x02000000) /* Copy on Write marker */ + - #define OCFS2_FL_VISIBLE (0x000100FF) /* User visible flags */ - #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ -+#define OCFS2_FL_MASK (0x0F0100FF) ++#define OCFS2_FL_VISIBLE (0x000300FF) /* User visible flags */ ++#define OCFS2_FL_MODIFIABLE (0x000300FF) /* User modifiable flags */ ++#define OCFS2_FL_MASK (0x030300FF) /* * Extent record flags (e_node.leaf.flags) ---- a/fs/ocfs2/ocfs2.h 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/ocfs2.h 2008-04-21 10:15:31.000000000 -0400 -@@ -171,6 +171,7 @@ enum ocfs2_mount_options +--- a/fs/ocfs2/ocfs2.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/ocfs2.h 2008-07-16 22:41:36.000000000 -0400 +@@ -172,6 +172,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ @@ -7005,27 +6575,27 @@ }; #define OCFS2_OSB_SOFT_RO 0x0001 ---- a/fs/ocfs2/super.c 2008-04-17 12:05:40.000000000 -0400 -+++ a/fs/ocfs2/super.c 2008-04-23 08:31:10.000000000 -0400 +--- a/fs/ocfs2/super.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/ocfs2/super.c 2008-07-17 20:23:09.000000000 -0400 @@ -154,6 +154,7 @@ enum { - Opt_commit, Opt_localalloc, Opt_localflocks, + Opt_stack, + Opt_tag, Opt_notag, Opt_tagid, Opt_err, }; -@@ -172,6 +173,9 @@ static match_table_t tokens = { - {Opt_commit, "commit=%u"}, +@@ -173,6 +174,9 @@ static match_table_t tokens = { {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, + {Opt_stack, "cluster_stack=%s"}, + {Opt_tag, "tag"}, + {Opt_notag, "notag"}, + {Opt_tagid, "tagid=%u"}, {Opt_err, NULL} }; -@@ -391,6 +395,13 @@ static int ocfs2_remount(struct super_bl +@@ -392,6 +396,13 @@ static int ocfs2_remount(struct super_bl goto out; } @@ -7039,7 +6609,7 @@ if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) { ret = -EINVAL; -@@ -691,6 +702,9 @@ static int ocfs2_fill_super(struct super +@@ -725,6 +736,9 @@ static int ocfs2_fill_super(struct super ocfs2_complete_mount_recovery(osb); @@ -7049,9 +6619,9 @@ if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else -@@ -864,6 +878,20 @@ static int ocfs2_parse_options(struct su - if (!is_remount) - mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; +@@ -918,6 +932,20 @@ static int ocfs2_parse_options(struct su + OCFS2_STACK_LABEL_LEN); + mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; break; +#ifndef CONFIG_TAGGING_NONE + case Opt_tag: @@ -7070,9 +6640,9 @@ default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " ---- a/fs/open.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/open.c 2008-04-21 13:51:52.000000000 -0400 -@@ -27,22 +27,31 @@ +--- a/fs/open.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/open.c 2008-07-17 18:02:08.000000000 -0400 +@@ -29,22 +29,31 @@ #include #include #include @@ -7106,69 +6676,19 @@ } return retval; } -@@ -249,7 +258,7 @@ static long do_sys_truncate(const char _ - goto dput_and_out; - - error = -EROFS; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(nd.path.mnt)) - goto dput_and_out; - - error = -EPERM; -@@ -458,7 +467,7 @@ asmlinkage long sys_faccessat(int dfd, c - special_file(nd.path.dentry->d_inode->i_mode)) - goto out_path_release; - -- if(IS_RDONLY(nd.path.dentry->d_inode)) -+ if(IS_RDONLY(nd.path.dentry->d_inode) || MNT_IS_RDONLY(nd.path.mnt)) - res = -EROFS; - - out_path_release: -@@ -568,7 +577,7 @@ asmlinkage long sys_fchmod(unsigned int - audit_inode(NULL, dentry); - - err = -EROFS; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(file->f_vfsmnt)) - goto out_putf; - err = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -@@ -598,11 +607,11 @@ asmlinkage long sys_fchmodat(int dfd, co +@@ -619,6 +628,11 @@ asmlinkage long sys_fchmodat(int dfd, co error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd); if (error) goto out; -- inode = nd.path.dentry->d_inode; -- -- error = -EROFS; -- if (IS_RDONLY(inode)) + + error = cow_check_and_break(&nd); + if (error) - goto dput_and_out; -+ inode = nd.path.dentry->d_inode; ++ goto dput_and_out; ++ + inode = nd.path.dentry->d_inode; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -@@ -627,7 +636,8 @@ asmlinkage long sys_chmod(const char __u - return sys_fchmodat(AT_FDCWD, filename, mode); - } - --static int chown_common(struct dentry * dentry, uid_t user, gid_t group) -+static int chown_common(struct dentry *dentry, struct vfsmount *mnt, -+ uid_t user, gid_t group) - { - struct inode * inode; - int error; -@@ -639,7 +649,7 @@ static int chown_common(struct dentry * - goto out; - } - error = -EROFS; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(mnt)) - goto out; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -@@ -647,11 +657,11 @@ static int chown_common(struct dentry * + error = mnt_want_write(nd.path.mnt); +@@ -667,11 +681,11 @@ static int chown_common(struct dentry * newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { newattrs.ia_valid |= ATTR_UID; @@ -7182,55 +6702,46 @@ } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= -@@ -671,7 +681,11 @@ asmlinkage long sys_chown(const char __u - error = user_path_walk(filename, &nd); +@@ -694,7 +708,11 @@ asmlinkage long sys_chown(const char __u + error = mnt_want_write(nd.path.mnt); if (error) - goto out; + goto out_release; - error = chown_common(nd.path.dentry, user, group); +#ifdef CONFIG_VSERVER_COWBL + error = cow_check_and_break(&nd); + if (!error) +#endif -+ error = chown_common(nd.path.dentry, nd.path.mnt, user, group); ++ error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); + out_release: path_put(&nd.path); - out: - return error; -@@ -691,7 +705,11 @@ asmlinkage long sys_fchownat(int dfd, co - error = __user_walk_fd(dfd, filename, follow, &nd); +@@ -719,7 +737,11 @@ asmlinkage long sys_fchownat(int dfd, co + error = mnt_want_write(nd.path.mnt); if (error) - goto out; + goto out_release; - error = chown_common(nd.path.dentry, user, group); +#ifdef CONFIG_VSERVER_COWBL + error = cow_check_and_break(&nd); + if (!error) +#endif -+ error = chown_common(nd.path.dentry, nd.path.mnt, user, group); ++ error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); + out_release: path_put(&nd.path); - out: - return error; -@@ -705,7 +723,11 @@ asmlinkage long sys_lchown(const char __ - error = user_path_walk_link(filename, &nd); +@@ -738,7 +760,11 @@ asmlinkage long sys_lchown(const char __ + error = mnt_want_write(nd.path.mnt); if (error) - goto out; + goto out_release; - error = chown_common(nd.path.dentry, user, group); +#ifdef CONFIG_VSERVER_COWBL + error = cow_check_and_break(&nd); + if (!error) +#endif -+ error = chown_common(nd.path.dentry, nd.path.mnt, user, group); ++ error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); + out_release: path_put(&nd.path); - out: - return error; -@@ -724,7 +746,7 @@ asmlinkage long sys_fchown(unsigned int - - dentry = file->f_path.dentry; - audit_inode(NULL, dentry); -- error = chown_common(dentry, user, group); -+ error = chown_common(dentry, file->f_vfsmnt, user, group); - fput(file); - out: - return error; -@@ -971,6 +993,7 @@ repeat: +@@ -1016,6 +1042,7 @@ repeat: else FD_CLR(fd, fdt->close_on_exec); files->next_fd = fd + 1; @@ -7238,7 +6749,7 @@ #if 1 /* Sanity check */ if (fdt->fd[fd] != NULL) { -@@ -998,6 +1021,7 @@ static void __put_unused_fd(struct files +@@ -1043,6 +1070,7 @@ static void __put_unused_fd(struct files __FD_CLR(fd, fdt->open_fds); if (fd < files->next_fd) files->next_fd = fd; @@ -7246,9 +6757,9 @@ } void put_unused_fd(unsigned int fd) ---- a/fs/proc/array.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/array.c 2008-05-21 14:08:19.000000000 -0400 -@@ -79,6 +79,8 @@ +--- a/fs/proc/array.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/array.c 2008-07-17 17:40:35.000000000 -0400 +@@ -80,6 +80,8 @@ #include #include #include @@ -7257,7 +6768,7 @@ #include #include -@@ -140,8 +142,9 @@ static const char *task_state_array[] = +@@ -141,8 +143,9 @@ static const char *task_state_array[] = "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "T (tracing stop)", /* 8 */ @@ -7269,7 +6780,7 @@ }; static inline const char *get_task_state(struct task_struct *tsk) -@@ -162,6 +165,7 @@ static inline void task_state(struct seq +@@ -163,6 +166,7 @@ static inline void task_state(struct seq struct group_info *group_info; int g; struct fdtable *fdt = NULL; @@ -7277,7 +6788,7 @@ pid_t ppid, tpid; rcu_read_lock(); -@@ -169,6 +173,12 @@ static inline void task_state(struct seq +@@ -170,6 +174,12 @@ static inline void task_state(struct seq task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; tpid = pid_alive(p) && p->ptrace ? task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0; @@ -7290,7 +6801,31 @@ seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" -@@ -308,6 +318,45 @@ static inline void task_context_switch_c +@@ -281,7 +291,7 @@ static inline void task_sig(struct seq_f + } + + static void render_cap_t(struct seq_file *m, const char *header, +- kernel_cap_t *a) ++ struct vx_info *vxi, kernel_cap_t *a) + { + unsigned __capi; + +@@ -295,10 +305,10 @@ static void render_cap_t(struct seq_file + + static inline void task_cap(struct seq_file *m, struct task_struct *p) + { +- render_cap_t(m, "CapInh:\t", &p->cap_inheritable); +- render_cap_t(m, "CapPrm:\t", &p->cap_permitted); +- render_cap_t(m, "CapEff:\t", &p->cap_effective); +- render_cap_t(m, "CapBnd:\t", &p->cap_bset); ++ render_cap_t(m, "CapInh:\t", p->vx_info, &p->cap_inheritable); ++ render_cap_t(m, "CapPrm:\t", p->vx_info, &p->cap_permitted); ++ render_cap_t(m, "CapEff:\t", p->vx_info, &p->cap_effective); ++ render_cap_t(m, "CapBnd:\t", p->vx_info, &p->cap_bset); + } + + static inline void task_context_switch_counts(struct seq_file *m, +@@ -310,6 +320,45 @@ static inline void task_context_switch_c p->nivcsw); } @@ -7336,7 +6871,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { -@@ -323,6 +372,7 @@ int proc_pid_status(struct seq_file *m, +@@ -325,6 +374,7 @@ int proc_pid_status(struct seq_file *m, task_sig(m, task); task_cap(m, task); cpuset_task_status_allowed(m, task); @@ -7344,7 +6879,7 @@ #if defined(CONFIG_S390) task_show_regs(m, task); #endif -@@ -494,6 +544,17 @@ static int do_task_stat(struct seq_file +@@ -496,6 +546,17 @@ static int do_task_stat(struct seq_file /* convert nsec -> ticks */ start_time = nsec_to_clock_t(start_time); @@ -7362,9 +6897,9 @@ seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", ---- a/fs/proc/base.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/base.c 2008-05-21 14:08:19.000000000 -0400 -@@ -76,6 +76,8 @@ +--- a/fs/proc/base.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/base.c 2008-07-16 22:41:36.000000000 -0400 +@@ -77,6 +77,8 @@ #include #include #include @@ -7373,7 +6908,7 @@ #include "internal.h" /* NOTE: -@@ -1290,6 +1292,8 @@ static struct inode *proc_pid_make_inode +@@ -1412,6 +1414,8 @@ static struct inode *proc_pid_make_inode inode->i_uid = task->euid; inode->i_gid = task->egid; } @@ -7382,7 +6917,7 @@ security_task_to_inode(task, inode); out: -@@ -1833,6 +1837,13 @@ static struct dentry *proc_pident_lookup +@@ -1953,6 +1957,13 @@ static struct dentry *proc_pident_lookup if (!task) goto out_no_task; @@ -7396,7 +6931,7 @@ /* * Yes, it does not scale. And it should not. Don't add * new entries into /proc// without very good reasons. -@@ -2220,7 +2231,7 @@ out_iput: +@@ -2340,7 +2351,7 @@ out_iput: static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) { struct dentry *error; @@ -7405,7 +6940,7 @@ const struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); -@@ -2285,6 +2296,9 @@ static int proc_pid_io_accounting(struct +@@ -2405,6 +2416,9 @@ static int proc_pid_io_accounting(struct static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; @@ -7415,7 +6950,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, task), DIR("fd", S_IRUSR|S_IXUSR, fd), -@@ -2335,6 +2349,8 @@ static const struct pid_entry tgid_base_ +@@ -2456,6 +2470,8 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_CGROUPS REG("cgroup", S_IRUGO, cgroup), #endif @@ -7424,7 +6959,7 @@ INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL -@@ -2350,6 +2366,7 @@ static const struct pid_entry tgid_base_ +@@ -2471,6 +2487,7 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_TASK_IO_ACCOUNTING INF("io", S_IRUGO, pid_io_accounting), #endif @@ -7432,7 +6967,7 @@ }; static int proc_tgid_base_readdir(struct file * filp, -@@ -2547,7 +2564,7 @@ retry: +@@ -2667,7 +2684,7 @@ retry: iter.task = NULL; pid = find_ge_pid(iter.tgid, ns); if (pid) { @@ -7441,7 +6976,7 @@ iter.task = pid_task(pid, PIDTYPE_PID); /* What we to know is if the pid we have find is the * pid of a thread_group_leader. Testing for task -@@ -2577,7 +2594,7 @@ static int proc_pid_fill_cache(struct fi +@@ -2697,7 +2714,7 @@ static int proc_pid_fill_cache(struct fi struct tgid_iter iter) { char name[PROC_NUMBUF]; @@ -7450,7 +6985,7 @@ return proc_fill_cache(filp, dirent, filldir, name, len, proc_pid_instantiate, iter.task, NULL); } -@@ -2586,7 +2603,7 @@ static int proc_pid_fill_cache(struct fi +@@ -2706,7 +2723,7 @@ static int proc_pid_fill_cache(struct fi int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; @@ -7459,7 +6994,7 @@ struct tgid_iter iter; struct pid_namespace *ns; -@@ -2606,6 +2623,8 @@ int proc_pid_readdir(struct file * filp, +@@ -2726,6 +2743,8 @@ int proc_pid_readdir(struct file * filp, iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { filp->f_pos = iter.tgid + TGID_OFFSET; @@ -7468,7 +7003,7 @@ if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { put_task_struct(iter.task); goto out; -@@ -2742,6 +2761,8 @@ static struct dentry *proc_task_lookup(s +@@ -2862,6 +2881,8 @@ static struct dentry *proc_task_lookup(s tid = name_to_int(dentry); if (tid == ~0U) goto out; @@ -7477,8 +7012,8 @@ ns = dentry->d_sb->s_fs_info; rcu_read_lock(); ---- a/fs/proc/generic.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/generic.c 2008-04-20 14:23:26.000000000 -0400 +--- a/fs/proc/generic.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/generic.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -7487,25 +7022,25 @@ #include #include "internal.h" -@@ -389,6 +390,8 @@ struct dentry *proc_lookup_de(struct pro - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; +@@ -386,6 +387,8 @@ struct dentry *proc_lookup_de(struct pro + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; + if (!vx_hide_check(0, de->vx_flags)) + continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + unsigned int ino; -@@ -397,6 +400,8 @@ struct dentry *proc_lookup_de(struct pro - spin_unlock(&proc_subdir_lock); - error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); +@@ -394,6 +397,8 @@ struct dentry *proc_lookup_de(struct pro + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, ino, de); + /* generic proc entries belong to the host */ + inode->i_tag = 0; - goto out_unlock; - } + goto out_unlock; } -@@ -481,6 +486,8 @@ int proc_readdir_de(struct proc_dir_entr + } +@@ -474,6 +479,8 @@ int proc_readdir_de(struct proc_dir_entr /* filldir passes info to user space */ de_get(de); @@ -7514,7 +7049,7 @@ spin_unlock(&proc_subdir_lock); if (filldir(dirent, de->name, de->namelen, filp->f_pos, de->low_ino, de->mode >> 12) < 0) { -@@ -488,6 +495,7 @@ int proc_readdir_de(struct proc_dir_entr +@@ -481,6 +488,7 @@ int proc_readdir_de(struct proc_dir_entr goto out; } spin_lock(&proc_subdir_lock); @@ -7522,7 +7057,7 @@ filp->f_pos++; next = de->next; de_put(de); -@@ -602,6 +610,7 @@ static struct proc_dir_entry *__proc_cre +@@ -595,6 +603,7 @@ static struct proc_dir_entry *__proc_cre ent->nlink = nlink; atomic_set(&ent->count, 1); ent->pde_users = 0; @@ -7530,7 +7065,7 @@ spin_lock_init(&ent->pde_unload_lock); ent->pde_unload_completion = NULL; out: -@@ -624,7 +633,8 @@ struct proc_dir_entry *proc_symlink(cons +@@ -617,7 +626,8 @@ struct proc_dir_entry *proc_symlink(cons kfree(ent->data); kfree(ent); ent = NULL; @@ -7540,28 +7075,28 @@ } else { kfree(ent); ent = NULL; ---- a/fs/proc/inode.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/inode.c 2008-04-19 15:14:52.000000000 -0400 -@@ -408,6 +408,8 @@ struct inode *proc_get_inode(struct supe - inode->i_uid = de->uid; - inode->i_gid = de->gid; - } +--- a/fs/proc/inode.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/inode.c 2008-07-16 22:41:36.000000000 -0400 +@@ -405,6 +405,8 @@ struct inode *proc_get_inode(struct supe + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->vx_flags) + PROC_I(inode)->vx_flags = de->vx_flags; - if (de->size) - inode->i_size = de->size; - if (de->nlink) ---- a/fs/proc/internal.h 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/internal.h 2008-05-21 14:08:19.000000000 -0400 + if (de->size) + inode->i_size = de->size; + if (de->nlink) +--- a/fs/proc/internal.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/internal.h 2008-07-16 22:41:36.000000000 -0400 @@ -10,6 +10,7 @@ */ #include +#include + extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL - extern int proc_sys_init(void); -@@ -57,6 +58,9 @@ extern int proc_pid_status(struct seq_fi +@@ -55,6 +56,9 @@ extern int proc_pid_status(struct seq_fi struct pid *pid, struct task_struct *task); extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); @@ -7571,7 +7106,7 @@ extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); extern const struct file_operations proc_maps_operations; -@@ -76,11 +80,16 @@ static inline struct pid *proc_pid(struc +@@ -74,11 +78,16 @@ static inline struct pid *proc_pid(struc return PROC_I(inode)->pid; } @@ -7589,8 +7124,8 @@ static inline int proc_fd(struct inode *inode) { return PROC_I(inode)->fd; ---- a/fs/proc/proc_misc.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/proc_misc.c 2008-04-23 11:56:24.000000000 -0400 +--- a/fs/proc/proc_misc.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/proc_misc.c 2008-07-16 22:41:36.000000000 -0400 @@ -56,6 +56,8 @@ #include #include "internal.h" @@ -7662,28 +7197,29 @@ cached = 0; get_vmalloc_info(&vmi); ---- a/fs/proc/root.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/proc/root.c 2008-04-19 15:14:52.000000000 -0400 -@@ -23,6 +23,9 @@ +--- a/fs/proc/root.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/proc/root.c 2008-07-17 17:31:52.000000000 -0400 +@@ -22,6 +22,10 @@ + #include "internal.h" - struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; +struct proc_dir_entry *proc_virtual; + +extern void proc_vx_init(void); - ++ static int proc_test_super(struct super_block *sb, void *data) { -@@ -138,6 +141,7 @@ void __init proc_root_init(void) - proc_device_tree_init(); + return sb->s_fs_info == data; +@@ -137,6 +141,7 @@ void __init proc_root_init(void) #endif - proc_bus = proc_mkdir("bus", NULL); -+ proc_vx_init(); + proc_mkdir("bus", NULL); proc_sys_init(); ++ proc_vx_init(); } ---- a/fs/quota.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/quota.c 2008-04-21 09:21:04.000000000 -0400 + static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +--- a/fs/quota.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/quota.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,7 @@ #include #include @@ -7692,7 +7228,7 @@ /* Check validity of generic quotactl commands */ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) -@@ -82,11 +83,11 @@ static int generic_quotactl_valid(struct +@@ -81,11 +82,11 @@ static int generic_quotactl_valid(struct if (cmd == Q_GETQUOTA) { if (((type == USRQUOTA && current->euid != id) || (type == GRPQUOTA && !in_egroup_p(id))) && @@ -7706,7 +7242,7 @@ return -EPERM; return 0; -@@ -133,10 +134,10 @@ static int xqm_quotactl_valid(struct sup +@@ -132,10 +133,10 @@ static int xqm_quotactl_valid(struct sup if (cmd == Q_XGETQUOTA) { if (((type == XQM_USRQUOTA && current->euid != id) || (type == XQM_GRPQUOTA && !in_egroup_p(id))) && @@ -7719,7 +7255,7 @@ return -EPERM; } -@@ -329,6 +330,46 @@ static int do_quotactl(struct super_bloc +@@ -328,6 +329,46 @@ static int do_quotactl(struct super_bloc return 0; } @@ -7766,7 +7302,7 @@ /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon -@@ -346,6 +387,22 @@ static inline struct super_block *quotac +@@ -345,6 +386,22 @@ static inline struct super_block *quotac putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -7789,8 +7325,8 @@ sb = get_super(bdev); bdput(bdev); if (!sb) ---- a/fs/reiserfs/bitmap.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/reiserfs/bitmap.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/reiserfs/bitmap.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/bitmap.c 2008-07-16 22:41:36.000000000 -0400 @@ -13,6 +13,7 @@ #include #include @@ -7884,15 +7420,15 @@ return CARRY_ON; --- a/fs/reiserfs/file.c 2008-04-17 11:31:38.000000000 -0400 -+++ a/fs/reiserfs/file.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/reiserfs/file.c 2008-07-16 22:41:36.000000000 -0400 @@ -306,4 +306,5 @@ const struct inode_operations reiserfs_f .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, + .sync_flags = reiserfs_sync_flags, }; ---- a/fs/reiserfs/inode.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/reiserfs/inode.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/reiserfs/inode.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,8 @@ #include #include @@ -7902,7 +7438,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); -@@ -52,6 +54,7 @@ void reiserfs_delete_inode(struct inode +@@ -54,6 +56,7 @@ void reiserfs_delete_inode(struct inode * stat data deletion */ if (!err) DQUOT_FREE_INODE(inode); @@ -7910,7 +7446,7 @@ if (journal_end(&th, inode->i_sb, jbegin_count)) goto out; -@@ -1114,6 +1117,8 @@ static void init_inode(struct inode *ino +@@ -1116,6 +1119,8 @@ static void init_inode(struct inode *ino struct buffer_head *bh; struct item_head *ih; __u32 rdev; @@ -7919,7 +7455,7 @@ //int version = ITEM_VERSION_1; bh = PATH_PLAST_BUFFER(path); -@@ -1137,12 +1142,13 @@ static void init_inode(struct inode *ino +@@ -1139,12 +1144,13 @@ static void init_inode(struct inode *ino (struct stat_data_v1 *)B_I_PITEM(bh, ih); unsigned long blocks; @@ -7935,7 +7471,7 @@ inode->i_size = sd_v1_size(sd); inode->i_atime.tv_sec = sd_v1_atime(sd); inode->i_mtime.tv_sec = sd_v1_mtime(sd); -@@ -1184,11 +1190,12 @@ static void init_inode(struct inode *ino +@@ -1186,11 +1192,12 @@ static void init_inode(struct inode *ino // (directories and symlinks) struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); @@ -7950,7 +7486,7 @@ inode->i_mtime.tv_sec = sd_v2_mtime(sd); inode->i_atime.tv_sec = sd_v2_atime(sd); inode->i_ctime.tv_sec = sd_v2_ctime(sd); -@@ -1218,6 +1225,10 @@ static void init_inode(struct inode *ino +@@ -1220,6 +1227,10 @@ static void init_inode(struct inode *ino sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); } @@ -7961,7 +7497,7 @@ pathrelse(path); if (S_ISREG(inode->i_mode)) { inode->i_op = &reiserfs_file_inode_operations; -@@ -1240,13 +1251,15 @@ static void init_inode(struct inode *ino +@@ -1242,13 +1253,15 @@ static void init_inode(struct inode *ino static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; @@ -7979,7 +7515,7 @@ set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); -@@ -1769,6 +1782,10 @@ int reiserfs_new_inode(struct reiserfs_t +@@ -1771,6 +1784,10 @@ int reiserfs_new_inode(struct reiserfs_t BUG_ON(!th->t_trans_id); @@ -7990,7 +7526,7 @@ if (DQUOT_ALLOC_INODE(inode)) { err = -EDQUOT; goto out_end_trans; -@@ -1954,6 +1971,9 @@ int reiserfs_new_inode(struct reiserfs_t +@@ -1956,6 +1973,9 @@ int reiserfs_new_inode(struct reiserfs_t DQUOT_FREE_INODE(inode); out_end_trans: @@ -8000,37 +7536,75 @@ journal_end(th, th->t_super, th->t_blocks_allocated); /* Drop can be outside and it needs more credits so it's better to have it outside */ DQUOT_DROP(inode); -@@ -2848,6 +2868,14 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, +@@ -2842,14 +2862,19 @@ int reiserfs_commit_write(struct file *f + void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) + { + if (reiserfs_attrs(inode->i_sb)) { +- if (sd_attrs & REISERFS_SYNC_FL) +- inode->i_flags |= S_SYNC; +- else +- inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; -+ if (sd_attrs & REISERFS_IUNLINK_FL) -+ inode->i_flags |= S_IUNLINK; ++ if (sd_attrs & REISERFS_IXUNLINK_FL) ++ inode->i_flags |= S_IXUNLINK; + else -+ inode->i_flags &= ~S_IUNLINK; -+ if (sd_attrs & REISERFS_BARRIER_FL) -+ inode->i_flags |= S_BARRIER; ++ inode->i_flags &= ~S_IXUNLINK; ++ ++ if (sd_attrs & REISERFS_SYNC_FL) ++ inode->i_flags |= S_SYNC; + else -+ inode->i_flags &= ~S_BARRIER; ++ inode->i_flags &= ~S_SYNC; if (sd_attrs & REISERFS_APPEND_FL) inode->i_flags |= S_APPEND; else -@@ -2870,6 +2898,14 @@ void i_attrs_to_sd_attrs(struct inode *i +@@ -2862,6 +2887,15 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, + REISERFS_I(inode)->i_flags |= i_nopack_mask; + else + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; ++ ++ if (sd_attrs & REISERFS_BARRIER_FL) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (sd_attrs & REISERFS_COW_FL) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + } + } + +@@ -2872,6 +2906,11 @@ void i_attrs_to_sd_attrs(struct inode *i *sd_attrs |= REISERFS_IMMUTABLE_FL; else *sd_attrs &= ~REISERFS_IMMUTABLE_FL; -+ if (inode->i_flags & S_IUNLINK) -+ *sd_attrs |= REISERFS_IUNLINK_FL; ++ if (inode->i_flags & S_IXUNLINK) ++ *sd_attrs |= REISERFS_IXUNLINK_FL; + else -+ *sd_attrs &= ~REISERFS_IUNLINK_FL; -+ if (inode->i_flags & S_BARRIER) -+ *sd_attrs |= REISERFS_BARRIER_FL; -+ else -+ *sd_attrs &= ~REISERFS_BARRIER_FL; ++ *sd_attrs &= ~REISERFS_IXUNLINK_FL; ++ if (inode->i_flags & S_SYNC) *sd_attrs |= REISERFS_SYNC_FL; else -@@ -3049,6 +3085,22 @@ static ssize_t reiserfs_direct_IO(int rw +@@ -2884,6 +2923,15 @@ void i_attrs_to_sd_attrs(struct inode *i + *sd_attrs |= REISERFS_NOTAIL_FL; + else + *sd_attrs &= ~REISERFS_NOTAIL_FL; ++ ++ if (inode->i_vflags & V_BARRIER) ++ *sd_attrs |= REISERFS_BARRIER_FL; ++ else ++ *sd_attrs &= ~REISERFS_BARRIER_FL; ++ if (inode->i_vflags & V_COW) ++ *sd_attrs |= REISERFS_COW_FL; ++ else ++ *sd_attrs &= ~REISERFS_COW_FL; + } + } + +@@ -3051,6 +3099,22 @@ static ssize_t reiserfs_direct_IO(int rw reiserfs_get_blocks_direct_io, NULL); } @@ -8053,7 +7627,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; -@@ -3102,9 +3154,11 @@ int reiserfs_setattr(struct dentry *dent +@@ -3104,9 +3168,11 @@ int reiserfs_setattr(struct dentry *dent } error = inode_change_ok(inode, attr); @@ -8066,7 +7640,7 @@ error = reiserfs_chown_xattrs(inode, attr); if (!error) { -@@ -3134,6 +3188,9 @@ int reiserfs_setattr(struct dentry *dent +@@ -3136,6 +3202,9 @@ int reiserfs_setattr(struct dentry *dent inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; @@ -8076,26 +7650,26 @@ mark_inode_dirty(inode); error = journal_end(&th, inode->i_sb, jbegin_count); ---- a/fs/reiserfs/ioctl.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/fs/reiserfs/ioctl.c 2008-05-21 14:30:41.000000000 -0400 -@@ -6,6 +6,7 @@ - #include +--- a/fs/reiserfs/ioctl.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/ioctl.c 2008-07-17 17:18:29.000000000 -0400 +@@ -7,6 +7,7 @@ + #include #include #include +#include #include #include #include -@@ -22,7 +23,7 @@ +@@ -23,7 +24,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - unsigned int flags; + unsigned int flags, oldflags; + int err = 0; switch (cmd) { - case REISERFS_IOC_UNPACK: -@@ -41,12 +42,14 @@ int reiserfs_ioctl(struct inode *inode, +@@ -43,6 +44,7 @@ int reiserfs_ioctl(struct inode *inode, flags = REISERFS_I(inode)->i_attrs; i_attrs_to_sd_attrs(inode, (__u16 *) & flags); @@ -8103,17 +7677,20 @@ return put_user(flags, (int __user *)arg); case REISERFS_IOC_SETFLAGS:{ if (!reiserfs_attrs(inode->i_sb)) - return -ENOTTY; - -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - - if (!is_owner_or_cap(inode)) -@@ -72,6 +75,10 @@ int reiserfs_ioctl(struct inode *inode, - if (result) - return result; +@@ -60,6 +62,10 @@ int reiserfs_ioctl(struct inode *inode, + err = -EFAULT; + goto setflags_out; + } ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -EACCES; ++ } + /* + * Is it quota file? Do not allow user to mess with it + */ +@@ -84,6 +90,10 @@ int reiserfs_ioctl(struct inode *inode, + goto setflags_out; + } } + + oldflags = REISERFS_I(inode)->i_attrs; @@ -8122,18 +7699,8 @@ sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; inode->i_ctime = CURRENT_TIME_SEC; -@@ -83,7 +90,8 @@ int reiserfs_ioctl(struct inode *inode, - case REISERFS_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) - return -EPERM; -- if (IS_RDONLY(inode)) -+ if (IS_RDONLY(inode) || -+ (filp && MNT_IS_RDONLY(filp->f_vfsmnt))) - return -EROFS; - if (get_user(inode->i_generation, (int __user *)arg)) - return -EFAULT; ---- a/fs/reiserfs/namei.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/reiserfs/namei.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/reiserfs/namei.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/namei.c 2008-07-16 22:41:36.000000000 -0400 @@ -17,6 +17,7 @@ #include #include @@ -8181,8 +7748,8 @@ + .sync_flags = reiserfs_sync_flags, }; ---- a/fs/reiserfs/stree.c 2008-04-17 11:31:38.000000000 -0400 -+++ a/fs/reiserfs/stree.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/reiserfs/stree.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/stree.c 2008-07-16 22:41:36.000000000 -0400 @@ -55,6 +55,7 @@ #include #include @@ -8207,7 +7774,7 @@ } break; } -@@ -1735,6 +1738,7 @@ int reiserfs_cut_from_item(struct reiser +@@ -1734,6 +1737,7 @@ int reiserfs_cut_from_item(struct reiser "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?'); #endif @@ -8215,7 +7782,7 @@ DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); return n_ret_value; } -@@ -1976,6 +1980,11 @@ int reiserfs_paste_into_item(struct reis +@@ -1975,6 +1979,11 @@ int reiserfs_paste_into_item(struct reis pathrelse(p_s_search_path); return -EDQUOT; } @@ -8227,7 +7794,7 @@ init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); #ifdef DISPLACE_NEW_PACKING_LOCALITIES -@@ -2028,6 +2037,7 @@ int reiserfs_paste_into_item(struct reis +@@ -2027,6 +2036,7 @@ int reiserfs_paste_into_item(struct reis n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); #endif @@ -8235,7 +7802,7 @@ DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); return retval; } -@@ -2065,6 +2075,11 @@ int reiserfs_insert_item(struct reiserfs +@@ -2064,6 +2074,11 @@ int reiserfs_insert_item(struct reiserfs pathrelse(p_s_path); return -EDQUOT; } @@ -8247,7 +7814,7 @@ } init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih)); -@@ -2112,7 +2127,9 @@ int reiserfs_insert_item(struct reiserfs +@@ -2111,7 +2126,9 @@ int reiserfs_insert_item(struct reiserfs "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); #endif @@ -8258,8 +7825,8 @@ + } return retval; } ---- a/fs/reiserfs/super.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/fs/reiserfs/super.c 2008-05-21 14:30:41.000000000 -0400 +--- a/fs/reiserfs/super.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/super.c 2008-07-16 22:41:36.000000000 -0400 @@ -896,6 +896,14 @@ static int reiserfs_parse_options(struct {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, @@ -8299,8 +7866,8 @@ rs = SB_DISK_SUPER_BLOCK(s); /* Let's do basic sanity check to verify that underlying device is not smaller than the filesystem. If the check fails then abort and scream, ---- a/fs/reiserfs/xattr.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/reiserfs/xattr.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/reiserfs/xattr.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/reiserfs/xattr.c 2008-07-16 22:41:36.000000000 -0400 @@ -35,6 +35,7 @@ #include #include @@ -8309,17 +7876,8 @@ #include #include #include -@@ -747,7 +748,7 @@ int reiserfs_delete_xattrs(struct inode - if (dir->d_inode->i_nlink <= 2) { - root = get_xa_root(inode->i_sb, XATTR_REPLACE); - reiserfs_write_lock_xattrs(inode->i_sb); -- err = vfs_rmdir(root->d_inode, dir); -+ err = vfs_rmdir(root->d_inode, dir, NULL); - reiserfs_write_unlock_xattrs(inode->i_sb); - dput(root); - } else { --- a/fs/stat.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/stat.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/stat.c 2008-07-16 22:41:36.000000000 -0400 @@ -26,6 +26,7 @@ void generic_fillattr(struct inode *inod stat->nlink = inode->i_nlink; stat->uid = inode->i_uid; @@ -8328,19 +7886,19 @@ stat->rdev = inode->i_rdev; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; ---- a/fs/super.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/super.c 2008-04-19 15:14:52.000000000 -0400 -@@ -37,6 +37,9 @@ - #include +--- a/fs/super.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/super.c 2008-07-17 21:01:21.000000000 -0400 +@@ -38,6 +38,9 @@ #include #include + #include +#include +#include +#include #include + #include "internal.h" - -@@ -859,12 +862,18 @@ struct vfsmount * +@@ -886,12 +889,18 @@ struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct vfsmount *mnt; @@ -8359,7 +7917,7 @@ error = -ENOMEM; mnt = alloc_vfsmnt(name); if (!mnt) -@@ -885,7 +894,14 @@ vfs_kern_mount(struct file_system_type * +@@ -912,7 +921,14 @@ vfs_kern_mount(struct file_system_type * goto out_free_secdata; BUG_ON(!mnt->mnt_sb); @@ -8375,8 +7933,8 @@ if (error) goto out_sb; ---- a/fs/sysfs/mount.c 2008-04-17 11:31:38.000000000 -0400 -+++ a/fs/sysfs/mount.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/sysfs/mount.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/sysfs/mount.c 2008-07-16 22:41:36.000000000 -0400 @@ -19,8 +19,6 @@ #include "sysfs.h" @@ -8395,9 +7953,9 @@ sb->s_op = &sysfs_ops; sb->s_time_gran = 1; sysfs_sb = sb; ---- a/fs/utimes.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/fs/utimes.c 2008-05-21 14:30:41.000000000 -0400 -@@ -7,6 +7,8 @@ +--- a/fs/utimes.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/utimes.c 2008-07-16 22:41:36.000000000 -0400 +@@ -8,6 +8,8 @@ #include #include #include @@ -8406,7 +7964,7 @@ #include #include -@@ -89,6 +91,9 @@ long do_utimes(int dfd, char __user *fil +@@ -87,6 +89,9 @@ long do_utimes(int dfd, char __user *fil if (error) goto out; @@ -8414,11 +7972,11 @@ + if (error) + goto dput_and_out; dentry = nd.path.dentry; + mnt = nd.path.mnt; } - ---- a/fs/xattr.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xattr.c 2008-04-21 13:52:50.000000000 -0400 -@@ -17,6 +17,7 @@ +--- a/fs/xattr.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xattr.c 2008-07-16 22:41:36.000000000 -0400 +@@ -18,6 +18,7 @@ #include #include #include @@ -8426,101 +7984,20 @@ #include -@@ -220,7 +221,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); - */ - static long - setxattr(struct dentry *d, char __user *name, void __user *value, -- size_t size, int flags) -+ size_t size, int flags, struct vfsmount *mnt) - { - int error; - void *kvalue = NULL; -@@ -247,6 +248,9 @@ setxattr(struct dentry *d, char __user * - } - } - -+ if (MNT_IS_RDONLY(mnt)) -+ return -EROFS; -+ - error = vfs_setxattr(d, kname, kvalue, size, flags); - kfree(kvalue); - return error; -@@ -262,7 +266,7 @@ sys_setxattr(char __user *path, char __u - error = user_path_walk(path, &nd); - if (error) - return error; -- error = setxattr(nd.path.dentry, name, value, size, flags); -+ error = setxattr(nd.path.dentry, name, value, size, flags, nd.path.mnt); - path_put(&nd.path); - return error; - } -@@ -277,7 +281,7 @@ sys_lsetxattr(char __user *path, char __ - error = user_path_walk_link(path, &nd); - if (error) - return error; -- error = setxattr(nd.path.dentry, name, value, size, flags); -+ error = setxattr(nd.path.dentry, name, value, size, flags, nd.path.mnt); - path_put(&nd.path); - return error; - } -@@ -295,7 +299,7 @@ sys_fsetxattr(int fd, char __user *name, - return error; - dentry = f->f_path.dentry; - audit_inode(NULL, dentry); -- error = setxattr(dentry, name, value, size, flags); -+ error = setxattr(dentry, name, value, size, flags, f->f_vfsmnt); - fput(f); - return error; - } -@@ -459,7 +463,7 @@ sys_flistxattr(int fd, char __user *list - * Extended attribute REMOVE operations - */ - static long --removexattr(struct dentry *d, char __user *name) -+removexattr(struct dentry *d, char __user *name, struct vfsmount *mnt) - { - int error; - char kname[XATTR_NAME_MAX + 1]; -@@ -470,6 +474,9 @@ removexattr(struct dentry *d, char __use - if (error < 0) - return error; - -+ if (MNT_IS_RDONLY(mnt)) -+ return -EROFS; -+ - return vfs_removexattr(d, kname); - } - -@@ -482,7 +489,7 @@ sys_removexattr(char __user *path, char - error = user_path_walk(path, &nd); - if (error) - return error; -- error = removexattr(nd.path.dentry, name); -+ error = removexattr(nd.path.dentry, name, nd.path.mnt); - path_put(&nd.path); - return error; - } -@@ -496,7 +503,7 @@ sys_lremovexattr(char __user *path, char - error = user_path_walk_link(path, &nd); - if (error) - return error; -- error = removexattr(nd.path.dentry, name); -+ error = removexattr(nd.path.dentry, name, nd.path.mnt); - path_put(&nd.path); - return error; - } -@@ -513,7 +520,7 @@ sys_fremovexattr(int fd, char __user *na - return error; - dentry = f->f_path.dentry; - audit_inode(NULL, dentry); -- error = removexattr(dentry, name); -+ error = removexattr(dentry, name, f->f_vfsmnt); - fput(f); - return error; - } ---- a/fs/xfs/linux-2.6/xfs_ioctl.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_ioctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1125,7 +1125,8 @@ xfs_merge_ioc_xflags( +--- a/fs/xfs/linux-2.6/xfs_ioctl.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_ioctl.c 2008-08-11 23:04:52.000000000 -0400 +@@ -800,6 +800,10 @@ xfs_merge_ioc_xflags( + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; ++ if (flags & FS_IXUNLINK_FL) ++ xflags |= XFS_XFLAG_IXUNLINK; ++ else ++ xflags &= ~XFS_XFLAG_IXUNLINK; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else +@@ -822,12 +826,16 @@ xfs_merge_ioc_xflags( STATIC unsigned int xfs_di2lxflags( @@ -8530,29 +8007,57 @@ { unsigned int flags = 0; -@@ -1139,6 +1140,11 @@ xfs_di2lxflags( + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= FS_IXUNLINK_FL; ++ + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -836,6 +844,11 @@ xfs_di2lxflags( flags |= FS_NOATIME_FL; if (di_flags & XFS_DIFLAG_NODUMP) flags |= FS_NODUMP_FL; + -+ if (di_vflags & XFS_DIVFLAG_IUNLINK) -+ flags |= FS_IUNLINK_FL; + if (di_vflags & XFS_DIVFLAG_BARRIER) + flags |= FS_BARRIER_FL; ++ if (di_vflags & XFS_DIVFLAG_COW) ++ flags |= FS_COW_FL; return flags; } -@@ -1219,7 +1225,7 @@ xfs_ioc_xattr( - } +@@ -916,7 +929,8 @@ xfs_ioc_getxflags( + { + unsigned int flags; - case XFS_IOC_GETXFLAGS: { -- flags = xfs_di2lxflags(ip->i_d.di_flags); -+ flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_vflags); - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; ---- a/fs/xfs/linux-2.6/xfs_iops.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_iops.c 2008-04-19 17:01:15.000000000 -0400 +- flags = xfs_di2lxflags(ip->i_d.di_flags); ++ flags = xfs_di2lxflags(ip->i_d.di_flags, ip->i_d.di_vflags); ++ flags &= ~(XFS_XFLAG_BARRIER | XFS_XFLAG_COW); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +@@ -1096,10 +1110,18 @@ xfs_ioctl( + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: ++ if (IS_BARRIER(inode)) { ++ vxwprintk_task(1, "messing with the barrier."); ++ return -XFS_ERROR(EACCES); ++ } + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { +--- a/fs/xfs/linux-2.6/xfs_iops.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_iops.c 2008-08-12 02:03:50.000000000 -0400 @@ -53,6 +53,7 @@ #include #include @@ -8561,15 +8066,23 @@ /* * Bring the atime in the XFS inode uptodate. -@@ -391,6 +392,7 @@ xfs_vn_lookup( +@@ -389,6 +390,7 @@ xfs_vn_lookup( d_add(dentry, NULL); return NULL; } + dx_propagate_tag(nd, vn_to_inode(cvp)); - return d_splice_alias(vn_to_inode(cvp), dentry); + return d_splice_alias(cip->i_vnode, dentry); } -@@ -655,6 +657,10 @@ xfs_vn_setattr( +@@ -619,6 +621,7 @@ xfs_vn_getattr( + stat->nlink = ip->i_d.di_nlink; + stat->uid = ip->i_d.di_uid; + stat->gid = ip->i_d.di_gid; ++ stat->tag = ip->i_d.di_tag; + stat->ino = ip->i_ino; + #if XFS_BIG_INUMS + stat->ino += mp->m_inoadd; +@@ -668,6 +671,10 @@ xfs_vn_setattr( int flags = 0; int error; @@ -8580,7 +8093,7 @@ if (ia_valid & ATTR_UID) { vattr.va_mask |= XFS_AT_UID; vattr.va_uid = attr->ia_uid; -@@ -663,6 +669,10 @@ xfs_vn_setattr( +@@ -676,6 +683,10 @@ xfs_vn_setattr( vattr.va_mask |= XFS_AT_GID; vattr.va_gid = attr->ia_gid; } @@ -8591,67 +8104,89 @@ if (ia_valid & ATTR_SIZE) { vattr.va_mask |= XFS_AT_SIZE; vattr.va_size = attr->ia_size; -@@ -708,6 +718,42 @@ xfs_vn_truncate( +@@ -728,6 +739,57 @@ xfs_vn_truncate( + WARN_ON(error); } - STATIC int -+xfs_vn_sync_flags(struct inode *inode) ++/* Propagate flags from i_flags to XFS_I(inode)->di_flags */ ++STATIC void ++xfs_get_inode_flags(struct inode *inode) +{ -+ unsigned int oldflags, newflags; -+ int flags = 0; -+ int error; -+ bhv_vattr_t vattr; -+ bhv_vnode_t *vp = vn_from_inode(inode); ++ xfs_inode_t *ip = XFS_I(inode); ++ unsigned int flags = inode->i_flags; ++ unsigned int vflags = inode->i_vflags; + -+ memset(&vattr, 0, sizeof vattr); ++ if (flags & S_IMMUTABLE) ++ ip->i_d.di_flags |= XFS_DIFLAG_IMMUTABLE; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IMMUTABLE; ++ if (flags & S_IXUNLINK) ++ ip->i_d.di_flags |= XFS_DIFLAG_IXUNLINK; ++ else ++ ip->i_d.di_flags &= ~XFS_DIFLAG_IXUNLINK; + -+ vattr.va_mask = XFS_AT_XFLAGS; -+ error = xfs_getattr(XFS_I(inode), &vattr, 0); -+ -+ if (error) -+ return error; -+ oldflags = vattr.va_xflags; -+ newflags = oldflags & ~(XFS_XFLAG_IMMUTABLE | -+ XFS_XFLAG_IUNLINK | XFS_XFLAG_BARRIER); -+ -+ if (IS_IMMUTABLE(inode)) -+ newflags |= XFS_XFLAG_IMMUTABLE; -+ if (IS_IUNLINK(inode)) -+ newflags |= XFS_XFLAG_IUNLINK; -+ if (IS_BARRIER(inode)) -+ newflags |= XFS_XFLAG_BARRIER; -+ -+ if (oldflags ^ newflags) { -+ vattr.va_xflags = newflags; -+ vattr.va_mask |= XFS_AT_XFLAGS; -+ error = xfs_setattr(XFS_I(inode), &vattr, flags, NULL); -+ } -+ vn_revalidate(vp); -+ return error; ++ if (vflags & V_BARRIER) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_BARRIER; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_BARRIER; ++ if (vflags & V_COW) ++ ip->i_d.di_vflags |= XFS_DIVFLAG_COW; ++ else ++ ip->i_d.di_vflags &= ~XFS_DIVFLAG_COW; +} + +STATIC int ++xfs_vn_sync_flags(struct inode *inode) ++{ ++ xfs_inode_t *ip = XFS_I(inode); ++ struct bhv_vattr *vattr; ++ int error; ++ ++ vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); ++ if (unlikely(!vattr)) ++ return -ENOMEM; ++ ++ xfs_get_inode_flags(inode); ++ ++ vattr->va_mask = XFS_AT_XFLAGS; ++ vattr->va_xflags = xfs_ip2xflags(ip); ++ ++ error = -xfs_setattr(ip, vattr, 0, NULL); ++ if (likely(!error)) ++ vn_revalidate(XFS_ITOV(ip)); /* update flags */ ++ ++ kfree(vattr); ++ return error; ++} ++ + STATIC int xfs_vn_setxattr( struct dentry *dentry, - const char *name, -@@ -881,6 +927,8 @@ const struct inode_operations xfs_dir_in +@@ -883,6 +945,7 @@ const struct inode_operations xfs_inode_ + .listxattr = xfs_vn_listxattr, + .removexattr = xfs_vn_removexattr, + .fallocate = xfs_vn_fallocate, ++ .sync_flags = xfs_vn_sync_flags, + }; + + const struct inode_operations xfs_dir_inode_operations = { +@@ -902,6 +965,7 @@ const struct inode_operations xfs_dir_in .getxattr = xfs_vn_getxattr, .listxattr = xfs_vn_listxattr, .removexattr = xfs_vn_removexattr, -+ .sync_flags = xfs_vn_sync_flags, + .sync_flags = xfs_vn_sync_flags, }; const struct inode_operations xfs_symlink_inode_operations = { -@@ -894,4 +942,5 @@ const struct inode_operations xfs_symlin +@@ -915,4 +979,5 @@ const struct inode_operations xfs_symlin .getxattr = xfs_vn_getxattr, .listxattr = xfs_vn_listxattr, .removexattr = xfs_vn_removexattr, + .sync_flags = xfs_vn_sync_flags, }; ---- a/fs/xfs/linux-2.6/xfs_linux.h 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_linux.h 2008-04-19 15:14:52.000000000 -0400 -@@ -128,6 +128,7 @@ +--- a/fs/xfs/linux-2.6/xfs_linux.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_linux.h 2008-07-16 22:41:36.000000000 -0400 +@@ -127,6 +127,7 @@ #define current_pid() (current->pid) #define current_fsuid(cred) (current->fsuid) #define current_fsgid(cred) (current->fsgid) @@ -8659,8 +8194,8 @@ #define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) ---- a/fs/xfs/linux-2.6/xfs_super.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_super.c 2008-04-26 09:51:47.000000000 -0400 +--- a/fs/xfs/linux-2.6/xfs_super.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_super.c 2008-07-16 22:41:36.000000000 -0400 @@ -137,6 +137,9 @@ xfs_args_allocate( #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ @@ -8699,7 +8234,7 @@ switch (inode->i_mode & S_IFMT) { case S_IFBLK: -@@ -612,6 +629,7 @@ xfs_revalidate_inode( +@@ -612,10 +629,16 @@ xfs_revalidate_inode( inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; @@ -8707,23 +8242,33 @@ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else -@@ -628,6 +646,15 @@ xfs_revalidate_inode( + inode->i_flags &= ~S_IMMUTABLE; ++ if (ip->i_d.di_flags & XFS_DIFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; ++ else ++ inode->i_flags &= ~S_IXUNLINK; ++ + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else +@@ -628,6 +651,16 @@ xfs_revalidate_inode( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + -+ if (ip->i_d.di_vflags & XFS_DIVFLAG_IUNLINK) -+ inode->i_flags |= S_IUNLINK; -+ else -+ inode->i_flags &= ~S_IUNLINK; + if (ip->i_d.di_vflags & XFS_DIVFLAG_BARRIER) -+ inode->i_flags |= S_BARRIER; ++ inode->i_vflags |= V_BARRIER; + else -+ inode->i_flags &= ~S_BARRIER; ++ inode->i_vflags &= ~V_BARRIER; ++ if (ip->i_d.di_vflags & XFS_DIVFLAG_COW) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; ++ xfs_iflags_clear(ip, XFS_IMODIFIED); } -@@ -1220,6 +1247,12 @@ xfs_fs_remount( +@@ -1220,6 +1253,12 @@ xfs_fs_remount( int error; error = xfs_parseargs(mp, options, args, 1); @@ -8736,7 +8281,7 @@ if (!error) error = xfs_mntupdate(mp, flags, args); kmem_free(args, sizeof(*args)); -@@ -1336,6 +1369,9 @@ xfs_fs_fill_super( +@@ -1336,6 +1375,9 @@ xfs_fs_fill_super( if (error) goto fail_vfsop; @@ -8747,7 +8292,7 @@ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; --- a/fs/xfs/linux-2.6/xfs_vnode.c 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_vnode.c 2008-04-19 17:04:58.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_vnode.c 2008-07-16 22:41:36.000000000 -0400 @@ -105,6 +105,7 @@ vn_revalidate( inode->i_mode = ip->i_d.di_mode; inode->i_uid = ip->i_d.di_uid; @@ -8756,24 +8301,36 @@ inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; -@@ -115,6 +116,14 @@ vn_revalidate( +@@ -115,6 +116,10 @@ vn_revalidate( inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; -+ if (xflags & XFS_XFLAG_IUNLINK) -+ inode->i_flags |= S_IUNLINK; ++ if (xflags & XFS_XFLAG_IXUNLINK) ++ inode->i_flags |= S_IXUNLINK; + else -+ inode->i_flags &= ~S_IUNLINK; -+ if (xflags & XFS_XFLAG_BARRIER) -+ inode->i_flags |= S_BARRIER; -+ else -+ inode->i_flags &= ~S_BARRIER; ++ inode->i_flags &= ~S_IXUNLINK; if (xflags & XFS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else ---- a/fs/xfs/linux-2.6/xfs_vnode.h 2008-04-17 12:05:41.000000000 -0400 -+++ a/fs/xfs/linux-2.6/xfs_vnode.h 2008-04-19 15:14:52.000000000 -0400 -@@ -99,6 +99,7 @@ typedef struct bhv_vattr { +@@ -127,6 +132,15 @@ vn_revalidate( + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; ++ ++ if (xflags & XFS_XFLAG_BARRIER) ++ inode->i_vflags |= V_BARRIER; ++ else ++ inode->i_vflags &= ~V_BARRIER; ++ if (xflags & XFS_XFLAG_COW) ++ inode->i_vflags |= V_COW; ++ else ++ inode->i_vflags &= ~V_COW; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + xfs_iflags_clear(ip, XFS_IMODIFIED); +--- a/fs/xfs/linux-2.6/xfs_vnode.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/linux-2.6/xfs_vnode.h 2008-07-16 22:41:36.000000000 -0400 +@@ -76,6 +76,7 @@ typedef struct bhv_vattr { xfs_nlink_t va_nlink; /* number of references to file */ uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ @@ -8781,7 +8338,7 @@ xfs_ino_t va_nodeid; /* file id */ xfs_off_t va_size; /* file size in bytes */ u_long va_blocksize; /* blocksize preferred for i/o */ -@@ -147,13 +148,15 @@ typedef struct bhv_vattr { +@@ -124,13 +125,15 @@ typedef struct bhv_vattr { #define XFS_AT_PROJID 0x04000000 #define XFS_AT_SIZE_NOPERM 0x08000000 #define XFS_AT_GENCOUNT 0x10000000 @@ -8798,8 +8355,8 @@ #define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ ---- a/fs/xfs/quota/xfs_qm_syscalls.c 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/quota/xfs_qm_syscalls.c 2008-04-19 15:14:52.000000000 -0400 +--- a/fs/xfs/quota/xfs_qm_syscalls.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/quota/xfs_qm_syscalls.c 2008-07-16 22:41:36.000000000 -0400 @@ -17,6 +17,7 @@ */ @@ -8817,16 +8374,16 @@ return XFS_ERROR(EPERM); /* * No file system can have quotas enabled on disk but not in core. -@@ -374,7 +375,7 @@ xfs_qm_scall_trunc_qfiles( - int error; +@@ -383,7 +384,7 @@ xfs_qm_scall_trunc_qfiles( + int error = 0, error2 = 0; xfs_inode_t *qip; - if (!capable(CAP_SYS_ADMIN)) + if (!vx_capable(CAP_SYS_ADMIN, VXC_QUOTA_CTL)) return XFS_ERROR(EPERM); - error = 0; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { -@@ -418,7 +419,7 @@ xfs_qm_scall_quotaon( + qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); +@@ -426,7 +427,7 @@ xfs_qm_scall_quotaon( uint accflags; __int64_t sbflags; @@ -8835,7 +8392,7 @@ return XFS_ERROR(EPERM); flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); -@@ -587,7 +588,7 @@ xfs_qm_scall_setqlim( +@@ -595,7 +596,7 @@ xfs_qm_scall_setqlim( int error; xfs_qcnt_t hard, soft; @@ -8845,16 +8402,16 @@ if ((newlim->d_fieldmask & --- a/fs/xfs/xfs_clnt.h 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_clnt.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/fs/xfs/xfs_clnt.h 2008-07-16 22:41:36.000000000 -0400 @@ -100,5 +100,6 @@ struct xfs_mount_args { * I/O size in stat(2) */ #define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams * allocator */ -+#define XFSMNT2_TAGGED 0x80000000 /* context tagging */ ++#define XFSMNT2_TAGGED 0x10000000 /* context tagging */ #endif /* __XFS_CLNT_H__ */ --- a/fs/xfs/xfs_dinode.h 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_dinode.h 2008-04-27 10:33:37.000000000 -0400 ++++ a/fs/xfs/xfs_dinode.h 2008-07-16 22:41:36.000000000 -0400 @@ -53,7 +53,9 @@ typedef struct xfs_dinode_core { __be32 di_gid; /* owner's group id */ __be32 di_nlink; /* number of links to file */ @@ -8871,40 +8428,54 @@ #define XFS_DI_U 0x2000000 #define XFS_DI_A 0x4000000 -#define XFS_DI_NUM_BITS 27 -+#define XFS_DI_VFLAGS 0x8000000 -+#define XFS_DI_TAG 0x10000000 ++#define XFS_DI_TAG 0x8000000 ++#define XFS_DI_VFLAGS 0x10000000 +#define XFS_DI_NUM_BITS 29 #define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1) #define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A)) -@@ -223,6 +227,7 @@ typedef enum xfs_dinode_fmt +@@ -223,6 +227,8 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ ++#define XFS_DIFLAG_IXUNLINK_BIT 15 /* Immutable inver on unlink */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) -@@ -252,4 +257,7 @@ typedef enum xfs_dinode_fmt - XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) +@@ -238,6 +244,7 @@ typedef enum xfs_dinode_fmt + #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) + #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) + #define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) ++#define XFS_DIFLAG_IXUNLINK (1 << XFS_DIFLAG_IXUNLINK_BIT) -+#define XFS_DIVFLAG_BARRIER 0x01 -+#define XFS_DIVFLAG_IUNLINK 0x02 + #ifdef CONFIG_XFS_RT + #define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +@@ -250,6 +257,10 @@ typedef enum xfs_dinode_fmt + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ +- XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) ++ XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM | \ ++ XFS_DIFLAG_IXUNLINK) + ++#define XFS_DIVFLAG_BARRIER 0x01 ++#define XFS_DIVFLAG_COW 0x02 + #endif /* __XFS_DINODE_H__ */ --- a/fs/xfs/xfs_fs.h 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_fs.h 2008-04-19 15:14:52.000000000 -0400 -@@ -67,6 +67,8 @@ struct fsxattr { ++++ a/fs/xfs/xfs_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -67,6 +67,9 @@ struct fsxattr { #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ #define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ ++#define XFS_XFLAG_IXUNLINK 0x00008000 /* immutable invert on unlink */ +#define XFS_XFLAG_BARRIER 0x10000000 /* chroot() barrier */ -+#define XFS_XFLAG_IUNLINK 0x20000000 /* immutable unlink */ ++#define XFS_XFLAG_COW 0x20000000 /* copy on write mark */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* -@@ -296,7 +298,8 @@ typedef struct xfs_bstat { +@@ -296,7 +299,8 @@ typedef struct xfs_bstat { __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ __u16 bs_projid; /* project id */ @@ -8914,20 +8485,20 @@ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ ---- a/fs/xfs/xfs_ialloc.c 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_ialloc.c 2008-04-27 10:30:39.000000000 -0400 -@@ -66,6 +66,8 @@ xfs_ialloc_log_di( - offsetof(xfs_dinode_core_t, di_gid), - offsetof(xfs_dinode_core_t, di_nlink), - offsetof(xfs_dinode_core_t, di_projid), +--- a/fs/xfs/xfs_ialloc.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_ialloc.c 2008-07-16 22:41:36.000000000 -0400 +@@ -84,6 +84,8 @@ xfs_ialloc_log_di( + offsetof(xfs_dinode_t, di_next_unlinked), + offsetof(xfs_dinode_t, di_u), + offsetof(xfs_dinode_t, di_a), + offsetof(xfs_dinode_core_t, di_tag), + offsetof(xfs_dinode_core_t, di_vflags), - offsetof(xfs_dinode_core_t, di_pad), - offsetof(xfs_dinode_core_t, di_atime), - offsetof(xfs_dinode_core_t, di_mtime), ---- a/fs/xfs/xfs_inode.c 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_inode.c 2008-04-26 11:06:34.000000000 -0400 -@@ -219,6 +219,7 @@ xfs_inotobp( + sizeof(xfs_dinode_t) + }; + +--- a/fs/xfs/xfs_inode.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_inode.c 2008-07-16 22:41:36.000000000 -0400 +@@ -249,6 +249,7 @@ xfs_inotobp( return 0; } @@ -8935,11 +8506,15 @@ /* * This routine is called to map an inode to the buffer containing -@@ -716,13 +717,21 @@ xfs_dinode_from_disk( +@@ -660,15 +661,25 @@ xfs_iformat_btree( + void + xfs_dinode_from_disk( xfs_icdinode_t *to, - xfs_dinode_core_t *from) +- xfs_dinode_core_t *from) ++ xfs_dinode_core_t *from, ++ int tagged) { -+ uint32_t uid, gid; ++ uint32_t uid, gid, tag; + to->di_magic = be16_to_cpu(from->di_magic); to->di_mode = be16_to_cpu(from->di_mode); @@ -8951,28 +8526,52 @@ + + uid = be32_to_cpu(from->di_uid); + gid = be32_to_cpu(from->di_gid); ++ tag = be16_to_cpu(from->di_tag); + -+ to->di_uid = INOTAG_UID(1, uid, gid); -+ to->di_gid = INOTAG_GID(1, uid, gid); -+ to->di_tag = INOTAG_TAG(1, uid, gid, 0); ++ to->di_uid = INOTAG_UID(tagged, uid, gid); ++ to->di_gid = INOTAG_GID(tagged, uid, gid); ++ to->di_tag = INOTAG_TAG(tagged, uid, gid, tag); + to->di_nlink = be32_to_cpu(from->di_nlink); to->di_projid = be16_to_cpu(from->di_projid); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); -@@ -756,8 +765,10 @@ xfs_dinode_to_disk( +@@ -689,21 +700,26 @@ xfs_dinode_from_disk( + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); ++ to->di_vflags = be16_to_cpu(from->di_vflags); + to->di_gen = be32_to_cpu(from->di_gen); + } + + void + xfs_dinode_to_disk( + xfs_dinode_core_t *to, +- xfs_icdinode_t *from) ++ xfs_icdinode_t *from, ++ int tagged) + { + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); to->di_version = from ->di_version; to->di_format = from->di_format; to->di_onlink = cpu_to_be16(from->di_onlink); - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); + -+ to->di_uid = cpu_to_be32(TAGINO_UID(1, from->di_uid, from->di_tag)); -+ to->di_gid = cpu_to_be32(TAGINO_GID(1, from->di_gid, from->di_tag)); ++ to->di_uid = cpu_to_be32(TAGINO_UID(tagged, from->di_uid, from->di_tag)); ++ to->di_gid = cpu_to_be32(TAGINO_GID(tagged, from->di_gid, from->di_tag)); ++ to->di_tag = cpu_to_be16(TAGINO_TAG(tagged, from->di_tag)); + to->di_nlink = cpu_to_be32(from->di_nlink); to->di_projid = cpu_to_be16(from->di_projid); memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); -@@ -783,7 +794,8 @@ xfs_dinode_to_disk( +@@ -724,12 +740,14 @@ xfs_dinode_to_disk( + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); ++ to->di_vflags = cpu_to_be16(from->di_vflags); + to->di_gen = cpu_to_be32(from->di_gen); + } STATIC uint _xfs_dic2xflags( @@ -8982,19 +8581,28 @@ { uint flags = 0; -@@ -817,7 +829,10 @@ _xfs_dic2xflags( +@@ -740,6 +758,8 @@ _xfs_dic2xflags( + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; ++ if (di_flags & XFS_DIFLAG_IXUNLINK) ++ flags |= XFS_XFLAG_IXUNLINK; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) +@@ -763,7 +783,10 @@ _xfs_dic2xflags( if (di_flags & XFS_DIFLAG_FILESTREAM) flags |= XFS_XFLAG_FILESTREAM; } - -+ if (di_vflags & XFS_DIVFLAG_IUNLINK) -+ flags |= XFS_XFLAG_IUNLINK; + if (di_vflags & XFS_DIVFLAG_BARRIER) + flags |= XFS_XFLAG_BARRIER; ++ if (di_vflags & XFS_DIVFLAG_COW) ++ flags |= XFS_XFLAG_COW; return flags; } -@@ -827,7 +842,7 @@ xfs_ip2xflags( +@@ -773,7 +796,7 @@ xfs_ip2xflags( { xfs_icdinode_t *dic = &ip->i_d; @@ -9003,7 +8611,7 @@ (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); } -@@ -837,7 +852,7 @@ xfs_dic2xflags( +@@ -783,7 +806,7 @@ xfs_dic2xflags( { xfs_dinode_core_t *dic = &dip->di_core; @@ -9012,7 +8620,17 @@ (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } -@@ -1138,6 +1153,7 @@ xfs_ialloc( +@@ -878,7 +901,8 @@ xfs_iread( + * Otherwise, just get the truly permanent information. + */ + if (dip->di_core.di_mode) { +- xfs_dinode_from_disk(&ip->i_d, &dip->di_core); ++ xfs_dinode_from_disk(&ip->i_d, &dip->di_core, ++ mp->m_flags & XFS_MOUNT_TAGGED); + error = xfs_iformat(ip, dip); + if (error) { + kmem_zone_free(xfs_inode_zone, ip); +@@ -1084,6 +1108,7 @@ xfs_ialloc( ASSERT(ip->i_d.di_nlink == nlink); ip->i_d.di_uid = current_fsuid(cr); ip->i_d.di_gid = current_fsgid(cr); @@ -9020,9 +8638,57 @@ ip->i_d.di_projid = prid; memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); ---- a/fs/xfs/xfs_inode.h 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_inode.h 2008-04-19 15:14:52.000000000 -0400 -@@ -187,7 +187,9 @@ typedef struct xfs_icdinode { +@@ -1138,6 +1163,7 @@ xfs_ialloc( + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: +@@ -1872,8 +1898,8 @@ xfs_iunlink( + agi_ok = + be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); +- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, +- XFS_RANDOM_IUNLINK))) { ++ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IXUNLINK, ++ XFS_RANDOM_IXUNLINK))) { + XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); + xfs_trans_brelse(tp, agibp); + return XFS_ERROR(EFSCORRUPTED); +@@ -1974,8 +2000,8 @@ xfs_iunlink_remove( + agi_ok = + be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && + XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); +- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, +- XFS_RANDOM_IUNLINK_REMOVE))) { ++ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IXUNLINK_REMOVE, ++ XFS_RANDOM_IXUNLINK_REMOVE))) { + XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, + mp, agi); + xfs_trans_brelse(tp, agibp); +@@ -2307,6 +2333,7 @@ xfs_ifree( + } + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; ++ ip->i_d.di_vflags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_df.if_ext_max = +@@ -3403,7 +3430,8 @@ xfs_iflush_int( + * because if the inode is dirty at all the core must + * be. + */ +- xfs_dinode_to_disk(&dip->di_core, &ip->i_d); ++ xfs_dinode_to_disk(&dip->di_core, &ip->i_d, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) +--- a/fs/xfs/xfs_inode.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_inode.h 2008-07-16 22:41:36.000000000 -0400 +@@ -174,7 +174,9 @@ typedef struct xfs_icdinode { __uint32_t di_gid; /* owner's group id */ __uint32_t di_nlink; /* number of links to file */ __uint16_t di_projid; /* owner's project id */ @@ -9033,9 +8699,21 @@ __uint16_t di_flushiter; /* incremented on flush */ xfs_ictimestamp_t di_atime; /* time last accessed */ xfs_ictimestamp_t di_mtime; /* time last modified */ ---- a/fs/xfs/xfs_itable.c 2008-04-17 12:05:42.000000000 -0400 -+++ a/fs/xfs/xfs_itable.c 2008-04-19 15:14:52.000000000 -0400 -@@ -89,6 +89,7 @@ xfs_bulkstat_one_iget( +@@ -495,9 +497,9 @@ int xfs_ialloc(struct xfs_trans *, xfs_ + xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t, + int, struct xfs_buf **, boolean_t *, xfs_inode_t **); + void xfs_dinode_from_disk(struct xfs_icdinode *, +- struct xfs_dinode_core *); ++ struct xfs_dinode_core *, int tagged); + void xfs_dinode_to_disk(struct xfs_dinode_core *, +- struct xfs_icdinode *); ++ struct xfs_icdinode *, int tagged); + + uint xfs_ip2xflags(struct xfs_inode *); + uint xfs_dic2xflags(struct xfs_dinode *); +--- a/fs/xfs/xfs_itable.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_itable.c 2008-07-16 22:41:36.000000000 -0400 +@@ -84,6 +84,7 @@ xfs_bulkstat_one_iget( buf->bs_mode = dic->di_mode; buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; @@ -9043,9 +8721,21 @@ buf->bs_size = dic->di_size; vn_atime_to_bstime(vp, &buf->bs_atime); buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; ---- a/fs/xfs/xfs_mount.h 2008-04-17 12:05:43.000000000 -0400 -+++ a/fs/xfs/xfs_mount.h 2008-04-19 15:14:52.000000000 -0400 -@@ -378,6 +378,7 @@ typedef struct xfs_mount { +--- a/fs/xfs/xfs_log_recover.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_log_recover.c 2008-07-16 22:41:36.000000000 -0400 +@@ -2418,7 +2418,8 @@ xlog_recover_do_inode_trans( + + /* The core is in in-core format */ + xfs_dinode_to_disk(&dip->di_core, +- (xfs_icdinode_t *)item->ri_buf[1].i_addr); ++ (xfs_icdinode_t *)item->ri_buf[1].i_addr, ++ mp->m_flags & XFS_MOUNT_TAGGED); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { +--- a/fs/xfs/xfs_mount.h 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_mount.h 2008-07-16 22:41:36.000000000 -0400 +@@ -379,6 +379,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ @@ -9053,9 +8743,9 @@ /* * Default minimum read and write sizes. ---- a/fs/xfs/xfs_vfsops.c 2008-04-17 12:05:43.000000000 -0400 -+++ a/fs/xfs/xfs_vfsops.c 2008-04-26 09:52:49.000000000 -0400 -@@ -290,6 +290,8 @@ xfs_start_flags( +--- a/fs/xfs/xfs_vfsops.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_vfsops.c 2008-07-16 22:41:36.000000000 -0400 +@@ -287,6 +287,8 @@ xfs_start_flags( if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; @@ -9064,7 +8754,7 @@ /* * no recovery flag requires a read-only mount -@@ -402,7 +404,6 @@ xfs_finish_flags( +@@ -399,7 +401,6 @@ xfs_finish_flags( if (ap->flags & XFSMNT_PQUOTAENF) mp->m_qflags |= XFS_OQUOTA_ENFD; } @@ -9072,25 +8762,17 @@ return 0; } ---- a/fs/xfs/xfs_vnodeops.c 2008-04-17 12:05:43.000000000 -0400 -+++ a/fs/xfs/xfs_vnodeops.c 2008-04-19 15:14:52.000000000 -0400 -@@ -122,6 +122,7 @@ xfs_getattr( - vap->va_mode = ip->i_d.di_mode; - vap->va_uid = ip->i_d.di_uid; - vap->va_gid = ip->i_d.di_gid; -+ vap->va_tag = ip->i_d.di_tag; - vap->va_projid = ip->i_d.di_projid; - - /* -@@ -221,6 +222,7 @@ xfs_setattr( +--- a/fs/xfs/xfs_vnodeops.c 2008-07-14 17:22:50.000000000 -0400 ++++ a/fs/xfs/xfs_vnodeops.c 2008-07-16 22:41:36.000000000 -0400 +@@ -93,6 +93,7 @@ xfs_setattr( uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; + tag_t tag=0, itag=0; int timeflags = 0; xfs_prid_t projid=0, iprojid=0; - int mandlock_before, mandlock_after; -@@ -272,6 +274,7 @@ xfs_setattr( + struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; +@@ -143,6 +144,7 @@ xfs_setattr( (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) { uint qflags = 0; @@ -9098,7 +8780,7 @@ if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) { uid = vap->va_uid; qflags |= XFS_QMOPT_UQUOTA; -@@ -351,6 +354,8 @@ xfs_setattr( +@@ -222,6 +224,8 @@ xfs_setattr( if (mask & (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID| XFS_AT_GID|XFS_AT_PROJID)) { @@ -9107,7 +8789,7 @@ /* * CAP_FOWNER overrides the following restrictions: * -@@ -399,7 +404,7 @@ xfs_setattr( +@@ -270,7 +274,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ @@ -9116,7 +8798,7 @@ /* * These IDs could have changed since we last looked at them. * But, we're assured that if the ownership did change -@@ -407,10 +412,12 @@ xfs_setattr( +@@ -278,10 +282,12 @@ xfs_setattr( * would have changed also. */ iuid = ip->i_d.di_uid; @@ -9131,7 +8813,7 @@ projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid : iprojid; -@@ -438,6 +445,7 @@ xfs_setattr( +@@ -309,6 +315,7 @@ xfs_setattr( if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { @@ -9139,7 +8821,16 @@ ASSERT(tp); code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? -@@ -686,7 +694,7 @@ xfs_setattr( +@@ -423,7 +430,7 @@ xfs_setattr( + */ + if ((mask & XFS_AT_XFLAGS) && + (ip->i_d.di_flags & +- (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || ++ (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND) || + (vap->va_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { +@@ -559,7 +566,7 @@ xfs_setattr( * and can change the group id only to a group of which he * or she is a member. */ @@ -9148,7 +8839,7 @@ /* * CAP_FSETID overrides the following restrictions: * -@@ -702,6 +710,9 @@ xfs_setattr( +@@ -575,6 +582,9 @@ xfs_setattr( * Change the ownerships and register quota modifications * in the transaction. */ @@ -9158,7 +8849,7 @@ if (iuid != uid) { if (XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & XFS_AT_UID); -@@ -777,6 +788,7 @@ xfs_setattr( +@@ -650,11 +660,14 @@ xfs_setattr( } if (mask & XFS_AT_XFLAGS) { uint di_flags; @@ -9166,20 +8857,27 @@ /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); -@@ -810,6 +822,11 @@ xfs_setattr( + if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; ++ if (vap->va_xflags & XFS_XFLAG_IXUNLINK) ++ di_flags |= XFS_DIFLAG_IXUNLINK; + if (vap->va_xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (vap->va_xflags & XFS_XFLAG_SYNC) +@@ -683,6 +696,11 @@ xfs_setattr( di_flags |= XFS_DIFLAG_EXTSIZE; } ip->i_d.di_flags = di_flags; -+ if (vap->va_xflags & XFS_XFLAG_IUNLINK) -+ di_vflags |= XFS_DIVFLAG_IUNLINK; + if (vap->va_xflags & XFS_XFLAG_BARRIER) + di_vflags |= XFS_DIVFLAG_BARRIER; ++ if (vap->va_xflags & XFS_XFLAG_COW) ++ di_vflags |= XFS_DIVFLAG_COW; + ip->i_d.di_vflags = di_vflags; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); timeflags |= XFS_ICHGTIME_CHG; --- a/include/asm-arm/tlb.h 2008-04-17 12:05:43.000000000 -0400 -+++ a/include/asm-arm/tlb.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-arm/tlb.h 2008-07-16 22:41:36.000000000 -0400 @@ -28,6 +28,7 @@ #else /* !CONFIG_MMU */ @@ -9188,8 +8886,8 @@ /* * TLB handling. This allows us to remove pages from the page ---- a/include/asm-blackfin/unistd.h 2008-04-17 12:05:43.000000000 -0400 -+++ a/include/asm-blackfin/unistd.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/asm-blackfin/unistd.h 2008-07-14 17:22:52.000000000 -0400 ++++ a/include/asm-blackfin/unistd.h 2008-07-16 22:41:36.000000000 -0400 @@ -278,7 +278,7 @@ #define __NR_tgkill 271 #define __NR_utimes 272 @@ -9200,7 +8898,7 @@ /* 276 __NR_get_mempolicy */ /* 277 __NR_set_mempolicy */ --- a/include/asm-generic/tlb.h 2008-04-17 12:05:43.000000000 -0400 -+++ a/include/asm-generic/tlb.h 2008-04-19 15:50:15.000000000 -0400 ++++ a/include/asm-generic/tlb.h 2008-07-16 22:41:36.000000000 -0400 @@ -14,6 +14,7 @@ #define _ASM_GENERIC__TLB_H @@ -9209,8 +8907,8 @@ #include #include ---- a/include/asm-ia64/tlb.h 2007-02-04 13:44:54.000000000 -0500 -+++ a/include/asm-ia64/tlb.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/asm-ia64/tlb.h 2008-07-14 17:22:53.000000000 -0400 ++++ a/include/asm-ia64/tlb.h 2008-07-16 22:41:36.000000000 -0400 @@ -40,6 +40,7 @@ #include #include @@ -9220,7 +8918,7 @@ #include #include --- a/include/asm-powerpc/systbl.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-powerpc/systbl.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-powerpc/systbl.h 2008-07-16 22:41:36.000000000 -0400 @@ -260,7 +260,7 @@ COMPAT_SYS_SPU(fstatfs64) SYSX(sys_ni_syscall, ppc_fadvise64_64, ppc_fadvise64_64) PPC_SYS_SPU(rtas) @@ -9231,7 +8929,7 @@ COMPAT_SYS(mbind) COMPAT_SYS(get_mempolicy) --- a/include/asm-powerpc/unistd.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-powerpc/unistd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-powerpc/unistd.h 2008-07-16 22:41:36.000000000 -0400 @@ -275,7 +275,7 @@ #endif #define __NR_rtas 255 @@ -9242,7 +8940,7 @@ #define __NR_mbind 259 #define __NR_get_mempolicy 260 --- a/include/asm-s390/unistd.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-s390/unistd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-s390/unistd.h 2008-07-16 22:41:36.000000000 -0400 @@ -202,7 +202,7 @@ #define __NR_clock_gettime (__NR_timer_create+6) #define __NR_clock_getres (__NR_timer_create+7) @@ -9253,7 +8951,7 @@ #define __NR_fstatfs64 266 #define __NR_remap_file_pages 267 --- a/include/asm-sparc/unistd.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-sparc/unistd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-sparc/unistd.h 2008-07-16 22:41:36.000000000 -0400 @@ -282,7 +282,7 @@ #define __NR_timer_getoverrun 264 #define __NR_timer_delete 265 @@ -9264,7 +8962,7 @@ #define __NR_io_destroy 269 #define __NR_io_submit 270 --- a/include/asm-sparc64/tlb.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-sparc64/tlb.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/asm-sparc64/tlb.h 2008-07-16 22:41:36.000000000 -0400 @@ -3,6 +3,7 @@ #include @@ -9273,8 +8971,8 @@ #include #include #include ---- a/include/asm-sparc64/unistd.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-sparc64/unistd.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/asm-sparc64/unistd.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/asm-sparc64/unistd.h 2008-07-16 22:41:36.000000000 -0400 @@ -284,7 +284,7 @@ #define __NR_timer_getoverrun 264 #define __NR_timer_delete 265 @@ -9284,8 +8982,8 @@ #define __NR_io_setup 268 #define __NR_io_destroy 269 #define __NR_io_submit 270 ---- a/include/asm-x86/unistd_64.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/asm-x86/unistd_64.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/asm-x86/unistd_64.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/asm-x86/unistd_64.h 2008-07-16 22:41:36.000000000 -0400 @@ -535,7 +535,7 @@ __SYSCALL(__NR_tgkill, sys_tgkill) #define __NR_utimes 235 __SYSCALL(__NR_utimes, sys_utimes) @@ -9295,9 +8993,9 @@ #define __NR_mbind 237 __SYSCALL(__NR_mbind, sys_mbind) #define __NR_set_mempolicy 238 ---- a/include/linux/capability.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/capability.h 2008-04-22 11:01:12.000000000 -0400 -@@ -260,6 +260,7 @@ typedef struct kernel_cap_struct { +--- a/include/linux/capability.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/linux/capability.h 2008-07-16 22:41:36.000000000 -0400 +@@ -274,6 +274,7 @@ typedef struct kernel_cap_struct { arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ /* Allow setting zone reclaim policy */ @@ -9305,7 +9003,7 @@ #define CAP_SYS_ADMIN 21 -@@ -332,7 +333,13 @@ typedef struct kernel_cap_struct { +@@ -346,7 +347,13 @@ typedef struct kernel_cap_struct { #define CAP_MAC_ADMIN 33 @@ -9320,26 +9018,30 @@ #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) ---- a/include/linux/devpts_fs.h 2007-02-04 13:44:54.000000000 -0500 -+++ a/include/linux/devpts_fs.h 2008-04-19 15:14:52.000000000 -0400 -@@ -30,5 +30,4 @@ static inline void devpts_pty_kill(int n +--- a/include/linux/devpts_fs.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/linux/devpts_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -34,5 +34,4 @@ static inline void devpts_pty_kill(int n #endif - #endif /* _LINUX_DEVPTS_FS_H */ --- a/include/linux/ext2_fs.h 2008-04-17 11:31:39.000000000 -0400 -+++ a/include/linux/ext2_fs.h 2008-04-19 15:14:52.000000000 -0400 -@@ -189,6 +189,8 @@ struct ext2_group_desc ++++ a/include/linux/ext2_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -189,8 +189,12 @@ struct ext2_group_desc #define EXT2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ #define EXT2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ #define EXT2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ -+#define EXT2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ -+#define EXT2_IUNLINK_FL FS_IUNLINK_FL /* Immutable unlink */ ++#define EXT2_IXUNLINK_FL FS_IXUNLINK_FL /* Immutable invert on unlink */ #define EXT2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ ++#define EXT2_BARRIER_FL FS_BARRIER_FL /* Barrier for chroot() */ ++#define EXT2_COW_FL FS_COW_FL /* Copy on Write marker */ ++ #define EXT2_FL_USER_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ -@@ -247,7 +249,7 @@ struct ext2_inode { + #define EXT2_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ + +@@ -247,7 +251,7 @@ struct ext2_inode { struct { __u8 l_i_frag; /* Fragment number */ __u8 l_i_fsize; /* Fragment size */ @@ -9348,7 +9050,7 @@ __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __u32 l_i_reserved2; -@@ -279,6 +281,7 @@ struct ext2_inode { +@@ -279,6 +283,7 @@ struct ext2_inode { #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high @@ -9356,7 +9058,7 @@ #define i_reserved2 osd2.linux2.l_i_reserved2 #endif -@@ -323,6 +326,7 @@ struct ext2_inode { +@@ -323,6 +328,7 @@ struct ext2_inode { #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ @@ -9365,17 +9067,25 @@ #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt --- a/include/linux/ext3_fs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/ext3_fs.h 2008-04-19 15:14:52.000000000 -0400 -@@ -173,6 +173,8 @@ struct ext3_group_desc ++++ a/include/linux/ext3_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -173,10 +173,14 @@ struct ext3_group_desc #define EXT3_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define EXT3_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT3_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -+#define EXT3_BARRIER_FL 0x04000000 /* Barrier for chroot() */ -+#define EXT3_IUNLINK_FL 0x08000000 /* Immutable unlink */ ++#define EXT3_IXUNLINK_FL 0x01000000 /* Immutable invert on unlink */ #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ - #define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ -@@ -292,7 +294,7 @@ struct ext3_inode { +-#define EXT3_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define EXT3_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define EXT3_BARRIER_FL 0x10000000 /* Barrier for chroot() */ ++#define EXT3_COW_FL 0x20000000 /* Copy on Write marker */ ++ ++#define EXT3_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define EXT3_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ + + /* + * Inode dynamic state flags +@@ -292,7 +296,7 @@ struct ext3_inode { struct { __u8 l_i_frag; /* Fragment number */ __u8 l_i_fsize; /* Fragment size */ @@ -9384,7 +9094,7 @@ __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __u32 l_i_reserved2; -@@ -326,6 +328,7 @@ struct ext3_inode { +@@ -326,6 +330,7 @@ struct ext3_inode { #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high @@ -9392,7 +9102,7 @@ #define i_reserved2 osd2.linux2.l_i_reserved2 #elif defined(__GNU__) -@@ -380,6 +383,7 @@ struct ext3_inode { +@@ -380,6 +385,7 @@ struct ext3_inode { #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ @@ -9400,7 +9110,7 @@ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -@@ -822,6 +826,7 @@ struct buffer_head * ext3_bread (handle_ +@@ -822,6 +828,7 @@ struct buffer_head * ext3_bread (handle_ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, int create, int extend_disksize); @@ -9408,54 +9118,9 @@ extern struct inode *ext3_iget(struct super_block *, unsigned long); extern int ext3_write_inode (struct inode *, int); ---- a/include/linux/ext4_fs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/ext4_fs.h 2008-04-19 15:43:04.000000000 -0400 -@@ -231,6 +231,8 @@ struct ext4_group_desc - #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ - #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ - #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ -+#define EXT4_BARRIER_FL 0x04000000 /* Barrier for chroot() */ -+#define EXT4_IUNLINK_FL 0x08000000 /* Immutable unlink */ - #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ - - #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -@@ -359,7 +361,8 @@ struct ext4_inode { - __le16 l_i_file_acl_high; - __le16 l_i_uid_high; /* these 2 fields */ - __le16 l_i_gid_high; /* were reserved2[0] */ -- __u32 l_i_reserved2; -+ __u16 l_i_tag; /* Context Tag */ -+ __u16 l_i_reserved2; - } linux2; - struct { - __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ -@@ -465,6 +468,7 @@ do { \ - #define i_gid_low i_gid - #define i_uid_high osd2.linux2.l_i_uid_high - #define i_gid_high osd2.linux2.l_i_gid_high -+#define i_raw_tag osd2.linux2.l_i_tag - #define i_reserved2 osd2.linux2.l_i_reserved2 - - #elif defined(__GNU__) -@@ -528,6 +532,7 @@ do { \ - #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ - #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ - #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ -+#define EXT4_MOUNT_TAGGED 0x8000000 /* Enable Context Tags */ - /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H - #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt -@@ -1030,6 +1035,7 @@ int ext4_get_blocks_handle(handle_t *han - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize); -+extern int ext4_sync_flags(struct inode *inode); - - extern struct inode *ext4_iget(struct super_block *, unsigned long); - extern int ext4_write_inode (struct inode *, int); ---- a/include/linux/fs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/fs.h 2008-04-19 15:37:28.000000000 -0400 -@@ -125,6 +125,8 @@ extern int dir_notify_enable; +--- a/include/linux/fs.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/linux/fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -126,6 +126,8 @@ extern int dir_notify_enable; #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ @@ -9464,23 +9129,22 @@ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) -@@ -151,6 +153,8 @@ extern int dir_notify_enable; +@@ -152,6 +154,14 @@ extern int dir_notify_enable; #define S_NOCMTIME 128 /* Do not update file c/mtime */ #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE 512 /* Inode is fs-internal */ -+#define S_BARRIER 1024 /* Barrier for chroot() */ -+#define S_IUNLINK 2048 /* Immutable unlink */ ++#define S_IXUNLINK 1024 /* Immutable Invert on unlink */ ++ ++/* Linux-VServer related Inode flags */ ++ ++#define V_VALID 1 ++#define V_XATTR 2 ++#define V_BARRIER 4 /* Barrier for chroot() */ ++#define V_COW 8 /* Copy on Write */ /* * Note that nosuid etc flags are inode-specific: setting some file-system -@@ -167,25 +171,37 @@ extern int dir_notify_enable; - */ - #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) - --#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) -+#define IS_RDONLY(inode) __IS_FLG(inode, MS_RDONLY) - #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ - ((inode)->i_flags & S_SYNC)) +@@ -174,12 +184,15 @@ extern int dir_notify_enable; #define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) @@ -9493,18 +9157,19 @@ #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) -+#define IS_IUNLINK(inode) ((inode)->i_flags & S_IUNLINK) -+#define IS_IXORUNLINK(inode) ((IS_IUNLINK(inode) ? S_IMMUTABLE : 0) ^ IS_IMMUTABLE(inode)) ++#define IS_IXUNLINK(inode) ((inode)->i_flags & S_IXUNLINK) ++#define IS_IXORUNLINK(inode) ((IS_IXUNLINK(inode) ? S_IMMUTABLE : 0) ^ IS_IMMUTABLE(inode)) #define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) -+#define IS_BARRIER(inode) (S_ISDIR((inode)->i_mode) && ((inode)->i_flags & S_BARRIER)) #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) - #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) +@@ -187,6 +200,16 @@ extern int dir_notify_enable; #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) ++#define IS_BARRIER(inode) (S_ISDIR((inode)->i_mode) && ((inode)->i_vflags & V_BARRIER)) ++ +#ifdef CONFIG_VSERVER_COWBL -+# define IS_COW(inode) (IS_IUNLINK(inode) && IS_IMMUTABLE(inode)) ++# define IS_COW(inode) (IS_IXUNLINK(inode) && IS_IMMUTABLE(inode)) +# define IS_COW_LINK(inode) (S_ISREG((inode)->i_mode) && ((inode)->i_nlink > 1)) +#else +# define IS_COW(inode) (0) @@ -9514,22 +9179,24 @@ /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ -@@ -259,12 +275,13 @@ extern int dir_notify_enable; +@@ -260,11 +283,14 @@ extern int dir_notify_enable; #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define FS_EXTENT_FL 0x00080000 /* Extents */ #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ -+#define FS_BARRIER_FL 0x04000000 /* Barrier for chroot() */ -+#define FS_IUNLINK_FL 0x08000000 /* Immutable unlink */ ++#define FS_IXUNLINK_FL 0x01000000 /* Immutable invert on unlink */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ - #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ - #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +-#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +-#define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ ++#define FS_BARRIER_FL 0x10000000 /* Barrier for chroot() */ ++#define FS_COW_FL 0x20000000 /* Copy on Write marker */ + ++#define FS_FL_USER_VISIBLE 0x0103DFFF /* User visible flags */ ++#define FS_FL_USER_MODIFIABLE 0x010380FF /* User modifiable flags */ -- #define SYNC_FILE_RANGE_WAIT_BEFORE 1 #define SYNC_FILE_RANGE_WRITE 2 - #define SYNC_FILE_RANGE_WAIT_AFTER 4 -@@ -334,6 +351,7 @@ typedef void (dio_iodone_t)(struct kiocb +@@ -334,6 +360,7 @@ typedef void (dio_iodone_t)(struct kiocb #define ATTR_FILE 8192 #define ATTR_KILL_PRIV 16384 #define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */ @@ -9537,7 +9204,7 @@ /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -349,6 +367,7 @@ struct iattr { +@@ -349,6 +376,7 @@ struct iattr { umode_t ia_mode; uid_t ia_uid; gid_t ia_gid; @@ -9545,17 +9212,17 @@ loff_t ia_size; struct timespec ia_atime; struct timespec ia_mtime; -@@ -362,6 +381,9 @@ struct iattr { +@@ -362,6 +390,9 @@ struct iattr { struct file *ia_file; }; +#define ATTR_FLAG_BARRIER 512 /* Barrier for chroot() */ -+#define ATTR_FLAG_IUNLINK 1024 /* Immutable unlink */ ++#define ATTR_FLAG_IXUNLINK 1024 /* Immutable invert on unlink */ + /* * Includes for diskquotas. */ -@@ -600,7 +622,9 @@ struct inode { +@@ -600,7 +631,9 @@ struct inode { unsigned int i_nlink; uid_t i_uid; gid_t i_gid; @@ -9565,7 +9232,17 @@ u64 i_version; loff_t i_size; #ifdef __NEED_I_SIZE_ORDERED -@@ -735,12 +759,12 @@ static inline void i_size_write(struct i +@@ -648,7 +681,8 @@ struct inode { + unsigned long i_state; + unsigned long dirtied_when; /* jiffies of first dirtying */ + +- unsigned int i_flags; ++ unsigned short i_flags; ++ unsigned short i_vflags; + + atomic_t i_writecount; + #ifdef CONFIG_SECURITY +@@ -735,12 +769,12 @@ static inline void i_size_write(struct i static inline unsigned iminor(const struct inode *inode) { @@ -9580,7 +9257,7 @@ } extern struct block_device *I_BDEV(struct inode *inode); -@@ -795,6 +819,7 @@ struct file { +@@ -798,6 +832,7 @@ struct file { loff_t f_pos; struct fown_struct f_owner; unsigned int f_uid, f_gid; @@ -9588,7 +9265,7 @@ struct file_ra_state f_ra; u64 f_version; -@@ -879,6 +904,7 @@ struct file_lock { +@@ -926,6 +961,7 @@ struct file_lock { unsigned char fl_type; loff_t fl_start; loff_t fl_end; @@ -9596,26 +9273,7 @@ struct fasync_struct * fl_fasync; /* for lease break notifications */ unsigned long fl_break_time; /* for nonblocking lease breaks */ -@@ -1076,12 +1102,12 @@ extern void unlock_super(struct super_bl - */ - extern int vfs_permission(struct nameidata *, int); - extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); --extern int vfs_mkdir(struct inode *, struct dentry *, int); --extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); --extern int vfs_symlink(struct inode *, struct dentry *, const char *, int); --extern int vfs_link(struct dentry *, struct inode *, struct dentry *); --extern int vfs_rmdir(struct inode *, struct dentry *); --extern int vfs_unlink(struct inode *, struct dentry *); -+extern int vfs_mkdir(struct inode *, struct dentry *, int, struct nameidata *); -+extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t, struct nameidata *); -+extern int vfs_symlink(struct inode *, struct dentry *, const char *, int, struct nameidata *); -+extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct nameidata *); -+extern int vfs_rmdir(struct inode *, struct dentry *, struct nameidata *); -+extern int vfs_unlink(struct inode *, struct dentry *, struct nameidata *); - extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); - - /* -@@ -1223,6 +1249,7 @@ struct inode_operations { +@@ -1272,6 +1308,7 @@ struct inode_operations { void (*truncate_range)(struct inode *, loff_t, loff_t); long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len); @@ -9623,15 +9281,15 @@ }; struct seq_file; -@@ -1238,6 +1265,7 @@ extern ssize_t vfs_readv(struct file *, +@@ -1287,6 +1324,7 @@ extern ssize_t vfs_readv(struct file *, unsigned long, loff_t *); extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); +ssize_t vfs_sendfile(struct file *, struct file *, loff_t *, size_t, loff_t); - /* - * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called -@@ -1934,6 +1962,7 @@ extern int dcache_dir_open(struct inode + struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); +@@ -1971,6 +2009,7 @@ extern int dcache_dir_open(struct inode extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, void *, filldir_t); @@ -9640,7 +9298,7 @@ extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_link(struct dentry *, struct inode *, struct dentry *); --- a/include/linux/if_tun.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/if_tun.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/if_tun.h 2008-07-16 22:41:36.000000000 -0400 @@ -42,6 +42,7 @@ #define TUNSETOWNER _IOW('T', 204, int) #define TUNSETLINK _IOW('T', 205, int) @@ -9649,9 +9307,9 @@ /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 ---- a/include/linux/init_task.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/init_task.h 2008-04-19 15:14:52.000000000 -0400 -@@ -196,6 +196,10 @@ extern struct group_info init_groups; +--- a/include/linux/init_task.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/linux/init_task.h 2008-07-16 22:41:36.000000000 -0400 +@@ -179,6 +179,10 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ @@ -9662,8 +9320,8 @@ } ---- a/include/linux/interrupt.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/interrupt.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/interrupt.h 2008-07-14 17:22:54.000000000 -0400 ++++ a/include/linux/interrupt.h 2008-07-16 22:41:36.000000000 -0400 @@ -8,8 +8,8 @@ #include #include @@ -9675,7 +9333,7 @@ #include #include --- a/include/linux/ipc.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/ipc.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/ipc.h 2008-07-16 22:41:36.000000000 -0400 @@ -93,6 +93,7 @@ struct kern_ipc_perm key_t key; uid_t uid; @@ -9684,17 +9342,8 @@ uid_t cuid; gid_t cgid; mode_t mode; ---- a/include/linux/Kbuild 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/Kbuild 2008-04-19 15:14:52.000000000 -0400 -@@ -354,3 +354,6 @@ unifdef-y += xattr.h - unifdef-y += xfrm.h - - objhdr-y += version.h -+ -+header-y += vserver/ -+ --- a/include/linux/loop.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/loop.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/loop.h 2008-07-16 22:41:36.000000000 -0400 @@ -45,6 +45,7 @@ struct loop_device { struct loop_func_table *lo_encryption; __u32 lo_init[2]; @@ -9704,7 +9353,7 @@ unsigned long arg); --- a/include/linux/magic.h 2008-04-17 11:31:39.000000000 -0400 -+++ a/include/linux/magic.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/magic.h 2008-07-16 22:41:36.000000000 -0400 @@ -3,7 +3,7 @@ #define ADFS_SUPER_MAGIC 0xadf5 @@ -9723,7 +9372,7 @@ #define REISERFS_SUPER_MAGIC 0x52654973 /* used by gcc */ --- a/include/linux/major.h 2008-04-17 10:37:24.000000000 -0400 -+++ a/include/linux/major.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/major.h 2008-07-16 22:41:36.000000000 -0400 @@ -15,6 +15,7 @@ #define HD_MAJOR IDE0_MAJOR #define PTY_SLAVE_MAJOR 3 @@ -9732,9 +9381,9 @@ #define TTYAUX_MAJOR 5 #define LP_MAJOR 6 #define VCS_MAJOR 7 ---- a/include/linux/mm_types.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/mm_types.h 2008-04-19 15:14:52.000000000 -0400 -@@ -201,6 +201,7 @@ struct mm_struct { +--- a/include/linux/mm_types.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/mm_types.h 2008-07-16 22:41:36.000000000 -0400 +@@ -205,6 +205,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; @@ -9742,19 +9391,9 @@ /* Swap token stuff */ /* ---- a/include/linux/mount.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/mount.h 2008-04-19 15:45:19.000000000 -0400 -@@ -28,6 +28,9 @@ struct mnt_namespace; - #define MNT_NOATIME 0x08 - #define MNT_NODIRATIME 0x10 - #define MNT_RELATIME 0x20 -+#define MNT_RDONLY 0x40 -+ -+#define MNT_IS_RDONLY(m) ((m) && ((m)->mnt_flags & MNT_RDONLY)) - - #define MNT_SHRINKABLE 0x100 - -@@ -35,6 +38,10 @@ struct mnt_namespace; +--- a/include/linux/mount.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/mount.h 2008-07-16 22:47:42.000000000 -0400 +@@ -37,6 +37,10 @@ struct mnt_namespace; #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ #define MNT_PNODE_MASK 0x3000 /* propagation flag mask */ @@ -9765,17 +9404,17 @@ struct vfsmount { struct list_head mnt_hash; struct vfsmount *mnt_parent; /* fs we are mounted on */ -@@ -62,6 +69,7 @@ struct vfsmount { - int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; - int mnt_ghosts; +@@ -71,6 +75,7 @@ struct vfsmount { + * are held, and all mnt_writer[]s on this mount have 0 as their ->count + */ + atomic_t __mnt_writers; + tag_t mnt_tag; /* tagging used for vfsmount */ }; static inline struct vfsmount *mntget(struct vfsmount *mnt) ---- a/include/linux/net.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/net.h 2008-04-19 15:14:52.000000000 -0400 -@@ -65,6 +65,7 @@ typedef enum { +--- a/include/linux/net.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/net.h 2008-07-16 22:41:36.000000000 -0400 +@@ -66,6 +66,7 @@ typedef enum { #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 @@ -9784,7 +9423,7 @@ #ifndef ARCH_HAS_SOCKET_TYPES /** --- a/include/linux/nfs_mount.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/nfs_mount.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/nfs_mount.h 2008-07-16 22:41:36.000000000 -0400 @@ -63,6 +63,7 @@ struct nfs_mount_data { #define NFS_MOUNT_SECFLAVOUR 0x2000 /* 5 */ #define NFS_MOUNT_NORDIRPLUS 0x4000 /* 5 */ @@ -9795,7 +9434,7 @@ #endif --- a/include/linux/nsproxy.h 2008-04-17 11:31:39.000000000 -0400 -+++ a/include/linux/nsproxy.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/nsproxy.h 2008-07-16 22:41:36.000000000 -0400 @@ -3,6 +3,7 @@ #include @@ -9844,8 +9483,8 @@ } #ifdef CONFIG_CGROUP_NS ---- a/include/linux/pid.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/pid.h 2008-04-29 17:56:00.000000000 -0400 +--- a/include/linux/pid.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/pid.h 2008-07-16 22:41:36.000000000 -0400 @@ -8,7 +8,8 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, @@ -9856,7 +9495,7 @@ }; /* -@@ -142,6 +143,7 @@ static inline pid_t pid_nr(struct pid *p +@@ -144,6 +145,7 @@ static inline pid_t pid_nr(struct pid *p } pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); @@ -9864,8 +9503,8 @@ pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ ---- a/include/linux/proc_fs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/proc_fs.h 2008-04-19 15:44:54.000000000 -0400 +--- a/include/linux/proc_fs.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/proc_fs.h 2008-07-16 22:41:36.000000000 -0400 @@ -59,6 +59,7 @@ struct proc_dir_entry { nlink_t nlink; uid_t uid; @@ -9874,7 +9513,7 @@ loff_t size; const struct inode_operations *proc_iops; /* -@@ -265,16 +266,23 @@ static inline void kclist_add(struct kco +@@ -274,16 +275,23 @@ static inline void kclist_add(struct kco extern void kclist_add(struct kcore_list *, void *, size_t); #endif @@ -9898,20 +9537,21 @@ int fd; union proc_op op; struct proc_dir_entry *pde; ---- a/include/linux/reiserfs_fs.h 2008-05-21 14:30:05.000000000 -0400 -+++ a/include/linux/reiserfs_fs.h 2008-05-21 14:30:41.000000000 -0400 -@@ -837,6 +837,10 @@ struct stat_data_v1 { +--- a/include/linux/reiserfs_fs.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/reiserfs_fs.h 2008-07-16 22:41:36.000000000 -0400 +@@ -837,6 +837,11 @@ struct stat_data_v1 { #define REISERFS_COMPR_FL FS_COMPR_FL #define REISERFS_NOTAIL_FL FS_NOTAIL_FL +/* unfortunately reiserfs sdattr is only 16 bit */ ++#define REISERFS_IXUNLINK_FL (FS_IXUNLINK_FL >> 16) +#define REISERFS_BARRIER_FL (FS_BARRIER_FL >> 16) -+#define REISERFS_IUNLINK_FL (FS_IUNLINK_FL >> 16) ++#define REISERFS_COW_FL (FS_COW_FL >> 16) + /* persistent flags that file inherits from the parent directory */ #define REISERFS_INHERIT_MASK ( REISERFS_IMMUTABLE_FL | \ REISERFS_SYNC_FL | \ -@@ -846,6 +850,9 @@ struct stat_data_v1 { +@@ -846,6 +851,9 @@ struct stat_data_v1 { REISERFS_COMPR_FL | \ REISERFS_NOTAIL_FL ) @@ -9921,7 +9561,7 @@ /* Stat Data on disk (reiserfs version of UFS disk inode minus the address blocks) */ struct stat_data { -@@ -1911,6 +1918,7 @@ static inline void reiserfs_update_sd(st +@@ -1911,6 +1919,7 @@ static inline void reiserfs_update_sd(st void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); @@ -9929,9 +9569,9 @@ /* namei.c */ void set_de_name_and_namelen(struct reiserfs_dir_entry *de); ---- a/include/linux/reiserfs_fs_sb.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/reiserfs_fs_sb.h 2008-04-19 15:14:52.000000000 -0400 -@@ -456,6 +456,7 @@ enum reiserfs_mount_options { +--- a/include/linux/reiserfs_fs_sb.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/reiserfs_fs_sb.h 2008-07-16 22:41:36.000000000 -0400 +@@ -455,6 +455,7 @@ enum reiserfs_mount_options { REISERFS_POSIXACL, REISERFS_BARRIER_NONE, REISERFS_BARRIER_FLUSH, @@ -9939,9 +9579,9 @@ /* Actions on error */ REISERFS_ERROR_PANIC, ---- a/include/linux/sched.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/sched.h 2008-04-21 11:09:01.000000000 -0400 -@@ -73,7 +73,6 @@ struct sched_param { +--- a/include/linux/sched.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/sched.h 2008-07-27 14:07:42.000000000 -0400 +@@ -71,7 +71,6 @@ struct sched_param { #include #include #include @@ -9949,7 +9589,7 @@ #include #include #include -@@ -89,6 +88,7 @@ struct sched_param { +@@ -87,6 +86,7 @@ struct sched_param { #include #include #include @@ -9995,7 +9635,7 @@ #define get_mm_rss(mm) \ (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) #define update_hiwater_rss(mm) do { \ -@@ -1162,6 +1164,14 @@ struct task_struct { +@@ -1194,6 +1196,14 @@ struct task_struct { #endif seccomp_t seccomp; @@ -10010,7 +9650,7 @@ /* Thread group tracking */ u32 parent_exec_id; u32 self_exec_id; -@@ -1350,6 +1360,11 @@ struct pid_namespace; +@@ -1382,6 +1392,11 @@ struct pid_namespace; * see also pid_nr() etc in include/linux/pid.h */ @@ -10022,7 +9662,7 @@ static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; -@@ -1359,7 +1374,7 @@ pid_t task_pid_nr_ns(struct task_struct +@@ -1391,7 +1406,7 @@ pid_t task_pid_nr_ns(struct task_struct static inline pid_t task_pid_vnr(struct task_struct *tsk) { @@ -10031,7 +9671,7 @@ } -@@ -1372,7 +1387,7 @@ pid_t task_tgid_nr_ns(struct task_struct +@@ -1404,7 +1419,7 @@ pid_t task_tgid_nr_ns(struct task_struct static inline pid_t task_tgid_vnr(struct task_struct *tsk) { @@ -10040,8 +9680,20 @@ } ---- a/include/linux/shmem_fs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/shmem_fs.h 2008-04-19 15:14:52.000000000 -0400 +@@ -1707,9 +1722,9 @@ extern struct pid_namespace init_pid_ns; + extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, + struct pid_namespace *ns); + +-static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr) ++static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr, struct pid_namespace *ns) + { +- return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); ++ return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns); + } + extern struct task_struct *find_task_by_vpid(pid_t nr); + extern struct task_struct *find_task_by_pid_ns(pid_t nr, +--- a/include/linux/shmem_fs.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/shmem_fs.h 2008-07-16 22:41:36.000000000 -0400 @@ -8,6 +8,9 @@ #define SHMEM_NR_DIRECT 16 @@ -10053,7 +9705,7 @@ spinlock_t lock; unsigned long flags; --- a/include/linux/stat.h 2008-04-17 10:33:07.000000000 -0400 -+++ a/include/linux/stat.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/stat.h 2008-07-16 22:41:36.000000000 -0400 @@ -66,6 +66,7 @@ struct kstat { unsigned int nlink; uid_t uid; @@ -10062,18 +9714,18 @@ dev_t rdev; loff_t size; struct timespec atime; ---- a/include/linux/sunrpc/auth.h 2008-04-17 10:37:24.000000000 -0400 -+++ a/include/linux/sunrpc/auth.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/sunrpc/auth.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/sunrpc/auth.h 2008-07-16 22:41:36.000000000 -0400 @@ -25,6 +25,7 @@ struct auth_cred { uid_t uid; gid_t gid; + tag_t tag; struct group_info *group_info; + unsigned char machine_cred : 1; }; - ---- a/include/linux/sunrpc/clnt.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/sunrpc/clnt.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/sunrpc/clnt.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/sunrpc/clnt.h 2008-07-16 22:41:36.000000000 -0400 @@ -42,7 +42,8 @@ struct rpc_clnt { unsigned int cl_softrtry : 1,/* soft timeouts */ @@ -10084,9 +9736,9 @@ struct rpc_rtt * cl_rtt; /* RTO estimator data */ const struct rpc_timeout *cl_timeout; /* Timeout strategy */ ---- a/include/linux/syscalls.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/syscalls.h 2008-04-19 15:14:52.000000000 -0400 -@@ -294,6 +294,8 @@ asmlinkage long sys_symlink(const char _ +--- a/include/linux/syscalls.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/syscalls.h 2008-07-16 22:41:36.000000000 -0400 +@@ -295,6 +295,8 @@ asmlinkage long sys_symlink(const char _ asmlinkage long sys_unlink(const char __user *pathname); asmlinkage long sys_rename(const char __user *oldname, const char __user *newname); @@ -10095,8 +9747,8 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode); asmlinkage long sys_fchmod(unsigned int fd, mode_t mode); ---- a/include/linux/sysctl.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/sysctl.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/sysctl.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/sysctl.h 2008-07-16 22:41:36.000000000 -0400 @@ -70,6 +70,7 @@ enum CTL_ABI=9, /* Binary emulation */ CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ @@ -10113,8 +9765,8 @@ KERN_SPARC_REBOOT=21, /* reboot command on Sparc */ KERN_CTLALTDEL=22, /* int: allow ctl-alt-del to reboot */ ---- a/include/linux/sysfs.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/sysfs.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/sysfs.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/sysfs.h 2008-07-16 22:41:36.000000000 -0400 @@ -17,6 +17,8 @@ #include #include @@ -10124,10 +9776,10 @@ struct kobject; struct module; ---- a/include/linux/time.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/time.h 2008-04-19 15:14:52.000000000 -0400 -@@ -183,6 +183,9 @@ static inline void timespec_add_ns(struc - } +--- a/include/linux/time.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/time.h 2008-07-16 22:41:36.000000000 -0400 +@@ -179,6 +179,9 @@ static __always_inline void timespec_add + a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } + @@ -10136,8 +9788,8 @@ #endif /* __KERNEL__ */ #define NFDBITS __NFDBITS ---- a/include/linux/types.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/linux/types.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/linux/types.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/linux/types.h 2008-07-16 22:41:36.000000000 -0400 @@ -36,6 +36,9 @@ typedef __kernel_uid32_t uid_t; typedef __kernel_gid32_t gid_t; typedef __kernel_uid16_t uid16_t; @@ -10149,7 +9801,7 @@ typedef unsigned long uintptr_t; --- a/include/linux/vroot.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vroot.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vroot.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,51 @@ + +/* @@ -10203,7 +9855,7 @@ + +#endif /* _LINUX_VROOT_H */ --- a/include/linux/vs_base.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_base.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_base.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,10 @@ +#ifndef _VS_BASE_H +#define _VS_BASE_H @@ -10216,7 +9868,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_context.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_context.h 2008-04-29 18:42:09.000000000 -0400 ++++ a/include/linux/vs_context.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,227 @@ +#ifndef _VS_CONTEXT_H +#define _VS_CONTEXT_H @@ -10446,8 +10098,8 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_cowbl.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_cowbl.h 2008-04-21 13:49:51.000000000 -0400 -@@ -0,0 +1,44 @@ ++++ a/include/linux/vs_cowbl.h 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,47 @@ +#ifndef _VS_COWBL_H +#define _VS_COWBL_H + @@ -10461,8 +10113,11 @@ +{ + struct inode *inode = nd->path.dentry->d_inode; + int error = 0; -+ if (IS_RDONLY(inode) || MNT_IS_RDONLY(nd->path.mnt)) ++ ++ /* do we need this check? */ ++ if (IS_RDONLY(inode)) + return -EROFS; ++ + if (IS_COW(inode)) { + if (IS_COW_LINK(inode)) { + struct dentry *new_dentry, *old_dentry = nd->path.dentry; @@ -10481,7 +10136,7 @@ + } else + error = PTR_ERR(new_dentry); + } else { -+ inode->i_flags &= ~(S_IUNLINK | S_IMMUTABLE); ++ inode->i_flags &= ~(S_IXUNLINK | S_IMMUTABLE); + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } @@ -10493,7 +10148,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_cvirt.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_cvirt.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_cvirt.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,50 @@ +#ifndef _VS_CVIRT_H +#define _VS_CVIRT_H @@ -10546,7 +10201,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_device.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_device.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_device.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,45 @@ +#ifndef _VS_DEVICE_H +#define _VS_DEVICE_H @@ -10594,7 +10249,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_dlimit.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_dlimit.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_dlimit.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,211 @@ +#ifndef _VS_DLIMIT_H +#define _VS_DLIMIT_H @@ -10808,7 +10463,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vserver/base.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/base.h 2008-04-23 20:52:31.000000000 -0400 ++++ a/include/linux/vserver/base.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,157 @@ +#ifndef _VX_BASE_H +#define _VX_BASE_H @@ -10968,7 +10623,7 @@ + +#endif --- a/include/linux/vserver/cacct_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cacct_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cacct_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,23 @@ +#ifndef _VX_CACCT_CMD_H +#define _VX_CACCT_CMD_H @@ -10994,7 +10649,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CACCT_CMD_H */ --- a/include/linux/vserver/cacct_def.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cacct_def.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cacct_def.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,43 @@ +#ifndef _VX_CACCT_DEF_H +#define _VX_CACCT_DEF_H @@ -11040,7 +10695,7 @@ + +#endif /* _VX_CACCT_DEF_H */ --- a/include/linux/vserver/cacct.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cacct.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cacct.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,15 @@ +#ifndef _VX_CACCT_H +#define _VX_CACCT_H @@ -11058,7 +10713,7 @@ + +#endif /* _VX_CACCT_H */ --- a/include/linux/vserver/cacct_int.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cacct_int.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cacct_int.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,21 @@ +#ifndef _VX_CACCT_INT_H +#define _VX_CACCT_INT_H @@ -11082,7 +10737,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CACCT_INT_H */ --- a/include/linux/vserver/check.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/check.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/check.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,89 @@ +#ifndef _VS_CHECK_H +#define _VS_CHECK_H @@ -11174,7 +10829,7 @@ + +#endif --- a/include/linux/vserver/context_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/context_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/context_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,128 @@ +#ifndef _VX_CONTEXT_CMD_H +#define _VX_CONTEXT_CMD_H @@ -11305,7 +10960,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CONTEXT_CMD_H */ --- a/include/linux/vserver/context.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/context.h 2008-04-22 15:14:28.000000000 -0400 ++++ a/include/linux/vserver/context.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,176 @@ +#ifndef _VX_CONTEXT_H +#define _VX_CONTEXT_H @@ -11484,7 +11139,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CONTEXT_H */ --- a/include/linux/vserver/cvirt_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cvirt_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cvirt_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,53 @@ +#ifndef _VX_CVIRT_CMD_H +#define _VX_CVIRT_CMD_H @@ -11540,7 +11195,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CVIRT_CMD_H */ --- a/include/linux/vserver/cvirt_def.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cvirt_def.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cvirt_def.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,80 @@ +#ifndef _VX_CVIRT_DEF_H +#define _VX_CVIRT_DEF_H @@ -11623,7 +11278,7 @@ + +#endif /* _VX_CVIRT_DEF_H */ --- a/include/linux/vserver/cvirt.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/cvirt.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/cvirt.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,20 @@ +#ifndef _VX_CVIRT_H +#define _VX_CVIRT_H @@ -11646,7 +11301,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CVIRT_H */ --- a/include/linux/vserver/debug_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/debug_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/debug_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,58 @@ +#ifndef _VX_DEBUG_CMD_H +#define _VX_DEBUG_CMD_H @@ -11707,7 +11362,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_DEBUG_CMD_H */ --- a/include/linux/vserver/debug.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/debug.h 2008-04-21 13:55:14.000000000 -0400 ++++ a/include/linux/vserver/debug.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,130 @@ +#ifndef _VX_DEBUG_H +#define _VX_DEBUG_H @@ -11840,7 +11495,7 @@ + +#endif /* _VX_DEBUG_H */ --- a/include/linux/vserver/device_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/device_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/device_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,44 @@ +#ifndef _VX_DEVICE_CMD_H +#define _VX_DEVICE_CMD_H @@ -11887,7 +11542,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_DEVICE_CMD_H */ --- a/include/linux/vserver/device_def.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/device_def.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/device_def.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,17 @@ +#ifndef _VX_DEVICE_DEF_H +#define _VX_DEVICE_DEF_H @@ -11907,7 +11562,7 @@ + +#endif /* _VX_DEVICE_DEF_H */ --- a/include/linux/vserver/device.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/device.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/device.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,15 @@ +#ifndef _VX_DEVICE_H +#define _VX_DEVICE_H @@ -11925,7 +11580,7 @@ +#warning duplicate inclusion +#endif /* _VX_DEVICE_H */ --- a/include/linux/vserver/dlimit_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/dlimit_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/dlimit_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,74 @@ +#ifndef _VX_DLIMIT_CMD_H +#define _VX_DLIMIT_CMD_H @@ -12002,7 +11657,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_DLIMIT_CMD_H */ --- a/include/linux/vserver/dlimit.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/dlimit.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/dlimit.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,54 @@ +#ifndef _VX_DLIMIT_H +#define _VX_DLIMIT_H @@ -12059,7 +11714,7 @@ +#warning duplicate inclusion +#endif /* _VX_DLIMIT_H */ --- a/include/linux/vserver/global.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/global.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/global.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,20 @@ +#ifndef _VX_GLOBAL_H +#define _VX_GLOBAL_H @@ -12082,7 +11737,7 @@ + +#endif /* _VX_GLOBAL_H */ --- a/include/linux/vserver/history.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/history.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/history.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,197 @@ +#ifndef _VX_HISTORY_H +#define _VX_HISTORY_H @@ -12282,7 +11937,7 @@ + +#endif /* _VX_HISTORY_H */ --- a/include/linux/vserver/inode_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/inode_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/inode_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,59 @@ +#ifndef _VX_INODE_CMD_H +#define _VX_INODE_CMD_H @@ -12344,7 +11999,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_INODE_CMD_H */ --- a/include/linux/vserver/inode.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/inode.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/inode.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,38 @@ +#ifndef _VX_INODE_H +#define _VX_INODE_H @@ -12358,7 +12013,7 @@ +#define IATTR_FLAGS 0x00000007 + +#define IATTR_BARRIER 0x00010000 -+#define IATTR_IUNLINK 0x00020000 ++#define IATTR_IXUNLINK 0x00020000 +#define IATTR_IMMUTABLE 0x00040000 + +#ifdef __KERNEL__ @@ -12384,19 +12039,8 @@ +#else /* _VX_INODE_H */ +#warning duplicate inclusion +#endif /* _VX_INODE_H */ ---- a/include/linux/vserver/Kbuild 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/Kbuild 2008-04-19 15:14:52.000000000 -0400 -@@ -0,0 +1,8 @@ -+ -+unifdef-y += context_cmd.h network_cmd.h space_cmd.h \ -+ cacct_cmd.h cvirt_cmd.h limit_cmd.h dlimit_cmd.h \ -+ inode_cmd.h tag_cmd.h sched_cmd.h signal_cmd.h \ -+ debug_cmd.h device_cmd.h -+ -+unifdef-y += switch.h network.h monitor.h inode.h device.h -+ --- a/include/linux/vserver/limit_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/limit_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/limit_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,69 @@ +#ifndef _VX_LIMIT_CMD_H +#define _VX_LIMIT_CMD_H @@ -12468,7 +12112,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_LIMIT_CMD_H */ --- a/include/linux/vserver/limit_def.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/limit_def.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/limit_def.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,47 @@ +#ifndef _VX_LIMIT_DEF_H +#define _VX_LIMIT_DEF_H @@ -12518,7 +12162,7 @@ + +#endif /* _VX_LIMIT_DEF_H */ --- a/include/linux/vserver/limit.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/limit.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/limit.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,70 @@ +#ifndef _VX_LIMIT_H +#define _VX_LIMIT_H @@ -12591,7 +12235,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_LIMIT_H */ --- a/include/linux/vserver/limit_int.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/limit_int.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/limit_int.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,198 @@ +#ifndef _VX_LIMIT_INT_H +#define _VX_LIMIT_INT_H @@ -12792,7 +12436,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_LIMIT_INT_H */ --- a/include/linux/vserver/monitor.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/monitor.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/monitor.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,96 @@ +#ifndef _VX_MONITOR_H +#define _VX_MONITOR_H @@ -12891,7 +12535,7 @@ + +#endif /* _VX_MONITOR_H */ --- a/include/linux/vserver/network_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/network_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/network_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,150 @@ +#ifndef _VX_NETWORK_CMD_H +#define _VX_NETWORK_CMD_H @@ -13044,7 +12688,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_CONTEXT_CMD_H */ --- a/include/linux/vserver/network.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/network.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/network.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,146 @@ +#ifndef _VX_NETWORK_H +#define _VX_NETWORK_H @@ -13193,7 +12837,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_NETWORK_H */ --- a/include/linux/vserver/percpu.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/percpu.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/percpu.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,14 @@ +#ifndef _VX_PERCPU_H +#define _VX_PERCPU_H @@ -13210,7 +12854,7 @@ + +#endif /* _VX_PERCPU_H */ --- a/include/linux/vserver/pid.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/pid.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/pid.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,51 @@ +#ifndef _VSERVER_PID_H +#define _VSERVER_PID_H @@ -13264,7 +12908,7 @@ + +#endif --- a/include/linux/vserver/sched_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/sched_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/sched_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,108 @@ +#ifndef _VX_SCHED_CMD_H +#define _VX_SCHED_CMD_H @@ -13375,7 +13019,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_SCHED_CMD_H */ --- a/include/linux/vserver/sched_def.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/sched_def.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/sched_def.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,68 @@ +#ifndef _VX_SCHED_DEF_H +#define _VX_SCHED_DEF_H @@ -13446,7 +13090,7 @@ + +#endif /* _VX_SCHED_DEF_H */ --- a/include/linux/vserver/sched.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/sched.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/sched.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,26 @@ +#ifndef _VX_SCHED_H +#define _VX_SCHED_H @@ -13475,7 +13119,7 @@ +#warning duplicate inclusion +#endif /* _VX_SCHED_H */ --- a/include/linux/vserver/signal_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/signal_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/signal_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,43 @@ +#ifndef _VX_SIGNAL_CMD_H +#define _VX_SIGNAL_CMD_H @@ -13521,7 +13165,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_SIGNAL_CMD_H */ --- a/include/linux/vserver/signal.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/signal.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/signal.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,14 @@ +#ifndef _VX_SIGNAL_H +#define _VX_SIGNAL_H @@ -13538,8 +13182,8 @@ +#warning duplicate inclusion +#endif /* _VX_SIGNAL_H */ --- a/include/linux/vserver/space_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/space_cmd.h 2008-04-19 15:14:52.000000000 -0400 -@@ -0,0 +1,26 @@ ++++ a/include/linux/vserver/space_cmd.h 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,29 @@ +#ifndef _VX_SPACE_CMD_H +#define _VX_SPACE_CMD_H + @@ -13550,7 +13194,10 @@ +#define VCMD_set_space_v0 VC_CMD(PROCALT, 3, 0) +#define VCMD_set_space VC_CMD(PROCALT, 3, 1) + -+#define VCMD_get_space_mask VC_CMD(PROCALT, 4, 0) ++#define VCMD_get_space_mask_v0 VC_CMD(PROCALT, 4, 0) ++ ++#define VCMD_get_space_mask VC_CMD(VSPACE, 0, 1) ++#define VCMD_get_space_default VC_CMD(VSPACE, 1, 0) + + +struct vcmd_space_mask { @@ -13562,12 +13209,12 @@ + +extern int vc_enter_space(struct vx_info *, void __user *); +extern int vc_set_space(struct vx_info *, void __user *); -+extern int vc_get_space_mask(struct vx_info *, void __user *); ++extern int vc_get_space_mask(void __user *, int); + +#endif /* __KERNEL__ */ +#endif /* _VX_SPACE_CMD_H */ --- a/include/linux/vserver/space.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/space.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/space.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,13 @@ +#ifndef _VX_SPACE_H +#define _VX_SPACE_H @@ -13583,8 +13230,8 @@ +#warning duplicate inclusion +#endif /* _VX_SPACE_H */ --- a/include/linux/vserver/switch.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/switch.h 2008-04-19 15:14:52.000000000 -0400 -@@ -0,0 +1,99 @@ ++++ a/include/linux/vserver/switch.h 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,100 @@ +#ifndef _VX_SWITCH_H +#define _VX_SWITCH_H + @@ -13624,7 +13271,7 @@ + OTHER |VSTAT | | | | | | |VINFO | | + | 40| 41| 42| 43| 44| 45| | 46| 47| + =======+=======+=======+=======+=======+=======+=======+ +=======+=======+ -+ SPECIAL|EVENT | | | |FLAGS | | | | | ++ SPECIAL|EVENT | | | |FLAGS | | |VSPACE | | + | 48| 49| 50| 51| 52| 53| | 54| 55| + -------+-------+-------+-------+-------+-------+-------+ +-------+-------+ + SPECIAL|DEBUG | | | |RLIMIT |SYSCALL| | |COMPAT | @@ -13662,6 +13309,7 @@ +#define VC_CAT_EVENT 48 + +#define VC_CAT_FLAGS 52 ++#define VC_CAT_VSPACE 54 +#define VC_CAT_DEBUG 56 +#define VC_CAT_RLIMIT 60 + @@ -13685,7 +13333,7 @@ + +#endif /* _VX_SWITCH_H */ --- a/include/linux/vserver/tag_cmd.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/tag_cmd.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vserver/tag_cmd.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,22 @@ +#ifndef _VX_TAG_CMD_H +#define _VX_TAG_CMD_H @@ -13710,7 +13358,7 @@ +#endif /* __KERNEL__ */ +#endif /* _VX_TAG_CMD_H */ --- a/include/linux/vserver/tag.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vserver/tag.h 2008-04-21 13:53:47.000000000 -0400 ++++ a/include/linux/vserver/tag.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,143 @@ +#ifndef _DX_TAG_H +#define _DX_TAG_H @@ -13856,7 +13504,7 @@ + +#endif /* _DX_TAG_H */ --- a/include/linux/vs_inet6.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_inet6.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_inet6.h 2008-07-29 18:12:38.000000000 -0400 @@ -0,0 +1,229 @@ +#ifndef _VS_INET6_H +#define _VS_INET6_H @@ -14088,7 +13736,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_inet.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_inet.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_inet.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,342 @@ +#ifndef _VS_INET_H +#define _VS_INET_H @@ -14433,7 +14081,7 @@ +// #warning duplicate inclusion +#endif --- a/include/linux/vs_limit.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_limit.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_limit.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,140 @@ +#ifndef _VS_LIMIT_H +#define _VS_LIMIT_H @@ -14576,7 +14224,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_memory.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_memory.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_memory.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,159 @@ +#ifndef _VS_MEMORY_H +#define _VS_MEMORY_H @@ -14738,7 +14386,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_network.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_network.h 2008-04-29 18:42:49.000000000 -0400 ++++ a/include/linux/vs_network.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,169 @@ +#ifndef _NX_VS_NETWORK_H +#define _NX_VS_NETWORK_H @@ -14910,7 +14558,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_pid.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_pid.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_pid.h 2008-07-27 14:11:13.000000000 -0400 @@ -0,0 +1,95 @@ +#ifndef _VS_PID_H +#define _VS_PID_H @@ -14939,7 +14587,7 @@ + return 1; +} + -+#define find_task_by_real_pid find_task_by_pid ++#define find_task_by_real_pid(pid) find_task_by_pid(pid, &init_pid_ns) + +#if 0 + @@ -15008,7 +14656,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_sched.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_sched.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_sched.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,110 @@ +#ifndef _VS_SCHED_H +#define _VS_SCHED_H @@ -15121,7 +14769,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_socket.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_socket.h 2008-04-23 14:32:00.000000000 -0400 ++++ a/include/linux/vs_socket.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,67 @@ +#ifndef _VS_SOCKET_H +#define _VS_SOCKET_H @@ -15191,7 +14839,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_tag.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_tag.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_tag.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,43 @@ +#ifndef _VS_TAG_H +#define _VS_TAG_H @@ -15237,7 +14885,7 @@ +#warning duplicate inclusion +#endif --- a/include/linux/vs_time.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/include/linux/vs_time.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/linux/vs_time.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,19 @@ +#ifndef _VS_TIME_H +#define _VS_TIME_H @@ -15258,17 +14906,12 @@ +#else +#warning duplicate inclusion +#endif ---- a/include/net/addrconf.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/addrconf.h 2008-04-19 15:14:52.000000000 -0400 -@@ -75,10 +75,12 @@ extern struct inet6_ifaddr *ipv6_ge - - extern int ipv6_get_saddr(struct dst_entry *dst, - struct in6_addr *daddr, -- struct in6_addr *saddr); -+ struct in6_addr *saddr, -+ struct nx_info *nxi); +--- a/include/net/addrconf.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/net/addrconf.h 2008-07-29 18:14:29.000000000 -0400 +@@ -83,7 +83,8 @@ extern struct inet6_ifaddr *ipv6_ge extern int ipv6_dev_get_saddr(struct net_device *dev, - struct in6_addr *daddr, + const struct in6_addr *daddr, + unsigned int srcprefs, - struct in6_addr *saddr); + struct in6_addr *saddr, + struct nx_info *nxi); @@ -15276,7 +14919,7 @@ struct in6_addr *addr, unsigned char banned_flags); --- a/include/net/af_unix.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/af_unix.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/include/net/af_unix.h 2008-07-16 22:41:36.000000000 -0400 @@ -4,6 +4,7 @@ #include #include @@ -15285,8 +14928,8 @@ #include extern void unix_inflight(struct file *fp); ---- a/include/net/inet_sock.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/inet_sock.h 2008-04-19 15:14:52.000000000 -0400 +--- a/include/net/inet_sock.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/net/inet_sock.h 2008-07-16 22:46:02.000000000 -0400 @@ -24,7 +24,7 @@ #include #include @@ -15296,18 +14939,20 @@ /** struct ip_options - IP Options * -@@ -193,9 +193,4 @@ static inline int inet_sk_ehashfn(const +@@ -192,11 +192,6 @@ static inline int inet_sk_ehashfn(const } -static inline int inet_iif(const struct sk_buff *skb) -{ -- return ((struct rtable *)skb->dst)->rt_iif; +- return skb->rtable->rt_iif; -} - - #endif /* _INET_SOCK_H */ ---- a/include/net/inet_timewait_sock.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/inet_timewait_sock.h 2008-04-19 15:25:34.000000000 -0400 + static inline struct request_sock *inet_reqsk_alloc(struct request_sock_ops *ops) + { + struct request_sock *req = reqsk_alloc(ops); +--- a/include/net/inet_timewait_sock.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/net/inet_timewait_sock.h 2008-07-16 22:41:36.000000000 -0400 @@ -15,15 +15,14 @@ #ifndef _INET_TIMEWAIT_SOCK_ #define _INET_TIMEWAIT_SOCK_ @@ -15337,30 +14982,29 @@ int tw_timeout; volatile unsigned char tw_substate; /* 3 bits hole, try to pack */ ---- a/include/net/route.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/route.h 2008-04-21 12:39:35.000000000 -0400 -@@ -34,7 +34,7 @@ - #include - #include - #include --#include -+#include - - #ifndef __KERNEL__ - #warning This file is not supposed to be used outside of kernel. -@@ -86,6 +86,11 @@ struct ip_rt_acct +--- a/include/net/route.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/net/route.h 2008-07-16 22:46:26.000000000 -0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++// #include + #include + #include + #include +@@ -85,6 +86,11 @@ struct ip_rt_acct __u32 i_packets; }; +static inline int inet_iif(const struct sk_buff *skb) +{ -+ return ((struct rtable *)skb->dst)->rt_iif; ++ return skb->rtable->rt_iif; +} + struct rt_cache_stat { unsigned int in_hit; -@@ -136,6 +141,9 @@ static inline void ip_rt_put(struct rtab +@@ -135,6 +141,9 @@ static inline void ip_rt_put(struct rtab dst_release(&rt->u.dst); } @@ -15370,7 +15014,7 @@ #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) extern const __u8 ip_tos2prio[16]; -@@ -145,6 +153,9 @@ static inline char rt_tos2priority(u8 to +@@ -144,6 +153,9 @@ static inline char rt_tos2priority(u8 to return ip_tos2prio[IPTOS_TOS(tos)>>1]; } @@ -15380,10 +15024,10 @@ static inline int ip_route_connect(struct rtable **rp, __be32 dst, __be32 src, u32 tos, int oif, u8 protocol, __be16 sport, __be16 dport, struct sock *sk, -@@ -162,7 +173,21 @@ static inline int ip_route_connect(struc +@@ -161,7 +173,21 @@ static inline int ip_route_connect(struc int err; - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); - if (!dst || !src) { + struct nx_info *nx_info = current->nx_info; + @@ -15403,12 +15047,12 @@ err = __ip_route_output_key(net, rp, &fl); if (err) return err; ---- a/include/net/sock.h 2008-04-17 12:05:44.000000000 -0400 -+++ a/include/net/sock.h 2008-04-19 15:14:52.000000000 -0400 -@@ -123,6 +123,10 @@ struct sock_common { - unsigned int skc_hash; - struct proto *skc_prot; +--- a/include/net/sock.h 2008-07-14 17:22:55.000000000 -0400 ++++ a/include/net/sock.h 2008-07-16 22:47:02.000000000 -0400 +@@ -128,6 +128,10 @@ struct sock_common { + #ifdef CONFIG_NET_NS struct net *skc_net; + #endif + xid_t skc_xid; + struct vx_info *skc_vx_info; + nid_t skc_nid; @@ -15416,7 +15060,7 @@ }; /** -@@ -205,6 +209,10 @@ struct sock { +@@ -211,6 +215,10 @@ struct sock { #define sk_hash __sk_common.skc_hash #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net @@ -15427,17 +15071,17 @@ unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; ---- a/init/main.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/init/main.c 2008-04-21 10:46:10.000000000 -0400 -@@ -58,6 +58,7 @@ - #include +--- a/init/main.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/init/main.c 2008-07-17 21:10:23.000000000 -0400 +@@ -60,6 +60,7 @@ #include #include + #include +#include #include #include -@@ -370,12 +371,14 @@ EXPORT_SYMBOL(__per_cpu_offset); +@@ -393,12 +394,14 @@ EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { @@ -15454,8 +15098,8 @@ ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { ---- a/ipc/mqueue.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/ipc/mqueue.c 2008-04-19 15:14:52.000000000 -0400 +--- a/ipc/mqueue.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/ipc/mqueue.c 2008-07-16 22:41:36.000000000 -0400 @@ -31,6 +31,8 @@ #include #include @@ -15532,18 +15176,9 @@ free_uid(user); } } -@@ -743,7 +756,7 @@ asmlinkage long sys_mq_unlink(const char - if (inode) - atomic_inc(&inode->i_count); - -- err = vfs_unlink(dentry->d_parent->d_inode, dentry); -+ err = vfs_unlink(dentry->d_parent->d_inode, dentry, NULL); - out_err: - dput(dentry); - ---- a/ipc/msg.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/ipc/msg.c 2008-04-21 10:41:47.000000000 -0400 -@@ -37,6 +37,7 @@ +--- a/ipc/msg.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/ipc/msg.c 2008-07-16 22:41:36.000000000 -0400 +@@ -38,6 +38,7 @@ #include #include #include @@ -15551,7 +15186,7 @@ #include #include -@@ -168,6 +169,7 @@ static int newque(struct ipc_namespace * +@@ -190,6 +191,7 @@ static int newque(struct ipc_namespace * msq->q_perm.mode = msgflg & S_IRWXUGO; msq->q_perm.key = key; @@ -15559,8 +15194,8 @@ msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); ---- a/ipc/namespace.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/ipc/namespace.c 2008-04-21 10:44:58.000000000 -0400 +--- a/ipc/namespace.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/ipc/namespace.c 2008-07-16 22:41:36.000000000 -0400 @@ -9,6 +9,8 @@ #include #include @@ -15570,23 +15205,24 @@ #include "util.h" -@@ -25,6 +27,7 @@ static struct ipc_namespace *clone_ipc_n - shm_init_ns(ns); +@@ -35,6 +37,7 @@ static struct ipc_namespace *clone_ipc_n + register_ipcns_notifier(ns); kref_init(&ns->kref); + atomic_inc(&vs_global_ipc_ns); return ns; } -@@ -82,5 +85,6 @@ void free_ipc_ns(struct kref *kref) +@@ -101,6 +104,7 @@ void free_ipc_ns(struct kref *kref) sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); + atomic_dec(&vs_global_ipc_ns); kfree(ns); - } ---- a/ipc/sem.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/ipc/sem.c 2008-04-21 10:45:22.000000000 -0400 + atomic_dec(&nr_ipc_ns); + +--- a/ipc/sem.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/ipc/sem.c 2008-07-16 22:41:36.000000000 -0400 @@ -83,6 +83,8 @@ #include #include @@ -15596,7 +15232,7 @@ #include #include "util.h" -@@ -252,6 +254,7 @@ static int newary(struct ipc_namespace * +@@ -255,6 +257,7 @@ static int newary(struct ipc_namespace * sma->sem_perm.mode = (semflg & S_IRWXUGO); sma->sem_perm.key = key; @@ -15604,7 +15240,7 @@ sma->sem_perm.security = NULL; retval = security_sem_alloc(sma); -@@ -267,6 +270,9 @@ static int newary(struct ipc_namespace * +@@ -270,6 +273,9 @@ static int newary(struct ipc_namespace * return id; } ns->used_sems += nsems; @@ -15612,10 +15248,10 @@ + vx_semary_inc(sma); + vx_nsems_add(sma, nsems); - sma->sem_perm.id = sem_buildid(id, sma->sem_perm.seq); sma->sem_base = (struct sem *) &sma[1]; ---- a/ipc/shm.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/ipc/shm.c 2008-04-21 10:45:38.000000000 -0400 + /* sma->sem_pending = NULL; */ +--- a/ipc/shm.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/ipc/shm.c 2008-07-16 22:41:36.000000000 -0400 @@ -39,6 +39,8 @@ #include #include @@ -15625,7 +15261,7 @@ #include -@@ -202,7 +204,12 @@ static void shm_open(struct vm_area_stru +@@ -183,7 +185,12 @@ static void shm_open(struct vm_area_stru */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { @@ -15639,7 +15275,7 @@ shm_rmid(ns, shp); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) -@@ -212,6 +219,7 @@ static void shm_destroy(struct ipc_names +@@ -193,6 +200,7 @@ static void shm_destroy(struct ipc_names shp->mlock_user); fput (shp->shm_file); security_shm_free(shp); @@ -15647,7 +15283,7 @@ ipc_rcu_putref(shp); } -@@ -383,11 +391,15 @@ static int newseg(struct ipc_namespace * +@@ -362,11 +370,15 @@ static int newseg(struct ipc_namespace * if (ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; @@ -15663,7 +15299,7 @@ shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->mlock_user = NULL; -@@ -441,6 +453,7 @@ static int newseg(struct ipc_namespace * +@@ -419,6 +431,7 @@ static int newseg(struct ipc_namespace * ns->shm_tot += numpages; error = shp->shm_perm.id; shm_unlock(shp); @@ -15671,8 +15307,8 @@ return error; no_id: ---- a/kernel/capability.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/capability.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/capability.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/capability.c 2008-07-16 22:41:36.000000000 -0400 @@ -13,6 +13,7 @@ #include #include @@ -15681,7 +15317,7 @@ #include /* -@@ -171,6 +172,8 @@ static inline int cap_set_pg(int pgrp_nr +@@ -241,6 +242,8 @@ static inline int cap_set_pg(int pgrp_nr pgrp = find_vpid(pgrp_nr); do_each_pid_task(pgrp, PIDTYPE_PGID, g) { @@ -15690,7 +15326,7 @@ target = g; while_each_thread(g, target) { if (!security_capset_check(target, effective, -@@ -335,8 +338,12 @@ int __capable(struct task_struct *t, int +@@ -391,8 +394,12 @@ int __capable(struct task_struct *t, int return 0; } @@ -15703,20 +15339,9 @@ return __capable(current, cap); } EXPORT_SYMBOL(capable); ---- a/kernel/cgroup.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/kernel/cgroup.c 2008-05-21 14:30:41.000000000 -0400 -@@ -2833,7 +2833,7 @@ int cgroup_clone(struct task_struct *tsk - } - - /* Create the cgroup directory, which also creates the cgroup */ -- ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); -+ ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755, NULL); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { ---- a/kernel/compat.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/compat.c 2008-04-19 15:14:52.000000000 -0400 -@@ -846,7 +846,7 @@ asmlinkage long compat_sys_time(compat_t +--- a/kernel/compat.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/compat.c 2008-07-16 22:41:36.000000000 -0400 +@@ -845,7 +845,7 @@ asmlinkage long compat_sys_time(compat_t compat_time_t i; struct timeval tv; @@ -15725,7 +15350,7 @@ i = tv.tv_sec; if (tloc) { -@@ -870,7 +870,7 @@ asmlinkage long compat_sys_stime(compat_ +@@ -869,7 +869,7 @@ asmlinkage long compat_sys_stime(compat_ if (err) return err; @@ -15734,9 +15359,9 @@ return 0; } ---- a/kernel/exit.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/exit.c 2008-04-19 15:14:52.000000000 -0400 -@@ -44,6 +44,11 @@ +--- a/kernel/exit.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/exit.c 2008-08-13 20:37:36.000000000 -0400 +@@ -45,6 +45,11 @@ #include #include #include @@ -15748,7 +15373,7 @@ #include #include -@@ -468,9 +473,11 @@ static void close_files(struct files_str +@@ -477,9 +482,11 @@ static void close_files(struct files_str filp_close(file, files); cond_resched(); } @@ -15760,7 +15385,15 @@ } } } -@@ -1014,6 +1021,10 @@ NORET_TYPE void do_exit(long code) +@@ -548,6 +555,7 @@ void put_fs_struct(struct fs_struct *fs) + path_put(&fs->pwd); + if (fs->altroot.dentry) + path_put(&fs->altroot); ++ atomic_dec(&vs_global_fs); + kmem_cache_free(fs_cachep, fs); + } + } +@@ -1090,6 +1098,10 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); @@ -15771,9 +15404,9 @@ preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; ---- a/kernel/fork.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/fork.c 2008-04-21 11:09:01.000000000 -0400 -@@ -53,6 +53,11 @@ +--- a/kernel/fork.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/fork.c 2008-07-16 22:41:36.000000000 -0400 +@@ -54,6 +54,11 @@ #include #include #include @@ -15785,7 +15418,7 @@ #include #include -@@ -113,6 +118,8 @@ void free_task(struct task_struct *tsk) +@@ -114,6 +119,8 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); @@ -15794,7 +15427,7 @@ free_task_struct(tsk); } EXPORT_SYMBOL(free_task); -@@ -229,6 +236,8 @@ static int dup_mmap(struct mm_struct *mm +@@ -253,6 +260,8 @@ static int dup_mmap(struct mm_struct *mm mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -15803,7 +15436,7 @@ cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; -@@ -240,7 +249,7 @@ static int dup_mmap(struct mm_struct *mm +@@ -264,7 +273,7 @@ static int dup_mmap(struct mm_struct *mm if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); @@ -15812,7 +15445,7 @@ vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; -@@ -351,8 +360,8 @@ static struct mm_struct * mm_init(struct +@@ -375,8 +384,8 @@ static struct mm_struct * mm_init(struct : MMF_DUMP_FILTER_DEFAULT; mm->core_waiters = 0; mm->nr_ptes = 0; @@ -15823,7 +15456,7 @@ spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; -@@ -362,6 +371,7 @@ static struct mm_struct * mm_init(struct +@@ -386,6 +395,7 @@ static struct mm_struct * mm_init(struct if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -15831,7 +15464,7 @@ return mm; } -@@ -395,6 +405,7 @@ void __mmdrop(struct mm_struct *mm) +@@ -418,6 +428,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); @@ -15839,7 +15472,7 @@ free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); -@@ -511,6 +522,7 @@ static struct mm_struct *dup_mm(struct t +@@ -534,6 +545,7 @@ struct mm_struct *dup_mm(struct task_str goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); @@ -15847,7 +15480,7 @@ /* Initializing for Swap token stuff */ mm->token_priority = 0; -@@ -542,6 +554,7 @@ fail_nocontext: +@@ -567,6 +579,7 @@ fail_nocontext: * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() */ @@ -15855,7 +15488,7 @@ mm_free_pgd(mm); free_mm(mm); return NULL; -@@ -612,6 +625,7 @@ static struct fs_struct *__copy_fs_struc +@@ -637,6 +650,7 @@ static struct fs_struct *__copy_fs_struc fs->altroot.dentry = NULL; } read_unlock(&old->lock); @@ -15863,16 +15496,7 @@ } return fs; } -@@ -730,6 +744,8 @@ static struct files_struct *dup_fd(struc - struct file *f = *old_fds++; - if (f) { - get_file(f); -+ /* TODO: sum it first for check and performance */ -+ vx_openfd_inc(open_files - i); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet -@@ -1011,6 +1027,8 @@ static struct task_struct *copy_process( +@@ -879,6 +893,8 @@ static struct task_struct *copy_process( int retval; struct task_struct *p; int cgroup_callbacks_done = 0; @@ -15881,7 +15505,7 @@ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); -@@ -1045,12 +1063,28 @@ static struct task_struct *copy_process( +@@ -913,12 +929,28 @@ static struct task_struct *copy_process( DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif @@ -15911,7 +15535,7 @@ } atomic_inc(&p->user->__count); -@@ -1352,6 +1386,18 @@ static struct task_struct *copy_process( +@@ -1220,6 +1252,18 @@ static struct task_struct *copy_process( total_forks++; spin_unlock(¤t->sighand->siglock); @@ -15930,7 +15554,7 @@ write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); -@@ -1398,6 +1444,9 @@ bad_fork_cleanup_count: +@@ -1266,6 +1310,9 @@ bad_fork_cleanup_count: put_group_info(p->group_info); atomic_dec(&p->user->processes); free_uid(p->user); @@ -15940,38 +15564,37 @@ bad_fork_free: free_task(p); fork_out: ---- a/kernel/kthread.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/kthread.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/kthread.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/kthread.c 2008-07-27 14:11:48.000000000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include - #include #define KTHREAD_NICE_LEVEL (-5) -@@ -99,7 +100,7 @@ static void create_kthread(struct kthrea + +@@ -98,7 +99,7 @@ static void create_kthread(struct kthrea struct sched_param param = { .sched_priority = 0 }; wait_for_completion(&create->started); read_lock(&tasklist_lock); -- create->result = find_task_by_pid(pid); +- create->result = find_task_by_pid_ns(pid, &init_pid_ns); + create->result = find_task_by_real_pid(pid); read_unlock(&tasklist_lock); /* * root may have changed our (kthreadd's) priority or CPU mask. ---- a/kernel/Makefile 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/Makefile 2008-04-19 15:14:52.000000000 -0400 -@@ -11,6 +11,8 @@ obj-y = sched.o fork.o exec_domain.o - hrtimer.o rwsem.o nsproxy.o srcu.o \ - notifier.o ksysfs.o pm_qos_params.o +--- a/kernel/Makefile 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/Makefile 2008-07-27 12:40:25.000000000 -0400 +@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + notifier.o ksysfs.o pm_qos_params.o sched_clock.o +obj-y += vserver/ -+ - obj-$(CONFIG_SYSCTL) += sysctl_check.o + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ ---- a/kernel/nsproxy.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/nsproxy.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/nsproxy.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/nsproxy.c 2008-07-16 22:41:36.000000000 -0400 @@ -20,6 +20,8 @@ #include #include @@ -16104,7 +15727,7 @@ if (!old_ns) return 0; -@@ -155,6 +198,9 @@ int copy_namespaces(unsigned long flags, +@@ -167,6 +210,9 @@ int copy_namespaces(unsigned long flags, out: put_nsproxy(old_ns); @@ -16114,15 +15737,18 @@ return err; } -@@ -171,6 +217,7 @@ void free_nsproxy(struct nsproxy *ns) +@@ -182,7 +228,9 @@ void free_nsproxy(struct nsproxy *ns) + put_pid_ns(ns->pid_ns); if (ns->user_ns) put_user_ns(ns->user_ns); - put_net(ns->net_ns); +- put_net(ns->net_ns); ++ if (ns->net_ns) ++ put_net(ns->net_ns); + atomic_dec(&vs_global_nsproxy); kmem_cache_free(nsproxy_cachep, ns); } -@@ -183,6 +230,10 @@ int unshare_nsproxy_namespaces(unsigned +@@ -195,6 +243,10 @@ int unshare_nsproxy_namespaces(unsigned { int err = 0; @@ -16133,8 +15759,8 @@ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) return 0; ---- a/kernel/pid.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/pid.c 2008-04-29 17:56:00.000000000 -0400 +--- a/kernel/pid.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/pid.c 2008-07-16 22:41:36.000000000 -0400 @@ -35,6 +35,8 @@ #include #include @@ -16144,7 +15770,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -@@ -303,7 +305,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); +@@ -304,7 +306,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { @@ -16153,7 +15779,7 @@ } EXPORT_SYMBOL_GPL(find_vpid); -@@ -359,6 +361,9 @@ void transfer_pid(struct task_struct *ol +@@ -370,6 +372,9 @@ void transfer_pid(struct task_struct *ol struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; @@ -16163,7 +15789,7 @@ if (pid) { struct hlist_node *first; first = rcu_dereference(pid->tasks[type].first); -@@ -388,14 +393,14 @@ EXPORT_SYMBOL(find_task_by_pid); +@@ -393,14 +398,14 @@ EXPORT_SYMBOL(find_task_by_pid_type_ns); struct task_struct *find_task_by_vpid(pid_t vnr) { @@ -16180,7 +15806,7 @@ } EXPORT_SYMBOL(find_task_by_pid_ns); -@@ -430,7 +435,7 @@ struct pid *find_get_pid(pid_t nr) +@@ -435,7 +440,7 @@ struct pid *find_get_pid(pid_t nr) return pid; } @@ -16189,7 +15815,7 @@ { struct upid *upid; pid_t nr = 0; -@@ -443,6 +448,11 @@ pid_t pid_nr_ns(struct pid *pid, struct +@@ -448,6 +453,11 @@ pid_t pid_nr_ns(struct pid *pid, struct return nr; } @@ -16201,8 +15827,8 @@ pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, current->nsproxy->pid_ns); ---- a/kernel/pid_namespace.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/pid_namespace.c 2008-04-23 11:52:08.000000000 -0400 +--- a/kernel/pid_namespace.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/pid_namespace.c 2008-07-16 22:41:36.000000000 -0400 @@ -12,6 +12,7 @@ #include #include @@ -16227,9 +15853,9 @@ kmem_cache_free(pid_ns_cachep, ns); } ---- a/kernel/posix-timers.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/posix-timers.c 2008-04-19 15:14:52.000000000 -0400 -@@ -47,6 +47,7 @@ +--- a/kernel/posix-timers.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/posix-timers.c 2008-07-27 12:57:35.000000000 -0400 +@@ -46,6 +46,7 @@ #include #include #include @@ -16237,41 +15863,39 @@ /* * Management arrays for POSIX timers. Timers are kept in slab memory -@@ -299,6 +300,12 @@ void do_schedule_next_timer(struct sigin - - int posix_timer_event(struct k_itimer *timr,int si_private) - { +@@ -298,6 +299,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private) + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + struct vx_info_save vxis; + struct vx_info *vxi; + int ret; + + vxi = task_get_vx_info(timr->it_process); + enter_vx_info(vxi, &vxis); - memset(&timr->sigq->info, 0, sizeof(siginfo_t)); timr->sigq->info.si_sys_private = si_private; - /* Send signal to the process that owns this timer.*/ -@@ -311,11 +318,11 @@ int posix_timer_event(struct k_itimer *t + + timr->sigq->info.si_signo = timr->it_sigev_signo; +@@ -310,10 +317,11 @@ int posix_timer_event(struct k_itimer *t if (timr->it_sigev_notify & SIGEV_THREAD_ID) { struct task_struct *leader; -- int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, -- timr->it_process); +- int ret = send_sigqueue(timr->sigq, timr->it_process, 0); ++ ++ ret = send_sigqueue(timr->sigq, timr->it_process, 0); -+ ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, -+ timr->it_process); if (likely(ret >= 0)) - return ret; + goto out; timr->it_sigev_notify = SIGEV_SIGNAL; leader = timr->it_process->group_leader; -@@ -323,8 +330,12 @@ int posix_timer_event(struct k_itimer *t +@@ -321,7 +329,11 @@ int posix_timer_event(struct k_itimer *t timr->it_process = leader; } -- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, -+ ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); +- return send_sigqueue(timr->sigq, timr->it_process, 1); ++ ret = send_sigqueue(timr->sigq, timr->it_process, 1); +out: + leave_vx_info(&vxis); + put_vx_info(vxi); @@ -16279,8 +15903,8 @@ } EXPORT_SYMBOL_GPL(posix_timer_event); ---- a/kernel/printk.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/printk.c 2008-04-21 10:59:28.000000000 -0400 +--- a/kernel/printk.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/printk.c 2008-07-16 22:41:36.000000000 -0400 @@ -32,6 +32,7 @@ #include #include @@ -16289,7 +15913,7 @@ #include -@@ -297,18 +298,13 @@ int do_syslog(int type, char __user *buf +@@ -300,18 +301,13 @@ int do_syslog(int type, char __user *buf unsigned i, j, limit, count; int do_clear = 0; char c; @@ -16310,7 +15934,7 @@ error = -EINVAL; if (!buf || len < 0) goto out; -@@ -319,6 +315,16 @@ int do_syslog(int type, char __user *buf +@@ -322,6 +318,16 @@ int do_syslog(int type, char __user *buf error = -EFAULT; goto out; } @@ -16327,7 +15951,7 @@ error = wait_event_interruptible(log_wait, (log_start - log_end)); if (error) -@@ -343,16 +349,6 @@ int do_syslog(int type, char __user *buf +@@ -346,16 +352,6 @@ int do_syslog(int type, char __user *buf do_clear = 1; /* FALL THRU */ case 3: /* Read last kernel messages */ @@ -16344,8 +15968,8 @@ count = len; if (count > log_buf_len) count = log_buf_len; ---- a/kernel/ptrace.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/ptrace.c 2008-04-21 10:50:28.000000000 -0400 +--- a/kernel/ptrace.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/ptrace.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -16366,7 +15990,7 @@ return security_ptrace(current, task); } -@@ -562,6 +568,10 @@ asmlinkage long sys_ptrace(long request, +@@ -556,6 +562,10 @@ asmlinkage long sys_ptrace(long request, goto out; } @@ -16377,18 +16001,18 @@ if (request == PTRACE_ATTACH) { ret = ptrace_attach(child); /* ---- a/kernel/sched.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/kernel/sched.c 2008-05-21 14:30:41.000000000 -0400 -@@ -66,6 +66,8 @@ - #include - #include - #include +--- a/kernel/sched.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/sched.c 2008-07-17 21:13:30.000000000 -0400 +@@ -70,6 +70,8 @@ + #include + #include + #include +#include +#include #include #include -@@ -375,6 +377,16 @@ struct root_domain { +@@ -461,6 +463,16 @@ struct root_domain { static struct root_domain def_root_domain; #endif @@ -16405,7 +16029,7 @@ /* * This is the main, per-CPU runqueue data structure. -@@ -1366,6 +1378,7 @@ static void set_load_weight(struct task_ +@@ -1546,6 +1558,7 @@ static void set_load_weight(struct task_ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { @@ -16413,7 +16037,7 @@ sched_info_queued(p); p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; -@@ -1556,6 +1569,9 @@ struct migration_req { +@@ -1736,6 +1749,9 @@ struct migration_req { struct completion done; }; @@ -16423,7 +16047,7 @@ /* * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. -@@ -1565,6 +1581,7 @@ migrate_task(struct task_struct *p, int +@@ -1745,6 +1761,7 @@ migrate_task(struct task_struct *p, int { struct rq *rq = task_rq(p); @@ -16431,7 +16055,7 @@ /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. -@@ -1926,6 +1943,12 @@ static int try_to_wake_up(struct task_st +@@ -2109,6 +2126,12 @@ static int try_to_wake_up(struct task_st /* might preempt at this point */ rq = task_rq_lock(p, &flags); old_state = p->state; @@ -16444,7 +16068,7 @@ if (!(old_state & state)) goto out; if (p->se.on_rq) -@@ -3697,13 +3720,16 @@ unsigned long long task_sched_runtime(st +@@ -3885,13 +3908,16 @@ unsigned long long task_sched_runtime(st void account_user_time(struct task_struct *p, cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -16462,7 +16086,7 @@ cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); -@@ -3748,6 +3774,7 @@ void account_system_time(struct task_str +@@ -3936,6 +3962,7 @@ void account_system_time(struct task_str cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; @@ -16470,15 +16094,15 @@ struct rq *rq = this_rq(); cputime64_t tmp; -@@ -3755,6 +3782,7 @@ void account_system_time(struct task_str - return account_guest_time(p, cputime); +@@ -3945,6 +3972,7 @@ void account_system_time(struct task_str + } p->stime = cputime_add(p->stime, cputime); + vx_account_system(vxi, cputime, (p == rq->idle)); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); -@@ -4500,7 +4528,7 @@ asmlinkage long sys_nice(int increment) +@@ -4656,7 +4684,7 @@ asmlinkage long sys_nice(int increment) nice = 19; if (increment < 0 && !can_nice(current, nice)) @@ -16487,18 +16111,19 @@ retval = security_task_setnice(current, nice); if (retval) ---- a/kernel/sched_fair.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/sched_fair.c 2008-04-19 15:14:52.000000000 -0400 -@@ -537,6 +537,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, st +--- a/kernel/sched_fair.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/sched_fair.c 2008-07-17 21:17:35.000000000 -0400 +@@ -637,6 +637,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, st check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); ++ + if (entity_is_task(se)) + vx_activate_task(task_of(se)); - account_entity_enqueue(cfs_rq, se); } -@@ -580,6 +582,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st + static void update_avg(u64 *avg, u64 sample) +@@ -679,6 +682,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, st if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); @@ -16508,7 +16133,7 @@ } --- a/kernel/sched_hard.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/sched_hard.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/sched_hard.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,324 @@ + +#ifdef CONFIG_VSERVER_IDLELIMIT @@ -16835,7 +16460,7 @@ +#endif /* CONFIG_VSERVER_HARDCPU */ + --- a/kernel/sched_mon.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/sched_mon.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/sched_mon.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,200 @@ + +#include @@ -17037,8 +16662,8 @@ + +#endif /* CONFIG_VSERVER_MONITOR */ + ---- a/kernel/signal.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/signal.c 2008-05-15 15:41:03.000000000 -0400 +--- a/kernel/signal.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/signal.c 2008-07-17 21:15:33.000000000 -0400 @@ -26,6 +26,8 @@ #include #include @@ -17048,9 +16673,9 @@ #include #include -@@ -530,6 +532,14 @@ static int check_kill_permission(int sig +@@ -573,6 +575,14 @@ static int check_kill_permission(int sig if (!valid_signal(sig)) - return error; + return -EINVAL; + if ((info != SEND_SIG_NOINFO) && + (is_si_special(info) || !SI_FROMUSER(info))) @@ -17060,11 +16685,11 @@ + "check_kill_permission(%d,%p,%p[#%u,%u])", + sig, info, t, vx_task_xid(t), t->pid); + - if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) -@@ -543,6 +553,18 @@ static int check_kill_permission(int sig - return error; + if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) + return 0; + +@@ -597,6 +607,18 @@ static int check_kill_permission(int sig + } } + error = -EPERM; @@ -17082,7 +16707,16 @@ return security_task_kill(t, info, sig, 0); } -@@ -1088,7 +1110,7 @@ int kill_pid_info_as_uid(int sig, struct +@@ -1055,7 +1077,7 @@ int kill_pid_info(int sig, struct siginf + rcu_read_lock(); + retry: + p = pid_task(pid, PIDTYPE_PID); +- if (p) { ++ if (p && vx_check(vx_task_xid(p), VS_WATCH | VS_IDENT)) { + error = group_send_sig_info(sig, info, p); + if (unlikely(error == -ESRCH)) + /* +@@ -1093,7 +1115,7 @@ int kill_pid_info_as_uid(int sig, struct read_lock(&tasklist_lock); p = pid_task(pid, PIDTYPE_PID); @@ -17091,7 +16725,7 @@ ret = -ESRCH; goto out_unlock; } -@@ -1140,7 +1162,9 @@ static int kill_something_info(int sig, +@@ -1145,7 +1167,9 @@ static int kill_something_info(int sig, struct task_struct * p; for_each_process(p) { @@ -17102,8 +16736,8 @@ int err = group_send_sig_info(sig, info, p); ++count; if (err != -EPERM) -@@ -1842,6 +1866,11 @@ relock: - if (is_global_init(current)) +@@ -1809,6 +1833,11 @@ relock: + !signal_group_exit(signal)) continue; + /* virtual init is protected against user signals */ @@ -17114,8 +16748,8 @@ if (sig_kernel_stop(signr)) { /* * The default action is to stop all threads in ---- a/kernel/softirq.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/softirq.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/softirq.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/softirq.c 2008-07-16 22:41:36.000000000 -0400 @@ -21,6 +21,7 @@ #include #include @@ -17124,8 +16758,8 @@ #include /* ---- a/kernel/sys.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/sys.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/sys.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/sys.c 2008-07-16 22:41:36.000000000 -0400 @@ -38,6 +38,7 @@ #include #include @@ -17134,7 +16768,7 @@ #include #include -@@ -116,7 +117,10 @@ static int set_one_prio(struct task_stru +@@ -122,7 +123,10 @@ static int set_one_prio(struct task_stru goto out; } if (niceval < task_nice(p) && !can_nice(p, niceval)) { @@ -17146,7 +16780,7 @@ goto out; } no_nice = security_task_setnice(p, niceval); -@@ -164,6 +168,8 @@ asmlinkage long sys_setpriority(int whic +@@ -170,6 +174,8 @@ asmlinkage long sys_setpriority(int whic else pgrp = task_pgrp(current); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { @@ -17155,7 +16789,7 @@ error = set_one_prio(p, niceval, error); } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; -@@ -224,6 +230,8 @@ asmlinkage long sys_getpriority(int whic +@@ -230,6 +236,8 @@ asmlinkage long sys_getpriority(int whic else pgrp = task_pgrp(current); do_each_pid_task(pgrp, PIDTYPE_PGID, p) { @@ -17164,7 +16798,7 @@ niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; -@@ -353,6 +361,9 @@ void kernel_power_off(void) +@@ -359,6 +367,9 @@ void kernel_power_off(void) machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); @@ -17174,7 +16808,7 @@ /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers -@@ -383,6 +394,9 @@ asmlinkage long sys_reboot(int magic1, i +@@ -389,6 +400,9 @@ asmlinkage long sys_reboot(int magic1, i if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; @@ -17184,7 +16818,7 @@ lock_kernel(); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: -@@ -1343,7 +1357,7 @@ asmlinkage long sys_sethostname(char __u +@@ -1361,7 +1375,7 @@ asmlinkage long sys_sethostname(char __u int errno; char tmp[__NEW_UTS_LEN]; @@ -17193,7 +16827,7 @@ return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; -@@ -1388,7 +1402,7 @@ asmlinkage long sys_setdomainname(char _ +@@ -1406,7 +1420,7 @@ asmlinkage long sys_setdomainname(char _ int errno; char tmp[__NEW_UTS_LEN]; @@ -17202,7 +16836,7 @@ return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; -@@ -1455,7 +1469,7 @@ asmlinkage long sys_setrlimit(unsigned i +@@ -1473,7 +1487,7 @@ asmlinkage long sys_setrlimit(unsigned i return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && @@ -17211,9 +16845,9 @@ return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) return -EPERM; ---- a/kernel/sysctl.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/sysctl.c 2008-04-19 15:14:52.000000000 -0400 -@@ -107,6 +107,7 @@ static int ngroups_max = NGROUPS_MAX; +--- a/kernel/sysctl.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/sysctl.c 2008-07-16 22:41:36.000000000 -0400 +@@ -109,6 +109,7 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif @@ -17221,7 +16855,7 @@ #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -@@ -492,6 +493,15 @@ static struct ctl_table kern_table[] = { +@@ -477,6 +478,15 @@ static struct ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif @@ -17238,7 +16872,7 @@ { .ctl_name = KERN_SG_BIG_BUFF, --- a/kernel/sysctl_check.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/sysctl_check.c 2008-04-21 13:42:56.000000000 -0400 ++++ a/kernel/sysctl_check.c 2008-07-16 22:41:36.000000000 -0400 @@ -39,6 +39,7 @@ static const struct trans_ctl_table tran { KERN_PANIC, "panic" }, @@ -17278,9 +16912,9 @@ {} }; ---- a/kernel/time.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/time.c 2008-04-19 15:14:52.000000000 -0400 -@@ -60,6 +60,7 @@ EXPORT_SYMBOL(sys_tz); +--- a/kernel/time.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/time.c 2008-07-16 22:41:36.000000000 -0400 +@@ -62,6 +62,7 @@ EXPORT_SYMBOL(sys_tz); asmlinkage long sys_time(time_t __user * tloc) { time_t i = get_seconds(); @@ -17288,7 +16922,7 @@ if (tloc) { if (put_user(i,tloc)) -@@ -89,7 +90,7 @@ asmlinkage long sys_stime(time_t __user +@@ -91,7 +92,7 @@ asmlinkage long sys_stime(time_t __user if (err) return err; @@ -17297,7 +16931,7 @@ return 0; } -@@ -100,7 +101,7 @@ asmlinkage long sys_gettimeofday(struct +@@ -102,7 +103,7 @@ asmlinkage long sys_gettimeofday(struct { if (likely(tv != NULL)) { struct timeval ktv; @@ -17306,7 +16940,7 @@ if (copy_to_user(tv, &ktv, sizeof(ktv))) return -EFAULT; } -@@ -175,7 +176,7 @@ int do_sys_settimeofday(struct timespec +@@ -177,7 +178,7 @@ int do_sys_settimeofday(struct timespec /* SMP safe, again the code in arch/foo/time.c should * globally block out interrupts when it runs. */ @@ -17315,7 +16949,7 @@ } return 0; } -@@ -307,7 +308,7 @@ void getnstimeofday(struct timespec *tv) +@@ -309,7 +310,7 @@ void getnstimeofday(struct timespec *tv) { struct timeval x; @@ -17324,8 +16958,8 @@ tv->tv_sec = x.tv_sec; tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; } ---- a/kernel/timer.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/timer.c 2008-04-19 15:14:52.000000000 -0400 +--- a/kernel/timer.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/timer.c 2008-07-16 22:41:36.000000000 -0400 @@ -37,6 +37,10 @@ #include #include @@ -17337,7 +16971,7 @@ #include #include -@@ -955,12 +959,6 @@ asmlinkage unsigned long sys_alarm(unsig +@@ -1089,12 +1093,6 @@ asmlinkage unsigned long sys_alarm(unsig #endif @@ -17350,7 +16984,7 @@ /** * sys_getpid - return the thread group id of the current process -@@ -989,10 +987,23 @@ asmlinkage long sys_getppid(void) +@@ -1123,10 +1121,23 @@ asmlinkage long sys_getppid(void) rcu_read_lock(); pid = task_tgid_vnr(current->real_parent); rcu_read_unlock(); @@ -17375,7 +17009,7 @@ asmlinkage long sys_getuid(void) { /* Only we change this so SMP safe */ -@@ -1160,6 +1171,8 @@ int do_sysinfo(struct sysinfo *info) +@@ -1297,6 +1308,8 @@ int do_sysinfo(struct sysinfo *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } @@ -17384,9 +17018,9 @@ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); ---- a/kernel/user.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/user.c 2008-04-23 16:24:56.000000000 -0400 -@@ -219,14 +219,15 @@ static struct kobj_type uids_ktype = { +--- a/kernel/user.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/user.c 2008-08-13 20:16:50.000000000 -0400 +@@ -243,12 +243,15 @@ static struct kobj_type uids_ktype = { }; /* create /sys/kernel/uids//cpu_share file for this user */ @@ -17394,17 +17028,17 @@ +static int uids_user_create(struct user_namespace *ns, struct user_struct *up) { struct kobject *kobj = &up->kobj; - int error; +- int error; ++ int error = 0; memset(kobj, 0, sizeof(struct kobject)); ++ if (ns != &init_user_ns) ++ goto done; ++ kobj->kset = uids_kset; -- error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); -+ error = kobject_init_and_add(kobj, &uids_ktype, NULL, -+ "%p:%d", ns, up->uid); + error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); if (error) { - kobject_put(kobj); - goto done; -@@ -248,7 +249,7 @@ int __init uids_sysfs_init(void) +@@ -272,7 +275,7 @@ int __init uids_sysfs_init(void) if (!uids_kset) return -ENOMEM; @@ -17413,7 +17047,22 @@ } /* work function to remove sysfs directory for a user and free up -@@ -308,7 +309,8 @@ static inline void free_user(struct user +@@ -302,9 +305,11 @@ static void remove_user_sysfs_dir(struct + if (!remove_user) + goto done; + +- kobject_uevent(&up->kobj, KOBJ_REMOVE); +- kobject_del(&up->kobj); +- kobject_put(&up->kobj); ++ if (up->kobj.name) { ++ kobject_uevent(&up->kobj, KOBJ_REMOVE); ++ kobject_del(&up->kobj); ++ kobject_put(&up->kobj); ++ } + + sched_destroy_user(up); + key_put(up->uid_keyring); +@@ -332,7 +337,8 @@ static inline void free_user(struct user #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ int uids_sysfs_init(void) { return 0; } @@ -17423,26 +17072,26 @@ static inline void uids_mutex_lock(void) { } static inline void uids_mutex_unlock(void) { } -@@ -399,7 +401,7 @@ struct user_struct * alloc_uid(struct us +@@ -409,7 +415,7 @@ struct user_struct *alloc_uid(struct use if (sched_create_user(new) < 0) - goto out_put_keys; + goto out_free_user; - if (uids_user_create(new)) + if (uids_user_create(ns, new)) goto out_destoy_sched; /* ---- a/kernel/user_namespace.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/kernel/user_namespace.c 2008-04-19 15:14:52.000000000 -0400 -@@ -9,6 +9,7 @@ - #include +--- a/kernel/user_namespace.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/user_namespace.c 2008-07-16 22:41:36.000000000 -0400 +@@ -10,6 +10,7 @@ #include + #include #include +#include /* * Clone a new ns copying an original user ns, setting refcount to 1 -@@ -26,6 +27,7 @@ static struct user_namespace *clone_user +@@ -27,6 +28,7 @@ static struct user_namespace *clone_user return ERR_PTR(-ENOMEM); kref_init(&ns->kref); @@ -17450,24 +17099,25 @@ for (n = 0; n < UIDHASH_SZ; ++n) INIT_HLIST_HEAD(ns->uidhash_table + n); -@@ -71,5 +73,6 @@ void free_user_ns(struct kref *kref) +@@ -72,6 +74,7 @@ void free_user_ns(struct kref *kref) ns = container_of(kref, struct user_namespace, kref); release_uids(ns); + atomic_dec(&vs_global_user_ns); kfree(ns); } ---- a/kernel/utsname.c 2008-04-17 10:37:25.000000000 -0400 -+++ a/kernel/utsname.c 2008-04-19 15:14:52.000000000 -0400 -@@ -14,6 +14,7 @@ - #include + EXPORT_SYMBOL(free_user_ns); +--- a/kernel/utsname.c 2008-07-14 17:22:55.000000000 -0400 ++++ a/kernel/utsname.c 2008-07-17 21:15:55.000000000 -0400 +@@ -15,6 +15,7 @@ #include #include + #include +#include /* * Clone a new ns copying an original utsname, setting refcount to 1 -@@ -32,6 +33,7 @@ static struct uts_namespace *clone_uts_n +@@ -33,6 +34,7 @@ static struct uts_namespace *clone_uts_n memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); up_read(&uts_sem); kref_init(&ns->kref); @@ -17475,7 +17125,7 @@ return ns; } -@@ -62,5 +64,6 @@ void free_uts_ns(struct kref *kref) +@@ -63,5 +65,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); @@ -17483,7 +17133,7 @@ kfree(ns); } --- a/kernel/vserver/cacct.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cacct.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/cacct.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,42 @@ +/* + * linux/kernel/vserver/cacct.c @@ -17528,7 +17178,7 @@ +} + --- a/kernel/vserver/cacct_init.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cacct_init.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/cacct_init.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,25 @@ + + @@ -17556,7 +17206,7 @@ +} + --- a/kernel/vserver/cacct_proc.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cacct_proc.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/cacct_proc.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,53 @@ +#ifndef _VX_CACCT_PROC_H +#define _VX_CACCT_PROC_H @@ -17612,8 +17262,8 @@ + +#endif /* _VX_CACCT_PROC_H */ --- a/kernel/vserver/context.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/context.c 2008-04-23 22:26:24.000000000 -0400 -@@ -0,0 +1,1005 @@ ++++ a/kernel/vserver/context.c 2008-07-27 14:10:37.000000000 -0400 +@@ -0,0 +1,1010 @@ +/* + * linux/kernel/vserver/context.c + * @@ -17751,14 +17401,15 @@ + +static void __dealloc_vx_info(struct vx_info *vxi) +{ ++ struct vx_info_save vxis; + int cpu; + + vxdprintk(VXD_CBIT(xid, 0), + "dealloc_vx_info(%p)", vxi); + vxh_dealloc_vx_info(vxi); + -+ vxi->vx_id = -1; -+ ++#ifdef CONFIG_VSERVER_WARN ++ enter_vx_info(vxi, &vxis); + vx_info_exit_limit(&vxi->limit); + vx_info_exit_sched(&vxi->sched); + vx_info_exit_cvirt(&vxi->cvirt); @@ -17770,7 +17421,10 @@ + vx_info_exit_cvirt_pc( + &vx_per_cpu(vxi, cvirt_pc, cpu), cpu); + } ++ leave_vx_info(&vxis); ++#endif + ++ vxi->vx_id = -1; + vxi->vx_state |= VXS_RELEASED; + +#ifdef CONFIG_SMP @@ -18116,6 +17770,7 @@ + + +#include ++#include + +static int vx_openfd_task(struct task_struct *tsk) +{ @@ -18620,7 +18275,7 @@ +EXPORT_SYMBOL_GPL(free_vx_info); + --- a/kernel/vserver/cvirt.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cvirt.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/cvirt.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,301 @@ +/* + * linux/kernel/vserver/cvirt.c @@ -18924,7 +18579,7 @@ +#endif + --- a/kernel/vserver/cvirt_init.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cvirt_init.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/cvirt_init.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,69 @@ + + @@ -18996,7 +18651,7 @@ +} + --- a/kernel/vserver/cvirt_proc.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/cvirt_proc.h 2008-04-21 13:01:29.000000000 -0400 ++++ a/kernel/vserver/cvirt_proc.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,135 @@ +#ifndef _VX_CVIRT_PROC_H +#define _VX_CVIRT_PROC_H @@ -19134,7 +18789,7 @@ + +#endif /* _VX_CVIRT_PROC_H */ --- a/kernel/vserver/debug.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/debug.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/debug.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,32 @@ +/* + * kernel/vserver/debug.c @@ -19169,7 +18824,7 @@ +EXPORT_SYMBOL_GPL(dump_vx_info); + --- a/kernel/vserver/device.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/device.c 2008-04-21 12:35:24.000000000 -0400 ++++ a/kernel/vserver/device.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,443 @@ +/* + * linux/kernel/vserver/device.c @@ -19615,7 +19270,7 @@ + + --- a/kernel/vserver/dlimit.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/dlimit.c 2008-04-21 12:36:09.000000000 -0400 ++++ a/kernel/vserver/dlimit.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,521 @@ +/* + * linux/kernel/vserver/dlimit.c @@ -20139,7 +19794,7 @@ +EXPORT_SYMBOL_GPL(rcu_free_dl_info); + --- a/kernel/vserver/helper.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/helper.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/helper.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,199 @@ +/* + * linux/kernel/vserver/helper.c @@ -20341,7 +19996,7 @@ +} + --- a/kernel/vserver/history.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/history.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/history.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,258 @@ +/* + * kernel/vserver/history.c @@ -20602,7 +20257,7 @@ +#endif /* CONFIG_COMPAT */ + --- a/kernel/vserver/inet.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/inet.c 2008-05-29 18:56:59.000000000 -0400 ++++ a/kernel/vserver/inet.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,225 @@ + +#include @@ -20830,7 +20485,7 @@ +EXPORT_SYMBOL_GPL(ip_v4_find_src); + --- a/kernel/vserver/init.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/init.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/init.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,45 @@ +/* + * linux/kernel/init.c @@ -20878,7 +20533,7 @@ +module_exit(exit_vserver); + --- a/kernel/vserver/inode.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/inode.c 2008-04-21 16:52:16.000000000 -0400 ++++ a/kernel/vserver/inode.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,409 @@ +/* + * linux/kernel/vserver/inode.c @@ -20916,9 +20571,9 @@ + + *flags = IATTR_TAG + | (IS_BARRIER(in) ? IATTR_BARRIER : 0) -+ | (IS_IUNLINK(in) ? IATTR_IUNLINK : 0) ++ | (IS_IXUNLINK(in) ? IATTR_IXUNLINK : 0) + | (IS_IMMUTABLE(in) ? IATTR_IMMUTABLE : 0); -+ *mask = IATTR_IUNLINK | IATTR_IMMUTABLE; ++ *mask = IATTR_IXUNLINK | IATTR_IMMUTABLE; + + if (S_ISDIR(in->i_mode)) + *mask |= IATTR_BARRIER; @@ -21062,24 +20717,24 @@ + entry->vx_flags = iflags; + } + -+ if (*mask & (IATTR_BARRIER | IATTR_IUNLINK | IATTR_IMMUTABLE)) { ++ if (*mask & (IATTR_BARRIER | IATTR_IXUNLINK | IATTR_IMMUTABLE)) { + if (*mask & IATTR_IMMUTABLE) { + if (*flags & IATTR_IMMUTABLE) + in->i_flags |= S_IMMUTABLE; + else + in->i_flags &= ~S_IMMUTABLE; + } -+ if (*mask & IATTR_IUNLINK) { -+ if (*flags & IATTR_IUNLINK) -+ in->i_flags |= S_IUNLINK; ++ if (*mask & IATTR_IXUNLINK) { ++ if (*flags & IATTR_IXUNLINK) ++ in->i_flags |= S_IXUNLINK; + else -+ in->i_flags &= ~S_IUNLINK; ++ in->i_flags &= ~S_IXUNLINK; + } + if (S_ISDIR(in->i_mode) && (*mask & IATTR_BARRIER)) { + if (*flags & IATTR_BARRIER) -+ in->i_flags |= S_BARRIER; ++ in->i_vflags |= V_BARRIER; + else -+ in->i_flags &= ~S_BARRIER; ++ in->i_vflags &= ~V_BARRIER; + } + if (in->i_op && in->i_op->sync_flags) { + error = in->i_op->sync_flags(in); @@ -21290,8 +20945,8 @@ +#endif /* CONFIG_PROPAGATE */ + --- a/kernel/vserver/Kconfig 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/Kconfig 2008-05-21 15:02:48.000000000 -0400 -@@ -0,0 +1,252 @@ ++++ a/kernel/vserver/Kconfig 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,251 @@ +# +# Linux VServer configuration +# @@ -21534,7 +21189,6 @@ + select NAMESPACES + select UTS_NS + select IPC_NS -+ select PID_NS + select USER_NS + select SYSVIPC + @@ -21545,7 +21199,7 @@ + select SECURITY_CAPABILITIES + --- a/kernel/vserver/limit.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/limit.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/limit.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,319 @@ +/* + * linux/kernel/vserver/limit.c @@ -21867,8 +21521,8 @@ +} + --- a/kernel/vserver/limit_init.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/limit_init.h 2008-04-19 15:14:52.000000000 -0400 -@@ -0,0 +1,33 @@ ++++ a/kernel/vserver/limit_init.h 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,31 @@ + + +static inline void vx_info_init_limit(struct _vx_limit *limit) @@ -21887,7 +21541,6 @@ + +static inline void vx_info_exit_limit(struct _vx_limit *limit) +{ -+#ifdef CONFIG_VSERVER_WARN + rlim_t value; + int lim; + @@ -21899,11 +21552,10 @@ + "!!! limit: %p[%s,%d] = %ld on exit.", + limit, vlimit_name[lim], lim, (long)value); + } -+#endif +} + --- a/kernel/vserver/limit_proc.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/limit_proc.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/limit_proc.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,57 @@ +#ifndef _VX_LIMIT_PROC_H +#define _VX_LIMIT_PROC_H @@ -21963,7 +21615,7 @@ + + --- a/kernel/vserver/Makefile 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/Makefile 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/Makefile 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,18 @@ +# +# Makefile for the Linux vserver routines. @@ -21984,7 +21636,7 @@ +vserver-$(CONFIG_VSERVER_DEVICE) += device.o + --- a/kernel/vserver/monitor.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/monitor.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/monitor.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,138 @@ +/* + * kernel/vserver/monitor.c @@ -22125,7 +21777,7 @@ +#endif /* CONFIG_COMPAT */ + --- a/kernel/vserver/network.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/network.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/network.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,864 @@ +/* + * linux/kernel/vserver/network.c @@ -22992,7 +22644,7 @@ +EXPORT_SYMBOL_GPL(unhash_nx_info); + --- a/kernel/vserver/proc.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/proc.c 2008-04-21 13:42:34.000000000 -0400 ++++ a/kernel/vserver/proc.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,1086 @@ +/* + * linux/kernel/vserver/proc.c @@ -24081,7 +23733,7 @@ +} + --- a/kernel/vserver/sched.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/sched.c 2008-04-29 18:40:09.000000000 -0400 ++++ a/kernel/vserver/sched.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,413 @@ +/* + * linux/kernel/vserver/sched.c @@ -24497,7 +24149,7 @@ +} + --- a/kernel/vserver/sched_init.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/sched_init.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/sched_init.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,50 @@ + +static inline void vx_info_init_sched(struct _vx_sched *sched) @@ -24550,7 +24202,7 @@ + return; +} --- a/kernel/vserver/sched_proc.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/sched_proc.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/sched_proc.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,57 @@ +#ifndef _VX_SCHED_PROC_H +#define _VX_SCHED_PROC_H @@ -24610,7 +24262,7 @@ + +#endif /* _VX_SCHED_PROC_H */ --- a/kernel/vserver/signal.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/signal.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/signal.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,132 @@ +/* + * linux/kernel/vserver/signal.c @@ -24745,8 +24397,8 @@ +} + --- a/kernel/vserver/space.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/space.c 2008-05-15 15:41:12.000000000 -0400 -@@ -0,0 +1,316 @@ ++++ a/kernel/vserver/space.c 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,372 @@ +/* + * linux/kernel/vserver/space.c + * @@ -24787,14 +24439,42 @@ +#include +#include + -+const struct vcmd_space_mask space_mask = { -+ .mask = CLONE_NEWNS | ++ ++static const struct vcmd_space_mask space_mask_v0 = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | + CLONE_NEWUTS | + CLONE_NEWIPC | + CLONE_NEWUSER | -+ CLONE_FS ++ 0 +}; + ++static const struct vcmd_space_mask space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++ CLONE_NEWUTS | ++ CLONE_NEWIPC | ++ CLONE_NEWUSER | ++#ifdef CONFIG_PID_NS ++ CLONE_NEWPID | ++#endif ++#ifdef CONFIG_NET_NS ++ CLONE_NEWNET | ++#endif ++ 0 ++}; ++ ++static const struct vcmd_space_mask default_space_mask = { ++ .mask = CLONE_FS | ++ CLONE_NEWNS | ++ CLONE_NEWUTS | ++ CLONE_NEWIPC | ++ CLONE_NEWUSER | ++#ifdef CONFIG_PID_NS ++// CLONE_NEWPID | ++#endif ++ 0 ++}; + +/* + * build a new nsproxy mix @@ -24809,9 +24489,13 @@ + struct mnt_namespace *old_ns; + struct uts_namespace *old_uts; + struct ipc_namespace *old_ipc; -+ struct pid_namespace *old_pid; + struct user_namespace *old_user; ++#ifdef CONFIG_PID_NS ++ struct pid_namespace *old_pid; ++#endif ++#ifdef CONFIG_NET_NS + struct net *old_net; ++#endif + struct nsproxy *nsproxy; + + nsproxy = copy_nsproxy(old_nsproxy); @@ -24850,6 +24534,7 @@ + } else + old_user = NULL; + ++#ifdef CONFIG_PID_NS + if (mask & CLONE_NEWPID) { + old_pid = nsproxy->pid_ns; + nsproxy->pid_ns = new_nsproxy->pid_ns; @@ -24857,7 +24542,8 @@ + get_pid_ns(nsproxy->pid_ns); + } else + old_pid = NULL; -+ ++#endif ++#ifdef CONFIG_NET_NS + if (mask & CLONE_NEWNET) { + old_net = nsproxy->net_ns; + nsproxy->net_ns = new_nsproxy->net_ns; @@ -24865,19 +24551,23 @@ + get_net(nsproxy->net_ns); + } else + old_net = NULL; -+ ++#endif + if (old_ns) + put_mnt_ns(old_ns); + if (old_uts) + put_uts_ns(old_uts); + if (old_ipc) + put_ipc_ns(old_ipc); -+ if (old_pid) -+ put_pid_ns(old_pid); + if (old_user) + put_user_ns(old_user); ++#ifdef CONFIG_PID_NS ++ if (old_pid) ++ put_pid_ns(old_pid); ++#endif ++#ifdef CONFIG_NET_NS + if (old_net) + put_net(old_net); ++#endif +out: + return nsproxy; +} @@ -24934,6 +24624,9 @@ + struct fs_struct *fs, *fs_cur, *fs_new; + int ret; + ++ vxdprintk(VXD_CBIT(space, 8), "vx_enter_space(%p[#%u],0x%08lx)", ++ vxi, vxi->vx_id, mask); ++ + if (vx_info_flags(vxi, VXF_INFO_PRIVATE, 0)) + return -EACCES; + @@ -24989,9 +24682,12 @@ + struct fs_struct *fs_vxi, *fs_cur, *fs_new; + int ret; + ++ vxdprintk(VXD_CBIT(space, 8), "vx_set_space(%p[#%u],0x%08lx)", ++ vxi, vxi->vx_id, mask); ++#if 0 + if (!mask) -+ mask = space_mask.mask; -+ ++ mask = default_space_mask.mask; ++#endif + if ((mask & space_mask.mask) != mask) + return -EINVAL; + @@ -25056,16 +24752,28 @@ + return vx_set_space(vxi, vc_data.mask); +} + -+int vc_get_space_mask(struct vx_info *vxi, void __user *data) ++int vc_get_space_mask(void __user *data, int type) +{ -+ if (copy_to_user(data, &space_mask, sizeof(space_mask))) ++ const struct vcmd_space_mask *mask; ++ ++ if (type == 0) ++ mask = &space_mask_v0; ++ else if (type == 1) ++ mask = &space_mask; ++ else ++ mask = &default_space_mask; ++ ++ vxdprintk(VXD_CBIT(space, 10), ++ "vc_get_space_mask(%d) = %08llx", type, mask->mask); ++ ++ if (copy_to_user(data, mask, sizeof(struct vcmd_space_mask))) + return -EFAULT; + return 0; +} + --- a/kernel/vserver/switch.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/switch.c 2008-04-29 18:40:18.000000000 -0400 -@@ -0,0 +1,529 @@ ++++ a/kernel/vserver/switch.c 2008-07-16 22:41:36.000000000 -0400 +@@ -0,0 +1,537 @@ +/* + * linux/kernel/vserver/switch.c + * @@ -25166,8 +24874,14 @@ + case VCMD_set_space: + return vc_set_space(vxi, data); + ++ case VCMD_get_space_mask_v0: ++ return vc_get_space_mask(data, 0); ++ /* this is version 1 */ + case VCMD_get_space_mask: -+ return vc_get_space_mask(vxi, data); ++ return vc_get_space_mask(data, 1); ++ ++ case VCMD_get_space_default: ++ return vc_get_space_mask(data, -1); + +#ifdef CONFIG_IA32_EMULATION + case VCMD_get_rlimit: @@ -25372,7 +25086,9 @@ + __VCMD(get_version, 0, VCA_NONE, 0); + __VCMD(get_vci, 0, VCA_NONE, 0); + __VCMD(get_rlimit_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_mask_v0,0, VCA_NONE, 0); + __VCMD(get_space_mask, 0, VCA_NONE, 0); ++ __VCMD(get_space_default,0, VCA_NONE, 0); + + /* info commands */ + __VCMD(task_xid, 2, VCA_NONE, 0); @@ -25596,7 +25312,7 @@ + +#endif /* CONFIG_COMPAT */ --- a/kernel/vserver/sysctl.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/sysctl.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/sysctl.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,244 @@ +/* + * kernel/vserver/sysctl.c @@ -25843,7 +25559,7 @@ +EXPORT_SYMBOL_GPL(vx_debug_misc); + --- a/kernel/vserver/tag.c 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/tag.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/tag.c 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,63 @@ +/* + * linux/kernel/vserver/tag.c @@ -25909,12 +25625,12 @@ + + --- a/kernel/vserver/vci_config.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/kernel/vserver/vci_config.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/kernel/vserver/vci_config.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,81 @@ + +/* interface version */ + -+#define VCI_VERSION 0x00020302 ++#define VCI_VERSION 0x00020303 + + +enum { @@ -25992,18 +25708,18 @@ + 0; +} + ---- a/mm/filemap_xip.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/filemap_xip.c 2008-04-19 15:14:52.000000000 -0400 +--- a/mm/filemap_xip.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/filemap_xip.c 2008-07-16 22:41:36.000000000 -0400 @@ -14,6 +14,7 @@ #include #include #include +#include #include + #include - /* --- a/mm/fremap.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/fremap.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/mm/fremap.c 2008-07-16 22:41:36.000000000 -0400 @@ -15,6 +15,7 @@ #include #include @@ -26012,8 +25728,8 @@ #include #include ---- a/mm/hugetlb.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/hugetlb.c 2008-04-19 15:14:52.000000000 -0400 +--- a/mm/hugetlb.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/hugetlb.c 2008-07-16 22:41:36.000000000 -0400 @@ -19,6 +19,7 @@ #include @@ -26022,9 +25738,9 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; ---- a/mm/memory.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/memory.c 2008-04-19 15:14:52.000000000 -0400 -@@ -505,6 +505,9 @@ static int copy_pte_range(struct mm_stru +--- a/mm/memory.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/memory.c 2008-07-16 22:41:36.000000000 -0400 +@@ -558,6 +558,9 @@ static int copy_pte_range(struct mm_stru int progress = 0; int rss[2]; @@ -26034,7 +25750,7 @@ again: rss[1] = rss[0] = 0; dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); -@@ -2058,6 +2061,11 @@ static int do_swap_page(struct mm_struct +@@ -2221,6 +2224,11 @@ static int do_swap_page(struct mm_struct goto out; } @@ -26046,7 +25762,7 @@ mark_page_accessed(page); lock_page(page); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); -@@ -2129,6 +2137,8 @@ static int do_anonymous_page(struct mm_s +@@ -2292,6 +2300,8 @@ static int do_anonymous_page(struct mm_s /* Allocate our own private page. */ pte_unmap(page_table); @@ -26055,7 +25771,7 @@ if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); -@@ -2453,6 +2463,7 @@ static inline int handle_pte_fault(struc +@@ -2606,6 +2616,7 @@ static inline int handle_pte_fault(struc { pte_t entry; spinlock_t *ptl; @@ -26063,7 +25779,7 @@ entry = *pte; if (!pte_present(entry)) { -@@ -2480,9 +2491,12 @@ static inline int handle_pte_fault(struc +@@ -2633,9 +2644,12 @@ static inline int handle_pte_fault(struc if (unlikely(!pte_same(*pte, entry))) goto unlock; if (write_access) { @@ -26078,7 +25794,7 @@ entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); -@@ -2500,7 +2514,10 @@ static inline int handle_pte_fault(struc +@@ -2653,7 +2667,10 @@ static inline int handle_pte_fault(struc } unlock: pte_unmap_unlock(pte, ptl); @@ -26091,7 +25807,7 @@ /* --- a/mm/mlock.c 2008-04-17 10:37:25.000000000 -0400 -+++ a/mm/mlock.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/mm/mlock.c 2008-07-16 22:41:36.000000000 -0400 @@ -12,6 +12,7 @@ #include #include @@ -26107,8 +25823,8 @@ - mm->locked_vm -= pages; + vx_vmlocked_sub(mm, pages); out: - if (ret == -ENOMEM) - ret = -EAGAIN; + return ret; + } @@ -134,7 +135,7 @@ static int do_mlock(unsigned long start, asmlinkage long sys_mlock(unsigned long start, size_t len) @@ -26148,12 +25864,12 @@ if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); ---- a/mm/mmap.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/mmap.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1197,10 +1197,10 @@ munmap_back: - kmem_cache_free(vm_area_cachep, vma); - } - out: +--- a/mm/mmap.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/mmap.c 2008-07-16 22:41:36.000000000 -0400 +@@ -1210,10 +1210,10 @@ munmap_back: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + out: - mm->total_vm += len >> PAGE_SHIFT; + vx_vmpages_add(mm, len >> PAGE_SHIFT); vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -26163,7 +25879,7 @@ make_pages_present(addr, addr + len); } if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) -@@ -1549,9 +1549,9 @@ static int acct_stack_growth(struct vm_a +@@ -1562,9 +1562,9 @@ static int acct_stack_growth(struct vm_a return -ENOMEM; /* Ok, everything looks good - let it rip */ @@ -26175,7 +25891,7 @@ vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } -@@ -1722,9 +1722,9 @@ static void remove_vma_list(struct mm_st +@@ -1735,9 +1735,9 @@ static void remove_vma_list(struct mm_st do { long nrpages = vma_pages(vma); @@ -26187,7 +25903,7 @@ vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); -@@ -1967,6 +1967,8 @@ unsigned long do_brk(unsigned long addr, +@@ -1983,6 +1983,8 @@ unsigned long do_brk(unsigned long addr, lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -26196,7 +25912,7 @@ } /* -@@ -1993,7 +1995,8 @@ unsigned long do_brk(unsigned long addr, +@@ -2009,7 +2011,8 @@ unsigned long do_brk(unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; @@ -26206,7 +25922,7 @@ return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ -@@ -2018,9 +2021,9 @@ unsigned long do_brk(unsigned long addr, +@@ -2034,9 +2037,9 @@ unsigned long do_brk(unsigned long addr, vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: @@ -26218,7 +25934,7 @@ make_pages_present(addr, addr + len); } return addr; -@@ -2049,6 +2052,11 @@ void exit_mmap(struct mm_struct *mm) +@@ -2065,6 +2068,11 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); @@ -26230,7 +25946,7 @@ /* * Walk the list again, actually closing and freeing it, * with preemption enabled, without holding any MM locks. -@@ -2088,7 +2096,8 @@ int insert_vm_struct(struct mm_struct * +@@ -2104,7 +2112,8 @@ int insert_vm_struct(struct mm_struct * if (__vma && __vma->vm_start < vma->vm_end) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && @@ -26240,7 +25956,7 @@ return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; -@@ -2161,6 +2170,8 @@ int may_expand_vm(struct mm_struct *mm, +@@ -2180,6 +2189,8 @@ int may_expand_vm(struct mm_struct *mm, if (cur + npages > lim) return 0; @@ -26249,7 +25965,7 @@ return 1; } -@@ -2238,7 +2249,6 @@ int install_special_mapping(struct mm_st +@@ -2257,7 +2268,6 @@ int install_special_mapping(struct mm_st return -ENOMEM; } @@ -26259,7 +25975,7 @@ return 0; } --- a/mm/mremap.c 2008-04-17 11:31:40.000000000 -0400 -+++ a/mm/mremap.c 2008-04-19 15:14:52.000000000 -0400 ++++ a/mm/mremap.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,7 @@ #include #include @@ -26309,9 +26025,9 @@ make_pages_present(addr + old_len, addr + new_len); } ---- a/mm/nommu.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/nommu.c 2008-04-19 15:14:52.000000000 -0400 -@@ -991,7 +991,7 @@ unsigned long do_mmap_pgoff(struct file +--- a/mm/nommu.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/nommu.c 2008-07-16 22:41:36.000000000 -0400 +@@ -1007,7 +1007,7 @@ unsigned long do_mmap_pgoff(struct file realalloc += kobjsize(vma); askedalloc += sizeof(*vma); @@ -26320,7 +26036,7 @@ add_nommu_vma(vma); -@@ -1117,7 +1117,7 @@ int do_munmap(struct mm_struct *mm, unsi +@@ -1139,7 +1139,7 @@ int do_munmap(struct mm_struct *mm, unsi kfree(vml); update_hiwater_vm(mm); @@ -26329,7 +26045,7 @@ #ifdef DEBUG show_process_blocks(); -@@ -1150,7 +1150,7 @@ void exit_mmap(struct mm_struct * mm) +@@ -1172,7 +1172,7 @@ void exit_mmap(struct mm_struct * mm) printk("Exit_mmap:\n"); #endif @@ -26338,8 +26054,8 @@ while ((tmp = mm->context.vmlist)) { mm->context.vmlist = tmp->next; ---- a/mm/oom_kill.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/oom_kill.c 2008-04-19 16:55:20.000000000 -0400 +--- a/mm/oom_kill.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/oom_kill.c 2008-07-16 22:41:36.000000000 -0400 @@ -26,6 +26,7 @@ #include #include @@ -26348,7 +26064,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; -@@ -73,6 +74,12 @@ unsigned long badness(struct task_struct +@@ -72,6 +73,12 @@ unsigned long badness(struct task_struct points = mm->total_vm; /* @@ -26361,7 +26077,7 @@ * After this unlock we can no longer dereference local variable `mm' */ task_unlock(p); -@@ -162,8 +169,8 @@ unsigned long badness(struct task_struct +@@ -161,8 +168,8 @@ unsigned long badness(struct task_struct } #ifdef DEBUG @@ -26372,7 +26088,7 @@ #endif return points; } -@@ -322,8 +329,8 @@ static void __oom_kill_task(struct task_ +@@ -323,8 +330,8 @@ static void __oom_kill_task(struct task_ } if (verbose) @@ -26383,7 +26099,7 @@ /* * We give our sacrificial lamb high priority and access to -@@ -403,8 +410,8 @@ static int oom_kill_process(struct task_ +@@ -404,8 +411,8 @@ static int oom_kill_process(struct task_ return 0; } @@ -26394,18 +26110,18 @@ /* Try to kill a child first */ list_for_each_entry(c, &p->children, sibling) { ---- a/mm/page_alloc.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/mm/page_alloc.c 2008-05-21 14:30:41.000000000 -0400 -@@ -45,6 +45,8 @@ - #include +--- a/mm/page_alloc.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/page_alloc.c 2008-07-17 16:47:26.000000000 -0400 +@@ -46,6 +46,8 @@ #include #include + #include +#include +#include #include #include -@@ -1762,6 +1764,9 @@ void si_meminfo(struct sysinfo *val) +@@ -1763,6 +1765,9 @@ void si_meminfo(struct sysinfo *val) val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); val->mem_unit = PAGE_SIZE; @@ -26415,7 +26131,7 @@ } EXPORT_SYMBOL(si_meminfo); -@@ -1782,6 +1787,9 @@ void si_meminfo_node(struct sysinfo *val +@@ -1783,6 +1788,9 @@ void si_meminfo_node(struct sysinfo *val val->freehigh = 0; #endif val->mem_unit = PAGE_SIZE; @@ -26425,8 +26141,8 @@ } #endif ---- a/mm/rmap.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/rmap.c 2008-04-19 16:53:36.000000000 -0400 +--- a/mm/rmap.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/rmap.c 2008-07-16 22:41:36.000000000 -0400 @@ -49,6 +49,7 @@ #include #include @@ -26435,8 +26151,8 @@ #include ---- a/mm/shmem.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/shmem.c 2008-04-19 15:14:52.000000000 -0400 +--- a/mm/shmem.c 2008-08-12 01:41:51.000000000 -0400 ++++ a/mm/shmem.c 2008-08-12 01:42:21.000000000 -0400 @@ -56,7 +56,6 @@ #include @@ -26445,7 +26161,7 @@ #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) -@@ -1773,7 +1772,7 @@ static int shmem_statfs(struct dentry *d +@@ -1716,7 +1715,7 @@ static int shmem_statfs(struct dentry *d { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -26454,7 +26170,7 @@ buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; spin_lock(&sbinfo->stat_lock); -@@ -2341,7 +2340,7 @@ static int shmem_fill_super(struct super +@@ -2284,7 +2283,7 @@ static int shmem_fill_super(struct super sb->s_maxbytes = SHMEM_MAX_BYTES; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -26463,9 +26179,9 @@ sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_POSIX_ACL ---- a/mm/slab.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/slab.c 2008-04-19 15:14:52.000000000 -0400 -@@ -509,6 +509,8 @@ struct kmem_cache { +--- a/mm/slab.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/slab.c 2008-07-16 22:41:36.000000000 -0400 +@@ -508,6 +508,8 @@ struct kmem_cache { #define STATS_INC_FREEMISS(x) do { } while (0) #endif @@ -26474,7 +26190,7 @@ #if DEBUG /* -@@ -3344,6 +3346,7 @@ retry: +@@ -3346,6 +3348,7 @@ retry: obj = slab_get_obj(cachep, slabp, nodeid); check_slabp(cachep, slabp); @@ -26482,7 +26198,7 @@ l3->free_objects--; /* move slabp to correct slabp list: */ list_del(&slabp->list); -@@ -3416,6 +3419,7 @@ __cache_alloc_node(struct kmem_cache *ca +@@ -3418,6 +3421,7 @@ __cache_alloc_node(struct kmem_cache *ca /* ___cache_alloc_node can fall back to other nodes */ ptr = ____cache_alloc_node(cachep, flags, nodeid); out: @@ -26490,7 +26206,7 @@ local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); -@@ -3587,6 +3591,7 @@ static inline void __cache_free(struct k +@@ -3589,6 +3593,7 @@ static inline void __cache_free(struct k check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); @@ -26499,7 +26215,7 @@ /* * Skip calling cache_free_alien() when the platform is not numa. --- a/mm/slab_vs.h 1969-12-31 19:00:00.000000000 -0500 -+++ a/mm/slab_vs.h 2008-04-19 15:14:52.000000000 -0400 ++++ a/mm/slab_vs.h 2008-07-16 22:41:36.000000000 -0400 @@ -0,0 +1,27 @@ + +#include @@ -26528,8 +26244,8 @@ + atomic_sub(cachep->buffer_size, ¤t->vx_info->cacct.slab[what]); +} + ---- a/mm/swapfile.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/mm/swapfile.c 2008-04-19 15:14:52.000000000 -0400 +--- a/mm/swapfile.c 2008-07-14 17:22:56.000000000 -0400 ++++ a/mm/swapfile.c 2008-07-16 22:41:36.000000000 -0400 @@ -32,6 +32,8 @@ #include #include @@ -26539,7 +26255,7 @@ DEFINE_SPINLOCK(swap_lock); unsigned int nr_swapfiles; -@@ -1743,6 +1745,8 @@ void si_swapinfo(struct sysinfo *val) +@@ -1747,6 +1749,8 @@ void si_swapinfo(struct sysinfo *val) val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; spin_unlock(&swap_lock); @@ -26548,17 +26264,17 @@ } /* ---- a/net/core/dev.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/core/dev.c 2008-04-19 15:14:52.000000000 -0400 -@@ -119,6 +119,7 @@ - #include +--- a/net/core/dev.c 2008-07-14 17:22:57.000000000 -0400 ++++ a/net/core/dev.c 2008-07-17 16:46:49.000000000 -0400 +@@ -120,6 +120,7 @@ #include #include + #include +#include #include "net-sysfs.h" -@@ -2336,6 +2337,8 @@ static int dev_ifconf(struct net *net, c +@@ -2366,6 +2367,8 @@ static int dev_ifconf(struct net *net, c total = 0; for_each_netdev(net, dev) { @@ -26567,7 +26283,7 @@ for (i = 0; i < NPROTO; i++) { if (gifconf_list[i]) { int done; -@@ -2404,6 +2407,9 @@ static void dev_seq_printf_stats(struct +@@ -2434,6 +2437,9 @@ static void dev_seq_printf_stats(struct { struct net_device_stats *stats = dev->get_stats(dev); @@ -26577,9 +26293,9 @@ seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", dev->name, stats->rx_bytes, stats->rx_packets, ---- a/net/core/rtnetlink.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/net/core/rtnetlink.c 2008-05-21 14:30:41.000000000 -0400 -@@ -674,6 +674,8 @@ static int rtnl_dump_ifinfo(struct sk_bu +--- a/net/core/rtnetlink.c 2008-07-14 17:22:57.000000000 -0400 ++++ a/net/core/rtnetlink.c 2008-07-16 22:41:36.000000000 -0400 +@@ -688,6 +688,8 @@ static int rtnl_dump_ifinfo(struct sk_bu idx = 0; for_each_netdev(net, dev) { @@ -26588,7 +26304,7 @@ if (idx < s_idx) goto cont; if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, -@@ -1207,6 +1209,9 @@ void rtmsg_ifinfo(int type, struct net_d +@@ -1221,6 +1223,9 @@ void rtmsg_ifinfo(int type, struct net_d struct sk_buff *skb; int err = -ENOBUFS; @@ -26598,8 +26314,8 @@ skb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); if (skb == NULL) goto errout; ---- a/net/core/sock.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/core/sock.c 2008-04-23 14:31:31.000000000 -0400 +--- a/net/core/sock.c 2008-07-14 17:22:57.000000000 -0400 ++++ a/net/core/sock.c 2008-07-17 16:45:58.000000000 -0400 @@ -126,6 +126,10 @@ #include @@ -26611,7 +26327,7 @@ #ifdef CONFIG_INET #include -@@ -907,6 +911,8 @@ static struct sock *sk_prot_alloc(struct +@@ -899,6 +903,8 @@ static struct sock *sk_prot_alloc(struct if (!try_module_get(prot->owner)) goto out_free_sec; } @@ -26620,10 +26336,10 @@ return sk; -@@ -984,6 +990,11 @@ void sk_free(struct sock *sk) - __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); +@@ -975,6 +981,11 @@ void sk_free(struct sock *sk) + __func__, atomic_read(&sk->sk_omem_alloc)); - put_net(sk->sk_net); + put_net(sock_net(sk)); + vx_sock_dec(sk); + clr_vx_info(&sk->sk_vx_info); + sk->sk_xid = -1; @@ -26632,16 +26348,16 @@ sk_prot_free(sk->sk_prot_creator, sk); } -@@ -999,6 +1010,8 @@ struct sock *sk_clone(const struct sock +@@ -1010,6 +1021,8 @@ struct sock *sk_clone(const struct sock /* SANITY */ - get_net(newsk->sk_net); + get_net(sock_net(newsk)); + sock_vx_init(newsk); + sock_nx_init(newsk); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); -@@ -1045,6 +1058,12 @@ struct sock *sk_clone(const struct sock +@@ -1056,6 +1069,12 @@ struct sock *sk_clone(const struct sock newsk->sk_priority = 0; atomic_set(&newsk->sk_refcnt, 2); @@ -26654,7 +26370,7 @@ /* * Increment the counter in the same struct proto as the master * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that -@@ -1727,6 +1746,11 @@ void sock_init_data(struct socket *sock, +@@ -1740,6 +1759,11 @@ void sock_init_data(struct socket *sock, sk->sk_stamp = ktime_set(-1L, 0); @@ -26666,8 +26382,8 @@ atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } ---- a/net/ipv4/af_inet.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/af_inet.c 2008-04-19 16:08:42.000000000 -0400 +--- a/net/ipv4/af_inet.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/af_inet.c 2008-07-29 18:05:52.000000000 -0400 @@ -115,6 +115,7 @@ #ifdef CONFIG_IP_MROUTE #include @@ -26676,7 +26392,7 @@ DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; -@@ -317,9 +318,12 @@ lookup_protocol: +@@ -331,9 +332,12 @@ lookup_protocol: } err = -EPERM; @@ -26687,10 +26403,10 @@ goto out_rcu_unlock; - +override: - sock->ops = answer->ops; - answer_prot = answer->prot; - answer_no_check = answer->no_check; -@@ -433,6 +437,7 @@ int inet_bind(struct socket *sock, struc + err = -EAFNOSUPPORT; + if (!inet_netns_ok(net, protocol)) + goto out_rcu_unlock; +@@ -451,6 +455,7 @@ int inet_bind(struct socket *sock, struc struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -26698,20 +26414,20 @@ unsigned short snum; int chk_addr_ret; int err; -@@ -446,7 +451,11 @@ int inet_bind(struct socket *sock, struc +@@ -464,7 +469,11 @@ int inet_bind(struct socket *sock, struc if (addr_len < sizeof(struct sockaddr_in)) goto out; -- chk_addr_ret = inet_addr_type(&init_net, addr->sin_addr.s_addr); +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + err = v4_map_sock_addr(inet, addr, &nsa); + if (err) + goto out; + -+ chk_addr_ret = inet_addr_type(&init_net, nsa.saddr); ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since -@@ -458,7 +467,7 @@ int inet_bind(struct socket *sock, struc +@@ -476,7 +485,7 @@ int inet_bind(struct socket *sock, struc err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && @@ -26720,7 +26436,7 @@ chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) -@@ -483,7 +492,7 @@ int inet_bind(struct socket *sock, struc +@@ -501,7 +510,7 @@ int inet_bind(struct socket *sock, struc if (sk->sk_state != TCP_CLOSE || inet->num) goto out_release_sock; @@ -26729,7 +26445,7 @@ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ -@@ -676,11 +685,13 @@ int inet_getname(struct socket *sock, st +@@ -694,11 +703,13 @@ int inet_getname(struct socket *sock, st peer == 1)) return -ENOTCONN; sin->sin_port = inet->dport; @@ -26744,9 +26460,9 @@ sin->sin_port = inet->sport; sin->sin_addr.s_addr = addr; } ---- a/net/ipv4/devinet.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/devinet.c 2008-04-19 15:14:52.000000000 -0400 -@@ -421,6 +421,7 @@ struct in_device *inetdev_by_index(struc +--- a/net/ipv4/devinet.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/devinet.c 2008-07-29 17:27:07.000000000 -0400 +@@ -420,6 +420,7 @@ struct in_device *inetdev_by_index(struc return in_dev; } @@ -26754,7 +26470,7 @@ /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, -@@ -672,6 +673,8 @@ int devinet_ioctl(unsigned int cmd, void +@@ -662,6 +663,8 @@ int devinet_ioctl(struct net *net, unsig *colon = ':'; if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { @@ -26763,7 +26479,7 @@ if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ -@@ -680,6 +683,8 @@ int devinet_ioctl(unsigned int cmd, void +@@ -670,6 +673,8 @@ int devinet_ioctl(struct net *net, unsig This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { @@ -26772,7 +26488,7 @@ if (!strcmp(ifr.ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_address) { -@@ -692,9 +697,12 @@ int devinet_ioctl(unsigned int cmd, void +@@ -682,9 +687,12 @@ int devinet_ioctl(struct net *net, unsig comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; @@ -26786,7 +26502,7 @@ } } -@@ -846,6 +854,8 @@ static int inet_gifconf(struct net_devic +@@ -835,6 +843,8 @@ static int inet_gifconf(struct net_devic goto out; for (; ifa; ifa = ifa->ifa_next) { @@ -26795,15 +26511,15 @@ if (!buf) { done += sizeof(ifr); continue; -@@ -1171,6 +1181,7 @@ static int inet_dump_ifaddr(struct sk_bu +@@ -1154,6 +1164,7 @@ static int inet_dump_ifaddr(struct sk_bu struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; + struct sock *sk = skb->sk; int s_ip_idx, s_idx = cb->args[0]; - if (net != &init_net) -@@ -1188,6 +1199,8 @@ static int inet_dump_ifaddr(struct sk_bu + s_ip_idx = ip_idx = cb->args[1]; +@@ -1168,6 +1179,8 @@ static int inet_dump_ifaddr(struct sk_bu for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { @@ -26812,19 +26528,19 @@ if (ip_idx < s_ip_idx) continue; if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, ---- a/net/ipv4/fib_hash.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/fib_hash.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1025,7 +1025,7 @@ static int fib_seq_show(struct seq_file +--- a/net/ipv4/fib_hash.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/fib_hash.c 2008-07-29 17:58:34.000000000 -0400 +@@ -1024,7 +1024,7 @@ static int fib_seq_show(struct seq_file prefix = f->fn_key; mask = FZ_MASK(iter->zone); flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi) + if (fi && nx_dev_visible(current->nx_info, fi->fib_dev)) - snprintf(bf, sizeof(bf), - "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", fi->fib_dev ? fi->fib_dev->name : "*", prefix, ---- a/net/ipv4/inet_connection_sock.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/inet_connection_sock.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv4/inet_connection_sock.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/inet_connection_sock.c 2008-07-29 17:27:07.000000000 -0400 @@ -47,10 +47,40 @@ void inet_get_local_port_range(int *low, } EXPORT_SYMBOL(inet_get_local_port_range); @@ -26867,7 +26583,7 @@ struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; -@@ -63,9 +93,7 @@ int inet_csk_bind_conflict(const struct +@@ -70,9 +100,7 @@ int inet_csk_bind_conflict(const struct sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { @@ -26879,7 +26595,7 @@ } } --- a/net/ipv4/inet_diag.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/inet_diag.c 2008-04-19 16:05:51.000000000 -0400 ++++ a/net/ipv4/inet_diag.c 2008-07-29 17:27:07.000000000 -0400 @@ -34,6 +34,8 @@ #include @@ -26981,8 +26697,8 @@ if (num < s_num) goto next_dying; if (r->id.idiag_sport != tw->tw_sport && ---- a/net/ipv4/inet_hashtables.c 2008-04-17 12:05:44.000000000 -0400 -+++ a/net/ipv4/inet_hashtables.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv4/inet_hashtables.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/inet_hashtables.c 2008-07-29 17:27:07.000000000 -0400 @@ -21,6 +21,7 @@ #include @@ -26991,7 +26707,7 @@ #include /* -@@ -144,11 +145,10 @@ static struct sock *inet_lookup_listener +@@ -161,11 +162,10 @@ static struct sock *inet_lookup_listener const __be32 rcv_saddr = inet->rcv_saddr; int score = sk->sk_family == PF_INET ? 1 : 0; @@ -27006,17 +26722,17 @@ if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) continue; -@@ -180,7 +180,7 @@ struct sock *__inet_lookup_listener(stru +@@ -197,7 +197,7 @@ struct sock *__inet_lookup_listener(stru const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + v4_inet_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if && sk->sk_net == net) + !sk->sk_bound_dev_if && net_eq(sock_net(sk), net)) goto sherry_cache; ---- a/net/ipv4/netfilter/nf_nat_helper.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/netfilter/nf_nat_helper.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv4/netfilter/nf_nat_helper.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/netfilter/nf_nat_helper.c 2008-07-16 22:41:36.000000000 -0400 @@ -18,6 +18,7 @@ #include @@ -27025,8 +26741,8 @@ #include #include #include ---- a/net/ipv4/netfilter.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/netfilter.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv4/netfilter.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/netfilter.c 2008-07-16 22:41:36.000000000 -0400 @@ -4,7 +4,7 @@ #include #include @@ -27036,18 +26752,18 @@ #include #include #include ---- a/net/ipv4/raw.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/raw.c 2008-04-29 20:28:52.000000000 -0400 -@@ -126,7 +126,7 @@ static struct sock *__raw_v4_lookup(stru +--- a/net/ipv4/raw.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/raw.c 2008-07-29 17:31:11.000000000 -0400 +@@ -119,7 +119,7 @@ static struct sock *__raw_v4_lookup(stru - if (sk->sk_net == net && inet->num == num && + if (net_eq(sock_net(sk), net) && inet->num == num && !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + v4_sock_addr_match(sk->sk_nx_info, inet, laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } -@@ -382,6 +382,12 @@ static int raw_send_hdrinc(struct sock * +@@ -373,6 +373,12 @@ static int raw_send_hdrinc(struct sock * icmp_out_count(((struct icmphdr *) skb_transport_header(skb))->type); @@ -27060,7 +26776,7 @@ err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) -@@ -393,6 +399,7 @@ out: +@@ -384,6 +390,7 @@ out: error_fault: err = -EFAULT; @@ -27068,21 +26784,21 @@ kfree_skb(skb); error: IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); -@@ -560,6 +567,13 @@ static int raw_sendmsg(struct kiocb *ioc +@@ -551,6 +558,13 @@ static int raw_sendmsg(struct kiocb *ioc } security_sk_classify_flow(sk, &fl); + if (sk->sk_nx_info) { -+ err = ip_v4_find_src(sk->sk_net, ++ err = ip_v4_find_src(sock_net(sk), + sk->sk_nx_info, &rt, &fl); + + if (err) + goto done; + } - err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); + err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); } if (err) -@@ -622,17 +636,19 @@ static int raw_bind(struct sock *sk, str +@@ -621,17 +635,19 @@ static int raw_bind(struct sock *sk, str { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; @@ -27092,9 +26808,9 @@ if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; -- chk_addr_ret = inet_addr_type(sk->sk_net, addr->sin_addr.s_addr); +- chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr); + v4_map_sock_addr(inet, addr, &nsa); -+ chk_addr_ret = inet_addr_type(sk->sk_net, nsa.saddr); ++ chk_addr_ret = inet_addr_type(sock_net(sk), nsa.saddr); ret = -EADDRNOTAVAIL; - if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && + if (nsa.saddr && chk_addr_ret != RTN_LOCAL && @@ -27105,7 +26821,7 @@ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ sk_dst_reset(sk); -@@ -684,7 +700,8 @@ static int raw_recvmsg(struct kiocb *ioc +@@ -683,7 +699,8 @@ static int raw_recvmsg(struct kiocb *ioc /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; @@ -27115,30 +26831,30 @@ sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); } -@@ -862,7 +879,8 @@ static struct sock *raw_get_first(struct +@@ -860,7 +877,8 @@ static struct sock *raw_get_first(struct struct hlist_node *node; sk_for_each(sk, node, &state->h->ht[state->bucket]) -- if (sk->sk_net == state->p.net) -+ if ((sk->sk_net == state->p.net) && +- if (sock_net(sk) == seq_file_net(seq)) ++ if ((sock_net(sk) == seq_file_net(seq)) && + nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) goto found; } sk = NULL; -@@ -878,7 +896,8 @@ static struct sock *raw_get_next(struct +@@ -876,7 +894,8 @@ static struct sock *raw_get_next(struct sk = sk_next(sk); try_again: ; -- } while (sk && sk->sk_net != state->p.net); -+ } while (sk && ((sk->sk_net != state->p.net) || +- } while (sk && sock_net(sk) != seq_file_net(seq)); ++ } while (sk && ((sock_net(sk) != seq_file_net(seq)) || + !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); if (!sk && ++state->bucket < RAW_HTABLE_SIZE) { sk = sk_head(&state->h->ht[state->bucket]); -@@ -937,7 +956,10 @@ static void raw_sock_seq_show(struct seq +@@ -935,7 +954,10 @@ static void raw_sock_seq_show(struct seq seq_printf(seq, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", - i, src, srcp, dest, destp, sp->sk_state, + i, + nx_map_sock_lback(current_nx_info(), src), srcp, @@ -27147,19 +26863,19 @@ atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), ---- a/net/ipv4/syncookies.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/syncookies.c 2008-04-19 15:14:52.000000000 -0400 -@@ -20,6 +20,7 @@ +--- a/net/ipv4/syncookies.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/syncookies.c 2008-07-16 22:41:36.000000000 -0400 +@@ -18,6 +18,7 @@ #include #include #include +#include - extern int sysctl_tcp_syncookies; - ---- a/net/ipv4/tcp.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/tcp.c 2008-04-19 15:14:52.000000000 -0400 -@@ -263,6 +263,7 @@ + /* Timestamps: lowest 9 bits store TCP options */ + #define TSBITS 9 +--- a/net/ipv4/tcp.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/tcp.c 2008-07-16 22:41:36.000000000 -0400 +@@ -266,6 +266,7 @@ #include #include #include @@ -27167,9 +26883,9 @@ #include #include ---- a/net/ipv4/tcp_ipv4.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/tcp_ipv4.c 2008-04-19 15:14:52.000000000 -0400 -@@ -1965,6 +1965,12 @@ static void *listening_get_next(struct s +--- a/net/ipv4/tcp_ipv4.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/tcp_ipv4.c 2008-07-29 17:42:09.000000000 -0400 +@@ -1959,6 +1959,12 @@ static void *listening_get_next(struct s req = req->dl_next; while (1) { while (req) { @@ -27179,10 +26895,10 @@ + if (req->sk && + !nx_check(req->sk->sk_nid, VS_WATCH_P | VS_IDENT)) + continue; - if (req->rsk_ops->family == st->family) { + if (req->rsk_ops->family == st->family && + net_eq(sock_net(req->sk), net)) { cur = req; - goto out; -@@ -1989,6 +1995,10 @@ get_req: +@@ -1984,6 +1990,10 @@ get_req: } get_sk: sk_for_each_from(sk, node) { @@ -27190,51 +26906,45 @@ + sk, sk->sk_nid, nx_current_nid()); + if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + continue; - if (sk->sk_family == st->family) { + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { cur = sk; goto out; -@@ -2038,18 +2048,26 @@ static void *established_get_first(struc +@@ -2034,6 +2044,11 @@ static void *established_get_first(struc read_lock_bh(lock); sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { -- if (sk->sk_family != st->family) { + vxdprintk(VXD_CBIT(net, 6), + "sk,egf: %p [#%d] (from %d)", + sk, sk->sk_nid, nx_current_nid()); + if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + continue; -+ if (sk->sk_family != st->family) + if (sk->sk_family != st->family || + !net_eq(sock_net(sk), net)) { continue; -- } - rc = sk; - goto out; - } +@@ -2044,6 +2059,11 @@ static void *established_get_first(struc st->state = TCP_SEQ_STATE_TIME_WAIT; inet_twsk_for_each(tw, node, &tcp_hashinfo.ehash[st->bucket].twchain) { -- if (tw->tw_family != st->family) { + vxdprintk(VXD_CBIT(net, 6), + "tw: %p [#%d] (from %d)", + tw, tw->tw_nid, nx_current_nid()); + if (!nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT)) + continue; -+ if (tw->tw_family != st->family) + if (tw->tw_family != st->family || + !net_eq(twsk_net(tw), net)) { continue; -- } - rc = tw; - goto out; - } -@@ -2073,7 +2091,8 @@ static void *established_get_next(struct +@@ -2072,7 +2092,9 @@ static void *established_get_next(struct tw = cur; tw = tw_next(tw); get_tw: -- while (tw && tw->tw_family != st->family) { +- while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) { + while (tw && (tw->tw_family != st->family || ++ !net_eq(twsk_net(tw), net) || + !nx_check(tw->tw_nid, VS_WATCH_P | VS_IDENT))) { tw = tw_next(tw); } if (tw) { -@@ -2094,6 +2113,11 @@ get_tw: +@@ -2093,6 +2115,11 @@ get_tw: sk = sk_next(sk); sk_for_each_from(sk, node) { @@ -27243,12 +26953,12 @@ + sk, sk->sk_nid, nx_current_nid()); + if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + continue; - if (sk->sk_family == st->family) + if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) goto found; } -@@ -2266,9 +2290,9 @@ static void get_openreq4(struct sock *sk - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", +@@ -2247,9 +2274,9 @@ static void get_openreq4(struct sock *sk + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", i, - ireq->loc_addr, + nx_map_sock_lback(current_nx_info(), ireq->loc_addr), @@ -27258,10 +26968,10 @@ ntohs(ireq->rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ -@@ -2310,7 +2334,10 @@ static void get_tcp4_sock(struct sock *s +@@ -2292,7 +2319,10 @@ static void get_tcp4_sock(struct sock *s - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " - "%08X %5d %8d %lu %d %p %u %u %u %u %d", + seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " + "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", - i, src, srcp, dest, destp, sk->sk_state, + i, + nx_map_sock_lback(current_nx_info(), src), srcp, @@ -27270,20 +26980,20 @@ tp->write_seq - tp->snd_una, sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), -@@ -2345,7 +2372,10 @@ static void get_timewait4_sock(struct in +@@ -2328,7 +2358,10 @@ static void get_timewait4_sock(struct in - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p", + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n", - i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + i, + nx_map_sock_lback(current_nx_info(), src), srcp, + nx_map_sock_lback(current_nx_info(), dest), destp, + tw->tw_substate, 0, 0, 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, - atomic_read(&tw->tw_refcnt), tw); + atomic_read(&tw->tw_refcnt), tw, len); } ---- a/net/ipv4/tcp_minisocks.c 2008-04-17 11:31:40.000000000 -0400 -+++ a/net/ipv4/tcp_minisocks.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv4/tcp_minisocks.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv4/tcp_minisocks.c 2008-07-29 17:27:07.000000000 -0400 @@ -28,6 +28,10 @@ #include #include @@ -27295,7 +27005,7 @@ #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ #else -@@ -293,6 +297,11 @@ void tcp_time_wait(struct sock *sk, int +@@ -295,6 +299,11 @@ void tcp_time_wait(struct sock *sk, int tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; @@ -27307,13 +27017,13 @@ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); ---- a/net/ipv4/udp.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv4/udp.c 2008-04-21 12:41:01.000000000 -0400 -@@ -246,14 +246,7 @@ int udp_get_port(struct sock *sk, unsign - return __udp_lib_get_port(sk, snum, udp_hash, scmp); +--- a/net/ipv4/udp.c 2008-08-12 01:41:51.000000000 -0400 ++++ a/net/ipv4/udp.c 2008-08-12 01:42:21.000000000 -0400 +@@ -239,20 +239,14 @@ fail: + return error; } --int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) -{ - struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - @@ -27321,19 +27031,19 @@ - (!inet1->rcv_saddr || !inet2->rcv_saddr || - inet1->rcv_saddr == inet2->rcv_saddr )); -} -+extern int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2); ++extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); - static inline int udp_v4_get_port(struct sock *sk, unsigned short snum) + int udp_v4_get_port(struct sock *sk, unsigned short snum) { -@@ -273,16 +266,23 @@ static struct sock *__udp4_lib_lookup(st - int badness = -1; + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); + } - read_lock(&udp_hash_lock); + - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { - struct inet_sock *inet = inet_sk(sk); - - if (sk->sk_net == net && sk->sk_hash == hnum && + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +@@ -272,10 +266,16 @@ static struct sock *__udp4_lib_lookup(st + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && !ipv6_only_sock(sk)) { int score = (sk->sk_family == PF_INET ? 1 : 0); + @@ -27349,7 +27059,7 @@ } if (inet->daddr) { if (inet->daddr != saddr) -@@ -308,6 +308,7 @@ static struct sock *__udp4_lib_lookup(st +@@ -301,6 +301,7 @@ static struct sock *__udp4_lib_lookup(st } } } @@ -27357,7 +27067,7 @@ if (result) sock_hold(result); read_unlock(&udp_hash_lock); -@@ -329,7 +330,7 @@ static inline struct sock *udp_v4_mcast_ +@@ -322,7 +323,7 @@ static inline struct sock *udp_v4_mcast_ if (s->sk_hash != hnum || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || @@ -27366,12 +27076,12 @@ ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; -@@ -662,7 +663,15 @@ int udp_sendmsg(struct kiocb *iocb, stru +@@ -656,7 +657,15 @@ int udp_sendmsg(struct kiocb *iocb, stru .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct nx_info *nxi = sk->sk_nx_info; -+ struct net *net = sk->sk_net; ++ struct net *net = sock_net(sk); + security_sk_classify_flow(sk, &fl); + @@ -27379,10 +27089,10 @@ + if (err) + goto out; + - err = ip_route_output_flow(&init_net, &rt, &fl, sk, 1); + err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) -@@ -905,7 +914,8 @@ try_again: +@@ -899,7 +908,8 @@ try_again: { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; @@ -27392,30 +27102,30 @@ memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } if (inet->cmsg_flags) -@@ -1516,7 +1526,8 @@ static struct sock *udp_get_first(struct - for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { - struct hlist_node *node; +@@ -1515,6 +1525,8 @@ static struct sock *udp_get_first(struct sk_for_each(sk, node, state->hashtable + state->bucket) { -- if (sk->sk_family == state->family) -+ if (sk->sk_family == state->family && -+ nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) + if (!net_eq(sock_net(sk), net)) + continue; ++ if (!nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (sk->sk_family == state->family) goto found; } - } -@@ -1533,7 +1544,8 @@ static struct sock *udp_get_next(struct +@@ -1533,7 +1545,9 @@ static struct sock *udp_get_next(struct sk = sk_next(sk); try_again: ; -- } while (sk && sk->sk_family != state->family); -+ } while (sk && (sk->sk_family != state->family || +- } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); ++ } while (sk && (!net_eq(sock_net(sk), net) || ++ sk->sk_family != state->family || + !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))); if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { sk = sk_head(state->hashtable + state->bucket); -@@ -1648,7 +1660,10 @@ static void udp4_format_sock(struct sock +@@ -1634,7 +1648,10 @@ static void udp4_format_sock(struct sock - sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + seq_printf(f, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p%n", - bucket, src, srcp, dest, destp, sp->sk_state, + bucket, + nx_map_sock_lback(current_nx_info(), src), srcp, @@ -27424,8 +27134,8 @@ atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), ---- a/net/ipv6/addrconf.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/addrconf.c 2008-04-19 16:44:23.000000000 -0400 +--- a/net/ipv6/addrconf.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/addrconf.c 2008-07-29 18:55:29.000000000 -0400 @@ -87,6 +87,8 @@ #include @@ -27435,41 +27145,16 @@ /* Set to 3 to get tracing... */ #define ACONF_DEBUG 2 -@@ -918,7 +920,8 @@ static inline int ipv6_saddr_preferred(i - } +@@ -1078,7 +1080,7 @@ out: - int ipv6_dev_get_saddr(struct net_device *daddr_dev, -- struct in6_addr *daddr, struct in6_addr *saddr) -+ struct in6_addr *daddr, struct in6_addr *saddr, -+ struct nx_info *nxi) + int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, +- struct in6_addr *saddr) ++ struct in6_addr *saddr, struct nx_info *nxi) { - struct ipv6_saddr_score hiscore; - struct inet6_ifaddr *ifa_result = NULL; -@@ -964,6 +967,10 @@ int ipv6_dev_get_saddr(struct net_device - - score.addr_type = __ipv6_addr_type(&ifa->addr); - -+ /* Use only addresses assigned to the context */ -+ if (!v6_ifa_in_nx_info(ifa, nxi)) -+ continue; -+ - /* Rule 0: - * - Tentative Address (RFC2462 section 5.4) - * - A tentative address is not considered -@@ -1181,9 +1188,10 @@ record_it: - - - int ipv6_get_saddr(struct dst_entry *dst, -- struct in6_addr *daddr, struct in6_addr *saddr) -+ struct in6_addr *daddr, struct in6_addr *saddr, -+ struct nx_info *nxi) - { -- return ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, daddr, saddr); -+ return ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, daddr, saddr, nxi); - } - - EXPORT_SYMBOL(ipv6_get_saddr); -@@ -1287,35 +1295,46 @@ struct inet6_ifaddr *ipv6_get_ifaddr(str + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; +@@ -1335,35 +1337,46 @@ struct inet6_ifaddr *ipv6_get_ifaddr(str return ifp; } @@ -27521,7 +27206,7 @@ return 1; return 0; -@@ -2846,7 +2865,10 @@ static void if6_seq_stop(struct seq_file +@@ -2935,7 +2948,10 @@ static void if6_seq_stop(struct seq_file static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; @@ -27533,19 +27218,20 @@ NIP6_SEQFMT " %02x %02x %02x %02x %8s\n", NIP6(ifp->addr), ifp->idev->dev->ifindex, -@@ -3337,6 +3359,11 @@ static int inet6_dump_addr(struct sk_buf - struct inet6_ifaddr *ifa; +@@ -3430,6 +3446,12 @@ static int inet6_dump_addr(struct sk_buf struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; + struct net *net = sock_net(skb->sk); + struct nx_info *nxi = skb->sk ? skb->sk->sk_nx_info : NULL; + + /* disable ipv6 on non v6 guests */ + if (nxi && !nx_info_has_v6(nxi)) + return skb->len; ++ s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; -@@ -3358,6 +3385,8 @@ static int inet6_dump_addr(struct sk_buf +@@ -3451,6 +3473,8 @@ static int inet6_dump_addr(struct sk_buf ifa = ifa->if_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; @@ -27554,25 +27240,25 @@ err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, -@@ -3371,6 +3400,8 @@ static int inet6_dump_addr(struct sk_buf +@@ -3464,6 +3488,8 @@ static int inet6_dump_addr(struct sk_buf ifmca = ifmca->next, ip_idx++) { if (ip_idx < s_ip_idx) continue; -+ if (!v6_addr_in_nx_info(nxi, &ifa->addr, -1)) ++ if (!v6_addr_in_nx_info(nxi, &ifmca->mca_addr, -1)) + continue; err = inet6_fill_ifmcaddr(skb, ifmca, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, -@@ -3384,6 +3415,8 @@ static int inet6_dump_addr(struct sk_buf +@@ -3477,6 +3503,8 @@ static int inet6_dump_addr(struct sk_buf ifaca = ifaca->aca_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; -+ if (!v6_addr_in_nx_info(nxi, &ifa->addr, -1)) ++ if (!v6_addr_in_nx_info(nxi, &ifaca->aca_addr, -1)) + continue; err = inet6_fill_ifacaddr(skb, ifaca, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, -@@ -3678,6 +3711,11 @@ static int inet6_dump_ifinfo(struct sk_b +@@ -3760,12 +3788,19 @@ static int inet6_dump_ifinfo(struct sk_b int s_idx = cb->args[0]; struct net_device *dev; struct inet6_dev *idev; @@ -27582,10 +27268,9 @@ + if (skb->sk && skb->sk->sk_vx_info) + return skb->len; */ - if (net != &init_net) - return 0; -@@ -3687,6 +3725,8 @@ static int inet6_dump_ifinfo(struct sk_b - for_each_netdev(&init_net, dev) { + read_lock(&dev_base_lock); + idx = 0; + for_each_netdev(net, dev) { if (idx < s_idx) goto cont; + if (!v6_dev_in_nx_info(dev, nxi)) @@ -27593,8 +27278,8 @@ if ((idev = in6_dev_get(dev)) == NULL) goto cont; err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, ---- a/net/ipv6/af_inet6.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/af_inet6.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv6/af_inet6.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/af_inet6.c 2008-07-29 18:22:35.000000000 -0400 @@ -43,6 +43,8 @@ #include #include @@ -27626,15 +27311,15 @@ sock->ops = answer->ops; answer_prot = answer->prot; answer_no_check = answer->no_check; -@@ -248,6 +254,7 @@ int inet6_bind(struct socket *sock, stru - struct sock *sk = sock->sk; +@@ -249,6 +255,7 @@ int inet6_bind(struct socket *sock, stru struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct nx_v6_sock_addr nsa; __be32 v4addr = 0; unsigned short snum; int addr_type = 0; -@@ -259,6 +266,11 @@ int inet6_bind(struct socket *sock, stru +@@ -260,6 +267,11 @@ int inet6_bind(struct socket *sock, stru if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -27646,7 +27331,7 @@ addr_type = ipv6_addr_type(&addr->sin6_addr); if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) return -EINVAL; -@@ -282,6 +294,10 @@ int inet6_bind(struct socket *sock, stru +@@ -283,6 +295,10 @@ int inet6_bind(struct socket *sock, stru err = -EADDRNOTAVAIL; goto out; } @@ -27657,7 +27342,7 @@ } else { if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; -@@ -307,6 +323,11 @@ int inet6_bind(struct socket *sock, stru +@@ -308,6 +324,11 @@ int inet6_bind(struct socket *sock, stru } } @@ -27669,7 +27354,7 @@ /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ -@@ -325,6 +346,8 @@ int inet6_bind(struct socket *sock, stru +@@ -326,6 +347,8 @@ int inet6_bind(struct socket *sock, stru } } @@ -27678,7 +27363,7 @@ inet->rcv_saddr = v4addr; inet->saddr = v4addr; -@@ -419,9 +442,11 @@ int inet6_getname(struct socket *sock, s +@@ -420,9 +443,11 @@ int inet6_getname(struct socket *sock, s return -ENOTCONN; sin->sin6_port = inet->dport; ipv6_addr_copy(&sin->sin6_addr, &np->daddr); @@ -27690,19 +27375,19 @@ if (ipv6_addr_any(&np->rcv_saddr)) ipv6_addr_copy(&sin->sin6_addr, &np->saddr); else ---- a/net/ipv6/fib6_rules.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/fib6_rules.c 2008-04-19 15:14:52.000000000 -0400 -@@ -86,7 +86,7 @@ static int fib6_rule_action(struct fib_r - r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { - struct in6_addr saddr; - if (ipv6_get_saddr(&rt->u.dst, &flp->fl6_dst, -- &saddr)) -+ &saddr, NULL)) +--- a/net/ipv6/fib6_rules.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/fib6_rules.c 2008-07-29 18:23:01.000000000 -0400 +@@ -95,7 +95,7 @@ static int fib6_rule_action(struct fib_r + if (ipv6_dev_get_saddr(net + ip6_dst_idev(&rt->u.dst)->dev, + &flp->fl6_dst, srcprefs, +- &saddr)) ++ &saddr, NULL)) goto again; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) ---- a/net/ipv6/inet6_hashtables.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/inet6_hashtables.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv6/inet6_hashtables.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/inet6_hashtables.c 2008-07-29 18:12:38.000000000 -0400 @@ -16,6 +16,7 @@ #include @@ -27721,23 +27406,23 @@ } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) ---- a/net/ipv6/ip6_output.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/ip6_output.c 2008-04-19 15:14:52.000000000 -0400 -@@ -920,7 +920,7 @@ static int ip6_dst_lookup_tail(struct so - goto out_err_release; - - if (ipv6_addr_any(&fl->fl6_src)) { -- err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); -+ err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src, sk->sk_nx_info); +--- a/net/ipv6/ip6_output.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/ip6_output.c 2008-07-29 18:25:00.000000000 -0400 +@@ -924,7 +924,7 @@ static int ip6_dst_lookup_tail(struct so + err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev, + &fl->fl6_dst, + sk ? inet6_sk(sk)->srcprefs : 0, +- &fl->fl6_src); ++ &fl->fl6_src, sk->sk_nx_info); if (err) goto out_err_release; } ---- a/net/ipv6/Kconfig 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/Kconfig 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv6/Kconfig 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/Kconfig 2008-07-29 18:21:01.000000000 -0400 @@ -4,8 +4,8 @@ # IPv6 as module will cause a CRASH if you try to unload it - config IPV6 + menuconfig IPV6 - tristate "The IPv6 protocol" - default m + bool "The IPv6 protocol" @@ -27745,39 +27430,40 @@ ---help--- This is complemental support for the IP version 6. You will still be able to do traditional IPv4 networking as well. ---- a/net/ipv6/ndisc.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/ndisc.c 2008-04-19 15:14:52.000000000 -0400 -@@ -563,7 +563,7 @@ static void ndisc_send_na(struct net_dev - override = 0; - in6_ifa_put(ifp); +--- a/net/ipv6/ndisc.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/ndisc.c 2008-07-29 18:19:07.000000000 -0400 +@@ -551,7 +551,7 @@ static void ndisc_send_na(struct net_dev } else { -- if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) -+ if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr, NULL)) + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, +- &tmpaddr)) ++ &tmpaddr, NULL /* FIXME: ? */ )) return; src_addr = &tmpaddr; } ---- a/net/ipv6/route.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/route.c 2008-04-19 15:14:52.000000000 -0400 -@@ -2122,7 +2122,7 @@ static int rt6_fill_node(struct sk_buff - NLA_PUT_U32(skb, RTA_IIF, iif); - else if (dst) { +--- a/net/ipv6/route.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/route.c 2008-07-29 18:56:21.000000000 -0400 +@@ -2181,7 +2181,8 @@ static int rt6_fill_node(struct sk_buff + struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst); struct in6_addr saddr_buf; -- if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) -+ if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf, (skb->sk ? skb->sk->sk_nx_info : NULL)) == 0) + if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, +- dst, 0, &saddr_buf) == 0) ++ dst, 0, &saddr_buf, ++ (skb->sk ? skb->sk->sk_nx_info : NULL)) == 0) NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); } ---- a/net/ipv6/tcp_ipv6.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/tcp_ipv6.c 2008-04-19 15:14:52.000000000 -0400 -@@ -68,6 +68,7 @@ +--- a/net/ipv6/tcp_ipv6.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/tcp_ipv6.c 2008-07-29 18:12:38.000000000 -0400 +@@ -69,6 +69,7 @@ #include #include +#include - /* Socket used for sending RSTs and ACKs */ - static struct socket *tcp6_socket; -@@ -154,8 +155,15 @@ static int tcp_v6_connect(struct sock *s + static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); + static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); +@@ -152,8 +153,15 @@ static int tcp_v6_connect(struct sock *s * connect() to INADDR_ANY means loopback (BSD'ism). */ @@ -27795,8 +27481,8 @@ addr_type = ipv6_addr_type(&usin->sin6_addr); ---- a/net/ipv6/udp.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/udp.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/ipv6/udp.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/udp.c 2008-07-29 18:12:38.000000000 -0400 @@ -49,6 +49,7 @@ #include @@ -27804,7 +27490,7 @@ +#include #include "udp_impl.h" - static inline int udp_v6_get_port(struct sock *sk, unsigned short snum) + int udp_v6_get_port(struct sock *sk, unsigned short snum) @@ -83,6 +84,10 @@ static struct sock *__udp6_lib_lookup(st if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) continue; @@ -27816,22 +27502,22 @@ } if (!ipv6_addr_any(&np->daddr)) { if (!ipv6_addr_equal(&np->daddr, saddr)) ---- a/net/ipv6/xfrm6_policy.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/ipv6/xfrm6_policy.c 2008-04-19 16:13:21.000000000 -0400 -@@ -58,7 +58,7 @@ static int xfrm6_get_saddr(xfrm_address_ - return -EHOSTUNREACH; - - ipv6_get_saddr(dst, (struct in6_addr *)&daddr->a6, -- (struct in6_addr *)&saddr->a6); -+ (struct in6_addr *)&saddr->a6, NULL); +--- a/net/ipv6/xfrm6_policy.c 2008-07-14 17:22:58.000000000 -0400 ++++ a/net/ipv6/xfrm6_policy.c 2008-07-29 18:21:53.000000000 -0400 +@@ -59,7 +59,7 @@ static int xfrm6_get_saddr(xfrm_address_ + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, +- (struct in6_addr *)&saddr->a6); ++ (struct in6_addr *)&saddr->a6, NULL); dst_release(dst); return 0; } ---- a/net/netlink/af_netlink.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/netlink/af_netlink.c 2008-04-19 15:14:52.000000000 -0400 -@@ -56,6 +56,9 @@ +--- a/net/netlink/af_netlink.c 2008-07-14 17:23:03.000000000 -0400 ++++ a/net/netlink/af_netlink.c 2008-07-16 22:51:45.000000000 -0400 +@@ -55,6 +55,9 @@ + #include #include - #include #include +#include +#include @@ -27839,19 +27525,49 @@ #include #include ---- a/net/sctp/ipv6.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/sctp/ipv6.c 2008-04-19 15:14:52.000000000 -0400 -@@ -316,7 +316,7 @@ static void sctp_v6_get_saddr(struct sct - __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr)); +@@ -1760,6 +1763,8 @@ static struct sock *netlink_seq_socket_i + sk_for_each(s, node, &hash->table[j]) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) { + iter->link = i; + iter->hash_idx = j; +@@ -1794,7 +1799,8 @@ static void *netlink_seq_next(struct seq + s = v; + do { + s = sk_next(s); +- } while (s && sock_net(s) != seq_file_net(seq)); ++ } while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))); + if (s) + return s; - if (!asoc) { -- ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr); -+ ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr, asoc->base.sk->sk_nx_info); +@@ -1806,7 +1812,8 @@ static void *netlink_seq_next(struct seq + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); +- while (s && sock_net(s) != seq_file_net(seq)) ++ while (s && (sock_net(s) != seq_file_net(seq) || ++ !nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT))) + s = sk_next(s); + if (s) { + iter->link = i; +--- a/net/sctp/ipv6.c 2008-07-14 17:23:04.000000000 -0400 ++++ a/net/sctp/ipv6.c 2008-07-29 18:16:30.000000000 -0400 +@@ -320,7 +320,8 @@ static void sctp_v6_get_saddr(struct sct + ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, + &daddr->v6.sin6_addr, + inet6_sk(&sk->inet.sk)->srcprefs, +- &saddr->v6.sin6_addr); ++ &saddr->v6.sin6_addr, ++ asoc->base.sk->sk_nx_info); SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n", NIP6(saddr->v6.sin6_addr)); return; ---- a/net/socket.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/socket.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/socket.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/socket.c 2008-07-29 18:12:38.000000000 -0400 @@ -93,6 +93,10 @@ #include @@ -27960,34 +27676,27 @@ err = sock1->ops->socketpair(sock1, sock2); if (err < 0) ---- a/net/sunrpc/auth.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/sunrpc/auth.c 2008-04-19 15:14:52.000000000 -0400 -@@ -13,6 +13,7 @@ - #include +--- a/net/sunrpc/auth.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/sunrpc/auth.c 2008-07-16 22:41:36.000000000 -0400 +@@ -14,6 +14,7 @@ + #include #include #include +#include #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH -@@ -345,6 +346,7 @@ rpcauth_lookupcred(struct rpc_auth *auth +@@ -353,6 +354,8 @@ rpcauth_lookupcred(struct rpc_auth *auth struct auth_cred acred = { .uid = current->fsuid, .gid = current->fsgid, + .tag = dx_current_tag(), - .group_info = current->group_info, - }; - struct rpc_cred *ret; -@@ -382,6 +384,7 @@ rpcauth_bindcred(struct rpc_task *task) - struct auth_cred acred = { - .uid = current->fsuid, - .gid = current->fsgid, + .tag = dx_current_tag(), .group_info = current->group_info, }; struct rpc_cred *ret; ---- a/net/sunrpc/auth_unix.c 2008-04-17 10:37:27.000000000 -0400 -+++ a/net/sunrpc/auth_unix.c 2008-04-19 15:14:52.000000000 -0400 +--- a/net/sunrpc/auth_unix.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/sunrpc/auth_unix.c 2008-07-17 16:38:01.000000000 -0400 @@ -11,12 +11,14 @@ #include #include @@ -28003,33 +27712,26 @@ gid_t uc_gids[NFS_NGROUPS]; }; #define uc_uid uc_base.cr_uid -@@ -73,6 +75,7 @@ unx_create_cred(struct rpc_auth *auth, s - if (flags & RPCAUTH_LOOKUP_ROOTCREDS) { - cred->uc_uid = 0; - cred->uc_gid = 0; -+ cred->uc_tag = dx_current_tag(); - cred->uc_gids[0] = NOGROUP; - } else { - int groups = acred->group_info->ngroups; -@@ -80,6 +83,7 @@ unx_create_cred(struct rpc_auth *auth, s - groups = NFS_NGROUPS; +@@ -78,6 +80,7 @@ unx_create_cred(struct rpc_auth *auth, s + groups = NFS_NGROUPS; - cred->uc_gid = acred->gid; -+ cred->uc_tag = acred->tag; - for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); - if (i < NFS_NGROUPS) -@@ -124,7 +128,8 @@ unx_match(struct auth_cred *acred, struc - int groups; + cred->uc_gid = acred->gid; ++ cred->uc_tag = acred->tag; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) +@@ -119,7 +122,9 @@ unx_match(struct auth_cred *acred, struc + unsigned int i; - if (cred->uc_uid != acred->uid -- || cred->uc_gid != acred->gid) -+ || cred->uc_gid != acred->gid -+ || cred->uc_tag != acred->tag) - return 0; - groups = acred->group_info->ngroups; -@@ -150,7 +155,7 @@ unx_marshal(struct rpc_task *task, __be3 +- if (cred->uc_uid != acred->uid || cred->uc_gid != acred->gid) ++ if (cred->uc_uid != acred->uid || ++ cred->uc_gid != acred->gid || ++ cred->uc_tag != acred->tag) + return 0; + + if (acred->group_info != NULL) +@@ -142,7 +147,7 @@ unx_marshal(struct rpc_task *task, __be3 struct rpc_clnt *clnt = task->tk_client; struct unx_cred *cred = container_of(task->tk_msg.rpc_cred, struct unx_cred, uc_base); __be32 *base, *hold; @@ -28038,7 +27740,7 @@ *p++ = htonl(RPC_AUTH_UNIX); base = p++; -@@ -160,9 +165,12 @@ unx_marshal(struct rpc_task *task, __be3 +@@ -152,9 +157,12 @@ unx_marshal(struct rpc_task *task, __be3 * Copy the UTS nodename captured when the client was created. */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); @@ -28053,8 +27755,8 @@ hold = p++; for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) *p++ = htonl((u32) cred->uc_gids[i]); ---- a/net/sunrpc/clnt.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/sunrpc/clnt.c 2008-04-19 16:09:36.000000000 -0400 +--- a/net/sunrpc/clnt.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/sunrpc/clnt.c 2008-07-16 22:41:36.000000000 -0400 @@ -31,6 +31,7 @@ #include #include @@ -28074,8 +27776,8 @@ return clnt; } EXPORT_SYMBOL_GPL(rpc_create); ---- a/net/unix/af_unix.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/unix/af_unix.c 2008-04-19 15:55:39.000000000 -0400 +--- a/net/unix/af_unix.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/unix/af_unix.c 2008-07-16 22:52:51.000000000 -0400 @@ -116,6 +116,8 @@ #include #include @@ -28085,8 +27787,8 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -@@ -255,6 +257,8 @@ static struct sock *__unix_find_socket_b - if (s->sk_net != net) +@@ -260,6 +262,8 @@ static struct sock *__unix_find_socket_b + if (!net_eq(sock_net(s), net)) continue; + if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) @@ -28094,17 +27796,27 @@ if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) goto found; -@@ -819,7 +823,7 @@ static int unix_bind(struct socket *sock - */ - mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current->fs->umask); -- err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); -+ err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0, NULL); - if (err) - goto out_mknod_dput; - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); ---- a/net/x25/af_x25.c 2008-04-17 12:05:45.000000000 -0400 -+++ a/net/x25/af_x25.c 2008-04-19 15:14:52.000000000 -0400 +@@ -2086,6 +2090,8 @@ static struct sock *unix_seq_idx(struct + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (sock_net(s) != seq_file_net(seq)) + continue; ++ if (!nx_check(s->sk_nid, VS_WATCH_P | VS_IDENT)) ++ continue; + if (off == pos) + return s; + ++off; +@@ -2111,7 +2117,8 @@ static void *unix_seq_next(struct seq_fi + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); +- while (sk && (sock_net(sk) != seq_file_net(seq))) ++ while (sk && (sock_net(sk) != seq_file_net(seq) || ++ !nx_check(sk->sk_nid, VS_WATCH_P | VS_IDENT))) + sk = next_unix_socket(&iter->i, sk); + return sk; + } +--- a/net/x25/af_x25.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/net/x25/af_x25.c 2008-07-16 22:41:36.000000000 -0400 @@ -506,7 +506,10 @@ static int x25_create(struct net *net, s x25 = x25_sk(sk); @@ -28118,7 +27830,7 @@ x25_init_timers(sk); --- a/scripts/checksyscalls.sh 2008-04-17 11:31:42.000000000 -0400 -+++ a/scripts/checksyscalls.sh 2008-04-19 15:14:52.000000000 -0400 ++++ a/scripts/checksyscalls.sh 2008-07-16 22:41:36.000000000 -0400 @@ -108,7 +108,6 @@ cat << EOF #define __IGNORE_afs_syscall #define __IGNORE_getpmsg @@ -28127,18 +27839,14 @@ EOF } ---- a/security/commoncap.c 2008-04-17 12:05:46.000000000 -0400 -+++ a/security/commoncap.c 2008-04-23 22:22:54.000000000 -0400 -@@ -24,6 +24,7 @@ - #include - #include +--- a/security/commoncap.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/security/commoncap.c 2008-07-16 22:48:47.000000000 -0400 +@@ -26,10 +26,11 @@ #include + #include + #include +#include - /* Global security state */ - -@@ -32,7 +33,7 @@ EXPORT_SYMBOL(securebits); - int cap_netlink_send(struct sock *sk, struct sk_buff *skb) { - NETLINK_CB(skb).eff_cap = current->cap_effective; @@ -28146,7 +27854,7 @@ return 0; } -@@ -53,9 +54,24 @@ EXPORT_SYMBOL(cap_netlink_recv); +@@ -50,9 +51,24 @@ EXPORT_SYMBOL(cap_netlink_recv); */ int cap_capable (struct task_struct *tsk, int cap) { @@ -28172,7 +27880,7 @@ return -EPERM; } -@@ -583,7 +599,8 @@ void cap_task_reparent_to_init (struct t +@@ -675,7 +691,8 @@ void cap_task_reparent_to_init (struct t int cap_syslog (int type) { @@ -28182,17 +27890,17 @@ return -EPERM; return 0; } ---- a/security/dummy.c 2008-04-17 12:05:46.000000000 -0400 -+++ a/security/dummy.c 2008-04-23 20:28:54.000000000 -0400 -@@ -27,6 +27,7 @@ - #include - #include +--- a/security/dummy.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/security/dummy.c 2008-07-16 22:48:12.000000000 -0400 +@@ -29,6 +29,7 @@ #include + #include + #include +#include static int dummy_ptrace (struct task_struct *parent, struct task_struct *child) { -@@ -714,7 +715,7 @@ static int dummy_sem_semop (struct sem_a +@@ -749,7 +750,7 @@ static int dummy_sem_semop (struct sem_a static int dummy_netlink_send (struct sock *sk, struct sk_buff *skb) { @@ -28201,9 +27909,9 @@ return 0; } ---- a/security/selinux/hooks.c 2008-05-21 14:30:05.000000000 -0400 -+++ a/security/selinux/hooks.c 2008-05-21 14:30:41.000000000 -0400 -@@ -64,7 +64,6 @@ +--- a/security/selinux/hooks.c 2008-07-14 17:23:05.000000000 -0400 ++++ a/security/selinux/hooks.c 2008-07-16 22:41:36.000000000 -0400 +@@ -65,7 +65,6 @@ #include #include #include /* for Unix socket types */ diff --git a/debian/patches/features/all/xen/console-hvc-overtake.patch b/debian/patches/features/all/xen/console-hvc-overtake.patch new file mode 100644 index 000000000..6ab48bfbc --- /dev/null +++ b/debian/patches/features/all/xen/console-hvc-overtake.patch @@ -0,0 +1,71 @@ +diff --git a/drivers/xen/console/console.c b/drivers/xen/console/console.c +index e4e5d41..03f8ce9 100644 +--- a/drivers/xen/console/console.c ++++ b/drivers/xen/console/console.c +@@ -66,20 +66,25 @@ + * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. + * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. + * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'. +- * default: XC_XVC ++ * 'xencons=hvc' [XC_HVC]: Console attached to '/dev/hvc0'. ++ * default: XC_HVC + * + * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses + * warnings from standard distro startup scripts. + */ + static enum { +- XC_OFF, XC_TTY, XC_SERIAL, XC_XVC +-} xc_mode = XC_XVC; ++ XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC ++} xc_mode = XC_HVC; + static int xc_num = -1; + + /* /dev/xvc0 device number allocated by lanana.org. */ + #define XEN_XVC_MAJOR 204 + #define XEN_XVC_MINOR 191 + ++/* /dev/hvc0 device number */ ++#define XEN_HVC_MAJOR 229 ++#define XEN_HVC_MINOR 0 ++ + static int __init xencons_setup(char *str) + { + char *q; +@@ -97,6 +102,9 @@ static int __init xencons_setup(char *str) + } else if (!strncmp(str, "xvc", 3)) { + xc_mode = XC_XVC; + str += 3; ++ } else if (!strncmp(str, "hvc", 3)) { ++ xc_mode = XC_HVC; ++ str += 3; + } else if (!strncmp(str, "off", 3)) { + xc_mode = XC_OFF; + str += 3; +@@ -205,6 +213,14 @@ static int __init xen_console_init(void) + xc_num = 0; + break; + ++ case XC_HVC: ++ strcpy(kcons_info.name, "hvc"); ++ if (xc_num == -1) ++ xc_num = 0; ++ if (!is_initial_xendomain()) ++ add_preferred_console(kcons_info.name, xc_num, NULL); ++ break; ++ + case XC_SERIAL: + strcpy(kcons_info.name, "ttyS"); + if (xc_num == -1) +@@ -681,6 +697,12 @@ static int __init xencons_init(void) + DRV(xencons_driver)->minor_start = XEN_XVC_MINOR; + DRV(xencons_driver)->name_base = xc_num; + break; ++ case XC_HVC: ++ DRV(xencons_driver)->name = "hvc"; ++ DRV(xencons_driver)->major = XEN_HVC_MAJOR; ++ DRV(xencons_driver)->minor_start = XEN_HVC_MINOR; ++ DRV(xencons_driver)->name_base = xc_num; ++ break; + case XC_SERIAL: + DRV(xencons_driver)->name = "ttyS"; + DRV(xencons_driver)->minor_start = 64 + xc_num; diff --git a/debian/patches/features/all/xen/disable-pat.patch b/debian/patches/features/all/xen/disable-pat.patch new file mode 100644 index 000000000..7425043c6 --- /dev/null +++ b/debian/patches/features/all/xen/disable-pat.patch @@ -0,0 +1,25 @@ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 170d743..ae1e15b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1141,6 +1141,7 @@ config X86_PAT + bool + prompt "x86 PAT support" + depends on MTRR ++ depends on !XEN + help + Use PAT attributes to setup page level cache control. + +diff --git a/include/asm-x86/mach-xen/asm/pgtable.h b/include/asm-x86/mach-xen/asm/pgtable.h +index a9ff073..3c37ff4 100644 +--- a/include/asm-x86/mach-xen/asm/pgtable.h ++++ b/include/asm-x86/mach-xen/asm/pgtable.h +@@ -74,7 +74,7 @@ extern unsigned int __kernel_page_user; + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +-#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) ++#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) + #define _PAGE_CACHE_WB (0) + #define _PAGE_CACHE_WT (_PAGE_PWT) + #define _PAGE_CACHE_WC (_PAGE_PAT) diff --git a/debian/patches/features/all/xen/dom0-fix-processor-throttling-when-pr-id-is-minus-1.patch b/debian/patches/features/all/xen/dom0-fix-processor-throttling-when-pr-id-is-minus-1.patch new file mode 100644 index 000000000..b9a694028 --- /dev/null +++ b/debian/patches/features/all/xen/dom0-fix-processor-throttling-when-pr-id-is-minus-1.patch @@ -0,0 +1,32 @@ +# HG changeset patch +# User Keir Fraser +# Date 1225190351 0 +# Node ID de7f94bd650b7e00cd57191280c5f0959b2a286e +# Parent f40f4f86d5a2c5caa0261512279f3590e95f3d91 +dom0: Fix for throttling while pr->id == -1 + +Signed-off-by: Wei Gang + +diff -r f40f4f86d5a2 -r de7f94bd650b drivers/acpi/processor_core.c +--- a/drivers/acpi/processor_core.c Mon Oct 27 13:47:07 2008 +0000 ++++ b/drivers/acpi/processor_core.c Tue Oct 28 10:39:11 2008 +0000 +@@ -677,8 +677,17 @@ + #if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL) + acpi_processor_ppc_has_changed(pr); + #endif +- acpi_processor_get_throttling_info(pr); +- acpi_processor_get_limit_info(pr); ++ ++ /* ++ * pr->id may equal to -1 while processor_cntl_external enabled. ++ * throttle and thermal module don't support this case. ++ * Tx only works when dom0 vcpu == pcpu num by far, as we give ++ * control to dom0. ++ */ ++ if (pr->id != -1) { ++ acpi_processor_get_throttling_info(pr); ++ acpi_processor_get_limit_info(pr); ++ } + + + acpi_processor_power_init(pr, device); diff --git a/debian/patches/features/all/xen/fix-pci-hook.patch b/debian/patches/features/all/xen/fix-pci-hook.patch new file mode 100644 index 000000000..63e47dd92 --- /dev/null +++ b/debian/patches/features/all/xen/fix-pci-hook.patch @@ -0,0 +1,51 @@ +diff --git a/drivers/xen/core/pci.c b/drivers/xen/core/pci.c +index 2710c18..3fd5167 100644 +--- a/drivers/xen/core/pci.c ++++ b/drivers/xen/core/pci.c +@@ -12,7 +12,7 @@ static int (*pci_bus_remove)(struct device *dev); + + static int pci_bus_probe_wrapper(struct device *dev) + { +- int r; ++ int r, r1; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + manage_pci.bus = pci_dev->bus->number; +@@ -23,16 +23,19 @@ static int pci_bus_probe_wrapper(struct device *dev) + return r; + + r = pci_bus_probe(dev); +- if (r) +- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, +- &manage_pci)); ++ if (r) { ++ r1 = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, ++ &manage_pci); ++ if (r1 && r1 != -ENOSYS) ++ WARN_ON(1); ++ } + + return r; + } + + static int pci_bus_remove_wrapper(struct device *dev) + { +- int r; ++ int r, r1; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + manage_pci.bus = pci_dev->bus->number; +@@ -41,8 +44,11 @@ static int pci_bus_remove_wrapper(struct device *dev) + r = pci_bus_remove(dev); + /* dev and pci_dev are no longer valid!! */ + +- WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, +- &manage_pci)); ++ r1 = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, ++ &manage_pci); ++ if (r1 && r1 != -ENOSYS) ++ WARN_ON(1); ++ + return r; + } + diff --git a/debian/patches/features/all/xen/hardcode-xen-makefile.patch b/debian/patches/features/all/xen/hardcode-xen-makefile.patch new file mode 100644 index 000000000..ca7978c00 --- /dev/null +++ b/debian/patches/features/all/xen/hardcode-xen-makefile.patch @@ -0,0 +1,50 @@ +diff --git a/scripts/Makefile.build b/scripts/Makefile.build +index e89ae81..302d893 100644 +--- a/scripts/Makefile.build ++++ b/scripts/Makefile.build +@@ -74,12 +74,6 @@ $(warning kbuild: Makefile.build is included improperly) + endif + + ifeq ($(CONFIG_XEN),y) +-$(objtree)/scripts/Makefile.xen: $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build +- @echo ' Updating $@' +- $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\ +- ,$(error 'Your awk program does not define gensub. Use gawk or another awk with gensub')) +- @$(AWK) -f $< $(filter-out $<,$^) >$@ +- + xen-src-single-used-m := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c)))) + xen-single-used-m := $(xen-src-single-used-m:-xen.c=.o) + single-used-m := $(filter-out $(xen-single-used-m),$(single-used-m)) +diff --git a/scripts/Makefile.xen b/scripts/Makefile.xen +new file mode 100644 +index 0000000..c1cf128 +--- /dev/null ++++ b/scripts/Makefile.xen +@@ -0,0 +1,27 @@ ++$(obj)/%.s: $(src)/%-xen.c FORCE ++ $(call if_changed_dep,cc_s_c) ++ ++$(obj)/%.i: $(src)/%-xen.c FORCE ++ $(call if_changed_dep,cc_i_c) ++ ++$(obj)/%.symtypes : $(src)/%-xen.c FORCE ++ $(call if_changed_dep,cc_symtypes_c) ++ ++$(obj)/%.o: $(src)/%-xen.c FORCE ++ $(call cmd,force_checksrc) ++ $(call if_changed_rule,cc_o_c) ++ ++$(xen-single-used-m): $(obj)/%.o: $(src)/%-xen.c FORCE ++ $(call cmd,force_checksrc) ++ $(call if_changed_rule,cc_o_c) ++ @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod) ++ ++$(obj)/%.lst: $(src)/%-xen.c FORCE ++ $(call if_changed_dep,cc_lst_c) ++ ++$(obj)/%.s: $(src)/%-xen.S FORCE ++ $(call if_changed_dep,as_s_S) ++ ++$(obj)/%.o: $(src)/%-xen.S FORCE ++ $(call if_changed_dep,as_o_S) ++ diff --git a/debian/patches/features/all/xen/remove-4gb-warning.patch b/debian/patches/features/all/xen/remove-4gb-warning.patch new file mode 100644 index 000000000..3d1f35106 --- /dev/null +++ b/debian/patches/features/all/xen/remove-4gb-warning.patch @@ -0,0 +1,55 @@ +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 81f7529..c41a5c8 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -91,8 +91,6 @@ scx200-y += scx200_32.o + + obj-$(CONFIG_OLPC) += olpc.o + +-obj-$(CONFIG_X86_XEN) += fixup.o +- + ### + # 64 bit specific files + ifeq ($(CONFIG_X86_64),y) +diff --git a/arch/x86/kernel/entry_32-xen.S b/arch/x86/kernel/entry_32-xen.S +index ca66938..37f6f70 100644 +--- a/arch/x86/kernel/entry_32-xen.S ++++ b/arch/x86/kernel/entry_32-xen.S +@@ -1238,15 +1238,8 @@ ENTRY(spurious_interrupt_bug) + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +-#endif /* !CONFIG_XEN */ +- +-ENTRY(fixup_4gb_segment) +- RING0_EC_FRAME +- pushl $do_fixup_4gb_segment +- CFI_ADJUST_CFA_OFFSET 4 +- jmp error_code +- CFI_ENDPROC + END(spurious_interrupt_bug) ++#endif /* !CONFIG_XEN */ + + ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder +diff --git a/arch/x86/kernel/traps_32-xen.c b/arch/x86/kernel/traps_32-xen.c +index c174679..0ce86ca 100644 +--- a/arch/x86/kernel/traps_32-xen.c ++++ b/arch/x86/kernel/traps_32-xen.c +@@ -101,8 +101,6 @@ asmlinkage void simd_coprocessor_error(void); + asmlinkage void alignment_check(void); + #ifndef CONFIG_XEN + asmlinkage void spurious_interrupt_bug(void); +-#else +-asmlinkage void fixup_4gb_segment(void); + #endif + asmlinkage void machine_check(void); + +@@ -1172,7 +1170,6 @@ static const trap_info_t __cpuinitconst trap_table[] = { + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0|4, __KERNEL_CS, (unsigned long)page_fault }, +- { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, + #ifdef CONFIG_X86_MCE diff --git a/debian/patches/features/all/xen/suse-20080808143035.patch b/debian/patches/features/all/xen/suse-20080808143035.patch new file mode 100644 index 000000000..4475d2059 --- /dev/null +++ b/debian/patches/features/all/xen/suse-20080808143035.patch @@ -0,0 +1,119931 @@ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index bf07b6f..170d743 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -23,8 +23,8 @@ config X86 + select HAVE_OPROFILE + select HAVE_KPROBES + select HAVE_KRETPROBES +- select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) +- select HAVE_ARCH_KGDB if !X86_VOYAGER ++ select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) && !XEN ++ select HAVE_ARCH_KGDB if !X86_VOYAGER && !XEN + + config ARCH_DEFCONFIG + string +@@ -43,13 +43,16 @@ config GENERIC_CMOS_UPDATE + + config CLOCKSOURCE_WATCHDOG + def_bool y ++ depends on !XEN + + config GENERIC_CLOCKEVENTS + def_bool y ++ depends on !XEN + + config GENERIC_CLOCKEVENTS_BROADCAST + def_bool y + depends on X86_64 || (X86_32 && X86_LOCAL_APIC) ++ depends on !XEN + + config LOCKDEP_SUPPORT + def_bool y +@@ -129,6 +132,7 @@ config HAVE_CPUMASK_OF_CPU_MAP + config ARCH_HIBERNATION_POSSIBLE + def_bool y + depends on !SMP || !X86_VOYAGER ++ depends on !XEN + + config ARCH_SUSPEND_POSSIBLE + def_bool y +@@ -182,18 +186,29 @@ config X86_HT + bool + depends on SMP + depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64 ++ depends on !XEN + default y + + config X86_BIOS_REBOOT + bool + depends on !X86_VISWS && !X86_VOYAGER ++ depends on !XEN + default y + + config X86_TRAMPOLINE + bool + depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) ++ depends on !XEN + default y + ++config X86_NO_TSS ++ def_bool y ++ depends on XEN ++ ++config X86_NO_IDT ++ def_bool y ++ depends on XEN ++ + config KTIME_SCALAR + def_bool X86_32 + source "init/Kconfig" +@@ -239,6 +254,18 @@ config X86_PC + help + Choose this option if your computer is a standard PC or compatible. + ++config X86_XEN ++ bool "Xen-compatible" ++ depends on X86_32 ++ select XEN ++ select X86_PAE ++ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST ++ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST ++ select SWIOTLB ++ help ++ Choose this option if you plan to run this kernel on top of the ++ Xen Hypervisor. ++ + config X86_ELAN + bool "AMD Elan" + depends on X86_32 +@@ -334,6 +361,14 @@ config X86_RDC321X + as R-8610-(G). + If you don't have one of these chips, you should say N here. + ++config X86_64_XEN ++ bool "Enable Xen compatible kernel" ++ depends on X86_64 ++ select XEN ++ select SWIOTLB ++ help ++ This option will compile a kernel compatible with Xen hypervisor ++ + config X86_VSMP + bool "Support for ScaleMP vSMP" + select PARAVIRT +@@ -359,6 +394,7 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER + + menuconfig PARAVIRT_GUEST + bool "Paravirtualized guest support" ++ depends on !XEN + help + Say Y here to get to see options related to running Linux under + various hypervisors. This option alone does not add any kernel code. +@@ -419,7 +455,7 @@ endif + + config MEMTEST_BOOTPARAM + bool "Memtest boot parameter" +- depends on X86_64 ++ depends on X86_64 && !XEN + default y + help + This option adds a kernel parameter 'memtest', which allows memtest +@@ -472,6 +508,7 @@ source "arch/x86/Kconfig.cpu" + config HPET_TIMER + def_bool X86_64 + prompt "HPET Timer Support" if X86_32 ++ depends on !XEN + help + Use the IA-PC HPET (High Precision Event Timer) to manage + time in preference to the PIT and RTC, if a HPET is +@@ -508,7 +545,7 @@ config GART_IOMMU + default y + select SWIOTLB + select AGP +- depends on X86_64 && PCI ++ depends on X86_64 && PCI && !X86_64_XEN + help + Support for full DMA access of devices with 32bit memory access only + on systems with more than 3GB. This is usually needed for USB, +@@ -523,7 +560,7 @@ config GART_IOMMU + config CALGARY_IOMMU + bool "IBM Calgary IOMMU support" + select SWIOTLB +- depends on X86_64 && PCI && EXPERIMENTAL ++ depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL + help + Support for hardware IOMMUs in IBM's xSeries x366 and x460 + systems. Needed to run systems with more than 3GB of memory +@@ -567,6 +604,7 @@ config NR_CPUS + range 2 255 + depends on SMP + default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 ++ default "16" if X86_64_XEN + default "8" + help + This allows you to specify the maximum number of CPUs which this +@@ -598,7 +636,7 @@ source "kernel/Kconfig.preempt" + + config X86_UP_APIC + bool "Local APIC support on uniprocessors" +- depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) ++ depends on X86_32 && !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH || XEN_UNPRIVILEGED_GUEST) + help + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU +@@ -624,18 +662,24 @@ config X86_UP_IOAPIC + config X86_LOCAL_APIC + def_bool y + depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) ++ depends on !XEN_UNPRIVILEGED_GUEST + + config X86_IO_APIC + def_bool y + depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) ++ depends on !XEN_UNPRIVILEGED_GUEST + + config X86_VISWS_APIC + def_bool y + depends on X86_32 && X86_VISWS + ++config X86_XEN_GENAPIC ++ def_bool y ++ depends on X86_64_XEN ++ + config X86_MCE + bool "Machine Check Exception" +- depends on !X86_VOYAGER ++ depends on !(X86_VOYAGER || XEN) + ---help--- + Machine Check Exception support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, component failure). +@@ -735,7 +779,7 @@ config I8K + config X86_REBOOTFIXUPS + def_bool n + prompt "Enable X86 board specific fixups for reboot" +- depends on X86_32 && X86 ++ depends on X86_32 && !XEN + ---help--- + This enables chipset and/or board specific fixups to be done + in order to get reboot to work correctly. This is only needed on +@@ -752,6 +796,7 @@ config X86_REBOOTFIXUPS + + config MICROCODE + tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" ++ depends on !XEN_UNPRIVILEGED_GUEST + select FW_LOADER + ---help--- + If you say Y here, you will be able to update the microcode on +@@ -910,7 +955,7 @@ config X86_PAE + # Common NUMA Features + config NUMA + bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" +- depends on SMP ++ depends on SMP && !X86_64_XEN + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) && EXPERIMENTAL) + default n if X86_PC + default y if (X86_NUMAQ || X86_SUMMIT) +@@ -1003,11 +1048,11 @@ config ARCH_DISCONTIGMEM_DEFAULT + + config ARCH_SPARSEMEM_DEFAULT + def_bool y +- depends on X86_64 ++ depends on X86_64 && !X86_64_XEN + + config ARCH_SPARSEMEM_ENABLE + def_bool y +- depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) ++ depends on (X86_64 && !X86_64_XEN) || NUMA || (EXPERIMENTAL && X86_PC) + select SPARSEMEM_STATIC if X86_32 + select SPARSEMEM_VMEMMAP_ENABLE if X86_64 + +@@ -1033,6 +1078,7 @@ config HIGHPTE + config MATH_EMULATION + bool + prompt "Math emulation" if X86_32 ++ depends on !XEN + ---help--- + Linux can emulate a math coprocessor (used for floating point + operations) if you don't have one. 486DX and Pentium processors have +@@ -1058,6 +1104,7 @@ config MATH_EMULATION + + config MTRR + bool "MTRR (Memory Type Range Register) support" ++ depends on !XEN_UNPRIVILEGED_GUEST + ---help--- + On Intel P6 family processors (Pentium Pro, Pentium II and later) + the Memory Type Range Registers (MTRRs) may be used to control +@@ -1108,7 +1155,7 @@ config X86_PAT + config EFI + def_bool n + prompt "EFI runtime service support" +- depends on ACPI ++ depends on ACPI && !XEN + ---help--- + This enables the kernel to use EFI runtime services that are + available (such as the EFI variable services). +@@ -1123,7 +1170,7 @@ config EFI + config IRQBALANCE + def_bool y + prompt "Enable kernel irq balancing" +- depends on X86_32 && SMP && X86_IO_APIC ++ depends on X86_32 && SMP && X86_IO_APIC && !XEN + help + The default yes will allow the kernel to do irq load balancing. + Saying no will keep the kernel from doing irq load balancing. +@@ -1174,6 +1221,7 @@ source kernel/Kconfig.hz + config KEXEC + bool "kexec system call" + depends on X86_BIOS_REBOOT ++ depends on !XEN_UNPRIVILEGED_GUEST + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot +@@ -1192,6 +1240,7 @@ config CRASH_DUMP + bool "kernel crash dumps (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on X86_64 || (X86_32 && HIGHMEM) ++ depends on !XEN + help + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels +@@ -1248,7 +1297,7 @@ config PHYSICAL_START + + config RELOCATABLE + bool "Build a relocatable kernel (EXPERIMENTAL)" +- depends on EXPERIMENTAL ++ depends on EXPERIMENTAL && !XEN + help + This builds a kernel image that retains relocation information + so it can be loaded someplace besides the default 1MB. +@@ -1322,7 +1371,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID + depends on NUMA + + menu "Power management options" +- depends on !X86_VOYAGER ++ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) + + config ARCH_HIBERNATION_HEADER + def_bool y +@@ -1339,7 +1388,7 @@ config X86_APM_BOOT + + menuconfig APM + tristate "APM (Advanced Power Management) BIOS support" +- depends on X86_32 && PM_SLEEP && !X86_VISWS ++ depends on X86_32 && PM_SLEEP && !(X86_VISWS || XEN) + ---help--- + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with +@@ -1506,6 +1555,7 @@ choice + + config PCI_GOBIOS + bool "BIOS" ++ depends on !XEN + + config PCI_GOMMCONFIG + bool "MMConfig" +@@ -1517,6 +1567,13 @@ config PCI_GOOLPC + bool "OLPC" + depends on OLPC + ++config PCI_GOXEN_FE ++ bool "Xen PCI Frontend" ++ depends on X86_XEN ++ help ++ The PCI device frontend driver allows the kernel to import arbitrary ++ PCI devices from a PCI backend to support PCI driver domains. ++ + config PCI_GOANY + bool "Any" + +@@ -1524,7 +1581,7 @@ endchoice + + config PCI_BIOS + def_bool y +- depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) ++ depends on X86_32 && !(X86_VISWS || XEN) && PCI && (PCI_GOBIOS || PCI_GOANY) + + # x86-64 doesn't support PCI BIOS access from long mode so always go direct. + config PCI_DIRECT +@@ -1547,9 +1604,25 @@ config PCI_MMCONFIG + bool "Support mmconfig PCI config space access" + depends on X86_64 && PCI && ACPI + ++config XEN_PCIDEV_FRONTEND ++ def_bool y ++ prompt "Xen PCI Frontend" if X86_64 ++ depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64) ++ select HOTPLUG ++ help ++ The PCI device frontend driver allows the kernel to import arbitrary ++ PCI devices from a PCI backend to support PCI driver domains. ++ ++config XEN_PCIDEV_FE_DEBUG ++ bool "Xen PCI Frontend Debugging" ++ depends on XEN_PCIDEV_FRONTEND ++ help ++ Enables some debug statements within the PCI Frontend. ++ + config DMAR + bool "Support for DMA Remapping Devices (EXPERIMENTAL)" + depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL ++ depends on !XEN + help + DMA remapping (DMAR) devices support enables independent address + translations for Direct Memory Access (DMA) from devices. +@@ -1589,7 +1662,7 @@ if X86_32 + + config ISA + bool "ISA support" +- depends on !(X86_VOYAGER || X86_VISWS) ++ depends on !(X86_VOYAGER || X86_VISWS || XEN) + help + Find out whether you have ISA slots on your motherboard. ISA is the + name of a bus system, i.e. the way the CPU talks to the other stuff +@@ -1616,7 +1689,7 @@ config EISA + source "drivers/eisa/Kconfig" + + config MCA +- bool "MCA support" if !(X86_VISWS || X86_VOYAGER) ++ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || XEN) + default y if X86_VOYAGER + help + MicroChannel Architecture is found in some IBM PS/2 machines and +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index dd138a2..25d4d15 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -340,7 +340,7 @@ config X86_PPRO_FENCE + + config X86_F00F_BUG + def_bool y +- depends on M586MMX || M586TSC || M586 || M486 || M386 ++ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT + + config X86_WP_WORKS_OK + def_bool y +@@ -398,6 +398,7 @@ config X86_P6_NOP + config X86_TSC + def_bool y + depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 ++ depends on !X86_XEN && !X86_64_XEN + + # this should be set for all -march=.. options where the compiler + # generates cmov. +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 1836337..ab8966c 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -93,7 +93,7 @@ config DEBUG_RODATA + + config DIRECT_GBPAGES + bool "Enable gbpages-mapped kernel pagetables" +- depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 ++ depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 && !XEN + help + Enable gigabyte pages support (if the CPU supports it). This can + improve the kernel's performance a tiny bit by reducing TLB +@@ -137,11 +137,12 @@ config X86_FIND_SMP_CONFIG + config X86_MPPARSE + def_bool y + depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64 ++ depends on !XEN_UNPRIVILEGED_GUEST + + config DOUBLEFAULT + default y + bool "Enable doublefault exception handler" if EMBEDDED +- depends on X86_32 ++ depends on X86_32 && !X86_NO_TSS + help + This option allows trapping of rare doublefault exceptions that + would otherwise cause a system to silently reboot. Disabling this +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 3cff3c8..862c302 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -129,6 +129,10 @@ mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/ + mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit + mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/ + ++# Xen subarch support ++mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-x86/mach-xen ++mcore-$(CONFIG_X86_XEN) := arch/x86/mach-xen/ ++ + # generic subarchitecture + mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic + fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ +@@ -168,7 +172,7 @@ libs-y += arch/x86/lib/ + core-y += $(fcore-y) + + # Xen paravirtualization support +-core-$(CONFIG_XEN) += arch/x86/xen/ ++core-$(CONFIG_PARAVIRT_XEN) += arch/x86/xen/ + + # lguest paravirtualization support + core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ +@@ -202,9 +206,28 @@ endif + + boot := arch/x86/boot + +-PHONY += zImage bzImage compressed zlilo bzlilo \ ++PHONY += zImage bzImage vmlinuz compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install + ++ifdef CONFIG_XEN ++KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ ++ -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(KBUILD_CPPFLAGS) ++ ++ifdef CONFIG_X86_64 ++LDFLAGS_vmlinux := -e startup_64 ++endif ++ ++# Default kernel to build ++all: vmlinuz ++ ++# KBUILD_IMAGE specifies the target image being built ++KBUILD_IMAGE := $(boot)/vmlinuz ++ ++vmlinuz: vmlinux ++ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) ++ $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot ++ $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@ ++else + # Default kernel to build + all: bzImage + +@@ -227,6 +250,7 @@ zdisk bzdisk: vmlinux + + fdimage fdimage144 fdimage288 isoimage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ ++endif + + install: + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install +diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile +index 7ee102f..cb1904e 100644 +--- a/arch/x86/boot/Makefile ++++ b/arch/x86/boot/Makefile +@@ -25,7 +25,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA + + #RAMDISK := -DRAMDISK=512 + +-targets := vmlinux.bin setup.bin setup.elf zImage bzImage ++targets := vmlinux.bin setup.bin setup.elf zImage bzImage vmlinuz vmlinux-stripped + subdir- := compressed + + setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o +@@ -190,5 +190,13 @@ zlilo: $(BOOTIMAGE) + cp System.map $(INSTALL_PATH)/ + if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi + ++$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE ++ $(call if_changed,gzip) ++ @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' ++ ++$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded ++$(obj)/vmlinux-stripped: vmlinux FORCE ++ $(call if_changed,objcopy) ++ + install: + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" +diff --git a/arch/x86/ia32/ia32entry-xen.S b/arch/x86/ia32/ia32entry-xen.S +new file mode 100644 +index 0000000..3164c99 +--- /dev/null ++++ b/arch/x86/ia32/ia32entry-xen.S +@@ -0,0 +1,673 @@ ++/* ++ * Compatibility mode system call entry point for x86-64. ++ * ++ * Copyright 2000-2002 Andi Kleen, SuSE Labs. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) ++ ++ .macro IA32_ARG_FIXUP noebp=0 ++ movl %edi,%r8d ++ .if \noebp ++ .else ++ movl %ebp,%r9d ++ .endif ++ xchg %ecx,%esi ++ movl %ebx,%edi ++ movl %edx,%edx /* zero extension */ ++ .endm ++ ++ /* clobbers %eax */ ++ .macro CLEAR_RREGS ++ xorl %eax,%eax ++ movq %rax,R11(%rsp) ++ movq %rax,R10(%rsp) ++ movq %rax,R9(%rsp) ++ movq %rax,R8(%rsp) ++ .endm ++ ++ .macro LOAD_ARGS32 offset ++ movl \offset(%rsp),%r11d ++ movl \offset+8(%rsp),%r10d ++ movl \offset+16(%rsp),%r9d ++ movl \offset+24(%rsp),%r8d ++ movl \offset+40(%rsp),%ecx ++ movl \offset+48(%rsp),%edx ++ movl \offset+56(%rsp),%esi ++ movl \offset+64(%rsp),%edi ++ movl \offset+72(%rsp),%eax ++ .endm ++ ++ .macro CFI_STARTPROC32 simple ++ CFI_STARTPROC \simple ++ CFI_UNDEFINED r8 ++ CFI_UNDEFINED r9 ++ CFI_UNDEFINED r10 ++ CFI_UNDEFINED r11 ++ CFI_UNDEFINED r12 ++ CFI_UNDEFINED r13 ++ CFI_UNDEFINED r14 ++ CFI_UNDEFINED r15 ++ .endm ++ ++/* ++ * 32bit SYSENTER instruction entry. ++ * ++ * Arguments: ++ * %eax System call number. ++ * %ebx Arg1 ++ * %ecx Arg2 ++ * %edx Arg3 ++ * %esi Arg4 ++ * %edi Arg5 ++ * %ebp user stack ++ * 0(%ebp) Arg6 ++ * ++ * Interrupts on. ++ * ++ * This is purely a fast path. For anything complicated we use the int 0x80 ++ * path below. Set up a complete hardware stack frame to share code ++ * with the int 0x80 path. ++ */ ++ENTRY(ia32_sysenter_target) ++ CFI_STARTPROC32 simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-RIP+16 ++ /*CFI_REL_OFFSET ss,SS-RIP+16*/ ++ CFI_REL_OFFSET rsp,RSP-RIP+16 ++ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ ++ /*CFI_REL_OFFSET cs,CS-RIP+16*/ ++ CFI_REL_OFFSET rip,RIP-RIP+16 ++ CFI_REL_OFFSET r11,8 ++ CFI_REL_OFFSET rcx,0 ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ popq %rcx ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_RESTORE rcx ++ movl %ebp,%ebp /* zero extension */ ++ movl %eax,%eax ++ movl 48-THREAD_SIZE+threadinfo_sysenter_return(%rsp),%r10d ++ movl $__USER32_DS,40(%rsp) ++ movq %rbp,32(%rsp) ++ movl $__USER32_CS,16(%rsp) ++ movq %r10,8(%rsp) ++ movq %rax,(%rsp) ++ cld ++ SAVE_ARGS 0,0,1 ++ /* no need to do an access_ok check here because rbp has been ++ 32bit zero extended */ ++1: movl (%rbp),%r9d ++ .section __ex_table,"a" ++ .quad 1b,ia32_badarg ++ .previous ++ GET_THREAD_INFO(%r10) ++ orl $TS_COMPAT,threadinfo_status(%r10) ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) ++ jnz sysenter_tracesys ++sysenter_do_call: ++ cmpl $(IA32_NR_syscalls-1),%eax ++ ja ia32_badsys ++ IA32_ARG_FIXUP 1 ++ call *ia32_sys_call_table(,%rax,8) ++ movq %rax,RAX-ARGOFFSET(%rsp) ++ jmp int_ret_from_sys_call ++ ++sysenter_tracesys: ++ xchgl %r9d,%ebp ++ SAVE_REST ++ CLEAR_RREGS ++ movq %r9,R9(%rsp) ++ movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ ++ movq %rsp,%rdi /* &pt_regs -> arg1 */ ++ call syscall_trace_enter ++ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ ++ RESTORE_REST ++ xchgl %ebp,%r9d ++ cmpl $(IA32_NR_syscalls-1),%eax ++ ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ ++ jmp sysenter_do_call ++ CFI_ENDPROC ++ENDPROC(ia32_sysenter_target) ++ ++/* ++ * 32bit SYSCALL instruction entry. ++ * ++ * Arguments: ++ * %eax System call number. ++ * %ebx Arg1 ++ * %ecx return EIP ++ * %edx Arg3 ++ * %esi Arg4 ++ * %edi Arg5 ++ * %ebp Arg2 [note: not saved in the stack frame, should not be touched] ++ * %esp user stack ++ * 0(%esp) Arg6 ++ * ++ * Interrupts on. ++ * ++ * This is purely a fast path. For anything complicated we use the int 0x80 ++ * path below. Set up a complete hardware stack frame to share code ++ * with the int 0x80 path. ++ */ ++ENTRY(ia32_cstar_target) ++ CFI_STARTPROC32 simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-RIP+16 ++ /*CFI_REL_OFFSET ss,SS-RIP+16*/ ++ CFI_REL_OFFSET rsp,RSP-RIP+16 ++ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ ++ /*CFI_REL_OFFSET cs,CS-RIP+16*/ ++ CFI_REL_OFFSET rip,RIP-RIP+16 ++ movl %eax,%eax /* zero extension */ ++ movl RSP-RIP+16(%rsp),%r8d ++ SAVE_ARGS -8,1,1 ++ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) ++ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ ++ movl %ebp,%ecx ++ movl $__USER32_CS,CS-ARGOFFSET(%rsp) ++ movl $__USER32_DS,SS-ARGOFFSET(%rsp) ++ /* no need to do an access_ok check here because r8 has been ++ 32bit zero extended */ ++ /* hardware stack frame is complete now */ ++1: movl (%r8),%r9d ++ .section __ex_table,"a" ++ .quad 1b,ia32_badarg ++ .previous ++ GET_THREAD_INFO(%r10) ++ orl $TS_COMPAT,threadinfo_status(%r10) ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) ++ jnz cstar_tracesys ++cstar_do_call: ++ cmpl $IA32_NR_syscalls-1,%eax ++ ja ia32_badsys ++ IA32_ARG_FIXUP 1 ++ call *ia32_sys_call_table(,%rax,8) ++ movq %rax,RAX-ARGOFFSET(%rsp) ++ jmp int_ret_from_sys_call ++ ++cstar_tracesys: ++ xchgl %r9d,%ebp ++ SAVE_REST ++ CLEAR_RREGS ++ movq %r9,R9(%rsp) ++ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ ++ movq %rsp,%rdi /* &pt_regs -> arg1 */ ++ call syscall_trace_enter ++ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ ++ RESTORE_REST ++ xchgl %ebp,%r9d ++ movl RSP-ARGOFFSET(%rsp), %r8d ++ cmpl $(IA32_NR_syscalls-1),%eax ++ ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ ++ jmp cstar_do_call ++END(ia32_cstar_target) ++ ++ia32_badarg: ++ movq $-EFAULT,%rax ++ jmp ia32_sysret ++ CFI_ENDPROC ++ ++/* ++ * Emulated IA32 system calls via int 0x80. ++ * ++ * Arguments: ++ * %eax System call number. ++ * %ebx Arg1 ++ * %ecx Arg2 ++ * %edx Arg3 ++ * %esi Arg4 ++ * %edi Arg5 ++ * %ebp Arg6 [note: not saved in the stack frame, should not be touched] ++ * ++ * Notes: ++ * Uses the same stack frame as the x86-64 version. ++ * All registers except %eax must be saved (but ptrace may violate that) ++ * Arguments are zero extended. For system calls that want sign extension and ++ * take long arguments a wrapper is needed. Most calls can just be called ++ * directly. ++ * Assumes it is only called from user space and entered with interrupts on. ++ */ ++ ++ENTRY(ia32_syscall) ++ CFI_STARTPROC32 simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-RIP+16 ++ /*CFI_REL_OFFSET ss,SS-RIP+16*/ ++ CFI_REL_OFFSET rsp,RSP-RIP+16 ++ /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ ++ /*CFI_REL_OFFSET cs,CS-RIP+16*/ ++ CFI_REL_OFFSET rip,RIP-RIP+16 ++ CFI_REL_OFFSET r11,8 ++ CFI_REL_OFFSET rcx,0 ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ popq %rcx ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_RESTORE rcx ++ movl %eax,%eax ++ movq %rax,(%rsp) ++ cld ++ /* note the registers are not zero extended to the sf. ++ this could be a problem. */ ++ SAVE_ARGS 0,0,1 ++ GET_THREAD_INFO(%r10) ++ orl $TS_COMPAT,threadinfo_status(%r10) ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) ++ jnz ia32_tracesys ++ia32_do_syscall: ++ cmpl $(IA32_NR_syscalls-1),%eax ++ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ ++ IA32_ARG_FIXUP ++ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ++ia32_sysret: ++ movq %rax,RAX-ARGOFFSET(%rsp) ++ jmp int_ret_from_sys_call ++ ++ia32_tracesys: ++ SAVE_REST ++ CLEAR_RREGS ++ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ ++ movq %rsp,%rdi /* &pt_regs -> arg1 */ ++ call syscall_trace_enter ++ LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ ++ RESTORE_REST ++ jmp ia32_do_syscall ++END(ia32_syscall) ++ ++ia32_badsys: ++ movq $0,ORIG_RAX-ARGOFFSET(%rsp) ++ movq $-ENOSYS,RAX-ARGOFFSET(%rsp) ++ jmp int_ret_from_sys_call ++ ++quiet_ni_syscall: ++ movq $-ENOSYS,%rax ++ ret ++ CFI_ENDPROC ++ ++ .macro PTREGSCALL label, func, arg ++ .globl \label ++\label: ++ leaq \func(%rip),%rax ++ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ ++ jmp ia32_ptregs_common ++ .endm ++ ++ CFI_STARTPROC32 ++ ++ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi ++ PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi ++ PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx ++ PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx ++ PTREGSCALL stub32_execve, sys32_execve, %rcx ++ PTREGSCALL stub32_fork, sys_fork, %rdi ++ PTREGSCALL stub32_clone, sys32_clone, %rdx ++ PTREGSCALL stub32_vfork, sys_vfork, %rdi ++ PTREGSCALL stub32_iopl, sys_iopl, %rsi ++ PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx ++ ++ENTRY(ia32_ptregs_common) ++ popq %r11 ++ CFI_ENDPROC ++ CFI_STARTPROC32 simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-ARGOFFSET ++ CFI_REL_OFFSET rax,RAX-ARGOFFSET ++ CFI_REL_OFFSET rcx,RCX-ARGOFFSET ++ CFI_REL_OFFSET rdx,RDX-ARGOFFSET ++ CFI_REL_OFFSET rsi,RSI-ARGOFFSET ++ CFI_REL_OFFSET rdi,RDI-ARGOFFSET ++ CFI_REL_OFFSET rip,RIP-ARGOFFSET ++/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ ++/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ ++ CFI_REL_OFFSET rsp,RSP-ARGOFFSET ++/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ ++ SAVE_REST ++ call *%rax ++ RESTORE_REST ++ jmp ia32_sysret /* misbalances the return cache */ ++ CFI_ENDPROC ++END(ia32_ptregs_common) ++ ++ .section .rodata,"a" ++ .align 8 ++ia32_sys_call_table: ++ .quad sys_restart_syscall ++ .quad sys_exit ++ .quad stub32_fork ++ .quad sys_read ++ .quad sys_write ++ .quad compat_sys_open /* 5 */ ++ .quad sys_close ++ .quad sys32_waitpid ++ .quad sys_creat ++ .quad sys_link ++ .quad sys_unlink /* 10 */ ++ .quad stub32_execve ++ .quad sys_chdir ++ .quad compat_sys_time ++ .quad sys_mknod ++ .quad sys_chmod /* 15 */ ++ .quad sys_lchown16 ++ .quad quiet_ni_syscall /* old break syscall holder */ ++ .quad sys_stat ++ .quad sys32_lseek ++ .quad sys_getpid /* 20 */ ++ .quad compat_sys_mount /* mount */ ++ .quad sys_oldumount /* old_umount */ ++ .quad sys_setuid16 ++ .quad sys_getuid16 ++ .quad compat_sys_stime /* stime */ /* 25 */ ++ .quad compat_sys_ptrace /* ptrace */ ++ .quad sys_alarm ++ .quad sys_fstat /* (old)fstat */ ++ .quad sys_pause ++ .quad compat_sys_utime /* 30 */ ++ .quad quiet_ni_syscall /* old stty syscall holder */ ++ .quad quiet_ni_syscall /* old gtty syscall holder */ ++ .quad sys_access ++ .quad sys_nice ++ .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ ++ .quad sys_sync ++ .quad sys32_kill ++ .quad sys_rename ++ .quad sys_mkdir ++ .quad sys_rmdir /* 40 */ ++ .quad sys_dup ++ .quad sys32_pipe ++ .quad compat_sys_times ++ .quad quiet_ni_syscall /* old prof syscall holder */ ++ .quad sys_brk /* 45 */ ++ .quad sys_setgid16 ++ .quad sys_getgid16 ++ .quad sys_signal ++ .quad sys_geteuid16 ++ .quad sys_getegid16 /* 50 */ ++ .quad sys_acct ++ .quad sys_umount /* new_umount */ ++ .quad quiet_ni_syscall /* old lock syscall holder */ ++ .quad compat_sys_ioctl ++ .quad compat_sys_fcntl64 /* 55 */ ++ .quad quiet_ni_syscall /* old mpx syscall holder */ ++ .quad sys_setpgid ++ .quad quiet_ni_syscall /* old ulimit syscall holder */ ++ .quad sys32_olduname ++ .quad sys_umask /* 60 */ ++ .quad sys_chroot ++ .quad sys32_ustat ++ .quad sys_dup2 ++ .quad sys_getppid ++ .quad sys_getpgrp /* 65 */ ++ .quad sys_setsid ++ .quad sys32_sigaction ++ .quad sys_sgetmask ++ .quad sys_ssetmask ++ .quad sys_setreuid16 /* 70 */ ++ .quad sys_setregid16 ++ .quad stub32_sigsuspend ++ .quad compat_sys_sigpending ++ .quad sys_sethostname ++ .quad compat_sys_setrlimit /* 75 */ ++ .quad compat_sys_old_getrlimit /* old_getrlimit */ ++ .quad compat_sys_getrusage ++ .quad sys32_gettimeofday ++ .quad sys32_settimeofday ++ .quad sys_getgroups16 /* 80 */ ++ .quad sys_setgroups16 ++ .quad sys32_old_select ++ .quad sys_symlink ++ .quad sys_lstat ++ .quad sys_readlink /* 85 */ ++ .quad sys_uselib ++ .quad sys_swapon ++ .quad sys_reboot ++ .quad compat_sys_old_readdir ++ .quad sys32_mmap /* 90 */ ++ .quad sys_munmap ++ .quad sys_truncate ++ .quad sys_ftruncate ++ .quad sys_fchmod ++ .quad sys_fchown16 /* 95 */ ++ .quad sys_getpriority ++ .quad sys_setpriority ++ .quad quiet_ni_syscall /* old profil syscall holder */ ++ .quad compat_sys_statfs ++ .quad compat_sys_fstatfs /* 100 */ ++ .quad sys_ioperm ++ .quad compat_sys_socketcall ++ .quad sys_syslog ++ .quad compat_sys_setitimer ++ .quad compat_sys_getitimer /* 105 */ ++ .quad compat_sys_newstat ++ .quad compat_sys_newlstat ++ .quad compat_sys_newfstat ++ .quad sys32_uname ++ .quad stub32_iopl /* 110 */ ++ .quad sys_vhangup ++ .quad quiet_ni_syscall /* old "idle" system call */ ++ .quad sys32_vm86_warning /* vm86old */ ++ .quad compat_sys_wait4 ++ .quad sys_swapoff /* 115 */ ++ .quad compat_sys_sysinfo ++ .quad sys32_ipc ++ .quad sys_fsync ++ .quad stub32_sigreturn ++ .quad stub32_clone /* 120 */ ++ .quad sys_setdomainname ++ .quad sys_uname ++ .quad sys_modify_ldt ++ .quad compat_sys_adjtimex ++ .quad sys32_mprotect /* 125 */ ++ .quad compat_sys_sigprocmask ++ .quad quiet_ni_syscall /* create_module */ ++ .quad sys_init_module ++ .quad sys_delete_module ++ .quad quiet_ni_syscall /* 130 get_kernel_syms */ ++ .quad sys32_quotactl ++ .quad sys_getpgid ++ .quad sys_fchdir ++ .quad quiet_ni_syscall /* bdflush */ ++ .quad sys_sysfs /* 135 */ ++ .quad sys_personality ++ .quad quiet_ni_syscall /* for afs_syscall */ ++ .quad sys_setfsuid16 ++ .quad sys_setfsgid16 ++ .quad sys_llseek /* 140 */ ++ .quad compat_sys_getdents ++ .quad compat_sys_select ++ .quad sys_flock ++ .quad sys_msync ++ .quad compat_sys_readv /* 145 */ ++ .quad compat_sys_writev ++ .quad sys_getsid ++ .quad sys_fdatasync ++ .quad sys32_sysctl /* sysctl */ ++ .quad sys_mlock /* 150 */ ++ .quad sys_munlock ++ .quad sys_mlockall ++ .quad sys_munlockall ++ .quad sys_sched_setparam ++ .quad sys_sched_getparam /* 155 */ ++ .quad sys_sched_setscheduler ++ .quad sys_sched_getscheduler ++ .quad sys_sched_yield ++ .quad sys_sched_get_priority_max ++ .quad sys_sched_get_priority_min /* 160 */ ++ .quad sys32_sched_rr_get_interval ++ .quad compat_sys_nanosleep ++ .quad sys_mremap ++ .quad sys_setresuid16 ++ .quad sys_getresuid16 /* 165 */ ++ .quad sys32_vm86_warning /* vm86 */ ++ .quad quiet_ni_syscall /* query_module */ ++ .quad sys_poll ++ .quad compat_sys_nfsservctl ++ .quad sys_setresgid16 /* 170 */ ++ .quad sys_getresgid16 ++ .quad sys_prctl ++ .quad stub32_rt_sigreturn ++ .quad sys32_rt_sigaction ++ .quad sys32_rt_sigprocmask /* 175 */ ++ .quad sys32_rt_sigpending ++ .quad compat_sys_rt_sigtimedwait ++ .quad sys32_rt_sigqueueinfo ++ .quad stub32_rt_sigsuspend ++ .quad sys32_pread /* 180 */ ++ .quad sys32_pwrite ++ .quad sys_chown16 ++ .quad sys_getcwd ++ .quad sys_capget ++ .quad sys_capset ++ .quad stub32_sigaltstack ++ .quad sys32_sendfile ++ .quad quiet_ni_syscall /* streams1 */ ++ .quad quiet_ni_syscall /* streams2 */ ++ .quad stub32_vfork /* 190 */ ++ .quad compat_sys_getrlimit ++ .quad sys32_mmap2 ++ .quad sys32_truncate64 ++ .quad sys32_ftruncate64 ++ .quad sys32_stat64 /* 195 */ ++ .quad sys32_lstat64 ++ .quad sys32_fstat64 ++ .quad sys_lchown ++ .quad sys_getuid ++ .quad sys_getgid /* 200 */ ++ .quad sys_geteuid ++ .quad sys_getegid ++ .quad sys_setreuid ++ .quad sys_setregid ++ .quad sys_getgroups /* 205 */ ++ .quad sys_setgroups ++ .quad sys_fchown ++ .quad sys_setresuid ++ .quad sys_getresuid ++ .quad sys_setresgid /* 210 */ ++ .quad sys_getresgid ++ .quad sys_chown ++ .quad sys_setuid ++ .quad sys_setgid ++ .quad sys_setfsuid /* 215 */ ++ .quad sys_setfsgid ++ .quad sys_pivot_root ++ .quad sys_mincore ++ .quad sys_madvise ++ .quad compat_sys_getdents64 /* 220 getdents64 */ ++ .quad compat_sys_fcntl64 ++ .quad quiet_ni_syscall /* tux */ ++ .quad quiet_ni_syscall /* security */ ++ .quad sys_gettid ++ .quad sys32_readahead /* 225 */ ++ .quad sys_setxattr ++ .quad sys_lsetxattr ++ .quad sys_fsetxattr ++ .quad sys_getxattr ++ .quad sys_lgetxattr /* 230 */ ++ .quad sys_fgetxattr ++ .quad sys_listxattr ++ .quad sys_llistxattr ++ .quad sys_flistxattr ++ .quad sys_removexattr /* 235 */ ++ .quad sys_lremovexattr ++ .quad sys_fremovexattr ++ .quad sys_tkill ++ .quad sys_sendfile64 ++ .quad compat_sys_futex /* 240 */ ++ .quad compat_sys_sched_setaffinity ++ .quad compat_sys_sched_getaffinity ++ .quad sys_set_thread_area ++ .quad sys_get_thread_area ++ .quad compat_sys_io_setup /* 245 */ ++ .quad sys_io_destroy ++ .quad compat_sys_io_getevents ++ .quad compat_sys_io_submit ++ .quad sys_io_cancel ++ .quad sys32_fadvise64 /* 250 */ ++ .quad quiet_ni_syscall /* free_huge_pages */ ++ .quad sys_exit_group ++ .quad sys32_lookup_dcookie ++ .quad sys_epoll_create ++ .quad sys_epoll_ctl /* 255 */ ++ .quad sys_epoll_wait ++ .quad sys_remap_file_pages ++ .quad sys_set_tid_address ++ .quad compat_sys_timer_create ++ .quad compat_sys_timer_settime /* 260 */ ++ .quad compat_sys_timer_gettime ++ .quad sys_timer_getoverrun ++ .quad sys_timer_delete ++ .quad compat_sys_clock_settime ++ .quad compat_sys_clock_gettime /* 265 */ ++ .quad compat_sys_clock_getres ++ .quad compat_sys_clock_nanosleep ++ .quad compat_sys_statfs64 ++ .quad compat_sys_fstatfs64 ++ .quad sys_tgkill /* 270 */ ++ .quad compat_sys_utimes ++ .quad sys32_fadvise64_64 ++ .quad quiet_ni_syscall /* sys_vserver */ ++ .quad sys_mbind ++ .quad compat_sys_get_mempolicy /* 275 */ ++ .quad sys_set_mempolicy ++ .quad compat_sys_mq_open ++ .quad sys_mq_unlink ++ .quad compat_sys_mq_timedsend ++ .quad compat_sys_mq_timedreceive /* 280 */ ++ .quad compat_sys_mq_notify ++ .quad compat_sys_mq_getsetattr ++ .quad compat_sys_kexec_load /* reserved for kexec */ ++ .quad compat_sys_waitid ++ .quad quiet_ni_syscall /* 285: sys_altroot */ ++ .quad sys_add_key ++ .quad sys_request_key ++ .quad sys_keyctl ++ .quad sys_ioprio_set ++ .quad sys_ioprio_get /* 290 */ ++ .quad sys_inotify_init ++ .quad sys_inotify_add_watch ++ .quad sys_inotify_rm_watch ++ .quad sys_migrate_pages ++ .quad compat_sys_openat /* 295 */ ++ .quad sys_mkdirat ++ .quad sys_mknodat ++ .quad sys_fchownat ++ .quad compat_sys_futimesat ++ .quad sys32_fstatat /* 300 */ ++ .quad sys_unlinkat ++ .quad sys_renameat ++ .quad sys_linkat ++ .quad sys_symlinkat ++ .quad sys_readlinkat /* 305 */ ++ .quad sys_fchmodat ++ .quad sys_faccessat ++ .quad compat_sys_pselect6 ++ .quad compat_sys_ppoll ++ .quad sys_unshare /* 310 */ ++ .quad compat_sys_set_robust_list ++ .quad compat_sys_get_robust_list ++ .quad sys_splice ++ .quad sys32_sync_file_range ++ .quad sys_tee /* 315 */ ++ .quad compat_sys_vmsplice ++ .quad compat_sys_move_pages ++ .quad sys_getcpu ++ .quad sys_epoll_pwait ++ .quad compat_sys_utimensat /* 320 */ ++ .quad compat_sys_signalfd ++ .quad sys_timerfd_create ++ .quad sys_eventfd ++ .quad sys32_fallocate ++ .quad compat_sys_timerfd_settime /* 325 */ ++ .quad compat_sys_timerfd_gettime ++ia32_syscall_end: +diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile +index 77807d4..81f7529 100644 +--- a/arch/x86/kernel/Makefile ++++ b/arch/x86/kernel/Makefile +@@ -91,10 +91,13 @@ scx200-y += scx200_32.o + + obj-$(CONFIG_OLPC) += olpc.o + ++obj-$(CONFIG_X86_XEN) += fixup.o ++ + ### + # 64 bit specific files + ifeq ($(CONFIG_X86_64),y) +- obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o ++ obj-$(CONFIG_X86_LOCAL_APIC) += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o ++ obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o + obj-$(CONFIG_AUDIT) += audit_64.o + +@@ -103,4 +106,12 @@ ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o + + obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o ++ ++ obj-$(CONFIG_XEN) += nmi_64.o ++ apic_64-$(CONFIG_XEN) += apic_32.o ++ time_64-$(CONFIG_XEN) += time_32.o + endif ++ ++disabled-obj-$(CONFIG_XEN) := early-quirks.o genapic_flat_$(BITS).o genx2apic_uv_x.o \ ++ hpet.o i8253.o i8259_$(BITS).o pci-swiotlb_64.o reboot.o smpboot.o \ ++ tlb_$(BITS).o tsc_$(BITS).o tsc_sync.o vsmp_64.o +diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile +index fd5ca97..52c5db9 100644 +--- a/arch/x86/kernel/acpi/Makefile ++++ b/arch/x86/kernel/acpi/Makefile +@@ -5,6 +5,9 @@ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o + + ifneq ($(CONFIG_ACPI_PROCESSOR),) + obj-y += cstate.o processor.o ++ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),) ++obj-$(CONFIG_XEN) += processor_extcntl_xen.o ++endif + endif + + $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin +@@ -12,3 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin + $(obj)/realmode/wakeup.bin: FORCE + $(Q)$(MAKE) $(build)=$(obj)/realmode + ++disabled-obj-$(CONFIG_XEN) := cstate.o wakeup_%.o +diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c +index 33c5216..08d6740 100644 +--- a/arch/x86/kernel/acpi/boot.c ++++ b/arch/x86/kernel/acpi/boot.c +@@ -88,7 +88,7 @@ int acpi_sci_override_gsi __initdata; + int acpi_skip_timer_override __initdata; + int acpi_use_timer_override __initdata; + +-#ifdef CONFIG_X86_LOCAL_APIC ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + #endif + +@@ -106,7 +106,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + */ + enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + +-#ifdef CONFIG_X86_64 ++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + + /* rely on all ACPI tables being in the direct mapping */ + char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) +@@ -139,8 +139,13 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) + unsigned long base, offset, mapped_size; + int idx; + ++#ifndef CONFIG_XEN + if (phys + size < 8 * 1024 * 1024) + return __va(phys); ++#else ++ if (phys + size <= (NR_FIX_ISAMAPS << PAGE_SHIFT)) ++ return isa_bus_to_virt(phys); ++#endif + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; +@@ -228,12 +233,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) + return -ENODEV; + } + ++#ifndef CONFIG_XEN + if (madt->address) { + acpi_lapic_addr = (u64) madt->address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->address); + } ++#endif + + acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); + +@@ -242,19 +249,23 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) + + static void __cpuinit acpi_register_lapic(int id, u8 enabled) + { ++#ifndef CONFIG_XEN + unsigned int ver = 0; ++#endif + + if (!enabled) { + ++disabled_cpus; + return; + } + ++#ifndef CONFIG_XEN + #ifdef CONFIG_X86_32 + if (boot_cpu_physical_apicid != -1U) + ver = apic_version[boot_cpu_physical_apicid]; + #endif + + generic_processor_info(id, ver); ++#endif + } + + static int __init +@@ -304,6 +315,7 @@ static int __init + acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, + const unsigned long end) + { ++#ifndef CONFIG_XEN + struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header; +@@ -312,6 +324,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, + return -EINVAL; + + acpi_lapic_addr = lapic_addr_ovr->address; ++#endif + + return 0; + } +@@ -769,6 +782,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) + * returns 0 on success, < 0 on error + */ + ++#ifndef CONFIG_XEN + static void __init acpi_register_lapic_address(unsigned long address) + { + mp_lapic_addr = address; +@@ -782,6 +796,9 @@ static void __init acpi_register_lapic_address(unsigned long address) + #endif + } + } ++#else ++#define acpi_register_lapic_address(address) ++#endif + + static int __init early_acpi_parse_madt_lapic_addr_ovr(void) + { +diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c +index de2d2e4..6dd1d80 100644 +--- a/arch/x86/kernel/acpi/processor.c ++++ b/arch/x86/kernel/acpi/processor.c +@@ -69,7 +69,18 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) + /* Initialize _PDC data based on the CPU vendor */ + void arch_acpi_processor_init_pdc(struct acpi_processor *pr) + { ++#ifdef CONFIG_XEN ++ /* ++ * As a work-around, just use cpu0's cpuinfo for all processors. ++ * Further work is required to expose xen hypervisor interface of ++ * getting physical cpuinfo to dom0 kernel and then ++ * arch_acpi_processor_init_pdc can set _PDC parameters according ++ * to Xen's phys information. ++ */ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++#else + struct cpuinfo_x86 *c = &cpu_data(pr->id); ++#endif + + pr->pdc = NULL; + if (c->x86_vendor == X86_VENDOR_INTEL) +diff --git a/arch/x86/kernel/acpi/processor_extcntl_xen.c b/arch/x86/kernel/acpi/processor_extcntl_xen.c +new file mode 100644 +index 0000000..cab3a42 +--- /dev/null ++++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c +@@ -0,0 +1,229 @@ ++/* ++ * processor_extcntl_xen.c - interface to notify Xen ++ * ++ * Copyright (C) 2008, Intel corporation ++ * ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or (at ++ * your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++static int xen_processor_pmbits; ++ ++static int xen_cx_notifier(struct acpi_processor *pr, int action) ++{ ++ int ret, count = 0, i; ++ xen_platform_op_t op = { ++ .cmd = XENPF_set_processor_pminfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.set_pminfo.id = pr->acpi_id, ++ .u.set_pminfo.type = XEN_PM_CX, ++ }; ++ struct xen_processor_cx *data, *buf; ++ struct acpi_processor_cx *cx; ++ ++ if (action == PROCESSOR_PM_CHANGE) ++ return -EINVAL; ++ ++ /* Convert to Xen defined structure and hypercall */ ++ buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx), ++ GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ ++ data = buf; ++ for (i = 1; i <= pr->power.count; i++) { ++ cx = &pr->power.states[i]; ++ /* Skip invalid cstate entry */ ++ if (!cx->valid) ++ continue; ++ ++ data->type = cx->type; ++ data->latency = cx->latency; ++ data->power = cx->power; ++ data->reg.space_id = cx->reg.space_id; ++ data->reg.bit_width = cx->reg.bit_width; ++ data->reg.bit_offset = cx->reg.bit_offset; ++ data->reg.access_size = cx->reg.reserved; ++ data->reg.address = cx->reg.address; ++ ++ /* Get dependency relationships */ ++ if (cx->csd_count) { ++ printk("Wow! _CSD is found. Not support for now!\n"); ++ kfree(buf); ++ return -EINVAL; ++ } else { ++ data->dpcnt = 0; ++ set_xen_guest_handle(data->dp, NULL); ++ } ++ ++ data++; ++ count++; ++ } ++ ++ if (!count) { ++ printk("No available Cx info for cpu %d\n", pr->acpi_id); ++ kfree(buf); ++ return -EINVAL; ++ } ++ ++ op.u.set_pminfo.power.count = count; ++ op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control; ++ op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check; ++ op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst; ++ op.u.set_pminfo.power.flags.power_setup_done = pr->flags.power_setup_done; ++ ++ set_xen_guest_handle(op.u.set_pminfo.power.states, buf); ++ ret = HYPERVISOR_platform_op(&op); ++ kfree(buf); ++ return ret; ++} ++ ++static void convert_pct_reg(struct xen_pct_register *xpct, ++ struct acpi_pct_register *apct) ++{ ++ xpct->descriptor = apct->descriptor; ++ xpct->length = apct->length; ++ xpct->space_id = apct->space_id; ++ xpct->bit_width = apct->bit_width; ++ xpct->bit_offset = apct->bit_offset; ++ xpct->reserved = apct->reserved; ++ xpct->address = apct->address; ++} ++ ++static void convert_pss_states(struct xen_processor_px *xpss, ++ struct acpi_processor_px *apss, int state_count) ++{ ++ int i; ++ for(i=0; icore_frequency = apss->core_frequency; ++ xpss->power = apss->power; ++ xpss->transition_latency = apss->transition_latency; ++ xpss->bus_master_latency = apss->bus_master_latency; ++ xpss->control = apss->control; ++ xpss->status = apss->status; ++ xpss++; ++ apss++; ++ } ++} ++ ++static void convert_psd_pack(struct xen_psd_package *xpsd, ++ struct acpi_psd_package *apsd) ++{ ++ xpsd->num_entries = apsd->num_entries; ++ xpsd->revision = apsd->revision; ++ xpsd->domain = apsd->domain; ++ xpsd->coord_type = apsd->coord_type; ++ xpsd->num_processors = apsd->num_processors; ++} ++ ++static int xen_px_notifier(struct acpi_processor *pr, int action) ++{ ++ int ret; ++ xen_platform_op_t op = { ++ .cmd = XENPF_set_processor_pminfo, ++ .interface_version = XENPF_INTERFACE_VERSION, ++ .u.set_pminfo.id = pr->acpi_id, ++ .u.set_pminfo.type = XEN_PM_PX, ++ }; ++ struct xen_processor_performance *perf; ++ struct xen_processor_px *states = NULL; ++ struct acpi_processor_performance *px; ++ struct acpi_psd_package *pdomain; ++ ++ /* leave dynamic ppc handle in the future */ ++ if (action == PROCESSOR_PM_CHANGE) ++ return 0; ++ ++ perf = &op.u.set_pminfo.perf; ++ px = pr->performance; ++ ++ perf->flags = XEN_PX_PPC | ++ XEN_PX_PCT | ++ XEN_PX_PSS | ++ XEN_PX_PSD; ++ ++ /* ppc */ ++ perf->ppc = pr->performance_platform_limit; ++ ++ /* pct */ ++ convert_pct_reg(&perf->control_register, &px->control_register); ++ convert_pct_reg(&perf->status_register, &px->status_register); ++ ++ /* pss */ ++ perf->state_count = px->state_count; ++ states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL); ++ if (!states) ++ return -ENOMEM; ++ convert_pss_states(states, px->states, px->state_count); ++ set_xen_guest_handle(perf->states, states); ++ ++ /* psd */ ++ pdomain = &px->domain_info; ++ convert_psd_pack(&perf->domain_info, pdomain); ++ if (perf->domain_info.num_processors) { ++ if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_ALL; ++ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_ANY; ++ else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) ++ perf->shared_type = CPUFREQ_SHARED_TYPE_HW; ++ } else ++ perf->shared_type = CPUFREQ_SHARED_TYPE_NONE; ++ ++ ret = HYPERVISOR_platform_op(&op); ++ kfree(states); ++ return ret; ++} ++ ++static int xen_tx_notifier(struct acpi_processor *pr, int action) ++{ ++ return -EINVAL; ++} ++static int xen_hotplug_notifier(struct acpi_processor *pr, int event) ++{ ++ return -EINVAL; ++} ++ ++static struct processor_extcntl_ops xen_extcntl_ops = { ++ .hotplug = xen_hotplug_notifier, ++}; ++ ++void arch_acpi_processor_init_extcntl(const struct processor_extcntl_ops **ops) ++{ ++ xen_processor_pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8; ++ ++ if (xen_processor_pmbits & XEN_PROCESSOR_PM_CX) ++ xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier; ++ if (xen_processor_pmbits & XEN_PROCESSOR_PM_PX) ++ xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier; ++ if (xen_processor_pmbits & XEN_PROCESSOR_PM_TX) ++ xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier; ++ ++ *ops = &xen_extcntl_ops; ++} ++EXPORT_SYMBOL(arch_acpi_processor_init_extcntl); +diff --git a/arch/x86/kernel/acpi/sleep-xen.c b/arch/x86/kernel/acpi/sleep-xen.c +new file mode 100644 +index 0000000..26edc70 +--- /dev/null ++++ b/arch/x86/kernel/acpi/sleep-xen.c +@@ -0,0 +1,156 @@ ++/* ++ * sleep.c - x86-specific ACPI sleep support. ++ * ++ * Copyright (C) 2001-2003 Patrick Mochel ++ * Copyright (C) 2001-2003 Pavel Machek ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "realmode/wakeup.h" ++#include "sleep.h" ++ ++#ifndef CONFIG_ACPI_PV_SLEEP ++unsigned long acpi_wakeup_address; ++unsigned long acpi_realmode_flags; ++ ++/* address in low memory of the wakeup routine. */ ++static unsigned long acpi_realmode; ++ ++#ifdef CONFIG_64BIT ++static char temp_stack[10240]; ++#endif ++#endif ++ ++/** ++ * acpi_save_state_mem - save kernel state ++ * ++ * Create an identity mapped page table and copy the wakeup routine to ++ * low memory. ++ * ++ * Note that this is too late to change acpi_wakeup_address. ++ */ ++int acpi_save_state_mem(void) ++{ ++#ifndef CONFIG_ACPI_PV_SLEEP ++ struct wakeup_header *header; ++ ++ if (!acpi_realmode) { ++ printk(KERN_ERR "Could not allocate memory during boot, " ++ "S3 disabled\n"); ++ return -ENOMEM; ++ } ++ memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); ++ ++ header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); ++ if (header->signature != 0x51ee1111) { ++ printk(KERN_ERR "wakeup header does not match\n"); ++ return -EINVAL; ++ } ++ ++ header->video_mode = saved_video_mode; ++ ++ header->wakeup_jmp_seg = acpi_wakeup_address >> 4; ++ /* GDT[0]: GDT self-pointer */ ++ header->wakeup_gdt[0] = ++ (u64)(sizeof(header->wakeup_gdt) - 1) + ++ ((u64)(acpi_wakeup_address + ++ ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) ++ << 16); ++ /* GDT[1]: real-mode-like code segment */ ++ header->wakeup_gdt[1] = (0x009bULL << 40) + ++ ((u64)acpi_wakeup_address << 16) + 0xffff; ++ /* GDT[2]: real-mode-like data segment */ ++ header->wakeup_gdt[2] = (0x0093ULL << 40) + ++ ((u64)acpi_wakeup_address << 16) + 0xffff; ++ ++#ifndef CONFIG_64BIT ++ store_gdt((struct desc_ptr *)&header->pmode_gdt); ++ ++ header->pmode_efer_low = nx_enabled; ++ if (header->pmode_efer_low & 1) { ++ /* This is strange, why not save efer, always? */ ++ rdmsr(MSR_EFER, header->pmode_efer_low, ++ header->pmode_efer_high); ++ } ++#endif /* !CONFIG_64BIT */ ++ ++ header->pmode_cr0 = read_cr0(); ++ header->pmode_cr4 = read_cr4(); ++ header->realmode_flags = acpi_realmode_flags; ++ header->real_magic = 0x12345678; ++ ++#ifndef CONFIG_64BIT ++ header->pmode_entry = (u32)&wakeup_pmode_return; ++ header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); ++ saved_magic = 0x12345678; ++#else /* CONFIG_64BIT */ ++ header->trampoline_segment = setup_trampoline() >> 4; ++ init_rsp = (unsigned long)temp_stack + 4096; ++ initial_code = (unsigned long)wakeup_long64; ++ saved_magic = 0x123456789abcdef0; ++#endif /* CONFIG_64BIT */ ++#endif ++ ++ return 0; ++} ++ ++/* ++ * acpi_restore_state - undo effects of acpi_save_state_mem ++ */ ++void acpi_restore_state_mem(void) ++{ ++} ++ ++ ++/** ++ * acpi_reserve_bootmem - do _very_ early ACPI initialisation ++ * ++ * We allocate a page from the first 1MB of memory for the wakeup ++ * routine for when we come back from a sleep state. The ++ * runtime allocator allows specification of <16MB pages, but not ++ * <1MB pages. ++ */ ++void __init acpi_reserve_bootmem(void) ++{ ++#ifndef CONFIG_ACPI_PV_SLEEP ++ if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { ++ printk(KERN_ERR ++ "ACPI: Wakeup code way too big, S3 disabled.\n"); ++ return; ++ } ++ ++ acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); ++ ++ if (!acpi_realmode) { ++ printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); ++ return; ++ } ++ ++ acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); ++#endif ++} ++ ++ ++#ifndef CONFIG_ACPI_PV_SLEEP ++static int __init acpi_sleep_setup(char *str) ++{ ++ while ((str != NULL) && (*str != '\0')) { ++ if (strncmp(str, "s3_bios", 7) == 0) ++ acpi_realmode_flags |= 1; ++ if (strncmp(str, "s3_mode", 7) == 0) ++ acpi_realmode_flags |= 2; ++ if (strncmp(str, "s3_beep", 7) == 0) ++ acpi_realmode_flags |= 4; ++ str = strchr(str, ','); ++ if (str != NULL) ++ str += strspn(str, ", \t"); ++ } ++ return 1; ++} ++ ++__setup("acpi_sleep=", acpi_sleep_setup); ++#endif /* CONFIG_ACPI_PV_SLEEP */ +diff --git a/arch/x86/kernel/apic_32-xen.c b/arch/x86/kernel/apic_32-xen.c +new file mode 100644 +index 0000000..b8e1e9b +--- /dev/null ++++ b/arch/x86/kernel/apic_32-xen.c +@@ -0,0 +1,44 @@ ++/* ++ * Local APIC handling stubs ++ */ ++ ++#include ++#include ++ ++#include ++ ++/* ++ * Debug level, exported for io_apic.c ++ */ ++int apic_verbosity; ++ ++static int __init apic_set_verbosity(char *str) ++{ ++ if (strcmp("debug", str) == 0) ++ apic_verbosity = APIC_DEBUG; ++ else if (strcmp("verbose", str) == 0) ++ apic_verbosity = APIC_VERBOSE; ++ return 1; ++} ++ ++__setup("apic=", apic_set_verbosity); ++ ++int setup_profiling_timer(unsigned int multiplier) ++{ ++ return -EINVAL; ++} ++ ++/* ++ * This initializes the IO-APIC and APIC hardware if this is ++ * a UP kernel. ++ */ ++int __init APIC_init_uniprocessor(void) ++{ ++#ifdef CONFIG_X86_IO_APIC ++ if (smp_found_config) ++ if (!skip_ioapic_setup && nr_ioapics) ++ setup_IO_APIC(); ++#endif ++ ++ return 0; ++} +diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c +index 9258808..4cfdb26 100644 +--- a/arch/x86/kernel/asm-offsets_32.c ++++ b/arch/x86/kernel/asm-offsets_32.c +@@ -19,10 +19,14 @@ + #include + #include + ++#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) + #include ++#endif + ++#ifdef CONFIG_LGUEST_GUEST + #include + #include "../../../drivers/lguest/lg.h" ++#endif + + /* workaround for a warning with -Wmissing-prototypes */ + void foo(void); +@@ -54,6 +58,7 @@ void foo(void) + OFFSET(TI_exec_domain, thread_info, exec_domain); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); ++ OFFSET(TI_cpu, thread_info, cpu); + OFFSET(TI_preempt_count, thread_info, preempt_count); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_restart_block, thread_info, restart_block); +@@ -91,9 +96,14 @@ void foo(void) + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + ++#ifndef CONFIG_X86_NO_TSS + /* Offset from the sysenter stack to tss.sp0 */ +- DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - ++ DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + sizeof(struct tss_struct)); ++#else ++ /* sysenter stack points directly to sp0 */ ++ DEFINE(SYSENTER_stack_sp0, 0); ++#endif + + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); +@@ -103,6 +113,11 @@ void foo(void) + + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + ++#ifdef CONFIG_XEN ++ BLANK(); ++ OFFSET(XEN_START_mfn_list, start_info, mfn_list); ++#endif ++ + #ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); +@@ -115,7 +130,7 @@ void foo(void) + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + #endif + +-#ifdef CONFIG_XEN ++#ifdef CONFIG_PARAVIRT_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c +index f126c05..2a472c8 100644 +--- a/arch/x86/kernel/asm-offsets_64.c ++++ b/arch/x86/kernel/asm-offsets_64.c +@@ -117,8 +117,10 @@ int main(void) + ENTRY(cr8); + BLANK(); + #undef ENTRY ++#ifndef CONFIG_X86_NO_TSS + DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); + BLANK(); ++#endif + DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); +diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile +index a0c6f81..7c518c7 100644 +--- a/arch/x86/kernel/cpu/Makefile ++++ b/arch/x86/kernel/cpu/Makefile +@@ -17,4 +17,6 @@ obj-$(CONFIG_X86_MCE) += mcheck/ + obj-$(CONFIG_MTRR) += mtrr/ + obj-$(CONFIG_CPU_FREQ) += cpufreq/ + ++ifneq ($(CONFIG_XEN),y) + obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o ++endif +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 2458668..81ded4d 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -24,7 +24,7 @@ + extern void vide(void); + __asm__(".align 4\nvide: ret"); + +-#ifdef CONFIG_X86_LOCAL_APIC ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + #define ENABLE_C1E_MASK 0x18000000 + #define CPUID_PROCESSOR_SIGNATURE 1 + #define CPUID_XFAM 0x0ff00000 +@@ -297,7 +297,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) + num_cache_leaves = 3; + } + +-#ifdef CONFIG_X86_LOCAL_APIC ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + if (amd_apic_timer_broken()) + local_apic_timer_disabled = 1; + #endif +diff --git a/arch/x86/kernel/cpu/common-xen.c b/arch/x86/kernel/cpu/common-xen.c +new file mode 100644 +index 0000000..768f066 +--- /dev/null ++++ b/arch/x86/kernel/cpu/common-xen.c +@@ -0,0 +1,741 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86_LOCAL_APIC ++#include ++#include ++#include ++#endif ++#include ++ ++#include "cpu.h" ++ ++DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { ++ [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, ++ [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, ++ [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, ++ [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, ++#ifndef CONFIG_XEN ++ /* ++ * Segments used for calling PnP BIOS have byte granularity. ++ * They code segments and data segments have fixed 64k limits, ++ * the transfer segment sizes are set at run time. ++ */ ++ /* 32-bit code */ ++ [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, ++ /* 16-bit code */ ++ [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, ++ /* 16-bit data */ ++ [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, ++ /* 16-bit data */ ++ [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, ++ /* 16-bit data */ ++ [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, ++ /* ++ * The APM segments have byte granularity and their bases ++ * are set at run time. All have 64k limits. ++ */ ++ /* 32-bit code */ ++ [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, ++ /* 16-bit code */ ++ [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, ++ /* data */ ++ [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, ++ ++ [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, ++#endif ++ [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, ++} }; ++EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); ++ ++__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; ++ ++static int cachesize_override __cpuinitdata = -1; ++static int disable_x86_serial_nr __cpuinitdata = 1; ++ ++struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; ++ ++static void __cpuinit default_init(struct cpuinfo_x86 *c) ++{ ++ /* Not much we can do here... */ ++ /* Check if at least it has cpuid */ ++ if (c->cpuid_level == -1) { ++ /* No cpuid. It must be an ancient CPU */ ++ if (c->x86 == 4) ++ strcpy(c->x86_model_id, "486"); ++ else if (c->x86 == 3) ++ strcpy(c->x86_model_id, "386"); ++ } ++} ++ ++static struct cpu_dev __cpuinitdata default_cpu = { ++ .c_init = default_init, ++ .c_vendor = "Unknown", ++}; ++static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; ++ ++static int __init cachesize_setup(char *str) ++{ ++ get_option(&str, &cachesize_override); ++ return 1; ++} ++__setup("cachesize=", cachesize_setup); ++ ++int __cpuinit get_model_name(struct cpuinfo_x86 *c) ++{ ++ unsigned int *v; ++ char *p, *q; ++ ++ if (cpuid_eax(0x80000000) < 0x80000004) ++ return 0; ++ ++ v = (unsigned int *) c->x86_model_id; ++ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); ++ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); ++ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); ++ c->x86_model_id[48] = 0; ++ ++ /* Intel chips right-justify this string for some dumb reason; ++ undo that brain damage */ ++ p = q = &c->x86_model_id[0]; ++ while (*p == ' ') ++ p++; ++ if (p != q) { ++ while (*p) ++ *q++ = *p++; ++ while (q <= &c->x86_model_id[48]) ++ *q++ = '\0'; /* Zero-pad the rest */ ++ } ++ ++ return 1; ++} ++ ++ ++void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) ++{ ++ unsigned int n, dummy, ecx, edx, l2size; ++ ++ n = cpuid_eax(0x80000000); ++ ++ if (n >= 0x80000005) { ++ cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); ++ printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", ++ edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); ++ c->x86_cache_size = (ecx>>24)+(edx>>24); ++ } ++ ++ if (n < 0x80000006) /* Some chips just has a large L1. */ ++ return; ++ ++ ecx = cpuid_ecx(0x80000006); ++ l2size = ecx >> 16; ++ ++ /* do processor-specific cache resizing */ ++ if (this_cpu->c_size_cache) ++ l2size = this_cpu->c_size_cache(c, l2size); ++ ++ /* Allow user to override all this if necessary. */ ++ if (cachesize_override != -1) ++ l2size = cachesize_override; ++ ++ if (l2size == 0) ++ return; /* Again, no L2 cache is possible */ ++ ++ c->x86_cache_size = l2size; ++ ++ printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", ++ l2size, ecx & 0xFF); ++} ++ ++/* ++ * Naming convention should be: [()] ++ * This table only is used unless init_() below doesn't set it; ++ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used ++ * ++ */ ++ ++/* Look up CPU names by table lookup. */ ++static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) ++{ ++ struct cpu_model_info *info; ++ ++ if (c->x86_model >= 16) ++ return NULL; /* Range check */ ++ ++ if (!this_cpu) ++ return NULL; ++ ++ info = this_cpu->c_models; ++ ++ while (info && info->family) { ++ if (info->family == c->x86) ++ return info->model_names[c->x86_model]; ++ info++; ++ } ++ return NULL; /* Not found */ ++} ++ ++ ++static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) ++{ ++ char *v = c->x86_vendor_id; ++ int i; ++ static int printed; ++ ++ for (i = 0; i < X86_VENDOR_NUM; i++) { ++ if (cpu_devs[i]) { ++ if (!strcmp(v, cpu_devs[i]->c_ident[0]) || ++ (cpu_devs[i]->c_ident[1] && ++ !strcmp(v, cpu_devs[i]->c_ident[1]))) { ++ c->x86_vendor = i; ++ if (!early) ++ this_cpu = cpu_devs[i]; ++ return; ++ } ++ } ++ } ++ if (!printed) { ++ printed++; ++ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); ++ printk(KERN_ERR "CPU: Your system may be unstable.\n"); ++ } ++ c->x86_vendor = X86_VENDOR_UNKNOWN; ++ this_cpu = &default_cpu; ++} ++ ++ ++static int __init x86_fxsr_setup(char *s) ++{ ++ setup_clear_cpu_cap(X86_FEATURE_FXSR); ++ setup_clear_cpu_cap(X86_FEATURE_XMM); ++ return 1; ++} ++__setup("nofxsr", x86_fxsr_setup); ++ ++ ++static int __init x86_sep_setup(char *s) ++{ ++ setup_clear_cpu_cap(X86_FEATURE_SEP); ++ return 1; ++} ++__setup("nosep", x86_sep_setup); ++ ++ ++/* Standard macro to see if a specific flag is changeable */ ++static inline int flag_is_changeable_p(u32 flag) ++{ ++ u32 f1, f2; ++ ++ asm("pushfl\n\t" ++ "pushfl\n\t" ++ "popl %0\n\t" ++ "movl %0,%1\n\t" ++ "xorl %2,%0\n\t" ++ "pushl %0\n\t" ++ "popfl\n\t" ++ "pushfl\n\t" ++ "popl %0\n\t" ++ "popfl\n\t" ++ : "=&r" (f1), "=&r" (f2) ++ : "ir" (flag)); ++ ++ return ((f1^f2) & flag) != 0; ++} ++ ++ ++/* Probe for the CPUID instruction */ ++static int __cpuinit have_cpuid_p(void) ++{ ++ return flag_is_changeable_p(X86_EFLAGS_ID); ++} ++ ++void __init cpu_detect(struct cpuinfo_x86 *c) ++{ ++ /* Get vendor name */ ++ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, ++ (unsigned int *)&c->x86_vendor_id[0], ++ (unsigned int *)&c->x86_vendor_id[8], ++ (unsigned int *)&c->x86_vendor_id[4]); ++ ++ c->x86 = 4; ++ if (c->cpuid_level >= 0x00000001) { ++ u32 junk, tfms, cap0, misc; ++ cpuid(0x00000001, &tfms, &misc, &junk, &cap0); ++ c->x86 = (tfms >> 8) & 15; ++ c->x86_model = (tfms >> 4) & 15; ++ if (c->x86 == 0xf) ++ c->x86 += (tfms >> 20) & 0xff; ++ if (c->x86 >= 0x6) ++ c->x86_model += ((tfms >> 16) & 0xF) << 4; ++ c->x86_mask = tfms & 15; ++ if (cap0 & (1<<19)) { ++ c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; ++ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; ++ } ++ } ++} ++static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) ++{ ++ u32 tfms, xlvl; ++ unsigned int ebx; ++ ++ memset(&c->x86_capability, 0, sizeof c->x86_capability); ++ if (have_cpuid_p()) { ++ /* Intel-defined flags: level 0x00000001 */ ++ if (c->cpuid_level >= 0x00000001) { ++ u32 capability, excap; ++ cpuid(0x00000001, &tfms, &ebx, &excap, &capability); ++ c->x86_capability[0] = capability; ++ c->x86_capability[4] = excap; ++ } ++ ++ /* AMD-defined flags: level 0x80000001 */ ++ xlvl = cpuid_eax(0x80000000); ++ if ((xlvl & 0xffff0000) == 0x80000000) { ++ if (xlvl >= 0x80000001) { ++ c->x86_capability[1] = cpuid_edx(0x80000001); ++ c->x86_capability[6] = cpuid_ecx(0x80000001); ++ } ++ } ++ ++ } ++ ++} ++ ++/* ++ * Do minimum CPU detection early. ++ * Fields really needed: vendor, cpuid_level, family, model, mask, ++ * cache alignment. ++ * The others are not touched to avoid unwanted side effects. ++ * ++ * WARNING: this function is only called on the BP. Don't add code here ++ * that is supposed to run on all CPUs. ++ */ ++static void __init early_cpu_detect(void) ++{ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++ ++ c->x86_cache_alignment = 32; ++ c->x86_clflush_size = 32; ++ ++ if (!have_cpuid_p()) ++ return; ++ ++ cpu_detect(c); ++ ++ get_cpu_vendor(c, 1); ++ ++ if (c->x86_vendor != X86_VENDOR_UNKNOWN && ++ cpu_devs[c->x86_vendor]->c_early_init) ++ cpu_devs[c->x86_vendor]->c_early_init(c); ++ ++ early_get_cap(c); ++} ++ ++static void __cpuinit generic_identify(struct cpuinfo_x86 *c) ++{ ++ u32 tfms, xlvl; ++ unsigned int ebx; ++ ++ if (have_cpuid_p()) { ++ /* Get vendor name */ ++ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, ++ (unsigned int *)&c->x86_vendor_id[0], ++ (unsigned int *)&c->x86_vendor_id[8], ++ (unsigned int *)&c->x86_vendor_id[4]); ++ ++ get_cpu_vendor(c, 0); ++ /* Initialize the standard set of capabilities */ ++ /* Note that the vendor-specific code below might override */ ++ /* Intel-defined flags: level 0x00000001 */ ++ if (c->cpuid_level >= 0x00000001) { ++ u32 capability, excap; ++ cpuid(0x00000001, &tfms, &ebx, &excap, &capability); ++ c->x86_capability[0] = capability; ++ c->x86_capability[4] = excap; ++ c->x86 = (tfms >> 8) & 15; ++ c->x86_model = (tfms >> 4) & 15; ++ if (c->x86 == 0xf) ++ c->x86 += (tfms >> 20) & 0xff; ++ if (c->x86 >= 0x6) ++ c->x86_model += ((tfms >> 16) & 0xF) << 4; ++ c->x86_mask = tfms & 15; ++ c->initial_apicid = (ebx >> 24) & 0xFF; ++#ifdef CONFIG_X86_HT ++ c->apicid = phys_pkg_id(c->initial_apicid, 0); ++ c->phys_proc_id = c->initial_apicid; ++#else ++ c->apicid = c->initial_apicid; ++#endif ++ if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) ++ c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; ++ } else { ++ /* Have CPUID level 0 only - unheard of */ ++ c->x86 = 4; ++ } ++ ++ /* AMD-defined flags: level 0x80000001 */ ++ xlvl = cpuid_eax(0x80000000); ++ if ((xlvl & 0xffff0000) == 0x80000000) { ++ if (xlvl >= 0x80000001) { ++ c->x86_capability[1] = cpuid_edx(0x80000001); ++ c->x86_capability[6] = cpuid_ecx(0x80000001); ++ } ++ if (xlvl >= 0x80000004) ++ get_model_name(c); /* Default name */ ++ } ++ ++ init_scattered_cpuid_features(c); ++ } ++ ++} ++ ++static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) ++{ ++ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { ++ /* Disable processor serial number */ ++ unsigned long lo, hi; ++ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); ++ lo |= 0x200000; ++ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); ++ printk(KERN_NOTICE "CPU serial number disabled.\n"); ++ clear_cpu_cap(c, X86_FEATURE_PN); ++ ++ /* Disabling the serial number may affect the cpuid level */ ++ c->cpuid_level = cpuid_eax(0); ++ } ++} ++ ++static int __init x86_serial_nr_setup(char *s) ++{ ++ disable_x86_serial_nr = 0; ++ return 1; ++} ++__setup("serialnumber", x86_serial_nr_setup); ++ ++ ++ ++/* ++ * This does the hard work of actually picking apart the CPU stuff... ++ */ ++void __cpuinit identify_cpu(struct cpuinfo_x86 *c) ++{ ++ int i; ++ ++ c->loops_per_jiffy = loops_per_jiffy; ++ c->x86_cache_size = -1; ++ c->x86_vendor = X86_VENDOR_UNKNOWN; ++ c->cpuid_level = -1; /* CPUID not detected */ ++ c->x86_model = c->x86_mask = 0; /* So far unknown... */ ++ c->x86_vendor_id[0] = '\0'; /* Unset */ ++ c->x86_model_id[0] = '\0'; /* Unset */ ++ c->x86_max_cores = 1; ++ c->x86_clflush_size = 32; ++ memset(&c->x86_capability, 0, sizeof c->x86_capability); ++ ++ if (!have_cpuid_p()) { ++ /* ++ * First of all, decide if this is a 486 or higher ++ * It's a 486 if we can modify the AC flag ++ */ ++ if (flag_is_changeable_p(X86_EFLAGS_AC)) ++ c->x86 = 4; ++ else ++ c->x86 = 3; ++ } ++ ++ generic_identify(c); ++ ++ if (this_cpu->c_identify) ++ this_cpu->c_identify(c); ++ ++ /* ++ * Vendor-specific initialization. In this section we ++ * canonicalize the feature flags, meaning if there are ++ * features a certain CPU supports which CPUID doesn't ++ * tell us, CPUID claiming incorrect flags, or other bugs, ++ * we handle them here. ++ * ++ * At the end of this section, c->x86_capability better ++ * indicate the features this CPU genuinely supports! ++ */ ++ if (this_cpu->c_init) ++ this_cpu->c_init(c); ++ ++ /* Disable the PN if appropriate */ ++ squash_the_stupid_serial_number(c); ++ ++ /* ++ * The vendor-specific functions might have changed features. Now ++ * we do "generic changes." ++ */ ++ ++ /* If the model name is still unset, do table lookup. */ ++ if (!c->x86_model_id[0]) { ++ char *p; ++ p = table_lookup_model(c); ++ if (p) ++ strcpy(c->x86_model_id, p); ++ else ++ /* Last resort... */ ++ sprintf(c->x86_model_id, "%02x/%02x", ++ c->x86, c->x86_model); ++ } ++ ++ /* ++ * On SMP, boot_cpu_data holds the common feature set between ++ * all CPUs; so make sure that we indicate which features are ++ * common between the CPUs. The first time this routine gets ++ * executed, c == &boot_cpu_data. ++ */ ++ if (c != &boot_cpu_data) { ++ /* AND the already accumulated flags with these */ ++ for (i = 0 ; i < NCAPINTS ; i++) ++ boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; ++ } ++ ++ /* Clear all flags overriden by options */ ++ for (i = 0; i < NCAPINTS; i++) ++ c->x86_capability[i] &= ~cleared_cpu_caps[i]; ++ ++ /* Init Machine Check Exception if available. */ ++ mcheck_init(c); ++ ++ select_idle_routine(c); ++} ++ ++void __init identify_boot_cpu(void) ++{ ++ identify_cpu(&boot_cpu_data); ++ sysenter_setup(); ++ enable_sep_cpu(); ++} ++ ++void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) ++{ ++ BUG_ON(c == &boot_cpu_data); ++ identify_cpu(c); ++ enable_sep_cpu(); ++ mtrr_ap_init(); ++} ++ ++#ifdef CONFIG_X86_HT ++void __cpuinit detect_ht(struct cpuinfo_x86 *c) ++{ ++ u32 eax, ebx, ecx, edx; ++ int index_msb, core_bits; ++ ++ cpuid(1, &eax, &ebx, &ecx, &edx); ++ ++ if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) ++ return; ++ ++ smp_num_siblings = (ebx & 0xff0000) >> 16; ++ ++ if (smp_num_siblings == 1) { ++ printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); ++ } else if (smp_num_siblings > 1) { ++ ++ if (smp_num_siblings > NR_CPUS) { ++ printk(KERN_WARNING "CPU: Unsupported number of the " ++ "siblings %d", smp_num_siblings); ++ smp_num_siblings = 1; ++ return; ++ } ++ ++ index_msb = get_count_order(smp_num_siblings); ++ c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); ++ ++ printk(KERN_INFO "CPU: Physical Processor ID: %d\n", ++ c->phys_proc_id); ++ ++ smp_num_siblings = smp_num_siblings / c->x86_max_cores; ++ ++ index_msb = get_count_order(smp_num_siblings) ; ++ ++ core_bits = get_count_order(c->x86_max_cores); ++ ++ c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & ++ ((1 << core_bits) - 1); ++ ++ if (c->x86_max_cores > 1) ++ printk(KERN_INFO "CPU: Processor Core ID: %d\n", ++ c->cpu_core_id); ++ } ++} ++#endif ++ ++static __init int setup_noclflush(char *arg) ++{ ++ setup_clear_cpu_cap(X86_FEATURE_CLFLSH); ++ return 1; ++} ++__setup("noclflush", setup_noclflush); ++ ++void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) ++{ ++ char *vendor = NULL; ++ ++ if (c->x86_vendor < X86_VENDOR_NUM) ++ vendor = this_cpu->c_vendor; ++ else if (c->cpuid_level >= 0) ++ vendor = c->x86_vendor_id; ++ ++ if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) ++ printk("%s ", vendor); ++ ++ if (!c->x86_model_id[0]) ++ printk("%d86", c->x86); ++ else ++ printk("%s", c->x86_model_id); ++ ++ if (c->x86_mask || c->cpuid_level >= 0) ++ printk(" stepping %02x\n", c->x86_mask); ++ else ++ printk("\n"); ++} ++ ++static __init int setup_disablecpuid(char *arg) ++{ ++ int bit; ++ if (get_option(&arg, &bit) && bit < NCAPINTS*32) ++ setup_clear_cpu_cap(bit); ++ else ++ return 0; ++ return 1; ++} ++__setup("clearcpuid=", setup_disablecpuid); ++ ++cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; ++ ++void __init early_cpu_init(void) ++{ ++ struct cpu_vendor_dev *cvdev; ++ ++ for (cvdev = __x86cpuvendor_start ; ++ cvdev < __x86cpuvendor_end ; ++ cvdev++) ++ cpu_devs[cvdev->vendor] = cvdev->cpu_dev; ++ ++ early_cpu_detect(); ++ validate_pat_support(&boot_cpu_data); ++} ++ ++/* Make sure %fs is initialized properly in idle threads */ ++struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) ++{ ++ memset(regs, 0, sizeof(struct pt_regs)); ++ regs->fs = __KERNEL_PERCPU; ++ return regs; ++} ++ ++/* Current gdt points %fs at the "master" per-cpu area: after this, ++ * it's on the real one. */ ++void switch_to_new_gdt(void) ++{ ++ struct desc_ptr gdt_descr; ++ unsigned long va, frames[16]; ++ int f; ++ ++ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); ++ gdt_descr.size = GDT_SIZE - 1; ++ ++ for (va = gdt_descr.address, f = 0; ++ va < gdt_descr.address + gdt_descr.size; ++ va += PAGE_SIZE, f++) { ++ frames[f] = virt_to_mfn(va); ++ make_lowmem_page_readonly( ++ (void *)va, XENFEAT_writable_descriptor_tables); ++ } ++ if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) ++ BUG(); ++ asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); ++} ++ ++/* ++ * cpu_init() initializes state that is per-CPU. Some data is already ++ * initialized (naturally) in the bootstrap process, such as the GDT ++ * and IDT. We reload them nevertheless, this function acts as a ++ * 'CPU state barrier', nothing should get across. ++ */ ++void __cpuinit cpu_init(void) ++{ ++ int cpu = smp_processor_id(); ++ struct task_struct *curr = current; ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *t = &per_cpu(init_tss, cpu); ++#endif ++ struct thread_struct *thread = &curr->thread; ++ ++ if (cpu_test_and_set(cpu, cpu_initialized)) { ++ printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); ++ for (;;) local_irq_enable(); ++ } ++ ++ printk(KERN_INFO "Initializing CPU#%d\n", cpu); ++ ++ if (cpu_has_vme || cpu_has_de) ++ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); ++ ++ switch_to_new_gdt(); ++ ++ /* ++ * Set up and load the per-CPU TSS and LDT ++ */ ++ atomic_inc(&init_mm.mm_count); ++ curr->active_mm = &init_mm; ++ if (curr->mm) ++ BUG(); ++ enter_lazy_tlb(&init_mm, curr); ++ ++ load_sp0(t, thread); ++ ++ load_LDT(&init_mm.context); ++ ++#ifdef CONFIG_DOUBLEFAULT ++ /* Set up doublefault TSS pointer in the GDT */ ++ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); ++#endif ++ ++ /* Clear %gs. */ ++ asm volatile ("mov %0, %%gs" : : "r" (0)); ++ ++ /* Clear all 6 debug registers: */ ++ set_debugreg(0, 0); ++ set_debugreg(0, 1); ++ set_debugreg(0, 2); ++ set_debugreg(0, 3); ++ set_debugreg(0, 6); ++ set_debugreg(0, 7); ++ ++ /* ++ * Force FPU initialization: ++ */ ++ current_thread_info()->status = 0; ++ clear_used_math(); ++ mxcsr_feature_mask_init(); ++} ++ ++#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) ++void __cpuinit cpu_uninit(void) ++{ ++ int cpu = raw_smp_processor_id(); ++ cpu_clear(cpu, cpu_initialized); ++ ++ /* lazy TLB state */ ++ per_cpu(cpu_tlbstate, cpu).state = 0; ++ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; ++} ++#endif +diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile +index 191fc05..949d678 100644 +--- a/arch/x86/kernel/cpu/mtrr/Makefile ++++ b/arch/x86/kernel/cpu/mtrr/Makefile +@@ -1,3 +1,4 @@ + obj-y := main.o if.o generic.o state.o + obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o + ++obj-$(CONFIG_XEN) := main.o if.o +diff --git a/arch/x86/kernel/cpu/mtrr/main-xen.c b/arch/x86/kernel/cpu/mtrr/main-xen.c +new file mode 100644 +index 0000000..29aa598 +--- /dev/null ++++ b/arch/x86/kernel/cpu/mtrr/main-xen.c +@@ -0,0 +1,330 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "mtrr.h" ++ ++static DEFINE_MUTEX(mtrr_mutex); ++ ++void generic_get_mtrr(unsigned int reg, unsigned long *base, ++ unsigned long *size, mtrr_type * type) ++{ ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = reg; ++ if (unlikely(HYPERVISOR_platform_op(&op))) ++ memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype)); ++ ++ *size = op.u.read_memtype.nr_mfns; ++ *base = op.u.read_memtype.mfn; ++ *type = op.u.read_memtype.type; ++} ++ ++struct mtrr_ops generic_mtrr_ops = { ++ .use_intel_if = 1, ++ .get = generic_get_mtrr, ++}; ++ ++struct mtrr_ops *mtrr_if = &generic_mtrr_ops; ++unsigned int num_var_ranges; ++unsigned int mtrr_usage_table[MAX_VAR_RANGES]; ++ ++static u64 tom2; ++ ++static void __init set_num_var_ranges(void) ++{ ++ struct xen_platform_op op; ++ ++ for (num_var_ranges = 0; ; num_var_ranges++) { ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = num_var_ranges; ++ if (HYPERVISOR_platform_op(&op) != 0) ++ break; ++ } ++} ++ ++static void __init init_table(void) ++{ ++ int i, max; ++ ++ max = num_var_ranges; ++ for (i = 0; i < max; i++) ++ mtrr_usage_table[i] = 0; ++} ++ ++int mtrr_add_page(unsigned long base, unsigned long size, ++ unsigned int type, bool increment) ++{ ++ int error; ++ struct xen_platform_op op; ++ ++ mutex_lock(&mtrr_mutex); ++ ++ op.cmd = XENPF_add_memtype; ++ op.u.add_memtype.mfn = base; ++ op.u.add_memtype.nr_mfns = size; ++ op.u.add_memtype.type = type; ++ error = HYPERVISOR_platform_op(&op); ++ if (error) { ++ mutex_unlock(&mtrr_mutex); ++ BUG_ON(error > 0); ++ return error; ++ } ++ ++ if (increment) ++ ++mtrr_usage_table[op.u.add_memtype.reg]; ++ ++ mutex_unlock(&mtrr_mutex); ++ ++ return op.u.add_memtype.reg; ++} ++ ++static int mtrr_check(unsigned long base, unsigned long size) ++{ ++ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { ++ printk(KERN_WARNING ++ "mtrr: size and base must be multiples of 4 kiB\n"); ++ printk(KERN_DEBUG ++ "mtrr: size: 0x%lx base: 0x%lx\n", size, base); ++ dump_stack(); ++ return -1; ++ } ++ return 0; ++} ++ ++int ++mtrr_add(unsigned long base, unsigned long size, unsigned int type, ++ bool increment) ++{ ++ if (mtrr_check(base, size)) ++ return -EINVAL; ++ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, ++ increment); ++} ++ ++int mtrr_del_page(int reg, unsigned long base, unsigned long size) ++{ ++ unsigned i; ++ mtrr_type ltype; ++ unsigned long lbase, lsize; ++ int error = -EINVAL; ++ struct xen_platform_op op; ++ ++ mutex_lock(&mtrr_mutex); ++ ++ if (reg < 0) { ++ /* Search for existing MTRR */ ++ for (i = 0; i < num_var_ranges; ++i) { ++ mtrr_if->get(i, &lbase, &lsize, <ype); ++ if (lbase == base && lsize == size) { ++ reg = i; ++ break; ++ } ++ } ++ if (reg < 0) { ++ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, ++ size); ++ goto out; ++ } ++ } ++ if (mtrr_usage_table[reg] < 1) { ++ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); ++ goto out; ++ } ++ if (--mtrr_usage_table[reg] < 1) { ++ op.cmd = XENPF_del_memtype; ++ op.u.del_memtype.handle = 0; ++ op.u.del_memtype.reg = reg; ++ error = HYPERVISOR_platform_op(&op); ++ if (error) { ++ BUG_ON(error > 0); ++ goto out; ++ } ++ } ++ error = reg; ++ out: ++ mutex_unlock(&mtrr_mutex); ++ return error; ++} ++ ++int ++mtrr_del(int reg, unsigned long base, unsigned long size) ++{ ++ if (mtrr_check(base, size)) ++ return -EINVAL; ++ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); ++} ++ ++EXPORT_SYMBOL(mtrr_add); ++EXPORT_SYMBOL(mtrr_del); ++ ++/* ++ * Returns the effective MTRR type for the region ++ * Error returns: ++ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR ++ * - 0xFF - when MTRR is not enabled ++ */ ++u8 mtrr_type_lookup(u64 start, u64 end) ++{ ++ int i, error; ++ u64 start_mfn, end_mfn, base_mfn, top_mfn; ++ u8 prev_match, curr_match; ++ struct xen_platform_op op; ++ ++ if (!is_initial_xendomain()) ++ return MTRR_TYPE_WRBACK; ++ ++ if (!num_var_ranges) ++ return 0xFF; ++ ++ start_mfn = start >> PAGE_SHIFT; ++ /* Make end inclusive end, instead of exclusive */ ++ end_mfn = --end >> PAGE_SHIFT; ++ ++ /* Look in fixed ranges. Just return the type as per start */ ++ if (start_mfn < 0x100) { ++#if 0//todo ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = ???; ++ error = HYPERVISOR_platform_op(&op); ++ if (!error) ++ return op.u.read_memtype.type; ++#endif ++ return MTRR_TYPE_UNCACHABLE; ++ } ++ ++ /* ++ * Look in variable ranges ++ * Look of multiple ranges matching this address and pick type ++ * as per MTRR precedence ++ */ ++ prev_match = 0xFF; ++ for (i = 0; i < num_var_ranges; ++i) { ++ op.cmd = XENPF_read_memtype; ++ op.u.read_memtype.reg = i; ++ error = HYPERVISOR_platform_op(&op); ++ ++ if (error || !op.u.read_memtype.nr_mfns) ++ continue; ++ ++ base_mfn = op.u.read_memtype.mfn; ++ top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1; ++ ++ if (base_mfn > end_mfn || start_mfn > top_mfn) { ++ continue; ++ } ++ ++ if (base_mfn > start_mfn || end_mfn > top_mfn) { ++ return 0xFE; ++ } ++ ++ curr_match = op.u.read_memtype.type; ++ if (prev_match == 0xFF) { ++ prev_match = curr_match; ++ continue; ++ } ++ ++ if (prev_match == MTRR_TYPE_UNCACHABLE || ++ curr_match == MTRR_TYPE_UNCACHABLE) { ++ return MTRR_TYPE_UNCACHABLE; ++ } ++ ++ if ((prev_match == MTRR_TYPE_WRBACK && ++ curr_match == MTRR_TYPE_WRTHROUGH) || ++ (prev_match == MTRR_TYPE_WRTHROUGH && ++ curr_match == MTRR_TYPE_WRBACK)) { ++ prev_match = MTRR_TYPE_WRTHROUGH; ++ curr_match = MTRR_TYPE_WRTHROUGH; ++ } ++ ++ if (prev_match != curr_match) { ++ return MTRR_TYPE_UNCACHABLE; ++ } ++ } ++ ++ if (tom2) { ++ if (start >= (1ULL<<32) && (end < tom2)) ++ return MTRR_TYPE_WRBACK; ++ } ++ ++ if (prev_match != 0xFF) ++ return prev_match; ++ ++#if 0//todo ++ op.cmd = XENPF_read_def_memtype; ++ error = HYPERVISOR_platform_op(&op); ++ if (!error) ++ return op.u.read_def_memtype.type; ++#endif ++ return MTRR_TYPE_UNCACHABLE; ++} ++ ++/* ++ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB ++ * for memory >4GB. Check for that here. ++ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't ++ * apply to are wrong, but so far we don't know of any such case in the wild. ++ */ ++#define Tom2Enabled (1U << 21) ++#define Tom2ForceMemTypeWB (1U << 22) ++ ++int __init amd_special_default_mtrr(void) ++{ ++ u32 l, h; ++ ++ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ++ return 0; ++ if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) ++ return 0; ++ /* In case some hypervisor doesn't pass SYSCFG through */ ++ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) ++ return 0; ++ /* ++ * Memory between 4GB and top of mem is forced WB by this magic bit. ++ * Reserved before K8RevF, but should be zero there. ++ */ ++ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == ++ (Tom2Enabled | Tom2ForceMemTypeWB)) ++ return 1; ++ return 0; ++} ++ ++void __init mtrr_bp_init(void) ++{ ++ if (amd_special_default_mtrr()) { ++ /* TOP_MEM2 */ ++ rdmsrl(MSR_K8_TOP_MEM2, tom2); ++ tom2 &= 0xffffff8000000ULL; ++ } ++} ++ ++void mtrr_ap_init(void) ++{ ++} ++ ++static int __init mtrr_init(void) ++{ ++ struct cpuinfo_x86 *c = &boot_cpu_data; ++ ++ if (!is_initial_xendomain()) ++ return -ENODEV; ++ ++ if ((!cpu_has(c, X86_FEATURE_MTRR)) && ++ (!cpu_has(c, X86_FEATURE_K6_MTRR)) && ++ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && ++ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) ++ return -ENODEV; ++ ++ set_num_var_ranges(); ++ init_table(); ++ ++ return 0; ++} ++ ++subsys_initcall(mtrr_init); +diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c +index 2685538..6bb025d 100644 +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -29,6 +29,7 @@ + + #include + ++#ifndef CONFIG_XEN + /* This keeps a track of which one is crashing cpu. */ + static int crashing_cpu; + +@@ -117,6 +118,7 @@ static void nmi_shootdown_cpus(void) + /* There are no cpus to shootdown */ + } + #endif ++#endif /* CONFIG_XEN */ + + void native_machine_crash_shutdown(struct pt_regs *regs) + { +@@ -131,6 +133,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + ++#ifndef CONFIG_XEN + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + nmi_shootdown_cpus(); +@@ -138,6 +141,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) + #if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); + #endif ++#endif /* CONFIG_XEN */ + #ifdef CONFIG_HPET_TIMER + hpet_disable(); + #endif +diff --git a/arch/x86/kernel/e820_32-xen.c b/arch/x86/kernel/e820_32-xen.c +new file mode 100644 +index 0000000..99e8b4c +--- /dev/null ++++ b/arch/x86/kernel/e820_32-xen.c +@@ -0,0 +1,873 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct e820map __initdata e820; ++struct change_member { ++ struct e820entry *pbios; /* pointer to original bios entry */ ++ unsigned long long addr; /* address for this change point */ ++}; ++static struct change_member change_point_list[2*E820MAX] __initdata; ++static struct change_member *change_point[2*E820MAX] __initdata; ++static struct e820entry *overlap_list[E820MAX] __initdata; ++static struct e820entry new_bios[E820MAX] __initdata; ++/* For PCI or other memory-mapped resources */ ++unsigned long pci_mem_start = 0x10000000; ++#ifdef CONFIG_PCI ++EXPORT_SYMBOL(pci_mem_start); ++#endif ++extern int user_defined_memmap; ++ ++static struct resource system_rom_resource = { ++ .name = "System ROM", ++ .start = 0xf0000, ++ .end = 0xfffff, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}; ++ ++static struct resource extension_rom_resource = { ++ .name = "Extension ROM", ++ .start = 0xe0000, ++ .end = 0xeffff, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}; ++ ++static struct resource adapter_rom_resources[] = { { ++ .name = "Adapter ROM", ++ .start = 0xc8000, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}, { ++ .name = "Adapter ROM", ++ .start = 0, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}, { ++ .name = "Adapter ROM", ++ .start = 0, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}, { ++ .name = "Adapter ROM", ++ .start = 0, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}, { ++ .name = "Adapter ROM", ++ .start = 0, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}, { ++ .name = "Adapter ROM", ++ .start = 0, ++ .end = 0, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++} }; ++ ++static struct resource video_rom_resource = { ++ .name = "Video ROM", ++ .start = 0xc0000, ++ .end = 0xc7fff, ++ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM ++}; ++ ++#define ROMSIGNATURE 0xaa55 ++ ++static int __init romsignature(const unsigned char *rom) ++{ ++ const unsigned short * const ptr = (const unsigned short *)rom; ++ unsigned short sig; ++ ++ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; ++} ++ ++static int __init romchecksum(const unsigned char *rom, unsigned long length) ++{ ++ unsigned char sum, c; ++ ++ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) ++ sum += c; ++ return !length && !sum; ++} ++ ++static void __init probe_roms(void) ++{ ++ const unsigned char *rom; ++ unsigned long start, length, upper; ++ unsigned char c; ++ int i; ++ ++#ifdef CONFIG_XEN ++ /* Nothing to do if not running in dom0. */ ++ if (!is_initial_xendomain()) ++ return; ++#endif ++ ++ /* video rom */ ++ upper = adapter_rom_resources[0].start; ++ for (start = video_rom_resource.start; start < upper; start += 2048) { ++ rom = isa_bus_to_virt(start); ++ if (!romsignature(rom)) ++ continue; ++ ++ video_rom_resource.start = start; ++ ++ if (probe_kernel_address(rom + 2, c) != 0) ++ continue; ++ ++ /* 0 < length <= 0x7f * 512, historically */ ++ length = c * 512; ++ ++ /* if checksum okay, trust length byte */ ++ if (length && romchecksum(rom, length)) ++ video_rom_resource.end = start + length - 1; ++ ++ request_resource(&iomem_resource, &video_rom_resource); ++ break; ++ } ++ ++ start = (video_rom_resource.end + 1 + 2047) & ~2047UL; ++ if (start < upper) ++ start = upper; ++ ++ /* system rom */ ++ request_resource(&iomem_resource, &system_rom_resource); ++ upper = system_rom_resource.start; ++ ++ /* check for extension rom (ignore length byte!) */ ++ rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start); ++ if (romsignature(rom)) { ++ length = extension_rom_resource.end - extension_rom_resource.start + 1; ++ if (romchecksum(rom, length)) { ++ request_resource(&iomem_resource, &extension_rom_resource); ++ upper = extension_rom_resource.start; ++ } ++ } ++ ++ /* check for adapter roms on 2k boundaries */ ++ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { ++ rom = isa_bus_to_virt(start); ++ if (!romsignature(rom)) ++ continue; ++ ++ if (probe_kernel_address(rom + 2, c) != 0) ++ continue; ++ ++ /* 0 < length <= 0x7f * 512, historically */ ++ length = c * 512; ++ ++ /* but accept any length that fits if checksum okay */ ++ if (!length || start + length > upper || !romchecksum(rom, length)) ++ continue; ++ ++ adapter_rom_resources[i].start = start; ++ adapter_rom_resources[i].end = start + length - 1; ++ request_resource(&iomem_resource, &adapter_rom_resources[i]); ++ ++ start = adapter_rom_resources[i++].end & ~2047UL; ++ } ++} ++ ++#ifdef CONFIG_XEN ++static struct e820map machine_e820; ++#define e820 machine_e820 ++#endif ++ ++/* ++ * Request address space for all standard RAM and ROM resources ++ * and also for regions reported as reserved by the e820. ++ */ ++void __init init_iomem_resources(struct resource *code_resource, ++ struct resource *data_resource, ++ struct resource *bss_resource) ++{ ++ int i; ++ ++ probe_roms(); ++ for (i = 0; i < e820.nr_map; i++) { ++ struct resource *res; ++#ifndef CONFIG_RESOURCES_64BIT ++ if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) ++ continue; ++#endif ++ res = kzalloc(sizeof(struct resource), GFP_ATOMIC); ++ switch (e820.map[i].type) { ++ case E820_RAM: res->name = "System RAM"; break; ++ case E820_ACPI: res->name = "ACPI Tables"; break; ++ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; ++ default: res->name = "reserved"; ++ } ++ res->start = e820.map[i].addr; ++ res->end = res->start + e820.map[i].size - 1; ++ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ if (request_resource(&iomem_resource, res)) { ++ kfree(res); ++ continue; ++ } ++ if (e820.map[i].type == E820_RAM) { ++ /* ++ * We don't know which RAM region contains kernel data, ++ * so we try it repeatedly and let the resource manager ++ * test it. ++ */ ++#ifndef CONFIG_XEN ++ request_resource(res, code_resource); ++ request_resource(res, data_resource); ++ request_resource(res, bss_resource); ++#endif ++#ifdef CONFIG_KEXEC ++ if (crashk_res.start != crashk_res.end) ++ request_resource(res, &crashk_res); ++#ifdef CONFIG_XEN ++ xen_machine_kexec_register_resources(res); ++#endif ++#endif ++ } ++ } ++} ++ ++#undef e820 ++ ++#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) ++/** ++ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not ++ * correspond to e820 RAM areas and mark the corresponding pages as nosave for ++ * hibernation. ++ * ++ * This function requires the e820 map to be sorted and without any ++ * overlapping entries and assumes the first e820 area to be RAM. ++ */ ++void __init e820_mark_nosave_regions(void) ++{ ++ int i; ++ unsigned long pfn; ++ ++ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); ++ for (i = 1; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ ++ if (pfn < PFN_UP(ei->addr)) ++ register_nosave_region(pfn, PFN_UP(ei->addr)); ++ ++ pfn = PFN_DOWN(ei->addr + ei->size); ++ if (ei->type != E820_RAM) ++ register_nosave_region(PFN_UP(ei->addr), pfn); ++ ++ if (pfn >= max_low_pfn) ++ break; ++ } ++} ++#endif ++ ++void __init add_memory_region(unsigned long long start, ++ unsigned long long size, int type) ++{ ++ int x; ++ ++ x = e820.nr_map; ++ ++ if (x == E820MAX) { ++ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); ++ return; ++ } ++ ++ e820.map[x].addr = start; ++ e820.map[x].size = size; ++ e820.map[x].type = type; ++ e820.nr_map++; ++} /* add_memory_region */ ++ ++/* ++ * Sanitize the BIOS e820 map. ++ * ++ * Some e820 responses include overlapping entries. The following ++ * replaces the original e820 map with a new one, removing overlaps. ++ * ++ */ ++int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) ++{ ++ struct change_member *change_tmp; ++ unsigned long current_type, last_type; ++ unsigned long long last_addr; ++ int chgidx, still_changing; ++ int overlap_entries; ++ int new_bios_entry; ++ int old_nr, new_nr, chg_nr; ++ int i; ++ ++ /* ++ Visually we're performing the following (1,2,3,4 = memory types)... ++ ++ Sample memory map (w/overlaps): ++ ____22__________________ ++ ______________________4_ ++ ____1111________________ ++ _44_____________________ ++ 11111111________________ ++ ____________________33__ ++ ___________44___________ ++ __________33333_________ ++ ______________22________ ++ ___________________2222_ ++ _________111111111______ ++ _____________________11_ ++ _________________4______ ++ ++ Sanitized equivalent (no overlap): ++ 1_______________________ ++ _44_____________________ ++ ___1____________________ ++ ____22__________________ ++ ______11________________ ++ _________1______________ ++ __________3_____________ ++ ___________44___________ ++ _____________33_________ ++ _______________2________ ++ ________________1_______ ++ _________________4______ ++ ___________________2____ ++ ____________________33__ ++ ______________________4_ ++ */ ++ /* if there's only one memory region, don't bother */ ++ if (*pnr_map < 2) { ++ return -1; ++ } ++ ++ old_nr = *pnr_map; ++ ++ /* bail out if we find any unreasonable addresses in bios map */ ++ for (i=0; iaddr = biosmap[i].addr; ++ change_point[chgidx++]->pbios = &biosmap[i]; ++ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; ++ change_point[chgidx++]->pbios = &biosmap[i]; ++ } ++ } ++ chg_nr = chgidx; /* true number of change-points */ ++ ++ /* sort change-point list by memory addresses (low -> high) */ ++ still_changing = 1; ++ while (still_changing) { ++ still_changing = 0; ++ for (i=1; i < chg_nr; i++) { ++ /* if > , swap */ ++ /* or, if current= & last=, swap */ ++ if ((change_point[i]->addr < change_point[i-1]->addr) || ++ ((change_point[i]->addr == change_point[i-1]->addr) && ++ (change_point[i]->addr == change_point[i]->pbios->addr) && ++ (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) ++ ) ++ { ++ change_tmp = change_point[i]; ++ change_point[i] = change_point[i-1]; ++ change_point[i-1] = change_tmp; ++ still_changing=1; ++ } ++ } ++ } ++ ++ /* create a new bios memory map, removing overlaps */ ++ overlap_entries=0; /* number of entries in the overlap table */ ++ new_bios_entry=0; /* index for creating new bios map entries */ ++ last_type = 0; /* start with undefined memory type */ ++ last_addr = 0; /* start with 0 as last starting address */ ++ /* loop through change-points, determining affect on the new bios map */ ++ for (chgidx=0; chgidx < chg_nr; chgidx++) ++ { ++ /* keep track of all overlapping bios entries */ ++ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) ++ { ++ /* add map entry to overlap list (> 1 entry implies an overlap) */ ++ overlap_list[overlap_entries++]=change_point[chgidx]->pbios; ++ } ++ else ++ { ++ /* remove entry from list (order independent, so swap with last) */ ++ for (i=0; ipbios) ++ overlap_list[i] = overlap_list[overlap_entries-1]; ++ } ++ overlap_entries--; ++ } ++ /* if there are overlapping entries, decide which "type" to use */ ++ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ ++ current_type = 0; ++ for (i=0; itype > current_type) ++ current_type = overlap_list[i]->type; ++ /* continue building up new bios map based on this information */ ++ if (current_type != last_type) { ++ if (last_type != 0) { ++ new_bios[new_bios_entry].size = ++ change_point[chgidx]->addr - last_addr; ++ /* move forward only if the new size was non-zero */ ++ if (new_bios[new_bios_entry].size != 0) ++ if (++new_bios_entry >= E820MAX) ++ break; /* no more space left for new bios entries */ ++ } ++ if (current_type != 0) { ++ new_bios[new_bios_entry].addr = change_point[chgidx]->addr; ++ new_bios[new_bios_entry].type = current_type; ++ last_addr=change_point[chgidx]->addr; ++ } ++ last_type = current_type; ++ } ++ } ++ new_nr = new_bios_entry; /* retain count for new bios entries */ ++ ++ /* copy new bios mapping into original location */ ++ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); ++ *pnr_map = new_nr; ++ ++ return 0; ++} ++ ++/* ++ * Copy the BIOS e820 map into a safe place. ++ * ++ * Sanity-check it while we're at it.. ++ * ++ * If we're lucky and live on a modern system, the setup code ++ * will have given us a memory map that we can use to properly ++ * set up memory. If we aren't, we'll fake a memory map. ++ * ++ * We check to see that the memory map contains at least 2 elements ++ * before we'll use it, because the detection code in setup.S may ++ * not be perfect and most every PC known to man has two memory ++ * regions: one from 0 to 640k, and one from 1mb up. (The IBM ++ * thinkpad 560x, for example, does not cooperate with the memory ++ * detection code.) ++ */ ++int __init copy_e820_map(struct e820entry *biosmap, int nr_map) ++{ ++#ifndef CONFIG_XEN ++ /* Only one memory region (or negative)? Ignore it */ ++ if (nr_map < 2) ++ return -1; ++#else ++ BUG_ON(nr_map < 1); ++#endif ++ ++ do { ++ u64 start = biosmap->addr; ++ u64 size = biosmap->size; ++ u64 end = start + size; ++ u32 type = biosmap->type; ++ ++ /* Overflow in 64 bits? Ignore the memory map. */ ++ if (start > end) ++ return -1; ++ ++ add_memory_region(start, size, type); ++ } while (biosmap++, --nr_map); ++ ++#ifdef CONFIG_XEN ++ if (is_initial_xendomain()) { ++ struct xen_memory_map memmap; ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, machine_e820.map); ++ ++ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) ++ BUG(); ++ machine_e820.nr_map = memmap.nr_entries; ++ } else ++ machine_e820 = e820; ++#endif ++ ++ return 0; ++} ++ ++/* ++ * Find the highest page frame number we have available ++ */ ++void __init propagate_e820_map(void) ++{ ++ int i; ++ ++ max_pfn = 0; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ unsigned long start, end; ++ /* RAM? */ ++ if (e820.map[i].type != E820_RAM) ++ continue; ++ start = PFN_UP(e820.map[i].addr); ++ end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); ++ if (start >= end) ++ continue; ++ if (end > max_pfn) ++ max_pfn = end; ++ memory_present(0, start, end); ++ } ++} ++ ++/* ++ * Register fully available low RAM pages with the bootmem allocator. ++ */ ++void __init register_bootmem_low_pages(unsigned long max_low_pfn) ++{ ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ unsigned long curr_pfn, last_pfn, size; ++ /* ++ * Reserve usable low memory ++ */ ++ if (e820.map[i].type != E820_RAM) ++ continue; ++ /* ++ * We are rounding up the start address of usable memory: ++ */ ++ curr_pfn = PFN_UP(e820.map[i].addr); ++ if (curr_pfn >= max_low_pfn) ++ continue; ++ /* ++ * ... and at the end of the usable range downwards: ++ */ ++ last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); ++ ++#ifdef CONFIG_XEN ++ /* ++ * Truncate to the number of actual pages currently ++ * present. ++ */ ++ if (last_pfn > xen_start_info->nr_pages) ++ last_pfn = xen_start_info->nr_pages; ++#endif ++ ++ if (last_pfn > max_low_pfn) ++ last_pfn = max_low_pfn; ++ ++ /* ++ * .. finally, did all the rounding and playing ++ * around just make the area go away? ++ */ ++ if (last_pfn <= curr_pfn) ++ continue; ++ ++ size = last_pfn - curr_pfn; ++ free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); ++ } ++} ++ ++void __init e820_register_memory(void) ++{ ++ unsigned long gapstart, gapsize, round; ++ unsigned long long last; ++ int i; ++ ++#ifdef CONFIG_XEN ++ if (is_initial_xendomain()) { ++ struct xen_memory_map memmap; ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, machine_e820.map); ++ ++ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) ++ BUG(); ++ machine_e820.nr_map = memmap.nr_entries; ++ } ++ else ++ machine_e820 = e820; ++#define e820 machine_e820 ++#endif ++ ++ /* ++ * Search for the biggest gap in the low 32 bits of the e820 ++ * memory space. ++ */ ++ last = 0x100000000ull; ++ gapstart = 0x10000000; ++ gapsize = 0x400000; ++ i = e820.nr_map; ++ while (--i >= 0) { ++ unsigned long long start = e820.map[i].addr; ++ unsigned long long end = start + e820.map[i].size; ++ ++ /* ++ * Since "last" is at most 4GB, we know we'll ++ * fit in 32 bits if this condition is true ++ */ ++ if (last > end) { ++ unsigned long gap = last - end; ++ ++ if (gap > gapsize) { ++ gapsize = gap; ++ gapstart = end; ++ } ++ } ++ if (start < last) ++ last = start; ++ } ++#undef e820 ++ ++ /* ++ * See how much we want to round up: start off with ++ * rounding to the next 1MB area. ++ */ ++ round = 0x100000; ++ while ((gapsize >> 4) > round) ++ round += round; ++ /* Fun with two's complement */ ++ pci_mem_start = (gapstart + round) & -round; ++ ++ printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", ++ pci_mem_start, gapstart, gapsize); ++} ++ ++void __init print_memory_map(char *who) ++{ ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ printk(" %s: %016Lx - %016Lx ", who, ++ e820.map[i].addr, ++ e820.map[i].addr + e820.map[i].size); ++ switch (e820.map[i].type) { ++ case E820_RAM: printk("(usable)\n"); ++ break; ++ case E820_RESERVED: ++ printk("(reserved)\n"); ++ break; ++ case E820_ACPI: ++ printk("(ACPI data)\n"); ++ break; ++ case E820_NVS: ++ printk("(ACPI NVS)\n"); ++ break; ++ default: printk("type %u\n", e820.map[i].type); ++ break; ++ } ++ } ++} ++ ++void __init limit_regions(unsigned long long size) ++{ ++ unsigned long long current_addr = 0; ++ int i; ++ ++ print_memory_map("limit_regions start"); ++ for (i = 0; i < e820.nr_map; i++) { ++ current_addr = e820.map[i].addr + e820.map[i].size; ++ if (current_addr < size) ++ continue; ++ ++ if (e820.map[i].type != E820_RAM) ++ continue; ++ ++ if (e820.map[i].addr >= size) { ++ /* ++ * This region starts past the end of the ++ * requested size, skip it completely. ++ */ ++ e820.nr_map = i; ++ } else { ++ e820.nr_map = i + 1; ++ e820.map[i].size -= current_addr - size; ++ } ++ print_memory_map("limit_regions endfor"); ++ return; ++ } ++#ifdef CONFIG_XEN ++ if (current_addr < size) { ++ /* ++ * The e820 map finished before our requested size so ++ * extend the final entry to the requested address. ++ */ ++ --i; ++ if (e820.map[i].type == E820_RAM) ++ e820.map[i].size -= current_addr - size; ++ else ++ add_memory_region(current_addr, size - current_addr, E820_RAM); ++ } ++#endif ++ print_memory_map("limit_regions endfunc"); ++} ++ ++/* ++ * This function checks if any part of the range is mapped ++ * with type. ++ */ ++int ++e820_any_mapped(u64 start, u64 end, unsigned type) ++{ ++ int i; ++ ++#ifndef CONFIG_XEN ++ for (i = 0; i < e820.nr_map; i++) { ++ const struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; ++i) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif ++ ++ if (type && ei->type != type) ++ continue; ++ if (ei->addr >= end || ei->addr + ei->size <= start) ++ continue; ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL_GPL(e820_any_mapped); ++ ++ /* ++ * This function checks if the entire range is mapped with type. ++ * ++ * Note: this function only works correct if the e820 table is sorted and ++ * not-overlapping, which is the case ++ */ ++int __init ++e820_all_mapped(unsigned long s, unsigned long e, unsigned type) ++{ ++ u64 start = s; ++ u64 end = e; ++ int i; ++ ++#ifndef CONFIG_XEN ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; ++i) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif ++ ++ if (type && ei->type != type) ++ continue; ++ /* is the region (part) in overlap with the current region ?*/ ++ if (ei->addr >= end || ei->addr + ei->size <= start) ++ continue; ++ /* if the region is at the beginning of we move ++ * start to the end of the region since it's ok until there ++ */ ++ if (ei->addr <= start) ++ start = ei->addr + ei->size; ++ /* if start is now at or beyond end, we're done, full ++ * coverage */ ++ if (start >= end) ++ return 1; /* we're done */ ++ } ++ return 0; ++} ++ ++static int __init parse_memmap(char *arg) ++{ ++ if (!arg) ++ return -EINVAL; ++ ++ if (strcmp(arg, "exactmap") == 0) { ++#ifdef CONFIG_CRASH_DUMP ++ /* If we are doing a crash dump, we ++ * still need to know the real mem ++ * size before original memory map is ++ * reset. ++ */ ++ propagate_e820_map(); ++ saved_max_pfn = max_pfn; ++#endif ++ e820.nr_map = 0; ++ user_defined_memmap = 1; ++ } else { ++ /* If the user specifies memory size, we ++ * limit the BIOS-provided memory map to ++ * that size. exactmap can be used to specify ++ * the exact map. mem=number can be used to ++ * trim the existing memory map. ++ */ ++ unsigned long long start_at, mem_size; ++ ++ mem_size = memparse(arg, &arg); ++ if (*arg == '@') { ++ start_at = memparse(arg+1, &arg); ++ add_memory_region(start_at, mem_size, E820_RAM); ++ } else if (*arg == '#') { ++ start_at = memparse(arg+1, &arg); ++ add_memory_region(start_at, mem_size, E820_ACPI); ++ } else if (*arg == '$') { ++ start_at = memparse(arg+1, &arg); ++ add_memory_region(start_at, mem_size, E820_RESERVED); ++ } else { ++ limit_regions(mem_size); ++ user_defined_memmap = 1; ++ } ++ } ++ return 0; ++} ++early_param("memmap", parse_memmap); ++ ++#ifndef CONFIG_XEN ++void __init update_memory_range(u64 start, u64 size, unsigned old_type, ++ unsigned new_type) ++{ ++ int i; ++ ++ BUG_ON(old_type == new_type); ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ u64 final_start, final_end; ++ if (ei->type != old_type) ++ continue; ++ /* totally covered? */ ++ if (ei->addr >= start && ei->size <= size) { ++ ei->type = new_type; ++ continue; ++ } ++ /* partially covered */ ++ final_start = max(start, ei->addr); ++ final_end = min(start + size, ei->addr + ei->size); ++ if (final_start >= final_end) ++ continue; ++ add_memory_region(final_start, final_end - final_start, ++ new_type); ++ } ++} ++ ++void __init update_e820(void) ++{ ++ u8 nr_map; ++ ++ nr_map = e820.nr_map; ++ if (sanitize_e820_map(e820.map, &nr_map)) ++ return; ++ e820.nr_map = nr_map; ++ printk(KERN_INFO "modified physical RAM map:\n"); ++ print_memory_map("modified"); ++} ++#endif +diff --git a/arch/x86/kernel/e820_64-xen.c b/arch/x86/kernel/e820_64-xen.c +new file mode 100644 +index 0000000..371c948 +--- /dev/null ++++ b/arch/x86/kernel/e820_64-xen.c +@@ -0,0 +1,1045 @@ ++/* ++ * Handle the memory map. ++ * The functions here do the job until bootmem takes over. ++ * ++ * Getting sanitize_e820_map() in sync with i386 version by applying change: ++ * - Provisions for empty E820 memory regions (reported by certain BIOSes). ++ * Alex Achenbach , December 2002. ++ * Venkatesh Pallipadi ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct e820map e820 __initdata; ++#ifdef CONFIG_XEN ++struct e820map machine_e820; ++#endif ++ ++/* ++ * PFN of last memory page. ++ */ ++unsigned long end_pfn; ++ ++/* ++ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. ++ * The direct mapping extends to max_pfn_mapped, so that we can directly access ++ * apertures, ACPI and other tables without having to play with fixmaps. ++ */ ++unsigned long max_pfn_mapped; ++ ++/* ++ * Last pfn which the user wants to use. ++ */ ++static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; ++ ++/* ++ * Early reserved memory areas. ++ */ ++#define MAX_EARLY_RES 20 ++ ++struct early_res { ++ unsigned long start, end; ++ char name[16]; ++}; ++static struct early_res early_res[MAX_EARLY_RES] __initdata = { ++#ifndef CONFIG_XEN ++ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ ++#ifdef CONFIG_X86_TRAMPOLINE ++ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, ++#endif ++#endif ++ {} ++}; ++ ++void __init reserve_early(unsigned long start, unsigned long end, char *name) ++{ ++ int i; ++ struct early_res *r; ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ r = &early_res[i]; ++ if (end > r->start && start < r->end) ++ panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n", ++ start, end - 1, name?name:"", r->start, r->end - 1, r->name); ++ } ++ if (i >= MAX_EARLY_RES) ++ panic("Too many early reservations"); ++ r = &early_res[i]; ++ r->start = start; ++ r->end = end; ++ if (name) ++ strncpy(r->name, name, sizeof(r->name) - 1); ++} ++ ++void __init free_early(unsigned long start, unsigned long end) ++{ ++ struct early_res *r; ++ int i, j; ++ ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ r = &early_res[i]; ++ if (start == r->start && end == r->end) ++ break; ++ } ++ if (i >= MAX_EARLY_RES || !early_res[i].end) ++ panic("free_early on not reserved area: %lx-%lx!", start, end); ++ ++ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) ++ ; ++ ++ memmove(&early_res[i], &early_res[i + 1], ++ (j - 1 - i) * sizeof(struct early_res)); ++ ++ early_res[j - 1].end = 0; ++} ++ ++void __init early_res_to_bootmem(unsigned long start, unsigned long end) ++{ ++ int i; ++ unsigned long final_start, final_end; ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ struct early_res *r = &early_res[i]; ++ final_start = max(start, r->start); ++ final_end = min(end, r->end); ++ if (final_start >= final_end) ++ continue; ++ printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, ++ final_start, final_end - 1, r->name); ++ reserve_bootmem_generic(final_start, final_end - final_start); ++ } ++} ++ ++/* Check for already reserved areas */ ++static inline int __init ++bad_addr(unsigned long *addrp, unsigned long size, unsigned long align) ++{ ++ int i; ++ unsigned long addr = *addrp, last; ++ int changed = 0; ++again: ++ last = addr + size; ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ struct early_res *r = &early_res[i]; ++ if (last >= r->start && addr < r->end) { ++ *addrp = addr = round_up(r->end, align); ++ changed = 1; ++ goto again; ++ } ++ } ++ return changed; ++} ++ ++/* Check for already reserved areas */ ++static inline int __init ++bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align) ++{ ++ int i; ++ unsigned long addr = *addrp, last; ++ unsigned long size = *sizep; ++ int changed = 0; ++again: ++ last = addr + size; ++ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { ++ struct early_res *r = &early_res[i]; ++ if (last > r->start && addr < r->start) { ++ size = r->start - addr; ++ changed = 1; ++ goto again; ++ } ++ if (last > r->end && addr < r->end) { ++ addr = round_up(r->end, align); ++ size = last - addr; ++ changed = 1; ++ goto again; ++ } ++ if (last <= r->end && addr >= r->start) { ++ (*sizep)++; ++ return 0; ++ } ++ } ++ if (changed) { ++ *addrp = addr; ++ *sizep = size; ++ } ++ return changed; ++} ++/* ++ * This function checks if any part of the range is mapped ++ * with type. ++ */ ++int ++e820_any_mapped(unsigned long start, unsigned long end, unsigned type) ++{ ++ int i; ++ ++#ifndef CONFIG_XEN ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; i++) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif ++ ++ if (type && ei->type != type) ++ continue; ++ if (ei->addr >= end || ei->addr + ei->size <= start) ++ continue; ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL_GPL(e820_any_mapped); ++ ++/* ++ * This function checks if the entire range is mapped with type. ++ * ++ * Note: this function only works correct if the e820 table is sorted and ++ * not-overlapping, which is the case ++ */ ++int __init e820_all_mapped(unsigned long start, unsigned long end, ++ unsigned type) ++{ ++ int i; ++ ++#ifndef CONFIG_XEN ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; i++) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif ++ ++ if (type && ei->type != type) ++ continue; ++ /* is the region (part) in overlap with the current region ?*/ ++ if (ei->addr >= end || ei->addr + ei->size <= start) ++ continue; ++ ++ /* if the region is at the beginning of we move ++ * start to the end of the region since it's ok until there ++ */ ++ if (ei->addr <= start) ++ start = ei->addr + ei->size; ++ /* ++ * if start is now at or beyond end, we're done, full ++ * coverage ++ */ ++ if (start >= end) ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * Find a free area with specified alignment in a specific range. ++ */ ++unsigned long __init find_e820_area(unsigned long start, unsigned long end, ++ unsigned long size, unsigned long align) ++{ ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ unsigned long addr, last; ++ unsigned long ei_last; ++ ++ if (ei->type != E820_RAM) ++ continue; ++ addr = round_up(ei->addr, align); ++ ei_last = ei->addr + ei->size; ++ if (addr < start) ++ addr = round_up(start, align); ++ if (addr >= ei_last) ++ continue; ++ while (bad_addr(&addr, size, align) && addr+size <= ei_last) ++ ; ++ last = addr + size; ++ if (last > ei_last) ++ continue; ++ if (last > end) ++ continue; ++ return addr; ++ } ++ return -1UL; ++} ++ ++/* ++ * Find next free range after *start ++ */ ++unsigned long __init find_e820_area_size(unsigned long start, ++ unsigned long *sizep, ++ unsigned long align) ++{ ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ unsigned long addr, last; ++ unsigned long ei_last; ++ ++ if (ei->type != E820_RAM) ++ continue; ++ addr = round_up(ei->addr, align); ++ ei_last = ei->addr + ei->size; ++ if (addr < start) ++ addr = round_up(start, align); ++ if (addr >= ei_last) ++ continue; ++ *sizep = ei_last - addr; ++ while (bad_addr_size(&addr, sizep, align) && ++ addr + *sizep <= ei_last) ++ ; ++ last = addr + *sizep; ++ if (last > ei_last) ++ continue; ++ return addr; ++ } ++ return -1UL; ++ ++} ++/* ++ * Find the highest page frame number we have available ++ */ ++unsigned long __init e820_end_of_ram(void) ++{ ++ unsigned long end_pfn; ++ ++ end_pfn = find_max_pfn_with_active_regions(); ++ ++ if (end_pfn > max_pfn_mapped) ++ max_pfn_mapped = end_pfn; ++ if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT) ++ max_pfn_mapped = MAXMEM>>PAGE_SHIFT; ++ if (end_pfn > end_user_pfn) ++ end_pfn = end_user_pfn; ++ if (end_pfn > max_pfn_mapped) ++ end_pfn = max_pfn_mapped; ++ ++ printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped); ++ return end_pfn; ++} ++ ++/* ++ * Mark e820 reserved areas as busy for the resource manager. ++ */ ++void __init e820_reserve_resources(struct e820entry *e820, int nr_map) ++{ ++ int i; ++ struct resource *res; ++ ++ res = alloc_bootmem_low(sizeof(struct resource) * nr_map); ++ for (i = 0; i < nr_map; i++) { ++ switch (e820[i].type) { ++ case E820_RAM: res->name = "System RAM"; break; ++ case E820_ACPI: res->name = "ACPI Tables"; break; ++ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; ++ default: res->name = "reserved"; ++ } ++ res->start = e820[i].addr; ++ res->end = res->start + e820[i].size - 1; ++ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ insert_resource(&iomem_resource, res); ++ res++; ++ } ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * Find the ranges of physical addresses that do not correspond to ++ * e820 RAM areas and mark the corresponding pages as nosave for software ++ * suspend and suspend to RAM. ++ * ++ * This function requires the e820 map to be sorted and without any ++ * overlapping entries and assumes the first e820 area to be RAM. ++ */ ++void __init e820_mark_nosave_regions(void) ++{ ++ int i; ++ unsigned long paddr; ++ ++ paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); ++ for (i = 1; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ ++ if (paddr < ei->addr) ++ register_nosave_region(PFN_DOWN(paddr), ++ PFN_UP(ei->addr)); ++ ++ paddr = round_down(ei->addr + ei->size, PAGE_SIZE); ++ if (ei->type != E820_RAM) ++ register_nosave_region(PFN_UP(ei->addr), ++ PFN_DOWN(paddr)); ++ ++ if (paddr >= (end_pfn << PAGE_SHIFT)) ++ break; ++ } ++} ++#endif ++ ++/* ++ * Finds an active region in the address range from start_pfn to end_pfn and ++ * returns its range in ei_startpfn and ei_endpfn for the e820 entry. ++ */ ++static int __init e820_find_active_region(const struct e820entry *ei, ++ unsigned long start_pfn, ++ unsigned long end_pfn, ++ unsigned long *ei_startpfn, ++ unsigned long *ei_endpfn) ++{ ++ *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; ++ *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; ++ ++ /* Skip map entries smaller than a page */ ++ if (*ei_startpfn >= *ei_endpfn) ++ return 0; ++ ++ /* Check if max_pfn_mapped should be updated */ ++ if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped) ++ max_pfn_mapped = *ei_endpfn; ++ ++ /* Skip if map is outside the node */ ++ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || ++ *ei_startpfn >= end_pfn) ++ return 0; ++ ++ /* Check for overlaps */ ++ if (*ei_startpfn < start_pfn) ++ *ei_startpfn = start_pfn; ++ if (*ei_endpfn > end_pfn) ++ *ei_endpfn = end_pfn; ++ ++ /* Obey end_user_pfn to save on memmap */ ++ if (*ei_startpfn >= end_user_pfn) ++ return 0; ++ if (*ei_endpfn > end_user_pfn) ++ *ei_endpfn = end_user_pfn; ++ ++ return 1; ++} ++ ++/* Walk the e820 map and register active regions within a node */ ++void __init ++e820_register_active_regions(int nid, unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ unsigned long ei_startpfn; ++ unsigned long ei_endpfn; ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) ++ if (e820_find_active_region(&e820.map[i], ++ start_pfn, end_pfn, ++ &ei_startpfn, &ei_endpfn)) ++ add_active_range(nid, ei_startpfn, ei_endpfn); ++} ++ ++/* ++ * Add a memory region to the kernel e820 map. ++ */ ++void __init add_memory_region(unsigned long start, unsigned long size, int type) ++{ ++ int x = e820.nr_map; ++ ++ if (x == E820MAX) { ++ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); ++ return; ++ } ++ ++ e820.map[x].addr = start; ++ e820.map[x].size = size; ++ e820.map[x].type = type; ++ e820.nr_map++; ++} ++ ++/* ++ * Find the hole size (in bytes) in the memory range. ++ * @start: starting address of the memory range to scan ++ * @end: ending address of the memory range to scan ++ */ ++unsigned long __init e820_hole_size(unsigned long start, unsigned long end) ++{ ++ unsigned long start_pfn = start >> PAGE_SHIFT; ++ unsigned long end_pfn = end >> PAGE_SHIFT; ++ unsigned long ei_startpfn, ei_endpfn, ram = 0; ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ if (e820_find_active_region(&e820.map[i], ++ start_pfn, end_pfn, ++ &ei_startpfn, &ei_endpfn)) ++ ram += ei_endpfn - ei_startpfn; ++ } ++ return end - start - (ram << PAGE_SHIFT); ++} ++ ++static void __init e820_print_map(char *who) ++{ ++ int i; ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ printk(KERN_INFO " %s: %016Lx - %016Lx ", who, ++ (unsigned long long) e820.map[i].addr, ++ (unsigned long long) ++ (e820.map[i].addr + e820.map[i].size)); ++ switch (e820.map[i].type) { ++ case E820_RAM: ++ printk(KERN_CONT "(usable)\n"); ++ break; ++ case E820_RESERVED: ++ printk(KERN_CONT "(reserved)\n"); ++ break; ++ case E820_ACPI: ++ printk(KERN_CONT "(ACPI data)\n"); ++ break; ++ case E820_NVS: ++ printk(KERN_CONT "(ACPI NVS)\n"); ++ break; ++ default: ++ printk(KERN_CONT "type %u\n", e820.map[i].type); ++ break; ++ } ++ } ++} ++ ++/* ++ * Sanitize the BIOS e820 map. ++ * ++ * Some e820 responses include overlapping entries. The following ++ * replaces the original e820 map with a new one, removing overlaps. ++ * ++ */ ++static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) ++{ ++ struct change_member { ++ struct e820entry *pbios; /* pointer to original bios entry */ ++ unsigned long long addr; /* address for this change point */ ++ }; ++ static struct change_member change_point_list[2*E820MAX] __initdata; ++ static struct change_member *change_point[2*E820MAX] __initdata; ++ static struct e820entry *overlap_list[E820MAX] __initdata; ++ static struct e820entry new_bios[E820MAX] __initdata; ++ struct change_member *change_tmp; ++ unsigned long current_type, last_type; ++ unsigned long long last_addr; ++ int chgidx, still_changing; ++ int overlap_entries; ++ int new_bios_entry; ++ int old_nr, new_nr, chg_nr; ++ int i; ++ ++ /* ++ Visually we're performing the following ++ (1,2,3,4 = memory types)... ++ ++ Sample memory map (w/overlaps): ++ ____22__________________ ++ ______________________4_ ++ ____1111________________ ++ _44_____________________ ++ 11111111________________ ++ ____________________33__ ++ ___________44___________ ++ __________33333_________ ++ ______________22________ ++ ___________________2222_ ++ _________111111111______ ++ _____________________11_ ++ _________________4______ ++ ++ Sanitized equivalent (no overlap): ++ 1_______________________ ++ _44_____________________ ++ ___1____________________ ++ ____22__________________ ++ ______11________________ ++ _________1______________ ++ __________3_____________ ++ ___________44___________ ++ _____________33_________ ++ _______________2________ ++ ________________1_______ ++ _________________4______ ++ ___________________2____ ++ ____________________33__ ++ ______________________4_ ++ */ ++ ++ /* if there's only one memory region, don't bother */ ++ if (*pnr_map < 2) ++ return -1; ++ ++ old_nr = *pnr_map; ++ ++ /* bail out if we find any unreasonable addresses in bios map */ ++ for (i = 0; i < old_nr; i++) ++ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) ++ return -1; ++ ++ /* create pointers for initial change-point information (for sorting) */ ++ for (i = 0; i < 2 * old_nr; i++) ++ change_point[i] = &change_point_list[i]; ++ ++ /* record all known change-points (starting and ending addresses), ++ omitting those that are for empty memory regions */ ++ chgidx = 0; ++ for (i = 0; i < old_nr; i++) { ++ if (biosmap[i].size != 0) { ++ change_point[chgidx]->addr = biosmap[i].addr; ++ change_point[chgidx++]->pbios = &biosmap[i]; ++ change_point[chgidx]->addr = biosmap[i].addr + ++ biosmap[i].size; ++ change_point[chgidx++]->pbios = &biosmap[i]; ++ } ++ } ++ chg_nr = chgidx; ++ ++ /* sort change-point list by memory addresses (low -> high) */ ++ still_changing = 1; ++ while (still_changing) { ++ still_changing = 0; ++ for (i = 1; i < chg_nr; i++) { ++ unsigned long long curaddr, lastaddr; ++ unsigned long long curpbaddr, lastpbaddr; ++ ++ curaddr = change_point[i]->addr; ++ lastaddr = change_point[i - 1]->addr; ++ curpbaddr = change_point[i]->pbios->addr; ++ lastpbaddr = change_point[i - 1]->pbios->addr; ++ ++ /* ++ * swap entries, when: ++ * ++ * curaddr > lastaddr or ++ * curaddr == lastaddr and curaddr == curpbaddr and ++ * lastaddr != lastpbaddr ++ */ ++ if (curaddr < lastaddr || ++ (curaddr == lastaddr && curaddr == curpbaddr && ++ lastaddr != lastpbaddr)) { ++ change_tmp = change_point[i]; ++ change_point[i] = change_point[i-1]; ++ change_point[i-1] = change_tmp; ++ still_changing = 1; ++ } ++ } ++ } ++ ++ /* create a new bios memory map, removing overlaps */ ++ overlap_entries = 0; /* number of entries in the overlap table */ ++ new_bios_entry = 0; /* index for creating new bios map entries */ ++ last_type = 0; /* start with undefined memory type */ ++ last_addr = 0; /* start with 0 as last starting address */ ++ ++ /* loop through change-points, determining affect on the new bios map */ ++ for (chgidx = 0; chgidx < chg_nr; chgidx++) { ++ /* keep track of all overlapping bios entries */ ++ if (change_point[chgidx]->addr == ++ change_point[chgidx]->pbios->addr) { ++ /* ++ * add map entry to overlap list (> 1 entry ++ * implies an overlap) ++ */ ++ overlap_list[overlap_entries++] = ++ change_point[chgidx]->pbios; ++ } else { ++ /* ++ * remove entry from list (order independent, ++ * so swap with last) ++ */ ++ for (i = 0; i < overlap_entries; i++) { ++ if (overlap_list[i] == ++ change_point[chgidx]->pbios) ++ overlap_list[i] = ++ overlap_list[overlap_entries-1]; ++ } ++ overlap_entries--; ++ } ++ /* ++ * if there are overlapping entries, decide which ++ * "type" to use (larger value takes precedence -- ++ * 1=usable, 2,3,4,4+=unusable) ++ */ ++ current_type = 0; ++ for (i = 0; i < overlap_entries; i++) ++ if (overlap_list[i]->type > current_type) ++ current_type = overlap_list[i]->type; ++ /* ++ * continue building up new bios map based on this ++ * information ++ */ ++ if (current_type != last_type) { ++ if (last_type != 0) { ++ new_bios[new_bios_entry].size = ++ change_point[chgidx]->addr - last_addr; ++ /* ++ * move forward only if the new size ++ * was non-zero ++ */ ++ if (new_bios[new_bios_entry].size != 0) ++ /* ++ * no more space left for new ++ * bios entries ? ++ */ ++ if (++new_bios_entry >= E820MAX) ++ break; ++ } ++ if (current_type != 0) { ++ new_bios[new_bios_entry].addr = ++ change_point[chgidx]->addr; ++ new_bios[new_bios_entry].type = current_type; ++ last_addr = change_point[chgidx]->addr; ++ } ++ last_type = current_type; ++ } ++ } ++ /* retain count for new bios entries */ ++ new_nr = new_bios_entry; ++ ++ /* copy new bios mapping into original location */ ++ memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); ++ *pnr_map = new_nr; ++ ++ return 0; ++} ++ ++/* ++ * Copy the BIOS e820 map into a safe place. ++ * ++ * Sanity-check it while we're at it.. ++ * ++ * If we're lucky and live on a modern system, the setup code ++ * will have given us a memory map that we can use to properly ++ * set up memory. If we aren't, we'll fake a memory map. ++ */ ++static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) ++{ ++#ifndef CONFIG_XEN ++ /* Only one memory region (or negative)? Ignore it */ ++ if (nr_map < 2) ++ return -1; ++#else ++ BUG_ON(nr_map < 1); ++#endif ++ ++ do { ++ u64 start = biosmap->addr; ++ u64 size = biosmap->size; ++ u64 end = start + size; ++ u32 type = biosmap->type; ++ ++ /* Overflow in 64 bits? Ignore the memory map. */ ++ if (start > end) ++ return -1; ++ ++ add_memory_region(start, size, type); ++ } while (biosmap++, --nr_map); ++ ++#ifdef CONFIG_XEN ++ if (is_initial_xendomain()) { ++ struct xen_memory_map memmap; ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, machine_e820.map); ++ ++ if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) ++ BUG(); ++ machine_e820.nr_map = memmap.nr_entries; ++ } else ++ machine_e820 = e820; ++#endif ++ ++ return 0; ++} ++ ++static void early_panic(char *msg) ++{ ++ early_printk(msg); ++ panic(msg); ++} ++ ++/* We're not void only for x86 32-bit compat */ ++char * __init machine_specific_memory_setup(void) ++{ ++#ifndef CONFIG_XEN ++ char *who = "BIOS-e820"; ++ /* ++ * Try to copy the BIOS-supplied E820-map. ++ * ++ * Otherwise fake a memory map; one section from 0k->640k, ++ * the next section from 1mb->appropriate_mem_k ++ */ ++ sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); ++ if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) ++ early_panic("Cannot find a valid memory map"); ++#else /* CONFIG_XEN */ ++ char *who = "Xen"; ++ int rc; ++ struct xen_memory_map memmap; ++ /* ++ * This is rather large for a stack variable but this early in ++ * the boot process we know we have plenty slack space. ++ */ ++ struct e820entry map[E820MAX]; ++ ++ memmap.nr_entries = E820MAX; ++ set_xen_guest_handle(memmap.buffer, map); ++ ++ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); ++ if ( rc == -ENOSYS ) { ++ memmap.nr_entries = 1; ++ map[0].addr = 0ULL; ++ map[0].size = xen_start_info->nr_pages << PAGE_SHIFT; ++ /* 8MB slack (to balance backend allocations). */ ++ map[0].size += 8 << 20; ++ map[0].type = E820_RAM; ++ rc = 0; ++ } ++ BUG_ON(rc); ++ ++ sanitize_e820_map(map, (char *)&memmap.nr_entries); ++ ++ if (copy_e820_map(map, (char)memmap.nr_entries) < 0) ++ early_panic("Cannot find a valid memory map"); ++#endif ++ printk(KERN_INFO "BIOS-provided physical RAM map:\n"); ++ e820_print_map(who); ++ ++ /* In case someone cares... */ ++ return who; ++} ++ ++static int __init parse_memopt(char *p) ++{ ++ int i; ++ unsigned long current_end; ++ unsigned long end; ++ ++ if (!p) ++ return -EINVAL; ++ end_user_pfn = memparse(p, &p); ++ end_user_pfn >>= PAGE_SHIFT; ++ ++ end = end_user_pfn<> PAGE_SHIFT); ++ } ++ return *p == '\0' ? 0 : -EINVAL; ++} ++early_param("memmap", parse_memmap_opt); ++ ++void __init finish_e820_parsing(void) ++{ ++ if (userdef) { ++ char nr = e820.nr_map; ++ ++ if (sanitize_e820_map(e820.map, &nr) < 0) ++ early_panic("Invalid user supplied memory map"); ++ e820.nr_map = nr; ++ ++ printk(KERN_INFO "user-defined physical RAM map:\n"); ++ e820_print_map("user"); ++ } ++} ++ ++#ifndef CONFIG_XEN ++void __init update_memory_range(u64 start, u64 size, unsigned old_type, ++ unsigned new_type) ++{ ++ int i; ++ ++ BUG_ON(old_type == new_type); ++ ++ for (i = 0; i < e820.nr_map; i++) { ++ struct e820entry *ei = &e820.map[i]; ++ u64 final_start, final_end; ++ if (ei->type != old_type) ++ continue; ++ /* totally covered? */ ++ if (ei->addr >= start && ei->size <= size) { ++ ei->type = new_type; ++ continue; ++ } ++ /* partially covered */ ++ final_start = max(start, ei->addr); ++ final_end = min(start + size, ei->addr + ei->size); ++ if (final_start >= final_end) ++ continue; ++ add_memory_region(final_start, final_end - final_start, ++ new_type); ++ } ++} ++ ++void __init update_e820(void) ++{ ++ u8 nr_map; ++ ++ nr_map = e820.nr_map; ++ if (sanitize_e820_map(e820.map, &nr_map)) ++ return; ++ e820.nr_map = nr_map; ++ printk(KERN_INFO "modified physical RAM map:\n"); ++ e820_print_map("modified"); ++} ++#endif ++ ++unsigned long pci_mem_start = 0xaeedbabe; ++EXPORT_SYMBOL(pci_mem_start); ++ ++/* ++ * Search for the biggest gap in the low 32 bits of the e820 ++ * memory space. We pass this space to PCI to assign MMIO resources ++ * for hotplug or unconfigured devices in. ++ * Hopefully the BIOS let enough space left. ++ */ ++__init void e820_setup_gap(struct e820entry *e820, int nr_map) ++{ ++ unsigned long gapstart, gapsize, round; ++ unsigned long last; ++ int i; ++ int found = 0; ++ ++ last = 0x100000000ull; ++ gapstart = 0x10000000; ++ gapsize = 0x400000; ++ i = nr_map; ++ while (--i >= 0) { ++ unsigned long long start = e820[i].addr; ++ unsigned long long end = start + e820[i].size; ++ ++ /* ++ * Since "last" is at most 4GB, we know we'll ++ * fit in 32 bits if this condition is true ++ */ ++ if (last > end) { ++ unsigned long gap = last - end; ++ ++ if (gap > gapsize) { ++ gapsize = gap; ++ gapstart = end; ++ found = 1; ++ } ++ } ++ if (start < last) ++ last = start; ++ } ++ ++ if (!found) { ++ gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; ++ printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " ++ "address range\n" ++ KERN_ERR "PCI: Unassigned devices with 32bit resource " ++ "registers may break!\n"); ++ } ++ ++ /* ++ * See how much we want to round up: start off with ++ * rounding to the next 1MB area. ++ */ ++ round = 0x100000; ++ while ((gapsize >> 4) > round) ++ round += round; ++ /* Fun with two's complement */ ++ pci_mem_start = (gapstart + round) & -round; ++ ++ printk(KERN_INFO ++ "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", ++ pci_mem_start, gapstart, gapsize); ++} ++ ++int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) ++{ ++ int i; ++ ++ if (slot < 0 || slot >= e820.nr_map) ++ return -1; ++ for (i = slot; i < e820.nr_map; i++) { ++ if (e820.map[i].type != E820_RAM) ++ continue; ++ break; ++ } ++ if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) ++ return -1; ++ *addr = e820.map[i].addr; ++ *size = min_t(u64, e820.map[i].size + e820.map[i].addr, ++ max_pfn << PAGE_SHIFT) - *addr; ++ return i + 1; ++} +diff --git a/arch/x86/kernel/early_printk-xen.c b/arch/x86/kernel/early_printk-xen.c +new file mode 100644 +index 0000000..72d0463 +--- /dev/null ++++ b/arch/x86/kernel/early_printk-xen.c +@@ -0,0 +1,285 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Simple VGA output */ ++#define VGABASE (__ISA_IO_base + 0xb8000) ++ ++#ifndef CONFIG_XEN ++static int max_ypos = 25, max_xpos = 80; ++static int current_ypos = 25, current_xpos; ++ ++static void early_vga_write(struct console *con, const char *str, unsigned n) ++{ ++ char c; ++ int i, k, j; ++ ++ while ((c = *str++) != '\0' && n-- > 0) { ++ if (current_ypos >= max_ypos) { ++ /* scroll 1 line up */ ++ for (k = 1, j = 0; k < max_ypos; k++, j++) { ++ for (i = 0; i < max_xpos; i++) { ++ writew(readw(VGABASE+2*(max_xpos*k+i)), ++ VGABASE + 2*(max_xpos*j + i)); ++ } ++ } ++ for (i = 0; i < max_xpos; i++) ++ writew(0x720, VGABASE + 2*(max_xpos*j + i)); ++ current_ypos = max_ypos-1; ++ } ++ if (c == '\n') { ++ current_xpos = 0; ++ current_ypos++; ++ } else if (c != '\r') { ++ writew(((0x7 << 8) | (unsigned short) c), ++ VGABASE + 2*(max_xpos*current_ypos + ++ current_xpos++)); ++ if (current_xpos >= max_xpos) { ++ current_xpos = 0; ++ current_ypos++; ++ } ++ } ++ } ++} ++ ++static struct console early_vga_console = { ++ .name = "earlyvga", ++ .write = early_vga_write, ++ .flags = CON_PRINTBUFFER, ++ .index = -1, ++}; ++ ++/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ ++ ++static int early_serial_base = 0x3f8; /* ttyS0 */ ++ ++#define XMTRDY 0x20 ++ ++#define DLAB 0x80 ++ ++#define TXR 0 /* Transmit register (WRITE) */ ++#define RXR 0 /* Receive register (READ) */ ++#define IER 1 /* Interrupt Enable */ ++#define IIR 2 /* Interrupt ID */ ++#define FCR 2 /* FIFO control */ ++#define LCR 3 /* Line control */ ++#define MCR 4 /* Modem control */ ++#define LSR 5 /* Line Status */ ++#define MSR 6 /* Modem Status */ ++#define DLL 0 /* Divisor Latch Low */ ++#define DLH 1 /* Divisor latch High */ ++ ++static int early_serial_putc(unsigned char ch) ++{ ++ unsigned timeout = 0xffff; ++ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) ++ cpu_relax(); ++ outb(ch, early_serial_base + TXR); ++ return timeout ? 0 : -1; ++} ++ ++static void early_serial_write(struct console *con, const char *s, unsigned n) ++{ ++ while (*s && n-- > 0) { ++ if (*s == '\n') ++ early_serial_putc('\r'); ++ early_serial_putc(*s); ++ s++; ++ } ++} ++ ++#define DEFAULT_BAUD 9600 ++ ++static __init void early_serial_init(char *s) ++{ ++ unsigned char c; ++ unsigned divisor; ++ unsigned baud = DEFAULT_BAUD; ++ char *e; ++ ++ if (*s == ',') ++ ++s; ++ ++ if (*s) { ++ unsigned port; ++ if (!strncmp(s, "0x", 2)) { ++ early_serial_base = simple_strtoul(s, &e, 16); ++ } else { ++ static int bases[] = { 0x3f8, 0x2f8 }; ++ ++ if (!strncmp(s, "ttyS", 4)) ++ s += 4; ++ port = simple_strtoul(s, &e, 10); ++ if (port > 1 || s == e) ++ port = 0; ++ early_serial_base = bases[port]; ++ } ++ s += strcspn(s, ","); ++ if (*s == ',') ++ s++; ++ } ++ ++ outb(0x3, early_serial_base + LCR); /* 8n1 */ ++ outb(0, early_serial_base + IER); /* no interrupt */ ++ outb(0, early_serial_base + FCR); /* no fifo */ ++ outb(0x3, early_serial_base + MCR); /* DTR + RTS */ ++ ++ if (*s) { ++ baud = simple_strtoul(s, &e, 0); ++ if (baud == 0 || s == e) ++ baud = DEFAULT_BAUD; ++ } ++ ++ divisor = 115200 / baud; ++ c = inb(early_serial_base + LCR); ++ outb(c | DLAB, early_serial_base + LCR); ++ outb(divisor & 0xff, early_serial_base + DLL); ++ outb((divisor >> 8) & 0xff, early_serial_base + DLH); ++ outb(c & ~DLAB, early_serial_base + LCR); ++} ++ ++#else /* CONFIG_XEN */ ++ ++static void ++early_serial_write(struct console *con, const char *s, unsigned count) ++{ ++ int n; ++ ++ while (count > 0) { ++ n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); ++ if (n <= 0) ++ break; ++ count -= n; ++ s += n; ++ } ++} ++ ++static __init void early_serial_init(char *s) ++{ ++} ++ ++/* ++ * No early VGA console on Xen, as we do not have convenient ISA-space ++ * mappings. Someone should fix this for domain 0. For now, use fake serial. ++ */ ++#define early_vga_console early_serial_console ++#define xenboot_console early_serial_console ++ ++#endif ++ ++static struct console early_serial_console = { ++ .name = "earlyser", ++ .write = early_serial_write, ++ .flags = CON_PRINTBUFFER, ++ .index = -1, ++}; ++ ++/* Console interface to a host file on AMD's SimNow! */ ++ ++static int simnow_fd; ++ ++enum { ++ MAGIC1 = 0xBACCD00A, ++ MAGIC2 = 0xCA110000, ++ XOPEN = 5, ++ XWRITE = 4, ++}; ++ ++static noinline long simnow(long cmd, long a, long b, long c) ++{ ++ long ret; ++ asm volatile("cpuid" : ++ "=a" (ret) : ++ "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); ++ return ret; ++} ++ ++static void __init simnow_init(char *str) ++{ ++ char *fn = "klog"; ++ if (*str == '=') ++ fn = ++str; ++ /* error ignored */ ++ simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); ++} ++ ++static void simnow_write(struct console *con, const char *s, unsigned n) ++{ ++ simnow(XWRITE, simnow_fd, (unsigned long)s, n); ++} ++ ++static struct console simnow_console = { ++ .name = "simnow", ++ .write = simnow_write, ++ .flags = CON_PRINTBUFFER, ++ .index = -1, ++}; ++ ++/* Direct interface for emergencies */ ++static struct console *early_console = &early_vga_console; ++static int early_console_initialized; ++ ++void early_printk(const char *fmt, ...) ++{ ++ char buf[512]; ++ int n; ++ va_list ap; ++ ++ va_start(ap, fmt); ++ n = vscnprintf(buf, 512, fmt, ap); ++ early_console->write(early_console, buf, n); ++ va_end(ap); ++} ++ ++static int __initdata keep_early; ++ ++static int __init setup_early_printk(char *buf) ++{ ++ if (!buf) ++ return 0; ++ ++ if (early_console_initialized) ++ return 0; ++ early_console_initialized = 1; ++ ++ if (strstr(buf, "keep")) ++ keep_early = 1; ++ ++ if (!strncmp(buf, "serial", 6)) { ++ early_serial_init(buf + 6); ++ early_console = &early_serial_console; ++ } else if (!strncmp(buf, "ttyS", 4)) { ++ early_serial_init(buf); ++ early_console = &early_serial_console; ++ } else if (!strncmp(buf, "vga", 3)) { ++#ifndef CONFIG_XEN ++ && boot_params.screen_info.orig_video_isVGA == 1) { ++ max_xpos = boot_params.screen_info.orig_video_cols; ++ max_ypos = boot_params.screen_info.orig_video_lines; ++ current_ypos = boot_params.screen_info.orig_y; ++#endif ++ early_console = &early_vga_console; ++ } else if (!strncmp(buf, "simnow", 6)) { ++ simnow_init(buf + 6); ++ early_console = &simnow_console; ++ keep_early = 1; ++#ifdef CONFIG_XEN ++ } else if (!strncmp(buf, "xen", 3)) { ++ early_console = &xenboot_console; ++#endif ++ } ++ ++ if (keep_early) ++ early_console->flags &= ~CON_BOOT; ++ else ++ early_console->flags |= CON_BOOT; ++ register_console(early_console); ++ return 0; ++} ++early_param("earlyprintk", setup_early_printk); +diff --git a/arch/x86/kernel/entry_32-xen.S b/arch/x86/kernel/entry_32-xen.S +new file mode 100644 +index 0000000..ca66938 +--- /dev/null ++++ b/arch/x86/kernel/entry_32-xen.S +@@ -0,0 +1,1404 @@ ++/* ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ */ ++ ++/* ++ * entry.S contains the system-call and fault low-level handling routines. ++ * This also contains the timer-interrupt handler, as well as all interrupts ++ * and faults that can result in a task-switch. ++ * ++ * NOTE: This code handles signal-recognition, which happens every time ++ * after a timer-interrupt and after each system call. ++ * ++ * I changed all the .align's to 4 (16 byte alignment), as that's faster ++ * on a 486. ++ * ++ * Stack layout in 'syscall_exit': ++ * ptrace needs to have all regs on the stack. ++ * if the order here is changed, it needs to be ++ * updated in fork.c:copy_process, signal.c:do_signal, ++ * ptrace.c and ptrace.h ++ * ++ * 0(%esp) - %ebx ++ * 4(%esp) - %ecx ++ * 8(%esp) - %edx ++ * C(%esp) - %esi ++ * 10(%esp) - %edi ++ * 14(%esp) - %ebp ++ * 18(%esp) - %eax ++ * 1C(%esp) - %ds ++ * 20(%esp) - %es ++ * 24(%esp) - %fs ++ * 28(%esp) - orig_eax ++ * 2C(%esp) - %eip ++ * 30(%esp) - %cs ++ * 34(%esp) - %eflags ++ * 38(%esp) - %oldesp ++ * 3C(%esp) - %oldss ++ * ++ * "current" is in register %ebx during any slow entries. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "irq_vectors.h" ++#include ++ ++/* ++ * We use macros for low-level operations which need to be overridden ++ * for paravirtualization. The following will never clobber any registers: ++ * INTERRUPT_RETURN (aka. "iret") ++ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") ++ * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). ++ * ++ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must ++ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). ++ * Allowing a register to be clobbered can shrink the paravirt replacement ++ * enough to patch inline, increasing performance. ++ */ ++ ++#define nr_syscalls ((syscall_table_size)/4) ++ ++/* Pseudo-eflags. */ ++NMI_MASK = 0x80000000 ++ ++#ifdef CONFIG_PREEMPT ++#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF ++#else ++#define preempt_stop(clobbers) ++#define resume_kernel restore_nocheck ++#endif ++ ++.macro TRACE_IRQS_IRET ++#ifdef CONFIG_TRACE_IRQFLAGS ++ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? ++ jz 1f ++ TRACE_IRQS_ON ++1: ++#endif ++.endm ++ ++#ifdef CONFIG_VM86 ++#define resume_userspace_sig check_userspace ++#else ++#define resume_userspace_sig resume_userspace ++#endif ++ ++#define SAVE_ALL \ ++ cld; \ ++ pushl %fs; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET fs, 0;*/\ ++ pushl %es; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET es, 0;*/\ ++ pushl %ds; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ /*CFI_REL_OFFSET ds, 0;*/\ ++ pushl %eax; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET eax, 0;\ ++ pushl %ebp; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ebp, 0;\ ++ pushl %edi; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET edi, 0;\ ++ pushl %esi; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET esi, 0;\ ++ pushl %edx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET edx, 0;\ ++ pushl %ecx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ecx, 0;\ ++ pushl %ebx; \ ++ CFI_ADJUST_CFA_OFFSET 4;\ ++ CFI_REL_OFFSET ebx, 0;\ ++ movl $(__USER_DS), %edx; \ ++ movl %edx, %ds; \ ++ movl %edx, %es; \ ++ movl $(__KERNEL_PERCPU), %edx; \ ++ movl %edx, %fs ++ ++#define RESTORE_INT_REGS \ ++ popl %ebx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ebx;\ ++ popl %ecx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ecx;\ ++ popl %edx; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE edx;\ ++ popl %esi; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE esi;\ ++ popl %edi; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE edi;\ ++ popl %ebp; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE ebp;\ ++ popl %eax; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ CFI_RESTORE eax ++ ++#define RESTORE_REGS \ ++ RESTORE_INT_REGS; \ ++1: popl %ds; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE ds;*/\ ++2: popl %es; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE es;*/\ ++3: popl %fs; \ ++ CFI_ADJUST_CFA_OFFSET -4;\ ++ /*CFI_RESTORE fs;*/\ ++.pushsection .fixup,"ax"; \ ++4: movl $0,(%esp); \ ++ jmp 1b; \ ++5: movl $0,(%esp); \ ++ jmp 2b; \ ++6: movl $0,(%esp); \ ++ jmp 3b; \ ++.section __ex_table,"a";\ ++ .align 4; \ ++ .long 1b,4b; \ ++ .long 2b,5b; \ ++ .long 3b,6b; \ ++.popsection ++ ++#define RING0_INT_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, 3*4;\ ++ /*CFI_OFFSET cs, -2*4;*/\ ++ CFI_OFFSET eip, -3*4 ++ ++#define RING0_EC_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, 4*4;\ ++ /*CFI_OFFSET cs, -2*4;*/\ ++ CFI_OFFSET eip, -3*4 ++ ++#define RING0_PTREGS_FRAME \ ++ CFI_STARTPROC simple;\ ++ CFI_SIGNAL_FRAME;\ ++ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ ++ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ ++ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ ++ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ ++ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ ++ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ ++ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ ++ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ ++ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ ++ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ ++ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ ++ CFI_OFFSET ebx, PT_EBX-PT_OLDESP ++ ++ENTRY(ret_from_fork) ++ CFI_STARTPROC ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call schedule_tail ++ GET_THREAD_INFO(%ebp) ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ pushl $0x0202 # Reset kernel eflags ++ CFI_ADJUST_CFA_OFFSET 4 ++ popfl ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp syscall_exit ++ CFI_ENDPROC ++END(ret_from_fork) ++ ++/* ++ * Return to user mode is not as complex as all this looks, ++ * but we want the default path for a system call return to ++ * go as quickly as possible which is why some of this is ++ * less clear than it otherwise should be. ++ */ ++ ++ # userspace resumption stub bypassing syscall exit tracing ++ ALIGN ++ RING0_PTREGS_FRAME ++ret_from_exception: ++ preempt_stop(CLBR_ANY) ++ret_from_intr: ++ GET_THREAD_INFO(%ebp) ++check_userspace: ++ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS ++ movb PT_CS(%esp), %al ++ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax ++ cmpl $USER_RPL, %eax ++ jb resume_kernel # not returning to v8086 or userspace ++ ++ENTRY(resume_userspace) ++ LOCKDEP_SYS_EXIT ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on ++ # int/exception return? ++ jne work_pending ++ jmp restore_all ++END(ret_from_exception) ++ ++#ifdef CONFIG_PREEMPT ++ENTRY(resume_kernel) ++ DISABLE_INTERRUPTS(CLBR_ANY) ++ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? ++ jnz restore_nocheck ++need_resched: ++ movl TI_flags(%ebp), %ecx # need_resched set ? ++ testb $_TIF_NEED_RESCHED, %cl ++ jz restore_all ++ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? ++ jz restore_all ++ call preempt_schedule_irq ++ jmp need_resched ++END(resume_kernel) ++#endif ++ CFI_ENDPROC ++ ++ .macro test_tif ti_reg # system call tracing in operation / emulation ++ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ ++ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(\ti_reg) ++ .endm ++ ++/* SYSENTER_RETURN points to after the "sysenter" instruction in ++ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ ++ ++ # sysenter call handler stub ++ENTRY(ia32_sysenter_target) ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA esp, 0 ++ CFI_REGISTER esp, ebp ++ movl SYSENTER_stack_sp0(%esp),%esp ++sysenter_past_esp: ++ /* ++ * Interrupts are disabled here, but we can't trace it until ++ * enough kernel state to call TRACE_IRQS_OFF can be called - but ++ * we immediately enable interrupts at that point anyway. ++ */ ++ pushl $(__USER_DS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET ss, 0*/ ++ pushl %ebp ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET esp, 0 ++ pushfl ++ orl $X86_EFLAGS_IF, (%esp) ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $(__USER_CS) ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET cs, 0*/ ++ /* ++ * Push current_thread_info()->sysenter_return to the stack. ++ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words ++ * pushed above; +8 corresponds to copy_thread's esp0 setting. ++ */ ++ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET eip, 0 ++ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ ++/* ++ * Load the potential sixth argument from user stack. ++ * Careful about security. ++ */ ++ cmpl $__PAGE_OFFSET-3,%ebp ++ jae syscall_fault ++1: movl (%ebp),%ebp ++ movl %ebp,PT_EBP(%esp) ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,syscall_fault ++.previous ++ ++ GET_THREAD_INFO(%ebp) ++ test_tif %ebp ++ jnz syscall_trace_entry ++ cmpl $(nr_syscalls), %eax ++ jae syscall_badsys ++ call *sys_call_table(,%eax,4) ++ movl %eax,PT_EAX(%esp) ++ LOCKDEP_SYS_EXIT ++ DISABLE_INTERRUPTS(CLBR_ANY) ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ testw $_TIF_ALLWORK_MASK, %cx ++ jne syscall_exit_work ++/* if something modifies registers it must also disable sysexit */ ++ movl PT_EIP(%esp), %edx ++ movl PT_OLDESP(%esp), %ecx ++ xorl %ebp,%ebp ++ TRACE_IRQS_ON ++1: mov PT_FS(%esp), %fs ++ ENABLE_INTERRUPTS_SYSCALL_RET ++ CFI_ENDPROC ++.pushsection .fixup,"ax" ++2: movl $0,PT_FS(%esp) ++ jmp 1b ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,2b ++.popsection ++ENDPROC(ia32_sysenter_target) ++ ++ # pv sysenter call handler stub ++ENTRY(ia32pv_sysenter_target) ++ RING0_INT_FRAME ++ movl $__USER_DS,16(%esp) ++ movl %ebp,12(%esp) ++ movl $__USER_CS,4(%esp) ++ addl $4,%esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ ++ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) ++ CFI_ADJUST_CFA_OFFSET 4 ++/* ++ * Load the potential sixth argument from user stack. ++ * Careful about security. ++ */ ++ cmpl $__PAGE_OFFSET-3,%ebp ++ jae syscall_fault ++1: movl (%ebp),%ebp ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,syscall_fault ++.previous ++ /* fall through */ ++ CFI_ENDPROC ++ENDPROC(ia32pv_sysenter_target) ++ ++ # system call handler stub ++ENTRY(system_call) ++ RING0_INT_FRAME # can't unwind into user space anyway ++ pushl %eax # save orig_eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ test_tif %ebp ++ jnz syscall_trace_entry ++ cmpl $(nr_syscalls), %eax ++ jae syscall_badsys ++syscall_call: ++ call *sys_call_table(,%eax,4) ++ movl %eax,PT_EAX(%esp) # store the return value ++syscall_exit: ++ LOCKDEP_SYS_EXIT ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ TRACE_IRQS_OFF ++ testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit ++ jz no_singlestep ++ orl $_TIF_SINGLESTEP,TI_flags(%ebp) ++no_singlestep: ++ movl TI_flags(%ebp), %ecx ++ testw $_TIF_ALLWORK_MASK, %cx # current->work ++ jne syscall_exit_work ++ ++restore_all: ++#ifndef CONFIG_XEN ++ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS ++ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we ++ # are returning to the kernel. ++ # See comments in process.c:copy_thread() for details. ++ movb PT_OLDSS(%esp), %ah ++ movb PT_CS(%esp), %al ++ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax ++ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax ++ CFI_REMEMBER_STATE ++ je ldt_ss # returning to user-space with LDT SS ++restore_nocheck: ++#else ++restore_nocheck: ++ movl PT_EFLAGS(%esp), %eax ++ testl $(X86_EFLAGS_VM|NMI_MASK), %eax ++ CFI_REMEMBER_STATE ++ jnz hypervisor_iret ++ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF ++ GET_VCPU_INFO ++ andb evtchn_upcall_mask(%esi),%al ++ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask ++ CFI_REMEMBER_STATE ++ jnz restore_all_enable_events # != 0 => enable event delivery ++#endif ++ TRACE_IRQS_IRET ++restore_nocheck_notrace: ++ RESTORE_REGS ++ addl $4, %esp # skip orig_eax/error_code ++ CFI_ADJUST_CFA_OFFSET -4 ++irq_return: ++ INTERRUPT_RETURN ++.section .fixup,"ax" ++ENTRY(iret_exc) ++ pushl $0 # no error code ++ pushl $do_iret_error ++ jmp error_code ++.previous ++.section __ex_table,"a" ++ .align 4 ++ .long irq_return,iret_exc ++.previous ++ ++ CFI_RESTORE_STATE ++#ifndef CONFIG_XEN ++ldt_ss: ++ larl PT_OLDSS(%esp), %eax ++ jnz restore_nocheck ++ testl $0x00400000, %eax # returning to 32bit stack? ++ jnz restore_nocheck # allright, normal return ++ ++#ifdef CONFIG_PARAVIRT ++ /* ++ * The kernel can't run on a non-flat stack if paravirt mode ++ * is active. Rather than try to fixup the high bits of ++ * ESP, bypass this code entirely. This may break DOSemu ++ * and/or Wine support in a paravirt VM, although the option ++ * is still available to implement the setting of the high ++ * 16-bits in the INTERRUPT_RETURN paravirt-op. ++ */ ++ cmpl $0, pv_info+PARAVIRT_enabled ++ jne restore_nocheck ++#endif ++ ++ /* If returning to userspace with 16bit stack, ++ * try to fix the higher word of ESP, as the CPU ++ * won't restore it. ++ * This is an "official" bug of all the x86-compatible ++ * CPUs, which we can try to work around to make ++ * dosemu and wine happy. */ ++ movl PT_OLDESP(%esp), %eax ++ movl %esp, %edx ++ call patch_espfix_desc ++ pushl $__ESPFIX_SS ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ DISABLE_INTERRUPTS(CLBR_EAX) ++ TRACE_IRQS_OFF ++ lss (%esp), %esp ++ CFI_ADJUST_CFA_OFFSET -8 ++ jmp restore_nocheck ++#else ++ ALIGN ++restore_all_enable_events: ++ TRACE_IRQS_ON ++ __ENABLE_INTERRUPTS ++scrit: /**** START OF CRITICAL REGION ****/ ++ __TEST_PENDING ++ jnz 14f # process more events if necessary... ++ RESTORE_REGS ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++1: INTERRUPT_RETURN ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,iret_exc ++.previous ++14: __DISABLE_INTERRUPTS ++ TRACE_IRQS_OFF ++ jmp 11f ++ecrit: /**** END OF CRITICAL REGION ****/ ++ ++ CFI_RESTORE_STATE ++hypervisor_iret: ++ andl $~NMI_MASK, PT_EFLAGS(%esp) ++ RESTORE_REGS ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp hypercall_page + (__HYPERVISOR_iret * 32) ++#endif ++ CFI_ENDPROC ++ENDPROC(system_call) ++ ++ # perform work that needs to be done immediately before resumption ++ ALIGN ++ RING0_PTREGS_FRAME # can't unwind into user space anyway ++work_pending: ++ testb $_TIF_NEED_RESCHED, %cl ++ jz work_notifysig ++work_resched: ++ call schedule ++ LOCKDEP_SYS_EXIT ++ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt ++ # setting need_resched or sigpending ++ # between sampling and the iret ++ TRACE_IRQS_OFF ++ movl TI_flags(%ebp), %ecx ++ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other ++ # than syscall tracing? ++ jz restore_all ++ testb $_TIF_NEED_RESCHED, %cl ++ jnz work_resched ++ ++work_notifysig: # deal with pending signals and ++ # notify-resume requests ++#ifdef CONFIG_VM86 ++ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) ++ movl %esp, %eax ++ jne work_notifysig_v86 # returning to kernel-space or ++ # vm86-space ++ xorl %edx, %edx ++ call do_notify_resume ++ jmp resume_userspace_sig ++ ++ ALIGN ++work_notifysig_v86: ++ pushl %ecx # save ti_flags for do_notify_resume ++ CFI_ADJUST_CFA_OFFSET 4 ++ call save_v86_state # %eax contains pt_regs pointer ++ popl %ecx ++ CFI_ADJUST_CFA_OFFSET -4 ++ movl %eax, %esp ++#else ++ movl %esp, %eax ++#endif ++ xorl %edx, %edx ++ call do_notify_resume ++ jmp resume_userspace_sig ++END(work_pending) ++ ++ # perform syscall exit tracing ++ ALIGN ++syscall_trace_entry: ++ movl $-ENOSYS,PT_EAX(%esp) ++ movl %esp, %eax ++ xorl %edx,%edx ++ call do_syscall_trace ++ cmpl $0, %eax ++ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, ++ # so must skip actual syscall ++ movl PT_ORIG_EAX(%esp), %eax ++ cmpl $(nr_syscalls), %eax ++ jnae syscall_call ++ jmp syscall_exit ++END(syscall_trace_entry) ++ ++ # perform syscall exit tracing ++ ALIGN ++syscall_exit_work: ++ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl ++ jz work_pending ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call ++ # schedule() instead ++ movl %esp, %eax ++ movl $1, %edx ++ call do_syscall_trace ++ jmp resume_userspace ++END(syscall_exit_work) ++ CFI_ENDPROC ++ ++ RING0_INT_FRAME # can't unwind into user space anyway ++syscall_fault: ++ GET_THREAD_INFO(%ebp) ++ movl $-EFAULT,PT_EAX(%esp) ++ jmp resume_userspace ++END(syscall_fault) ++ ++syscall_badsys: ++ movl $-ENOSYS,PT_EAX(%esp) ++ jmp resume_userspace ++END(syscall_badsys) ++ CFI_ENDPROC ++ ++#ifndef CONFIG_XEN ++#define FIXUP_ESPFIX_STACK \ ++ /* since we are on a wrong stack, we cant make it a C code :( */ \ ++ PER_CPU(gdt_page, %ebx); \ ++ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ ++ addl %esp, %eax; \ ++ pushl $__KERNEL_DS; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl %eax; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ lss (%esp), %esp; \ ++ CFI_ADJUST_CFA_OFFSET -8; ++#define UNWIND_ESPFIX_STACK \ ++ movl %ss, %eax; \ ++ /* see if on espfix stack */ \ ++ cmpw $__ESPFIX_SS, %ax; \ ++ jne 27f; \ ++ movl $__KERNEL_DS, %eax; \ ++ movl %eax, %ds; \ ++ movl %eax, %es; \ ++ /* switch to normal stack */ \ ++ FIXUP_ESPFIX_STACK; \ ++27:; ++ ++/* ++ * Build the entry stubs and pointer table with ++ * some assembler magic. ++ */ ++.section .rodata,"a" ++ENTRY(interrupt) ++.text ++ ++ENTRY(irq_entries_start) ++ RING0_INT_FRAME ++vector=0 ++.rept NR_IRQS ++ ALIGN ++ .if vector ++ CFI_ADJUST_CFA_OFFSET -4 ++ .endif ++1: pushl $~(vector) ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp common_interrupt ++ .previous ++ .long 1b ++ .text ++vector=vector+1 ++.endr ++END(irq_entries_start) ++ ++.previous ++END(interrupt) ++.previous ++ ++/* ++ * the CPU automatically disables interrupts when executing an IRQ vector, ++ * so IRQ-flags tracing has to follow that: ++ */ ++ ALIGN ++common_interrupt: ++ SAVE_ALL ++ TRACE_IRQS_OFF ++ movl %esp,%eax ++ call do_IRQ ++ jmp ret_from_intr ++ENDPROC(common_interrupt) ++ CFI_ENDPROC ++ ++#define BUILD_INTERRUPT(name, nr) \ ++ENTRY(name) \ ++ RING0_INT_FRAME; \ ++ pushl $~(nr); \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ SAVE_ALL; \ ++ TRACE_IRQS_OFF \ ++ movl %esp,%eax; \ ++ call smp_##name; \ ++ jmp ret_from_intr; \ ++ CFI_ENDPROC; \ ++ENDPROC(name) ++ ++/* The include is where all of the SMP etc. interrupts come from */ ++#include "entry_arch.h" ++ ++#else ++#define UNWIND_ESPFIX_STACK ++#endif ++ ++KPROBE_ENTRY(page_fault) ++ RING0_EC_FRAME ++ pushl $do_page_fault ++ CFI_ADJUST_CFA_OFFSET 4 ++ ALIGN ++error_code: ++ /* the function address is in %fs's slot on the stack */ ++ pushl %es ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET es, 0*/ ++ pushl %ds ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET ds, 0*/ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET eax, 0 ++ pushl %ebp ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebp, 0 ++ pushl %edi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edi, 0 ++ pushl %esi ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET esi, 0 ++ pushl %edx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET edx, 0 ++ pushl %ecx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ecx, 0 ++ pushl %ebx ++ CFI_ADJUST_CFA_OFFSET 4 ++ CFI_REL_OFFSET ebx, 0 ++ cld ++ pushl %fs ++ CFI_ADJUST_CFA_OFFSET 4 ++ /*CFI_REL_OFFSET fs, 0*/ ++ movl $(__KERNEL_PERCPU), %ecx ++ movl %ecx, %fs ++ UNWIND_ESPFIX_STACK ++ popl %ecx ++ CFI_ADJUST_CFA_OFFSET -4 ++ /*CFI_REGISTER es, ecx*/ ++ movl PT_FS(%esp), %edi # get the function address ++ movl PT_ORIG_EAX(%esp), %edx # get the error code ++ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart ++ mov %ecx, PT_FS(%esp) ++ /*CFI_REL_OFFSET fs, ES*/ ++ movl $(__USER_DS), %ecx ++ movl %ecx, %ds ++ movl %ecx, %es ++ movl %esp,%eax # pt_regs pointer ++ call *%edi ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(page_fault) ++ ++#ifdef CONFIG_XEN ++# A note on the "critical region" in our callback handler. ++# We want to avoid stacking callback handlers due to events occurring ++# during handling of the last event. To do this, we keep events disabled ++# until we've done all processing. HOWEVER, we must enable events before ++# popping the stack frame (can't be done atomically) and so it would still ++# be possible to get enough handler activations to overflow the stack. ++# Although unlikely, bugs of that kind are hard to track down, so we'd ++# like to avoid the possibility. ++# So, on entry to the handler we detect whether we interrupted an ++# existing activation in its critical region -- if so, we pop the current ++# activation and restart the handler using the previous one. ++# ++# The sysexit critical region is slightly different. sysexit ++# atomically removes the entire stack frame. If we interrupt in the ++# critical region we know that the entire frame is present and correct ++# so we can simply throw away the new one. ++ENTRY(hypervisor_callback) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ movl PT_EIP(%esp),%eax ++ cmpl $scrit,%eax ++ jb 11f ++ cmpl $ecrit,%eax ++ jb critical_region_fixup ++ cmpl $sysexit_scrit,%eax ++ jb 11f ++ cmpl $sysexit_ecrit,%eax ++ ja 11f ++ addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. ++11: push %esp ++ CFI_ADJUST_CFA_OFFSET 4 ++ call evtchn_do_upcall ++ add $4,%esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp ret_from_intr ++ CFI_ENDPROC ++ ++# [How we do the fixup]. We want to merge the current stack frame with the ++# just-interrupted frame. How we do this depends on where in the critical ++# region the interrupted handler was executing, and so how many saved ++# registers are in each frame. We do this quickly using the lookup table ++# 'critical_fixup_table'. For each byte offset in the critical region, it ++# provides the number of bytes which have already been popped from the ++# interrupted stack frame. ++critical_region_fixup: ++ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped ++ cmpb $0xff,%cl # 0xff => vcpu_info critical region ++ jne 15f ++ xorl %ecx,%ecx ++15: leal (%esp,%ecx),%esi # %esi points at end of src region ++ leal PT_OLDESP(%esp),%edi # %edi points at end of dst region ++ shrl $2,%ecx # convert words to bytes ++ je 17f # skip loop if nothing to copy ++16: subl $4,%esi # pre-decrementing copy loop ++ subl $4,%edi ++ movl (%esi),%eax ++ movl %eax,(%edi) ++ loop 16b ++17: movl %edi,%esp # final %edi is top of merged stack ++ jmp 11b ++ ++.section .rodata,"a" ++critical_fixup_table: ++ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING ++ .byte 0xff,0xff # jnz 14f ++ .byte 0x00 # pop %ebx ++ .byte 0x04 # pop %ecx ++ .byte 0x08 # pop %edx ++ .byte 0x0c # pop %esi ++ .byte 0x10 # pop %edi ++ .byte 0x14 # pop %ebp ++ .byte 0x18 # pop %eax ++ .byte 0x1c # pop %ds ++ .byte 0x20 # pop %es ++ .byte 0x24,0x24 # pop %fs ++ .byte 0x28,0x28,0x28 # add $4,%esp ++ .byte 0x2c # iret ++ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) ++ .byte 0x00,0x00 # jmp 11b ++.previous ++ ++# Hypervisor uses this for application faults while it executes. ++# We get here for two reasons: ++# 1. Fault while reloading DS, ES, FS or GS ++# 2. Fault while executing IRET ++# Category 1 we fix up by reattempting the load, and zeroing the segment ++# register if the load fails. ++# Category 2 we fix up by jumping to do_iret_error. We cannot use the ++# normal Linux return path in this case because if we use the IRET hypercall ++# to pop the stack frame we end up in an infinite loop of failsafe callbacks. ++# We distinguish between categories by maintaining a status value in EAX. ++ENTRY(failsafe_callback) ++ pushl %eax ++ movl $1,%eax ++1: mov 4(%esp),%ds ++2: mov 8(%esp),%es ++3: mov 12(%esp),%fs ++4: mov 16(%esp),%gs ++ testl %eax,%eax ++ popl %eax ++ jz 5f ++ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) ++ jmp iret_exc ++5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) ++ RING0_INT_FRAME ++ pushl $0 ++ SAVE_ALL ++ jmp ret_from_exception ++.section .fixup,"ax"; \ ++6: xorl %eax,%eax; \ ++ movl %eax,4(%esp); \ ++ jmp 1b; \ ++7: xorl %eax,%eax; \ ++ movl %eax,8(%esp); \ ++ jmp 2b; \ ++8: xorl %eax,%eax; \ ++ movl %eax,12(%esp); \ ++ jmp 3b; \ ++9: xorl %eax,%eax; \ ++ movl %eax,16(%esp); \ ++ jmp 4b; \ ++.previous; \ ++.section __ex_table,"a"; \ ++ .align 4; \ ++ .long 1b,6b; \ ++ .long 2b,7b; \ ++ .long 3b,8b; \ ++ .long 4b,9b; \ ++.previous ++#endif ++ CFI_ENDPROC ++ ++ENTRY(coprocessor_error) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_coprocessor_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(coprocessor_error) ++ ++ENTRY(simd_coprocessor_error) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_simd_coprocessor_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(simd_coprocessor_error) ++ ++ENTRY(device_not_available) ++ RING0_INT_FRAME ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++#ifndef CONFIG_XEN ++ GET_CR0_INTO_EAX ++ testl $0x4, %eax # EM (math emulation bit) ++ je device_available_emulate ++ pushl $0 # temporary storage for ORIG_EIP ++ CFI_ADJUST_CFA_OFFSET 4 ++ call math_emulate ++ addl $4, %esp ++ CFI_ADJUST_CFA_OFFSET -4 ++ jmp ret_from_exception ++device_available_emulate: ++#endif ++ preempt_stop(CLBR_ANY) ++ call math_state_restore ++ jmp ret_from_exception ++ CFI_ENDPROC ++END(device_not_available) ++ ++#ifndef CONFIG_XEN ++/* ++ * Debug traps and NMI can happen at the one SYSENTER instruction ++ * that sets up the real kernel stack. Check here, since we can't ++ * allow the wrong stack to be used. ++ * ++ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have ++ * already pushed 3 words if it hits on the sysenter instruction: ++ * eflags, cs and eip. ++ * ++ * We just load the right stack, and push the three (known) values ++ * by hand onto the new stack - while updating the return eip past ++ * the instruction that would have done it for sysenter. ++ */ ++#define FIX_STACK(offset, ok, label) \ ++ cmpw $__KERNEL_CS,4(%esp); \ ++ jne ok; \ ++label: \ ++ movl SYSENTER_stack_sp0+offset(%esp),%esp; \ ++ CFI_DEF_CFA esp, 0; \ ++ CFI_UNDEFINED eip; \ ++ pushfl; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl $__KERNEL_CS; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ pushl $sysenter_past_esp; \ ++ CFI_ADJUST_CFA_OFFSET 4; \ ++ CFI_REL_OFFSET eip, 0 ++#endif /* CONFIG_XEN */ ++ ++KPROBE_ENTRY(debug) ++ RING0_INT_FRAME ++#ifndef CONFIG_XEN ++ cmpl $ia32_sysenter_target,(%esp) ++ jne debug_stack_correct ++ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) ++debug_stack_correct: ++#endif /* !CONFIG_XEN */ ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # error code 0 ++ movl %esp,%eax # pt_regs pointer ++ call do_debug ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(debug) ++ ++#ifndef CONFIG_XEN ++/* ++ * NMI is doubly nasty. It can happen _while_ we're handling ++ * a debug fault, and the debug fault hasn't yet been able to ++ * clear up the stack. So we first check whether we got an ++ * NMI on the sysenter entry path, but after that we need to ++ * check whether we got an NMI on the debug path where the debug ++ * fault happened on the sysenter path. ++ */ ++KPROBE_ENTRY(nmi) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ movl %ss, %eax ++ cmpw $__ESPFIX_SS, %ax ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ je nmi_espfix_stack ++ cmpl $ia32_sysenter_target,(%esp) ++ je nmi_stack_fixup ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ movl %esp,%eax ++ /* Do not access memory above the end of our stack page, ++ * it might not exist. ++ */ ++ andl $(THREAD_SIZE-1),%eax ++ cmpl $(THREAD_SIZE-20),%eax ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ jae nmi_stack_correct ++ cmpl $ia32_sysenter_target,12(%esp) ++ je nmi_debug_stack_check ++nmi_stack_correct: ++ /* We have a RING0_INT_FRAME here */ ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_nmi ++ jmp restore_nocheck_notrace ++ CFI_ENDPROC ++ ++nmi_stack_fixup: ++ RING0_INT_FRAME ++ FIX_STACK(12,nmi_stack_correct, 1) ++ jmp nmi_stack_correct ++ ++nmi_debug_stack_check: ++ /* We have a RING0_INT_FRAME here */ ++ cmpw $__KERNEL_CS,16(%esp) ++ jne nmi_stack_correct ++ cmpl $debug,(%esp) ++ jb nmi_stack_correct ++ cmpl $debug_esp_fix_insn,(%esp) ++ ja nmi_stack_correct ++ FIX_STACK(24,nmi_stack_correct, 1) ++ jmp nmi_stack_correct ++ ++nmi_espfix_stack: ++ /* We have a RING0_INT_FRAME here. ++ * ++ * create the pointer to lss back ++ */ ++ pushl %ss ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl %esp ++ CFI_ADJUST_CFA_OFFSET 4 ++ addw $4, (%esp) ++ /* copy the iret frame of 12 bytes */ ++ .rept 3 ++ pushl 16(%esp) ++ CFI_ADJUST_CFA_OFFSET 4 ++ .endr ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ FIXUP_ESPFIX_STACK # %eax == %esp ++ xorl %edx,%edx # zero error code ++ call do_nmi ++ RESTORE_REGS ++ lss 12+4(%esp), %esp # back to espfix stack ++ CFI_ADJUST_CFA_OFFSET -24 ++ jmp irq_return ++ CFI_ENDPROC ++#else ++KPROBE_ENTRY(nmi) ++ RING0_INT_FRAME ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_nmi ++ orl $NMI_MASK, PT_EFLAGS(%esp) ++ jmp restore_all ++ CFI_ENDPROC ++#endif ++KPROBE_END(nmi) ++ ++#ifdef CONFIG_PARAVIRT ++ENTRY(native_iret) ++ iret ++.section __ex_table,"a" ++ .align 4 ++ .long native_iret, iret_exc ++.previous ++END(native_iret) ++ ++ENTRY(native_irq_enable_syscall_ret) ++ sti ++ sysexit ++END(native_irq_enable_syscall_ret) ++#endif ++ ++KPROBE_ENTRY(int3) ++ RING0_INT_FRAME ++ pushl $-1 # mark this as an int ++ CFI_ADJUST_CFA_OFFSET 4 ++ SAVE_ALL ++ xorl %edx,%edx # zero error code ++ movl %esp,%eax # pt_regs pointer ++ call do_int3 ++ jmp ret_from_exception ++ CFI_ENDPROC ++KPROBE_END(int3) ++ ++ENTRY(overflow) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_overflow ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(overflow) ++ ++ENTRY(bounds) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_bounds ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(bounds) ++ ++ENTRY(invalid_op) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_invalid_op ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(invalid_op) ++ ++ENTRY(coprocessor_segment_overrun) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_coprocessor_segment_overrun ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(coprocessor_segment_overrun) ++ ++ENTRY(invalid_TSS) ++ RING0_EC_FRAME ++ pushl $do_invalid_TSS ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(invalid_TSS) ++ ++ENTRY(segment_not_present) ++ RING0_EC_FRAME ++ pushl $do_segment_not_present ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(segment_not_present) ++ ++ENTRY(stack_segment) ++ RING0_EC_FRAME ++ pushl $do_stack_segment ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(stack_segment) ++ ++KPROBE_ENTRY(general_protection) ++ RING0_EC_FRAME ++ pushl $do_general_protection ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++KPROBE_END(general_protection) ++ ++ENTRY(alignment_check) ++ RING0_EC_FRAME ++ pushl $do_alignment_check ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(alignment_check) ++ ++ENTRY(divide_error) ++ RING0_INT_FRAME ++ pushl $0 # no error code ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_divide_error ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(divide_error) ++ ++#ifdef CONFIG_X86_MCE ++ENTRY(machine_check) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl machine_check_vector ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(machine_check) ++#endif ++ ++#ifndef CONFIG_XEN ++ENTRY(spurious_interrupt_bug) ++ RING0_INT_FRAME ++ pushl $0 ++ CFI_ADJUST_CFA_OFFSET 4 ++ pushl $do_spurious_interrupt_bug ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++#endif /* !CONFIG_XEN */ ++ ++ENTRY(fixup_4gb_segment) ++ RING0_EC_FRAME ++ pushl $do_fixup_4gb_segment ++ CFI_ADJUST_CFA_OFFSET 4 ++ jmp error_code ++ CFI_ENDPROC ++END(spurious_interrupt_bug) ++ ++ENTRY(kernel_thread_helper) ++ pushl $0 # fake return address for unwinder ++ CFI_STARTPROC ++ movl %edx,%eax ++ push %edx ++ CFI_ADJUST_CFA_OFFSET 4 ++ call *%ebx ++ push %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call do_exit ++ CFI_ENDPROC ++ENDPROC(kernel_thread_helper) ++ ++#include ++ ++ # pv syscall call handler stub ++ENTRY(ia32pv_cstar_target) ++ RING0_INT_FRAME ++ movl $__USER_DS,16(%esp) ++ movl %ebp,%ecx ++ movl $__USER_CS,4(%esp) ++ movl 12(%esp),%ebp ++ pushl %eax # save orig_eax ++ CFI_ADJUST_CFA_OFFSET 4 ++/* ++ * Load the potential sixth argument from user stack. ++ * Careful about security. ++ */ ++ cmpl $__PAGE_OFFSET-4,%ebp ++ CFI_REMEMBER_STATE ++ ja cstar_fault ++1: movl (%ebp),%ebp ++.section __ex_table,"a" ++ .align 4 ++ .long 1b,cstar_fault ++.previous ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ test_tif %ebp ++ jnz cstar_trace_entry ++ cmpl $nr_syscalls,%eax ++ jae cstar_badsys ++.Lcstar_call: ++ btl %eax,cstar_special ++ jc .Lcstar_special ++ call *cstar_call_table(,%eax,4) ++ movl %eax,PT_EAX(%esp) # store the return value ++.Lcstar_exit: ++ movl PT_ECX(%esp),%ecx ++ movl %ecx,PT_EBP(%esp) # put user EBP back in place ++ jmp syscall_exit ++.Lcstar_special: ++ movl PT_ECX(%esp),%ecx ++ movl %ecx,PT_EBP(%esp) # put user EBP back in place ++ jmp syscall_call ++cstar_set_tif: ++ movl $cstar_clear_tif,(%esp) # replace return address ++ LOCK_PREFIX ++ orl $_TIF_CSTAR,TI_flags(%ebp) ++ jmp *sys_call_table(,%eax,4) ++cstar_clear_tif: ++ movl %eax,PT_EAX(%esp) # store the return value ++ LOCK_PREFIX ++ andl $~_TIF_CSTAR,TI_flags(%ebp) ++ jmp .Lcstar_exit ++cstar_trace_entry: ++ movl $-ENOSYS,PT_EAX(%esp) ++ cmpl $nr_syscalls,%eax ++ jae 1f ++ btl %eax,cstar_special ++ jc .Lcstar_trace_special ++1: movl %esp,%eax ++ xorl %edx,%edx ++ LOCK_PREFIX ++ orl $_TIF_CSTAR,TI_flags(%ebp) ++ call do_syscall_trace ++ LOCK_PREFIX ++ andl $~_TIF_CSTAR,TI_flags(%ebp) ++ testl %eax,%eax ++ jne .Lcstar_resume # ret != 0 -> running under PTRACE_SYSEMU, ++ # so must skip actual syscall ++ movl PT_ORIG_EAX(%esp),%eax ++ cmpl $nr_syscalls,%eax ++ jb .Lcstar_call ++ jmp .Lcstar_exit ++.Lcstar_trace_special: ++ movl PT_ECX(%esp),%ecx ++ movl %esp,%eax ++ xorl %edx,%edx ++ movl %ecx,PT_EBP(%esp) # put user EBP back in place ++ call do_syscall_trace ++ testl %eax,%eax ++ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, ++ # so must skip actual syscall ++ movl PT_ORIG_EAX(%esp),%eax ++ cmpl $nr_syscalls,%eax ++ jb syscall_call ++ jmp syscall_exit ++cstar_badsys: ++ movl $-ENOSYS,PT_EAX(%esp) ++.Lcstar_resume: ++ movl PT_ECX(%esp),%ecx ++ movl %ecx,PT_EBP(%esp) # put user EBP back in place ++ jmp resume_userspace ++ CFI_RESTORE_STATE ++cstar_fault: ++ movl $-EFAULT,%eax ++ SAVE_ALL ++ GET_THREAD_INFO(%ebp) ++ jmp .Lcstar_resume ++ CFI_ENDPROC ++ENDPROC(ia32pv_cstar_target) ++ ++ENTRY(cstar_ret_from_fork) ++ CFI_STARTPROC ++ movl PT_ECX(%esp),%ecx ++ GET_THREAD_INFO(%ebp) ++ movl %ecx,PT_EBP(%esp) # put user EBP back in place ++ LOCK_PREFIX ++ andl $~_TIF_CSTAR,TI_flags(%ebp) ++ jmp ret_from_fork ++ CFI_ENDPROC ++END(ret_from_fork) ++ ++.section .rodata,"a" ++#include "syscall_table_32.S" ++ ++syscall_table_size=(.-sys_call_table) ++ ++#include ++cstar_special: ++nr=0 ++mask=0 ++.rept nr_syscalls+31 ++ .irp n, __NR_sigreturn, __NR_rt_sigreturn ++ .if nr == \n ++ mask = mask | (1 << (\n & 31)) ++ .endif ++ .endr ++ nr = nr + 1 ++ .if (nr & 31) == 0 ++ .long mask ++ mask = 0 ++ .endif ++.endr ++#define sys_call_table cstar_call_table ++#define sys_fork cstar_set_tif ++#define sys_clone cstar_set_tif ++#define sys_vfork cstar_set_tif ++#include "syscall_table_32.S" ++#undef sys_call_table ++#undef sys_fork ++#undef sys_clone ++#undef sys_vfork +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index c778e4f..b61ba3e 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -282,7 +282,7 @@ ENTRY(ia32_sysenter_target) + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp +- movl TSS_sysenter_sp0(%esp),%esp ++ movl SYSENTER_stack_sp0(%esp),%esp + sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until +@@ -739,7 +739,7 @@ END(device_not_available) + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * +- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have ++ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * +@@ -751,7 +751,7 @@ END(device_not_available) + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ + label: \ +- movl TSS_sysenter_sp0+offset(%esp),%esp; \ ++ movl SYSENTER_stack_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ +@@ -1017,7 +1017,7 @@ ENTRY(kernel_thread_helper) + CFI_ENDPROC + ENDPROC(kernel_thread_helper) + +-#ifdef CONFIG_XEN ++#ifdef CONFIG_PARAVIRT_XEN + /* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ + ENTRY(xen_sysenter_target) +@@ -1108,7 +1108,7 @@ ENTRY(xen_failsafe_callback) + .previous + ENDPROC(xen_failsafe_callback) + +-#endif /* CONFIG_XEN */ ++#endif /* CONFIG_PARAVIRT_XEN */ + + .section .rodata,"a" + #include "syscall_table_32.S" +diff --git a/arch/x86/kernel/entry_64-xen.S b/arch/x86/kernel/entry_64-xen.S +new file mode 100644 +index 0000000..7da37a7 +--- /dev/null ++++ b/arch/x86/kernel/entry_64-xen.S +@@ -0,0 +1,1258 @@ ++/* ++ * linux/arch/x86_64/entry.S ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs ++ * Copyright (C) 2000 Pavel Machek ++ * Jun Nakajima ++ * Asit Mallick ++ * Modified for Xen ++ */ ++ ++/* ++ * entry.S contains the system-call and fault low-level handling routines. ++ * ++ * NOTE: This code handles signal-recognition, which happens every time ++ * after an interrupt and after each system call. ++ * ++ * Normal syscalls and interrupts don't save a full stack frame, this is ++ * only done for syscall tracing, signals or fork/exec et.al. ++ * ++ * A note on terminology: ++ * - top of stack: Architecture defined interrupt frame from SS to RIP ++ * at the top of the kernel process stack. ++ * - partial stack frame: partially saved registers upto R11. ++ * - full stack frame: Like partial stack frame, but all register saved. ++ * ++ * Some macro usage: ++ * - CFI macros are used to generate dwarf2 unwind information for better ++ * backtraces. They don't change any code. ++ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers ++ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. ++ * There are unfortunately lots of special cases where some registers ++ * not touched. The macro is a big mess that should be cleaned up. ++ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. ++ * Gives a full stack frame. ++ * - ENTRY/END Define functions in the symbol table. ++ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack ++ * frame that is otherwise undefined after a SYSCALL ++ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. ++ * - errorentry/paranoidentry/zeroentry - Define exception entry points. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ .code64 ++ ++#ifndef CONFIG_PREEMPT ++#define retint_kernel retint_restore_args ++#endif ++ ++#ifdef CONFIG_PARAVIRT ++ENTRY(native_irq_enable_syscall_ret) ++ movq %gs:pda_oldrsp,%rsp ++ swapgs ++ sysretq ++#endif /* CONFIG_PARAVIRT */ ++ ++ ++.macro TRACE_IRQS_IRETQ offset=ARGOFFSET ++#ifdef CONFIG_TRACE_IRQFLAGS ++ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ ++ jnc 1f ++ TRACE_IRQS_ON ++1: ++#endif ++.endm ++ ++NMI_MASK = 0x80000000 ++ ++/* ++ * C code is not supposed to know about undefined top of stack. Every time ++ * a C function with an pt_regs argument is called from the SYSCALL based ++ * fast path FIXUP_TOP_OF_STACK is needed. ++ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs ++ * manipulation. ++ */ ++ ++ /* %rsp:at FRAMEEND */ ++ .macro FIXUP_TOP_OF_STACK tmp ++ movq $__USER_CS,CS(%rsp) ++ movq $-1,RCX(%rsp) ++ .endm ++ ++ .macro RESTORE_TOP_OF_STACK tmp,offset=0 ++ .endm ++ ++ .macro FAKE_STACK_FRAME child_rip ++ /* push in order ss, rsp, eflags, cs, rip */ ++ xorl %eax, %eax ++ pushq %rax /* ss */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ /*CFI_REL_OFFSET ss,0*/ ++ pushq %rax /* rsp */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rsp,0 ++ pushq $(1<<9) /* eflags - interrupts on */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ /*CFI_REL_OFFSET rflags,0*/ ++ pushq $__KERNEL_CS /* cs */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ /*CFI_REL_OFFSET cs,0*/ ++ pushq \child_rip /* rip */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rip,0 ++ pushq %rax /* orig rax */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ .endm ++ ++ .macro UNFAKE_STACK_FRAME ++ addq $8*6, %rsp ++ CFI_ADJUST_CFA_OFFSET -(6*8) ++ .endm ++ ++ .macro CFI_DEFAULT_STACK start=1,adj=0 ++ .if \start ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-(\adj*ARGOFFSET) ++ .else ++ CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET) ++ .endif ++ .if \adj == 0 ++ CFI_REL_OFFSET r15,R15 ++ CFI_REL_OFFSET r14,R14 ++ CFI_REL_OFFSET r13,R13 ++ CFI_REL_OFFSET r12,R12 ++ CFI_REL_OFFSET rbp,RBP ++ CFI_REL_OFFSET rbx,RBX ++ .endif ++ CFI_REL_OFFSET r11,R11 ++ CFI_REL_OFFSET r10,R10 ++ CFI_REL_OFFSET r9,R9 ++ CFI_REL_OFFSET r8,R8 ++ CFI_REL_OFFSET rax,RAX ++ CFI_REL_OFFSET rcx,RCX ++ CFI_REL_OFFSET rdx,RDX ++ CFI_REL_OFFSET rsi,RSI ++ CFI_REL_OFFSET rdi,RDI ++ CFI_REL_OFFSET rip,RIP ++ /*CFI_REL_OFFSET cs,CS*/ ++ /*CFI_REL_OFFSET rflags,EFLAGS*/ ++ CFI_REL_OFFSET rsp,RSP ++ /*CFI_REL_OFFSET ss,SS*/ ++ .endm ++ ++ /* ++ * Must be consistent with the definition in arch-x86/xen-x86_64.h: ++ * struct iret_context { ++ * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; ++ * }; ++ * with rax, r11, and rcx being taken care of in the hypercall stub. ++ */ ++ .macro HYPERVISOR_IRET flag ++ testb $3,1*8(%rsp) ++ jnz 2f ++ testl $NMI_MASK,2*8(%rsp) ++ jnz 2f ++ ++ cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip) ++ jne 1f ++ ++ /* Direct iret to kernel space. Correct CS and SS. */ ++ orl $3,1*8(%rsp) ++ orl $3,4*8(%rsp) ++1: iretq ++ ++2: /* Slow iret via hypervisor. */ ++ andl $~NMI_MASK, 2*8(%rsp) ++ pushq $\flag ++ jmp hypercall_page + (__HYPERVISOR_iret * 32) ++ .endm ++ ++/* ++ * A newly forked process directly context switches into this. ++ */ ++/* rdi: prev */ ++ENTRY(ret_from_fork) ++ CFI_DEFAULT_STACK ++ push kernel_eflags(%rip) ++ CFI_ADJUST_CFA_OFFSET 4 ++ popf # reset kernel eflags ++ CFI_ADJUST_CFA_OFFSET -4 ++ call schedule_tail ++ GET_THREAD_INFO(%rcx) ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) ++ jnz rff_trace ++rff_action: ++ RESTORE_REST ++ testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? ++ je int_ret_from_sys_call ++ testl $_TIF_IA32,threadinfo_flags(%rcx) ++ jnz int_ret_from_sys_call ++ RESTORE_TOP_OF_STACK %rdi,ARGOFFSET ++ jmp ret_from_sys_call ++rff_trace: ++ movq %rsp,%rdi ++ call syscall_trace_leave ++ GET_THREAD_INFO(%rcx) ++ jmp rff_action ++ CFI_ENDPROC ++END(ret_from_fork) ++ ++/* ++ * initial frame state for interrupts and exceptions ++ */ ++ .macro _frame ref ++ CFI_STARTPROC simple ++ CFI_SIGNAL_FRAME ++ CFI_DEF_CFA rsp,SS+8-\ref ++ /*CFI_REL_OFFSET ss,SS-\ref*/ ++ CFI_REL_OFFSET rsp,RSP-\ref ++ /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ ++ /*CFI_REL_OFFSET cs,CS-\ref*/ ++ CFI_REL_OFFSET rip,RIP-\ref ++ .endm ++ ++/* ++ * System call entry. Upto 6 arguments in registers are supported. ++ * ++ * SYSCALL does not save anything on the stack and does not change the ++ * stack pointer. ++ */ ++ ++/* ++ * Register setup: ++ * rax system call number ++ * rdi arg0 ++ * rcx return address for syscall/sysret, C arg3 ++ * rsi arg1 ++ * rdx arg2 ++ * r10 arg3 (--> moved to rcx for C) ++ * r8 arg4 ++ * r9 arg5 ++ * r11 eflags for syscall/sysret, temporary for C ++ * r12-r15,rbp,rbx saved by C code, not touched. ++ * ++ * Interrupts are enabled on entry. ++ * Only called from user space. ++ * ++ * XXX if we had a free scratch register we could save the RSP into the stack frame ++ * and report it properly in ps. Unfortunately we haven't. ++ * ++ * When user can change the frames always force IRET. That is because ++ * it deals with uncanonical addresses better. SYSRET has trouble ++ * with them due to bugs in both AMD and Intel CPUs. ++ */ ++ ++ENTRY(system_call) ++ _frame (RIP-0x10) ++ SAVE_ARGS -8,0 ++ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) ++ GET_THREAD_INFO(%rcx) ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) ++ jnz tracesys ++ cmpq $__NR_syscall_max,%rax ++ ja badsys ++ movq %r10,%rcx ++ call *sys_call_table(,%rax,8) # XXX: rip relative ++ movq %rax,RAX-ARGOFFSET(%rsp) ++/* ++ * Syscall return path ending with SYSRET (fast path) ++ * Has incomplete stack frame and undefined top of stack. ++ */ ++ret_from_sys_call: ++ movl $_TIF_ALLWORK_MASK,%edi ++ /* edi: flagmask */ ++sysret_check: ++ LOCKDEP_SYS_EXIT ++ GET_THREAD_INFO(%rcx) ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ movl threadinfo_flags(%rcx),%edx ++ andl %edi,%edx ++ jnz sysret_careful ++ CFI_REMEMBER_STATE ++ /* ++ * sysretq will re-enable interrupts: ++ */ ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ RESTORE_ARGS 0,8,0 ++ HYPERVISOR_IRET VGCF_IN_SYSCALL ++ ++ CFI_RESTORE_STATE ++ /* Handle reschedules */ ++ /* edx: work, edi: workmask */ ++sysret_careful: ++ bt $TIF_NEED_RESCHED,%edx ++ jnc sysret_signal ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ pushq %rdi ++ CFI_ADJUST_CFA_OFFSET 8 ++ call schedule ++ popq %rdi ++ CFI_ADJUST_CFA_OFFSET -8 ++ jmp sysret_check ++ ++ /* Handle a signal */ ++sysret_signal: ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ testl $_TIF_DO_NOTIFY_MASK,%edx ++ jz 1f ++ ++ /* Really a signal */ ++ /* edx: work flags (arg3) */ ++ leaq do_notify_resume(%rip),%rax ++ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 ++ xorl %esi,%esi # oldset -> arg2 ++ call ptregscall_common ++1: movl $_TIF_NEED_RESCHED,%edi ++ /* Use IRET because user could have changed frame. This ++ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ jmp int_with_check ++ ++badsys: ++ movq $-ENOSYS,RAX-ARGOFFSET(%rsp) ++ jmp ret_from_sys_call ++ ++ /* Do syscall tracing */ ++tracesys: ++ SAVE_REST ++ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ ++ FIXUP_TOP_OF_STACK %rdi ++ movq %rsp,%rdi ++ call syscall_trace_enter ++ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ ++ RESTORE_REST ++ cmpq $__NR_syscall_max,%rax ++ ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ ++ movq %r10,%rcx /* fixup for C */ ++ call *sys_call_table(,%rax,8) ++ movq %rax,RAX-ARGOFFSET(%rsp) ++ /* Use IRET because user could have changed frame */ ++ ++/* ++ * Syscall return path ending with IRET. ++ * Has correct top of stack, but partial stack frame. ++ */ ++ .globl int_ret_from_sys_call ++int_ret_from_sys_call: ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ testb $3,CS-ARGOFFSET(%rsp) ++ jnz 1f ++ /* Need to set the proper %ss (not NULL) for ring 3 iretq */ ++ movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) ++ jmp retint_restore_args # retrun from ring3 kernel ++1: ++ movl $_TIF_ALLWORK_MASK,%edi ++ /* edi: mask to check */ ++int_with_check: ++ LOCKDEP_SYS_EXIT_IRQ ++ GET_THREAD_INFO(%rcx) ++ movl threadinfo_flags(%rcx),%edx ++ andl %edi,%edx ++ jnz int_careful ++ andl $~TS_COMPAT,threadinfo_status(%rcx) ++ jmp retint_restore_args ++ ++ /* Either reschedule or signal or syscall exit tracking needed. */ ++ /* First do a reschedule test. */ ++ /* edx: work, edi: workmask */ ++int_careful: ++ bt $TIF_NEED_RESCHED,%edx ++ jnc int_very_careful ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ pushq %rdi ++ CFI_ADJUST_CFA_OFFSET 8 ++ call schedule ++ popq %rdi ++ CFI_ADJUST_CFA_OFFSET -8 ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ jmp int_with_check ++ ++ /* handle signals and tracing -- both require a full stack frame */ ++int_very_careful: ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ SAVE_REST ++ /* Check for syscall exit trace */ ++ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx ++ jz int_signal ++ pushq %rdi ++ CFI_ADJUST_CFA_OFFSET 8 ++ leaq 8(%rsp),%rdi # &ptregs -> arg1 ++ call syscall_trace_leave ++ popq %rdi ++ CFI_ADJUST_CFA_OFFSET -8 ++ andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi ++ jmp int_restore_rest ++ ++int_signal: ++ testl $_TIF_DO_NOTIFY_MASK,%edx ++ jz 1f ++ movq %rsp,%rdi # &ptregs -> arg1 ++ xorl %esi,%esi # oldset -> arg2 ++ call do_notify_resume ++1: movl $_TIF_NEED_RESCHED,%edi ++int_restore_rest: ++ RESTORE_REST ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ jmp int_with_check ++ CFI_ENDPROC ++END(system_call) ++ ++/* ++ * Certain special system calls that need to save a complete full stack frame. ++ */ ++ ++ .macro PTREGSCALL label,func,arg ++ .globl \label ++\label: ++ leaq \func(%rip),%rax ++ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ ++ jmp ptregscall_common ++END(\label) ++ .endm ++ ++ CFI_STARTPROC ++ ++ PTREGSCALL stub_clone, sys_clone, %r8 ++ PTREGSCALL stub_fork, sys_fork, %rdi ++ PTREGSCALL stub_vfork, sys_vfork, %rdi ++ PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx ++ PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx ++ PTREGSCALL stub_iopl, sys_iopl, %rsi ++ ++ENTRY(ptregscall_common) ++ popq %r11 ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_REGISTER rip, r11 ++ SAVE_REST ++ movq %r11, %r15 ++ CFI_REGISTER rip, r15 ++ FIXUP_TOP_OF_STACK %r11 ++ call *%rax ++ RESTORE_TOP_OF_STACK %r11 ++ movq %r15, %r11 ++ CFI_REGISTER rip, r11 ++ RESTORE_REST ++ pushq %r11 ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rip, 0 ++ ret ++ CFI_ENDPROC ++END(ptregscall_common) ++ ++ENTRY(stub_execve) ++ CFI_STARTPROC ++ popq %r11 ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_REGISTER rip, r11 ++ SAVE_REST ++ FIXUP_TOP_OF_STACK %r11 ++ movq %rsp, %rcx ++ call sys_execve ++ RESTORE_TOP_OF_STACK %r11 ++ movq %rax,RAX(%rsp) ++ RESTORE_REST ++ jmp int_ret_from_sys_call ++ CFI_ENDPROC ++END(stub_execve) ++ ++/* ++ * sigreturn is special because it needs to restore all registers on return. ++ * This cannot be done with SYSRET, so use the IRET return path instead. ++ */ ++ENTRY(stub_rt_sigreturn) ++ CFI_STARTPROC ++ addq $8, %rsp ++ CFI_ADJUST_CFA_OFFSET -8 ++ SAVE_REST ++ movq %rsp,%rdi ++ FIXUP_TOP_OF_STACK %r11 ++ call sys_rt_sigreturn ++ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer ++ RESTORE_REST ++ jmp int_ret_from_sys_call ++ CFI_ENDPROC ++END(stub_rt_sigreturn) ++ ++/* initial frame state for interrupts (and exceptions without error code) */ ++#define INTR_FRAME _frame (RIP-0x10); \ ++ CFI_REL_OFFSET rcx,0; \ ++ CFI_REL_OFFSET r11,8 ++ ++/* initial frame state for exceptions with error code (and interrupts with ++ vector already pushed) */ ++#define XCPT_FRAME _frame (RIP-0x18); \ ++ CFI_REL_OFFSET rcx,0; \ ++ CFI_REL_OFFSET r11,8 ++ ++/* ++ * Interrupt exit. ++ * ++ */ ++ ++retint_check: ++ CFI_DEFAULT_STACK adj=1 ++ LOCKDEP_SYS_EXIT_IRQ ++ movl threadinfo_flags(%rcx),%edx ++ andl %edi,%edx ++ CFI_REMEMBER_STATE ++ jnz retint_careful ++retint_restore_args: /* return to kernel space */ ++ movl EFLAGS-REST_SKIP(%rsp), %eax ++ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF ++ GET_VCPU_INFO ++ andb evtchn_upcall_mask(%rsi),%al ++ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask ++ jnz restore_all_enable_events # != 0 => enable event delivery ++ ++ RESTORE_ARGS 0,8,0 ++ HYPERVISOR_IRET 0 ++ ++ /* edi: workmask, edx: work */ ++retint_careful: ++ CFI_RESTORE_STATE ++ bt $TIF_NEED_RESCHED,%edx ++ jnc retint_signal ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ pushq %rdi ++ CFI_ADJUST_CFA_OFFSET 8 ++ call schedule ++ popq %rdi ++ CFI_ADJUST_CFA_OFFSET -8 ++ GET_THREAD_INFO(%rcx) ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ jmp retint_check ++ ++retint_signal: ++ testl $_TIF_DO_NOTIFY_MASK,%edx ++ jz retint_restore_args ++ TRACE_IRQS_ON ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ SAVE_REST ++ movq $-1,ORIG_RAX(%rsp) ++ xorl %esi,%esi # oldset ++ movq %rsp,%rdi # &pt_regs ++ call do_notify_resume ++ RESTORE_REST ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ movl $_TIF_NEED_RESCHED,%edi ++ GET_THREAD_INFO(%rcx) ++ jmp retint_check ++ ++#ifdef CONFIG_PREEMPT ++ /* Returning to kernel space. Check if we need preemption */ ++ /* rcx: threadinfo. interrupts off. */ ++ENTRY(retint_kernel) ++ cmpl $0,threadinfo_preempt_count(%rcx) ++ jnz retint_restore_args ++ bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) ++ jnc retint_restore_args ++ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ ++ jnc retint_restore_args ++ call preempt_schedule_irq ++ jmp retint_kernel /* check again */ ++#endif ++ ++ CFI_ENDPROC ++END(retint_check) ++ ++#ifndef CONFIG_XEN ++/* ++ * APIC interrupts. ++ */ ++ .macro apicinterrupt num,func ++ INTR_FRAME ++ pushq $~(\num) ++ CFI_ADJUST_CFA_OFFSET 8 ++ interrupt \func ++ jmp error_entry ++ CFI_ENDPROC ++ .endm ++ ++ENTRY(thermal_interrupt) ++ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt ++END(thermal_interrupt) ++ ++ENTRY(threshold_interrupt) ++ apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt ++END(threshold_interrupt) ++ ++#ifdef CONFIG_SMP ++ENTRY(reschedule_interrupt) ++ apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt ++END(reschedule_interrupt) ++ ++ .macro INVALIDATE_ENTRY num ++ENTRY(invalidate_interrupt\num) ++ apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt ++END(invalidate_interrupt\num) ++ .endm ++ ++ INVALIDATE_ENTRY 0 ++ INVALIDATE_ENTRY 1 ++ INVALIDATE_ENTRY 2 ++ INVALIDATE_ENTRY 3 ++ INVALIDATE_ENTRY 4 ++ INVALIDATE_ENTRY 5 ++ INVALIDATE_ENTRY 6 ++ INVALIDATE_ENTRY 7 ++ ++ENTRY(call_function_interrupt) ++ apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt ++END(call_function_interrupt) ++ENTRY(irq_move_cleanup_interrupt) ++ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt ++END(irq_move_cleanup_interrupt) ++#endif ++ ++ENTRY(apic_timer_interrupt) ++ apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt ++END(apic_timer_interrupt) ++ ++ENTRY(error_interrupt) ++ apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt ++END(error_interrupt) ++ ++ENTRY(spurious_interrupt) ++ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt ++END(spurious_interrupt) ++#endif /* !CONFIG_XEN */ ++ ++/* ++ * Exception entry points. ++ */ ++ .macro zeroentry sym ++ INTR_FRAME ++ movq (%rsp),%rcx ++ CFI_RESTORE rcx ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ addq $0x10,%rsp /* skip rcx and r11 */ ++ CFI_ADJUST_CFA_OFFSET -0x10 ++ pushq $0 /* push error code/oldrax */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ pushq %rax /* push real oldrax to the rdi slot */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rax,0 ++ leaq \sym(%rip),%rax ++ jmp error_entry ++ CFI_ENDPROC ++ .endm ++ ++ .macro errorentry sym ++ XCPT_FRAME ++ movq (%rsp),%rcx ++ CFI_RESTORE rcx ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ addq $0x10,%rsp /* rsp points to the error code */ ++ CFI_ADJUST_CFA_OFFSET -0x10 ++ pushq %rax ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rax,0 ++ leaq \sym(%rip),%rax ++ jmp error_entry ++ CFI_ENDPROC ++ .endm ++ ++#if 0 /* not XEN */ ++ /* error code is on the stack already */ ++ /* handle NMI like exceptions that can happen everywhere */ ++ .macro paranoidentry sym, ist=0, irqtrace=1 ++ movq (%rsp),%rcx ++ movq 8(%rsp),%r11 ++ addq $0x10,%rsp /* skip rcx and r11 */ ++ SAVE_ALL ++ cld ++#if 0 /* not XEN */ ++ movl $1,%ebx ++ movl $MSR_GS_BASE,%ecx ++ rdmsr ++ testl %edx,%edx ++ js 1f ++ SWAPGS ++ xorl %ebx,%ebx ++1: ++#endif ++ .if \ist ++ movq %gs:pda_data_offset, %rbp ++ .endif ++ movq %rsp,%rdi ++ movq ORIG_RAX(%rsp),%rsi ++ movq $-1,ORIG_RAX(%rsp) ++ .if \ist ++ subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) ++ .endif ++ call \sym ++ .if \ist ++ addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) ++ .endif ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ .if \irqtrace ++ TRACE_IRQS_OFF ++ .endif ++ .endm ++ ++ /* ++ * "Paranoid" exit path from exception stack. ++ * Paranoid because this is used by NMIs and cannot take ++ * any kernel state for granted. ++ * We don't do kernel preemption checks here, because only ++ * NMI should be common and it does not enable IRQs and ++ * cannot get reschedule ticks. ++ * ++ * "trace" is 0 for the NMI handler only, because irq-tracing ++ * is fundamentally NMI-unsafe. (we cannot change the soft and ++ * hard flags at once, atomically) ++ */ ++ .macro paranoidexit trace=1 ++ /* ebx: no swapgs flag */ ++paranoid_exit\trace: ++ testl %ebx,%ebx /* swapgs needed? */ ++ jnz paranoid_restore\trace ++ testl $3,CS(%rsp) ++ jnz paranoid_userspace\trace ++paranoid_swapgs\trace: ++ .if \trace ++ TRACE_IRQS_IRETQ 0 ++ .endif ++ SWAPGS_UNSAFE_STACK ++paranoid_restore\trace: ++ RESTORE_ALL 8 ++ jmp irq_return ++paranoid_userspace\trace: ++ GET_THREAD_INFO(%rcx) ++ movl threadinfo_flags(%rcx),%ebx ++ andl $_TIF_WORK_MASK,%ebx ++ jz paranoid_swapgs\trace ++ movq %rsp,%rdi /* &pt_regs */ ++ call sync_regs ++ movq %rax,%rsp /* switch stack for scheduling */ ++ testl $_TIF_NEED_RESCHED,%ebx ++ jnz paranoid_schedule\trace ++ movl %ebx,%edx /* arg3: thread flags */ ++ .if \trace ++ TRACE_IRQS_ON ++ .endif ++ ENABLE_INTERRUPTS(CLBR_NONE) ++ xorl %esi,%esi /* arg2: oldset */ ++ movq %rsp,%rdi /* arg1: &pt_regs */ ++ call do_notify_resume ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ .if \trace ++ TRACE_IRQS_OFF ++ .endif ++ jmp paranoid_userspace\trace ++paranoid_schedule\trace: ++ .if \trace ++ TRACE_IRQS_ON ++ .endif ++ ENABLE_INTERRUPTS(CLBR_ANY) ++ call schedule ++ DISABLE_INTERRUPTS(CLBR_ANY) ++ .if \trace ++ TRACE_IRQS_OFF ++ .endif ++ jmp paranoid_userspace\trace ++ CFI_ENDPROC ++ .endm ++#endif ++ ++/* ++ * Exception entry point. This expects an error code/orig_rax on the stack ++ * and the exception handler in %rax. ++ */ ++KPROBE_ENTRY(error_entry) ++ _frame RDI ++ CFI_REL_OFFSET rax,0 ++ /* rdi slot contains rax, oldrax contains error code */ ++ cld ++ subq $14*8,%rsp ++ CFI_ADJUST_CFA_OFFSET (14*8) ++ movq %rsi,13*8(%rsp) ++ CFI_REL_OFFSET rsi,RSI ++ movq 14*8(%rsp),%rsi /* load rax from rdi slot */ ++ CFI_REGISTER rax,rsi ++ movq %rdx,12*8(%rsp) ++ CFI_REL_OFFSET rdx,RDX ++ movq %rcx,11*8(%rsp) ++ CFI_REL_OFFSET rcx,RCX ++ movq %rsi,10*8(%rsp) /* store rax */ ++ CFI_REL_OFFSET rax,RAX ++ movq %r8, 9*8(%rsp) ++ CFI_REL_OFFSET r8,R8 ++ movq %r9, 8*8(%rsp) ++ CFI_REL_OFFSET r9,R9 ++ movq %r10,7*8(%rsp) ++ CFI_REL_OFFSET r10,R10 ++ movq %r11,6*8(%rsp) ++ CFI_REL_OFFSET r11,R11 ++ movq %rbx,5*8(%rsp) ++ CFI_REL_OFFSET rbx,RBX ++ movq %rbp,4*8(%rsp) ++ CFI_REL_OFFSET rbp,RBP ++ movq %r12,3*8(%rsp) ++ CFI_REL_OFFSET r12,R12 ++ movq %r13,2*8(%rsp) ++ CFI_REL_OFFSET r13,R13 ++ movq %r14,1*8(%rsp) ++ CFI_REL_OFFSET r14,R14 ++ movq %r15,(%rsp) ++ CFI_REL_OFFSET r15,R15 ++#if 0 ++ cmpl $__KERNEL_CS,CS(%rsp) ++ CFI_REMEMBER_STATE ++ je error_kernelspace ++#endif ++error_call_handler: ++ movq %rdi, RDI(%rsp) ++ CFI_REL_OFFSET rdi,RDI ++ movq %rsp,%rdi ++ movq ORIG_RAX(%rsp),%rsi # get error code ++ movq $-1,ORIG_RAX(%rsp) ++ call *%rax ++error_exit: ++ RESTORE_REST ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ GET_THREAD_INFO(%rcx) ++ testb $3,CS-ARGOFFSET(%rsp) ++ jz retint_kernel ++ LOCKDEP_SYS_EXIT_IRQ ++ movl threadinfo_flags(%rcx),%edx ++ movl $_TIF_WORK_MASK,%edi ++ andl %edi,%edx ++ jnz retint_careful ++ jmp retint_restore_args ++ ++#if 0 ++ /* ++ * We need to re-write the logic here because we don't do iretq to ++ * to return to user mode. It's still possible that we get trap/fault ++ * in the kernel (when accessing buffers pointed to by system calls, ++ * for example). ++ * ++ */ ++ CFI_RESTORE_STATE ++error_kernelspace: ++ incl %ebx ++ /* There are two places in the kernel that can potentially fault with ++ usergs. Handle them here. The exception handlers after ++ iret run with kernel gs again, so don't set the user space flag. ++ B stepping K8s sometimes report an truncated RIP for IRET ++ exceptions returning to compat mode. Check for these here too. */ ++ leaq irq_return(%rip),%rbp ++ cmpq %rbp,RIP(%rsp) ++ je error_swapgs ++ movl %ebp,%ebp /* zero extend */ ++ cmpq %rbp,RIP(%rsp) ++ je error_swapgs ++ cmpq $gs_change,RIP(%rsp) ++ je error_swapgs ++ jmp error_sti ++#endif ++ CFI_ENDPROC ++KPROBE_END(error_entry) ++ ++ENTRY(hypervisor_callback) ++ zeroentry do_hypervisor_callback ++END(hypervisor_callback) ++ ++/* ++ * Copied from arch/xen/i386/kernel/entry.S ++ */ ++# A note on the "critical region" in our callback handler. ++# We want to avoid stacking callback handlers due to events occurring ++# during handling of the last event. To do this, we keep events disabled ++# until we've done all processing. HOWEVER, we must enable events before ++# popping the stack frame (can't be done atomically) and so it would still ++# be possible to get enough handler activations to overflow the stack. ++# Although unlikely, bugs of that kind are hard to track down, so we'd ++# like to avoid the possibility. ++# So, on entry to the handler we detect whether we interrupted an ++# existing activation in its critical region -- if so, we pop the current ++# activation and restart the handler using the previous one. ++ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) ++ CFI_STARTPROC ++# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will ++# see the correct pointer to the pt_regs ++ movq %rdi, %rsp # we don't return, adjust the stack frame ++ CFI_ENDPROC ++ CFI_DEFAULT_STACK ++11: incl %gs:pda_irqcount ++ movq %rsp,%rbp ++ CFI_DEF_CFA_REGISTER rbp ++ cmovzq %gs:pda_irqstackptr,%rsp ++ pushq %rbp # backlink for old unwinder ++ call evtchn_do_upcall ++ popq %rsp ++ CFI_DEF_CFA_REGISTER rsp ++ decl %gs:pda_irqcount ++ jmp error_exit ++ CFI_ENDPROC ++END(do_hypervisor_callback) ++ ++ ALIGN ++restore_all_enable_events: ++ CFI_DEFAULT_STACK adj=1 ++ TRACE_IRQS_ON ++ __ENABLE_INTERRUPTS ++ ++scrit: /**** START OF CRITICAL REGION ****/ ++ __TEST_PENDING ++ CFI_REMEMBER_STATE ++ jnz 14f # process more events if necessary... ++ RESTORE_ARGS 0,8,0 ++ HYPERVISOR_IRET 0 ++ ++ CFI_RESTORE_STATE ++14: __DISABLE_INTERRUPTS ++ SAVE_REST ++ movq %rsp,%rdi # set the argument again ++ jmp 11b ++ CFI_ENDPROC ++ecrit: /**** END OF CRITICAL REGION ****/ ++# At this point, unlike on x86-32, we don't do the fixup to simplify the ++# code and the stack frame is more complex on x86-64. ++# When the kernel is interrupted in the critical section, the kernel ++# will do IRET in that case, and everything will be restored at that point, ++# i.e. it just resumes from the next instruction interrupted with the same context. ++ ++# Hypervisor uses this for application faults while it executes. ++# We get here for two reasons: ++# 1. Fault while reloading DS, ES, FS or GS ++# 2. Fault while executing IRET ++# Category 1 we do not need to fix up as Xen has already reloaded all segment ++# registers that could be reloaded and zeroed the others. ++# Category 2 we fix up by killing the current process. We cannot use the ++# normal Linux return path in this case because if we use the IRET hypercall ++# to pop the stack frame we end up in an infinite loop of failsafe callbacks. ++# We distinguish between categories by comparing each saved segment register ++# with its current contents: any discrepancy means we in category 1. ++ENTRY(failsafe_callback) ++ _frame (RIP-0x30) ++ CFI_REL_OFFSET rcx, 0 ++ CFI_REL_OFFSET r11, 8 ++ movw %ds,%cx ++ cmpw %cx,0x10(%rsp) ++ CFI_REMEMBER_STATE ++ jne 1f ++ movw %es,%cx ++ cmpw %cx,0x18(%rsp) ++ jne 1f ++ movw %fs,%cx ++ cmpw %cx,0x20(%rsp) ++ jne 1f ++ movw %gs,%cx ++ cmpw %cx,0x28(%rsp) ++ jne 1f ++ /* All segments match their saved values => Category 2 (Bad IRET). */ ++ movq (%rsp),%rcx ++ CFI_RESTORE rcx ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ addq $0x30,%rsp ++ CFI_ADJUST_CFA_OFFSET -0x30 ++ movq $11,%rdi /* SIGSEGV */ ++ jmp do_exit ++ CFI_RESTORE_STATE ++1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ ++ movq (%rsp),%rcx ++ CFI_RESTORE rcx ++ movq 8(%rsp),%r11 ++ CFI_RESTORE r11 ++ addq $0x30,%rsp ++ CFI_ADJUST_CFA_OFFSET -0x30 ++ pushq $0 ++ CFI_ADJUST_CFA_OFFSET 8 ++ SAVE_ALL ++ jmp error_exit ++ CFI_ENDPROC ++#if 0 ++ .section __ex_table,"a" ++ .align 8 ++ .quad gs_change,bad_gs ++ .previous ++ .section .fixup,"ax" ++ /* running with kernelgs */ ++bad_gs: ++/* swapgs */ /* switch back to user gs */ ++ xorl %eax,%eax ++ movl %eax,%gs ++ jmp 2b ++ .previous ++#endif ++ ++/* ++ * Create a kernel thread. ++ * ++ * C extern interface: ++ * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++ * ++ * asm input arguments: ++ * rdi: fn, rsi: arg, rdx: flags ++ */ ++ENTRY(kernel_thread) ++ CFI_STARTPROC ++ FAKE_STACK_FRAME $child_rip ++ SAVE_ALL ++ ++ # rdi: flags, rsi: usp, rdx: will be &pt_regs ++ movq %rdx,%rdi ++ orq kernel_thread_flags(%rip),%rdi ++ movq $-1, %rsi ++ movq %rsp, %rdx ++ ++ xorl %r8d,%r8d ++ xorl %r9d,%r9d ++ ++ # clone now ++ call do_fork ++ movq %rax,RAX(%rsp) ++ xorl %edi,%edi ++ ++ /* ++ * It isn't worth to check for reschedule here, ++ * so internally to the x86_64 port you can rely on kernel_thread() ++ * not to reschedule the child before returning, this avoids the need ++ * of hacks for example to fork off the per-CPU idle tasks. ++ * [Hopefully no generic code relies on the reschedule -AK] ++ */ ++ RESTORE_ALL ++ UNFAKE_STACK_FRAME ++ ret ++ CFI_ENDPROC ++ENDPROC(kernel_thread) ++ ++child_rip: ++ pushq $0 # fake return address ++ CFI_STARTPROC ++ /* ++ * Here we are in the child and the registers are set as they were ++ * at kernel_thread() invocation in the parent. ++ */ ++ movq %rdi, %rax ++ movq %rsi, %rdi ++ call *%rax ++ # exit ++ mov %eax, %edi ++ call do_exit ++ CFI_ENDPROC ++ENDPROC(child_rip) ++ ++/* ++ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. ++ * ++ * C extern interface: ++ * extern long execve(char *name, char **argv, char **envp) ++ * ++ * asm input arguments: ++ * rdi: name, rsi: argv, rdx: envp ++ * ++ * We want to fallback into: ++ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) ++ * ++ * do_sys_execve asm fallback arguments: ++ * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack ++ */ ++ENTRY(kernel_execve) ++ CFI_STARTPROC ++ FAKE_STACK_FRAME $0 ++ SAVE_ALL ++ movq %rsp,%rcx ++ call sys_execve ++ movq %rax, RAX(%rsp) ++ RESTORE_REST ++ testq %rax,%rax ++ jne 1f ++ jmp int_ret_from_sys_call ++1: RESTORE_ARGS ++ UNFAKE_STACK_FRAME ++ ret ++ CFI_ENDPROC ++ENDPROC(kernel_execve) ++ ++KPROBE_ENTRY(page_fault) ++ errorentry do_page_fault ++KPROBE_END(page_fault) ++ ++ENTRY(coprocessor_error) ++ zeroentry do_coprocessor_error ++END(coprocessor_error) ++ ++ENTRY(simd_coprocessor_error) ++ zeroentry do_simd_coprocessor_error ++END(simd_coprocessor_error) ++ ++ENTRY(device_not_available) ++ zeroentry math_state_restore ++END(device_not_available) ++ ++ /* runs on exception stack */ ++KPROBE_ENTRY(debug) ++/* INTR_FRAME ++ pushq $0 ++ CFI_ADJUST_CFA_OFFSET 8 */ ++ zeroentry do_debug ++/* paranoidexit ++ CFI_ENDPROC */ ++KPROBE_END(debug) ++ ++KPROBE_ENTRY(nmi) ++ zeroentry do_nmi_callback ++KPROBE_END(nmi) ++do_nmi_callback: ++ CFI_STARTPROC ++ addq $8, %rsp ++ CFI_ENDPROC ++ CFI_DEFAULT_STACK ++ call do_nmi ++ orl $NMI_MASK,EFLAGS(%rsp) ++ RESTORE_REST ++ DISABLE_INTERRUPTS(CLBR_NONE) ++ TRACE_IRQS_OFF ++ GET_THREAD_INFO(%rcx) ++ jmp retint_restore_args ++ CFI_ENDPROC ++END(do_nmi_callback) ++ ++KPROBE_ENTRY(int3) ++/* INTR_FRAME ++ pushq $0 ++ CFI_ADJUST_CFA_OFFSET 8 */ ++ zeroentry do_int3 ++/* jmp paranoid_exit1 ++ CFI_ENDPROC */ ++KPROBE_END(int3) ++ ++ENTRY(overflow) ++ zeroentry do_overflow ++END(overflow) ++ ++ENTRY(bounds) ++ zeroentry do_bounds ++END(bounds) ++ ++ENTRY(invalid_op) ++ zeroentry do_invalid_op ++END(invalid_op) ++ ++ENTRY(coprocessor_segment_overrun) ++ zeroentry do_coprocessor_segment_overrun ++END(coprocessor_segment_overrun) ++ ++ENTRY(reserved) ++ zeroentry do_reserved ++END(reserved) ++ ++#if 0 ++ /* runs on exception stack */ ++ENTRY(double_fault) ++ XCPT_FRAME ++ paranoidentry do_double_fault ++ jmp paranoid_exit1 ++ CFI_ENDPROC ++END(double_fault) ++#endif ++ ++ENTRY(invalid_TSS) ++ errorentry do_invalid_TSS ++END(invalid_TSS) ++ ++ENTRY(segment_not_present) ++ errorentry do_segment_not_present ++END(segment_not_present) ++ ++ /* runs on exception stack */ ++ENTRY(stack_segment) ++/* XCPT_FRAME ++ paranoidentry do_stack_segment */ ++ errorentry do_stack_segment ++/* jmp paranoid_exit1 ++ CFI_ENDPROC */ ++END(stack_segment) ++ ++KPROBE_ENTRY(general_protection) ++ errorentry do_general_protection ++KPROBE_END(general_protection) ++ ++ENTRY(alignment_check) ++ errorentry do_alignment_check ++END(alignment_check) ++ ++ENTRY(divide_error) ++ zeroentry do_divide_error ++END(divide_error) ++ ++ENTRY(spurious_interrupt_bug) ++ zeroentry do_spurious_interrupt_bug ++END(spurious_interrupt_bug) ++ ++#ifdef CONFIG_X86_MCE ++ /* runs on exception stack */ ++ENTRY(machine_check) ++ INTR_FRAME ++ pushq $0 ++ CFI_ADJUST_CFA_OFFSET 8 ++ paranoidentry do_machine_check ++ jmp paranoid_exit1 ++ CFI_ENDPROC ++END(machine_check) ++#endif ++ ++/* Call softirq on interrupt stack. Interrupts are off. */ ++ENTRY(call_softirq) ++ CFI_STARTPROC ++ push %rbp ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rbp,0 ++ mov %rsp,%rbp ++ CFI_DEF_CFA_REGISTER rbp ++ incl %gs:pda_irqcount ++ cmove %gs:pda_irqstackptr,%rsp ++ push %rbp # backlink for old unwinder ++ call __do_softirq ++ leaveq ++ CFI_DEF_CFA_REGISTER rsp ++ CFI_ADJUST_CFA_OFFSET -8 ++ decl %gs:pda_irqcount ++ ret ++ CFI_ENDPROC ++ENDPROC(call_softirq) ++ ++KPROBE_ENTRY(ignore_sysret) ++ CFI_STARTPROC ++ mov $-ENOSYS,%eax ++ HYPERVISOR_IRET 0 ++ CFI_ENDPROC ++ENDPROC(ignore_sysret) +diff --git a/arch/x86/kernel/fixup.c b/arch/x86/kernel/fixup.c +new file mode 100644 +index 0000000..cbd6eb5 +--- /dev/null ++++ b/arch/x86/kernel/fixup.c +@@ -0,0 +1,88 @@ ++/****************************************************************************** ++ * fixup.c ++ * ++ * Binary-rewriting of certain IA32 instructions, on notification by Xen. ++ * Used to avoid repeated slow emulation of common instructions used by the ++ * user-space TLS (Thread-Local Storage) libraries. ++ * ++ * **** NOTE **** ++ * Issues with the binary rewriting have caused it to be removed. Instead ++ * we rely on Xen's emulator to boot the kernel, and then print a banner ++ * message recommending that the user disables /lib/tls. ++ * ++ * Copyright (c) 2004, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) ++ ++void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) ++{ ++ static unsigned long printed = 0; ++ char info[100]; ++ int i; ++ ++ /* Ignore statically-linked init. */ ++ if (current->tgid == 1) ++ return; ++ ++ VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, ++ VMASST_TYPE_4gb_segments_notify)); ++ ++ if (test_and_set_bit(0, &printed)) ++ return; ++ ++ sprintf(info, "%s (pid=%d)", current->comm, current->tgid); ++ ++ DP(""); ++ DP("***************************************************************"); ++ DP("***************************************************************"); ++ DP("** WARNING: Currently emulating unsupported memory accesses **"); ++ DP("** in /lib/tls glibc libraries. The emulation is **"); ++ DP("** slow. To ensure full performance you should **"); ++ DP("** install a 'xen-friendly' (nosegneg) version of **"); ++ DP("** the library, or disable tls support by executing **"); ++ DP("** the following as root: **"); ++ DP("** mv /lib/tls /lib/tls.disabled **"); ++ DP("** Offending process: %-38.38s **", info); ++ DP("***************************************************************"); ++ DP("***************************************************************"); ++ DP(""); ++ ++ for (i = 5; i > 0; i--) { ++ touch_softlockup_watchdog(); ++ printk("Pausing... %d", i); ++ mdelay(1000); ++ printk("\b\b\b\b\b\b\b\b\b\b\b\b"); ++ } ++ ++ printk("Continuing...\n\n"); ++} ++ ++static int __init fixup_init(void) ++{ ++ WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, ++ VMASST_TYPE_4gb_segments_notify)); ++ return 0; ++} ++__initcall(fixup_init); +diff --git a/arch/x86/kernel/genapic_64-xen.c b/arch/x86/kernel/genapic_64-xen.c +new file mode 100644 +index 0000000..2ccd7cf +--- /dev/null ++++ b/arch/x86/kernel/genapic_64-xen.c +@@ -0,0 +1,123 @@ ++/* ++ * Copyright 2004 James Cleverdon, IBM. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Generic APIC sub-arch probe layer. ++ * ++ * Hacked for x86-64 by James Cleverdon from i386 architecture code by ++ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and ++ * James Cleverdon. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#ifdef CONFIG_ACPI ++#include ++#endif ++ ++#ifndef CONFIG_XEN ++DEFINE_PER_CPU(int, x2apic_extra_bits); ++ ++struct genapic __read_mostly *genapic = &apic_flat; ++ ++static enum uv_system_type uv_system_type; ++#else ++extern struct genapic apic_xen; ++struct genapic __read_mostly *genapic = &apic_xen; ++#endif ++ ++ ++/* ++ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. ++ */ ++void __init setup_apic_routing(void) ++{ ++#ifndef CONFIG_XEN ++ if (uv_system_type == UV_NON_UNIQUE_APIC) ++ genapic = &apic_x2apic_uv_x; ++ else ++#ifdef CONFIG_ACPI ++ /* ++ * Quirk: some x86_64 machines can only use physical APIC mode ++ * regardless of how many processors are present (x86_64 ES7000 ++ * is an example). ++ */ ++ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && ++ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) ++ genapic = &apic_physflat; ++ else ++#endif ++ ++ if (num_possible_cpus() <= 8) ++ genapic = &apic_flat; ++ else ++ genapic = &apic_physflat; ++ ++#else ++ /* hardcode to xen apic functions */ ++ genapic = &apic_xen; ++#endif ++ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); ++} ++ ++/* Same for both flat and physical. */ ++ ++#ifdef CONFIG_XEN ++extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector); ++#endif ++ ++void send_IPI_self(int vector) ++{ ++#ifndef CONFIG_XEN ++ __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); ++#else ++ xen_send_IPI_shortcut(APIC_DEST_SELF, vector); ++#endif ++} ++ ++int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) ++{ ++#ifndef CONFIG_XEN ++ if (!strcmp(oem_id, "SGI")) { ++ if (!strcmp(oem_table_id, "UVL")) ++ uv_system_type = UV_LEGACY_APIC; ++ else if (!strcmp(oem_table_id, "UVX")) ++ uv_system_type = UV_X2APIC; ++ else if (!strcmp(oem_table_id, "UVH")) ++ uv_system_type = UV_NON_UNIQUE_APIC; ++ } ++#endif ++ return 0; ++} ++ ++#ifndef CONFIG_XEN ++unsigned int read_apic_id(void) ++{ ++ unsigned int id; ++ ++ WARN_ON(preemptible() && num_online_cpus() > 1); ++ id = apic_read(APIC_ID); ++ if (uv_system_type >= UV_X2APIC) ++ id |= __get_cpu_var(x2apic_extra_bits); ++ return id; ++} ++ ++enum uv_system_type get_uv_system_type(void) ++{ ++ return uv_system_type; ++} ++ ++int is_uv_system(void) ++{ ++ return uv_system_type != UV_NONE; ++} ++#endif +diff --git a/arch/x86/kernel/genapic_xen_64.c b/arch/x86/kernel/genapic_xen_64.c +new file mode 100644 +index 0000000..a729031 +--- /dev/null ++++ b/arch/x86/kernel/genapic_xen_64.c +@@ -0,0 +1,164 @@ ++/* ++ * Copyright 2004 James Cleverdon, IBM. ++ * Subject to the GNU Public License, v.2 ++ * ++ * Xen APIC subarch code. Maximum 8 CPUs, logical delivery. ++ * ++ * Hacked for x86-64 by James Cleverdon from i386 architecture code by ++ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and ++ * James Cleverdon. ++ * ++ * Hacked to pieces for Xen by Chris Wright. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_XEN_PRIVILEGED_GUEST ++#include ++#else ++#include ++#endif ++#include ++#include ++ ++DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); ++ ++static inline void __send_IPI_one(unsigned int cpu, int vector) ++{ ++ int irq = per_cpu(ipi_to_irq, cpu)[vector]; ++ BUG_ON(irq < 0); ++ notify_remote_via_irq(irq); ++} ++ ++void xen_send_IPI_shortcut(unsigned int shortcut, int vector) ++{ ++ int cpu; ++ ++ switch (shortcut) { ++ case APIC_DEST_SELF: ++ __send_IPI_one(smp_processor_id(), vector); ++ break; ++ case APIC_DEST_ALLBUT: ++ for (cpu = 0; cpu < NR_CPUS; ++cpu) { ++ if (cpu == smp_processor_id()) ++ continue; ++ if (cpu_isset(cpu, cpu_online_map)) { ++ __send_IPI_one(cpu, vector); ++ } ++ } ++ break; ++ case APIC_DEST_ALLINC: ++ for (cpu = 0; cpu < NR_CPUS; ++cpu) { ++ if (cpu_isset(cpu, cpu_online_map)) { ++ __send_IPI_one(cpu, vector); ++ } ++ } ++ break; ++ default: ++ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, ++ vector); ++ break; ++ } ++} ++ ++static cpumask_t xen_target_cpus(void) ++{ ++ return cpu_online_map; ++} ++ ++static cpumask_t xen_vector_allocation_domain(int cpu) ++{ ++ return cpumask_of_cpu(cpu); ++} ++ ++/* ++ * Set up the logical destination ID. ++ * Do nothing, not called now. ++ */ ++static void xen_init_apic_ldr(void) ++{ ++ Dprintk("%s\n", __FUNCTION__); ++ return; ++} ++ ++static void xen_send_IPI_allbutself(int vector) ++{ ++ /* ++ * if there are no other CPUs in the system then ++ * we get an APIC send error if we try to broadcast. ++ * thus we have to avoid sending IPIs in this case. ++ */ ++ Dprintk("%s\n", __FUNCTION__); ++ if (num_online_cpus() > 1) ++ xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); ++} ++ ++static void xen_send_IPI_all(int vector) ++{ ++ Dprintk("%s\n", __FUNCTION__); ++ xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector); ++} ++ ++static void xen_send_IPI_mask(cpumask_t cpumask, int vector) ++{ ++ unsigned long mask = cpus_addr(cpumask)[0]; ++ unsigned int cpu; ++ unsigned long flags; ++ ++ Dprintk("%s\n", __FUNCTION__); ++ local_irq_save(flags); ++ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); ++ ++ for (cpu = 0; cpu < NR_CPUS; ++cpu) { ++ if (cpu_isset(cpu, cpumask)) { ++ __send_IPI_one(cpu, vector); ++ } ++ } ++ local_irq_restore(flags); ++} ++ ++#ifdef CONFIG_XEN_PRIVILEGED_GUEST ++static int xen_apic_id_registered(void) ++{ ++ /* better be set */ ++ Dprintk("%s\n", __FUNCTION__); ++ return physid_isset(smp_processor_id(), phys_cpu_present_map); ++} ++#endif ++ ++static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask) ++{ ++ Dprintk("%s\n", __FUNCTION__); ++ return cpus_addr(cpumask)[0]; ++} ++ ++static unsigned int phys_pkg_id(int index_msb) ++{ ++ u32 ebx; ++ ++ Dprintk("%s\n", __FUNCTION__); ++ ebx = cpuid_ebx(1); ++ return ((ebx >> 24) & 0xFF) >> index_msb; ++} ++ ++struct genapic apic_xen = { ++ .name = "xen", ++#ifdef CONFIG_XEN_PRIVILEGED_GUEST ++ .int_delivery_mode = dest_LowestPrio, ++#endif ++ .int_dest_mode = 1, ++ .target_cpus = xen_target_cpus, ++ .vector_allocation_domain = xen_vector_allocation_domain, ++#ifdef CONFIG_XEN_PRIVILEGED_GUEST ++ .apic_id_registered = xen_apic_id_registered, ++#endif ++ .init_apic_ldr = xen_init_apic_ldr, ++ .send_IPI_all = xen_send_IPI_all, ++ .send_IPI_allbutself = xen_send_IPI_allbutself, ++ .send_IPI_mask = xen_send_IPI_mask, ++ .cpu_mask_to_apicid = xen_cpu_mask_to_apicid, ++ .phys_pkg_id = phys_pkg_id, ++}; +diff --git a/arch/x86/kernel/head64-xen.c b/arch/x86/kernel/head64-xen.c +new file mode 100644 +index 0000000..b25ebf6 +--- /dev/null ++++ b/arch/x86/kernel/head64-xen.c +@@ -0,0 +1,229 @@ ++/* ++ * prepare to run common code ++ * ++ * Copyright (C) 2000 Andrea Arcangeli SuSE ++ * ++ * Jun Nakajima ++ * Modified for Xen. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++unsigned long start_pfn; ++ ++#ifndef CONFIG_XEN ++static void __init zap_identity_mappings(void) ++{ ++ pgd_t *pgd = pgd_offset_k(0UL); ++ pgd_clear(pgd); ++ __flush_tlb_all(); ++} ++ ++/* Don't add a printk in there. printk relies on the PDA which is not initialized ++ yet. */ ++static void __init clear_bss(void) ++{ ++ memset(__bss_start, 0, ++ (unsigned long) __bss_stop - (unsigned long) __bss_start); ++} ++#endif ++ ++static void __init copy_bootdata(char *real_mode_data) ++{ ++#ifndef CONFIG_XEN ++ char * command_line; ++ ++ memcpy(&boot_params, real_mode_data, sizeof boot_params); ++ if (boot_params.hdr.cmd_line_ptr) { ++ command_line = __va(boot_params.hdr.cmd_line_ptr); ++ memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); ++ } ++#else ++ int max_cmdline; ++ ++ if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) ++ max_cmdline = COMMAND_LINE_SIZE; ++ memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); ++ boot_command_line[max_cmdline-1] = '\0'; ++#endif ++} ++ ++#include ++unsigned long *machine_to_phys_mapping; ++EXPORT_SYMBOL(machine_to_phys_mapping); ++unsigned int machine_to_phys_order; ++EXPORT_SYMBOL(machine_to_phys_order); ++ ++#define BIOS_LOWMEM_KILOBYTES 0x413 ++ ++/* ++ * The BIOS places the EBDA/XBDA at the top of conventional ++ * memory, and usually decreases the reported amount of ++ * conventional memory (int 0x12) too. This also contains a ++ * workaround for Dell systems that neglect to reserve EBDA. ++ * The same workaround also avoids a problem with the AMD768MPX ++ * chipset: reserve a page before VGA to prevent PCI prefetch ++ * into it (errata #56). Usually the page is reserved anyways, ++ * unless you have no PS/2 mouse plugged in. ++ */ ++static void __init reserve_ebda_region(void) ++{ ++#ifndef CONFIG_XEN ++ unsigned int lowmem, ebda_addr; ++ ++ /* To determine the position of the EBDA and the */ ++ /* end of conventional memory, we need to look at */ ++ /* the BIOS data area. In a paravirtual environment */ ++ /* that area is absent. We'll just have to assume */ ++ /* that the paravirt case can handle memory setup */ ++ /* correctly, without our help. */ ++ if (paravirt_enabled()) ++ return; ++ ++ /* end of low (conventional) memory */ ++ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); ++ lowmem <<= 10; ++ ++ /* start of EBDA area */ ++ ebda_addr = get_bios_ebda(); ++ ++ /* Fixup: bios puts an EBDA in the top 64K segment */ ++ /* of conventional memory, but does not adjust lowmem. */ ++ if ((lowmem - ebda_addr) <= 0x10000) ++ lowmem = ebda_addr; ++ ++ /* Fixup: bios does not report an EBDA at all. */ ++ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ ++ if ((ebda_addr == 0) && (lowmem >= 0x9f000)) ++ lowmem = 0x9f000; ++ ++ /* Paranoia: should never happen, but... */ ++ if ((lowmem == 0) || (lowmem >= 0x100000)) ++ lowmem = 0x9f000; ++ ++ /* reserve all memory between lowmem and the 1MB mark */ ++ reserve_early(lowmem, 0x100000, "BIOS reserved"); ++#endif ++} ++ ++static void __init reserve_setup_data(void) ++{ ++#ifndef CONFIG_XEN ++ struct setup_data *data; ++ unsigned long pa_data; ++ char buf[32]; ++ ++ if (boot_params.hdr.version < 0x0209) ++ return; ++ pa_data = boot_params.hdr.setup_data; ++ while (pa_data) { ++ data = early_ioremap(pa_data, sizeof(*data)); ++ sprintf(buf, "setup data %x", data->type); ++ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); ++ pa_data = data->next; ++ early_iounmap(data, sizeof(*data)); ++ } ++#endif ++} ++ ++void __init x86_64_start_kernel(char * real_mode_data) ++{ ++ struct xen_machphys_mapping mapping; ++ unsigned long machine_to_phys_nr_ents; ++ int i; ++ ++ /* ++ * Build-time sanity checks on the kernel image and module ++ * area mappings. (these are purely build-time and produce no code) ++ */ ++ BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START); ++ BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE); ++ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); ++ BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0); ++ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); ++ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); ++ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == ++ (__START_KERNEL & PGDIR_MASK))); ++ ++ xen_setup_features(); ++ ++ xen_start_info = (struct start_info *)real_mode_data; ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) ++ phys_to_machine_mapping = ++ (unsigned long *)xen_start_info->mfn_list; ++ start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) + ++ xen_start_info->nr_pt_frames; ++ ++ machine_to_phys_mapping = (unsigned long *)MACH2PHYS_VIRT_START; ++ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; ++ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { ++ machine_to_phys_mapping = (unsigned long *)mapping.v_start; ++ machine_to_phys_nr_ents = mapping.max_mfn + 1; ++ } ++ while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) ++ machine_to_phys_order++; ++ ++#ifndef CONFIG_XEN ++ /* clear bss before set_intr_gate with early_idt_handler */ ++ clear_bss(); ++ ++ /* Make NULL pointers segfault */ ++ zap_identity_mappings(); ++ ++ /* Cleanup the over mapped high alias */ ++ cleanup_highmap(); ++ ++ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { ++#ifdef CONFIG_EARLY_PRINTK ++ set_intr_gate(i, &early_idt_handlers[i]); ++#else ++ set_intr_gate(i, early_idt_handler); ++#endif ++ } ++ load_idt((const struct desc_ptr *)&idt_descr); ++#endif ++ ++ early_printk("Kernel alive\n"); ++ ++ for (i = 0; i < NR_CPUS; i++) ++ cpu_pda(i) = &boot_cpu_pda[i]; ++ ++ pda_init(0); ++ copy_bootdata(__va(real_mode_data)); ++ ++ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); ++ ++ reserve_early(round_up(__pa_symbol(&_end), PAGE_SIZE), ++ start_pfn << PAGE_SHIFT, "Xen provided"); ++ ++ reserve_ebda_region(); ++ reserve_setup_data(); ++ ++ /* ++ * At this point everything still needed from the boot loader ++ * or BIOS or kernel text should be early reserved or marked not ++ * RAM in e820. All other memory is free game. ++ */ ++ ++ start_kernel(); ++} +diff --git a/arch/x86/kernel/head_32-xen.S b/arch/x86/kernel/head_32-xen.S +new file mode 100644 +index 0000000..3a4160f +--- /dev/null ++++ b/arch/x86/kernel/head_32-xen.S +@@ -0,0 +1,164 @@ ++ ++ ++.text ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * References to members of the new_cpu_data structure. ++ */ ++ ++#define X86 new_cpu_data+CPUINFO_x86 ++#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor ++#define X86_MODEL new_cpu_data+CPUINFO_x86_model ++#define X86_MASK new_cpu_data+CPUINFO_x86_mask ++#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math ++#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level ++#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability ++#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id ++ ++.section .text.head,"ax",@progbits ++#define VIRT_ENTRY_OFFSET 0x0 ++.org VIRT_ENTRY_OFFSET ++ENTRY(startup_32) ++ movl %esi,xen_start_info ++ cld ++ ++ /* Set up the stack pointer */ ++ movl $(init_thread_union+THREAD_SIZE),%esp ++ ++ movl %ss,%eax ++ movl %eax,%fs # gets reset once there's real percpu ++ ++ /* get vendor info */ ++ xorl %eax,%eax # call CPUID with 0 -> return vendor ID ++ XEN_CPUID ++ movl %eax,X86_CPUID # save CPUID level ++ movl %ebx,X86_VENDOR_ID # lo 4 chars ++ movl %edx,X86_VENDOR_ID+4 # next 4 chars ++ movl %ecx,X86_VENDOR_ID+8 # last 4 chars ++ ++ movl $1,%eax # Use the CPUID instruction to get CPU type ++ XEN_CPUID ++ movb %al,%cl # save reg for future use ++ andb $0x0f,%ah # mask processor family ++ movb %ah,X86 ++ andb $0xf0,%al # mask model ++ shrb $4,%al ++ movb %al,X86_MODEL ++ andb $0x0f,%cl # mask mask revision ++ movb %cl,X86_MASK ++ movl %edx,X86_CAPABILITY ++ ++ movb $1,X86_HARD_MATH ++ ++ xorl %eax,%eax # Clear GS ++ movl %eax,%gs ++ ++ cld # gcc2 wants the direction flag cleared at all times ++ ++ pushl $0 # fake return address for unwinder ++ jmp i386_start_kernel ++ ++#define HYPERCALL_PAGE_OFFSET 0x1000 ++.org HYPERCALL_PAGE_OFFSET ++ENTRY(hypercall_page) ++ CFI_STARTPROC ++.skip 0x1000 ++ CFI_ENDPROC ++ ++/* ++ * Real beginning of normal "text" segment ++ */ ++ENTRY(stext) ++ENTRY(_stext) ++ ++/* ++ * BSS section ++ */ ++.section ".bss.page_aligned","wa" ++ .align PAGE_SIZE_asm ++ENTRY(swapper_pg_fixmap) ++ .fill 1024,4,0 ++ENTRY(empty_zero_page) ++ .fill 4096,1,0 ++ ++/* ++ * This starts the data section. ++ */ ++.data ++ ++#if CONFIG_XEN_COMPAT <= 0x030002 ++/* ++ * __xen_guest information ++ */ ++.macro utoa value ++ .if (\value) < 0 || (\value) >= 0x10 ++ utoa (((\value)>>4)&0x0fffffff) ++ .endif ++ .if ((\value) & 0xf) < 10 ++ .byte '0' + ((\value) & 0xf) ++ .else ++ .byte 'A' + ((\value) & 0xf) - 10 ++ .endif ++.endm ++ ++.section __xen_guest ++ .ascii "GUEST_OS=linux,GUEST_VER=2.6" ++ .ascii ",XEN_VER=xen-3.0" ++ .ascii ",VIRT_BASE=0x" ++ utoa __PAGE_OFFSET ++ .ascii ",ELF_PADDR_OFFSET=0x" ++ utoa __PAGE_OFFSET ++ .ascii ",VIRT_ENTRY=0x" ++ utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) ++ .ascii ",HYPERCALL_PAGE=0x" ++ utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) ++ .ascii ",FEATURES=writable_page_tables" ++ .ascii "|writable_descriptor_tables" ++ .ascii "|auto_translated_physmap" ++ .ascii "|pae_pgdir_above_4gb" ++ .ascii "|supervisor_mode_kernel" ++#ifdef CONFIG_X86_PAE ++ .ascii ",PAE=yes[extended-cr3]" ++#else ++ .ascii ",PAE=no" ++#endif ++ .ascii ",LOADER=generic" ++ .byte 0 ++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ ++ ++ ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ++ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") ++ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) ++#if CONFIG_XEN_COMPAT <= 0x030002 ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long __PAGE_OFFSET) ++#else ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long 0) ++#endif ++ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_32) ++ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) ++ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long HYPERVISOR_VIRT_START) ++ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ++#ifdef CONFIG_X86_PAE ++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) ++#else ++ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") ++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long _PAGE_PRESENT, _PAGE_PRESENT) ++#endif ++ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ++ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) +diff --git a/arch/x86/kernel/head_64-xen.S b/arch/x86/kernel/head_64-xen.S +new file mode 100644 +index 0000000..fdd4408 +--- /dev/null ++++ b/arch/x86/kernel/head_64-xen.S +@@ -0,0 +1,211 @@ ++/* ++ * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit ++ * ++ * Copyright (C) 2000 Andrea Arcangeli SuSE ++ * Copyright (C) 2000 Pavel Machek ++ * Copyright (C) 2000 Karsten Keil ++ * Copyright (C) 2001,2002 Andi Kleen ++ * Copyright (C) 2005 Eric Biederman ++ * Jun Nakajima ++ * Modified for Xen ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ .section .text.head, "ax", @progbits ++ .code64 ++ .globl startup_64 ++startup_64: ++ movq $(init_thread_union+THREAD_SIZE-8),%rsp ++ ++ /* rsi is pointer to startup info structure. ++ pass it to C */ ++ movq %rsi,%rdi ++ pushq $0 # fake return address ++ jmp x86_64_start_kernel ++ ++#ifdef CONFIG_ACPI_SLEEP ++.org 0xf00 ++ .globl pGDT32 ++pGDT32: ++ .word gdt_end-cpu_gdt_table-1 ++ .long cpu_gdt_table-__START_KERNEL_map ++#endif ++ ++.balign PAGE_SIZE ++ ++#define NEXT_PAGE(name) \ ++ .balign PAGE_SIZE; \ ++ phys_##name = . - .text.head; \ ++ENTRY(name) ++ ++NEXT_PAGE(init_level4_pgt) ++ .fill 512,8,0 ++ /* ++ * We update two pgd entries to make kernel and user pgd consistent ++ * at pgd_populate(). It can be used for kernel modules. So we place ++ * this page here for those cases to avoid memory corruption. ++ * We also use this page to establish the initial mapping for the ++ * vsyscall area. ++ */ ++ .fill 512,8,0 ++ ++NEXT_PAGE(level3_kernel_pgt) ++ .fill 512,8,0 ++ ++ /* ++ * This is used for vsyscall area mapping as we have a different ++ * level4 page table for user. ++ */ ++NEXT_PAGE(level3_user_pgt) ++ .fill 512,8,0 ++ ++NEXT_PAGE(level2_kernel_pgt) ++ .fill 512,8,0 ++ ++NEXT_PAGE(level2_fixmap_pgt) ++ .fill 512,8,0 ++ ++NEXT_PAGE(level1_fixmap_pgt) ++ .fill 512,8,0 ++ ++NEXT_PAGE(hypercall_page) ++ CFI_STARTPROC ++ .rept 0x1000 / 0x20 ++ .skip 1 /* push %rcx */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rcx,0 ++ .skip 2 /* push %r11 */ ++ CFI_ADJUST_CFA_OFFSET 8 ++ CFI_REL_OFFSET rcx,0 ++ .skip 5 /* mov $#,%eax */ ++ .skip 2 /* syscall */ ++ .skip 2 /* pop %r11 */ ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_RESTORE r11 ++ .skip 1 /* pop %rcx */ ++ CFI_ADJUST_CFA_OFFSET -8 ++ CFI_RESTORE rcx ++ .align 0x20,0 /* ret */ ++ .endr ++ CFI_ENDPROC ++ ++#undef NEXT_PAGE ++ ++ .data ++/* Just dummy symbol to allow compilation. Not used in sleep path */ ++#ifdef CONFIG_ACPI_SLEEP ++ .align PAGE_SIZE ++ENTRY(wakeup_level4_pgt) ++ .fill 512,8,0 ++#endif ++ ++ .data ++ ++ .align 16 ++ .globl cpu_gdt_descr ++cpu_gdt_descr: ++ .word gdt_end-cpu_gdt_table-1 ++gdt: ++ .quad cpu_gdt_table ++#ifdef CONFIG_SMP ++ .rept NR_CPUS-1 ++ .word 0 ++ .quad 0 ++ .endr ++#endif ++ ++/* We need valid kernel segments for data and code in long mode too ++ * IRET will check the segment types kkeil 2000/10/28 ++ * Also sysret mandates a special GDT layout ++ */ ++ ++ .section .data.page_aligned, "aw" ++ .align PAGE_SIZE ++ ++/* The TLS descriptors are currently at a different place compared to i386. ++ Hopefully nobody expects them at a fixed place (Wine?) */ ++ ++ENTRY(cpu_gdt_table) ++ .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ ++ .quad 0x00af9b000000ffff /* __KERNEL_CS */ ++ .quad 0x00cf93000000ffff /* __KERNEL_DS */ ++ .quad 0x00cffb000000ffff /* __USER32_CS */ ++ .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ ++ .quad 0x00affb000000ffff /* __USER_CS */ ++ .quad 0x0 /* unused */ ++ .quad 0,0 /* TSS */ ++ .quad 0,0 /* LDT */ ++ .quad 0,0,0 /* three TLS descriptors */ ++ .quad 0x0000f40000000000 /* node/CPU stored in limit */ ++gdt_end: ++ /* asm/segment.h:GDT_ENTRIES must match this */ ++ /* This should be a multiple of the cache line size */ ++ /* GDTs of other CPUs are now dynamically allocated */ ++ ++ /* zero the remaining page */ ++ .fill PAGE_SIZE / 8 - GDT_ENTRIES,8,0 ++ ++ .section .bss.page_aligned, "aw", @nobits ++ .align PAGE_SIZE ++ENTRY(empty_zero_page) ++ .skip PAGE_SIZE ++ ++#if CONFIG_XEN_COMPAT <= 0x030002 ++/* ++ * __xen_guest information ++ */ ++.macro utoh value ++ i = 64 ++ .rept 16 ++ i = i - 4 ++ .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) ++ .endr ++.endm ++ ++.section __xen_guest ++ .ascii "GUEST_OS=linux,GUEST_VER=2.6" ++ .ascii ",XEN_VER=xen-3.0" ++ .ascii ",VIRT_BASE=0x" ++ utoh __START_KERNEL_map ++ .ascii ",ELF_PADDR_OFFSET=0x" ++ utoh __START_KERNEL_map ++ .ascii ",VIRT_ENTRY=0x" ++ utoh (__START_KERNEL_map + __PHYSICAL_START) ++ .ascii ",HYPERCALL_PAGE=0x" ++ utoh (phys_hypercall_page >> PAGE_SHIFT) ++ .ascii ",FEATURES=writable_page_tables" ++ .ascii "|writable_descriptor_tables" ++ .ascii "|auto_translated_physmap" ++ .ascii "|supervisor_mode_kernel" ++ .ascii ",LOADER=generic" ++ .byte 0 ++#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ ++ ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ++ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ++ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") ++ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map) ++#if CONFIG_XEN_COMPAT <= 0x030002 ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad __START_KERNEL_map) ++#else ++ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0) ++#endif ++ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) ++ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) ++ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) ++ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") ++ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ++ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) +diff --git a/arch/x86/kernel/init_task-xen.c b/arch/x86/kernel/init_task-xen.c +new file mode 100644 +index 0000000..420e4c7 +--- /dev/null ++++ b/arch/x86/kernel/init_task-xen.c +@@ -0,0 +1,51 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++static struct fs_struct init_fs = INIT_FS; ++static struct signal_struct init_signals = INIT_SIGNALS(init_signals); ++static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); ++#ifdef CONFIG_X86_XEN ++#define swapper_pg_dir ((pgd_t *)NULL) ++#endif ++struct mm_struct init_mm = INIT_MM(init_mm); ++#undef swapper_pg_dir ++EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ ++ ++/* ++ * Initial thread structure. ++ * ++ * We need to make sure that this is THREAD_SIZE aligned due to the ++ * way process stacks are handled. This is done by having a special ++ * "init_task" linker map entry.. ++ */ ++union thread_union init_thread_union ++ __attribute__((__section__(".data.init_task"))) = ++ { INIT_THREAD_INFO(init_task) }; ++ ++/* ++ * Initial task structure. ++ * ++ * All other task structs will be allocated on slabs in fork.c ++ */ ++struct task_struct init_task = INIT_TASK(init_task); ++EXPORT_SYMBOL(init_task); ++ ++#ifndef CONFIG_X86_NO_TSS ++/* ++ * no more per-task TSS's. The TSS size is kept cacheline-aligned ++ * so they are allowed to end up in the .data.cacheline_aligned ++ * section. Since TSS's are completely CPU-local, we want them ++ * on exact cacheline boundaries, to eliminate cacheline ping-pong. ++ */ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; ++#endif ++ +diff --git a/arch/x86/kernel/io_apic_32-xen.c b/arch/x86/kernel/io_apic_32-xen.c +new file mode 100644 +index 0000000..5646681 +--- /dev/null ++++ b/arch/x86/kernel/io_apic_32-xen.c +@@ -0,0 +1,2913 @@ ++/* ++ * Intel IO-APIC support for multi-Pentium hosts. ++ * ++ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo ++ * ++ * Many thanks to Stig Venaas for trying out countless experimental ++ * patches and reporting/debugging problems patiently! ++ * ++ * (c) 1999, Multiple IO-APIC support, developed by ++ * Ken-ichi Yaku and ++ * Hidemi Kishimoto , ++ * further tested and cleaned up by Zach Brown ++ * and Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively ++ * Paul Diefenbaugh : Added full ACPI support ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* time_after() */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_XEN ++#include ++#include ++ ++/* Fake i8259 */ ++#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) ++#define disable_8259A_irq(_irq) ((void)0) ++#define i8259A_irq_pending(_irq) (0) ++ ++unsigned long io_apic_irqs; ++ ++#define clear_IO_APIC() ((void)0) ++#else ++int (*ioapic_renumber_irq)(int ioapic, int irq); ++atomic_t irq_mis_count; ++#endif /* CONFIG_XEN */ ++ ++/* Where if anywhere is the i8259 connect in external int mode */ ++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; ++ ++static DEFINE_SPINLOCK(ioapic_lock); ++static DEFINE_SPINLOCK(vector_lock); ++ ++#ifndef CONFIG_XEN ++int timer_over_8254 __initdata = 1; ++#endif ++ ++/* ++ * Is the SiS APIC rmw bug present ? ++ * -1 = don't know, 0 = no, 1 = yes ++ */ ++int sis_apic_bug = -1; ++ ++/* ++ * # of IRQ routing registers ++ */ ++int nr_ioapic_registers[MAX_IO_APICS]; ++ ++/* I/O APIC entries */ ++struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; ++int nr_ioapics; ++ ++/* MP IRQ source entries */ ++struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; ++ ++/* # of MP IRQ source entries */ ++int mp_irq_entries; ++ ++#ifndef CONFIG_XEN ++static int disable_timer_pin_1 __initdata; ++#endif ++ ++/* ++ * Rough estimation of how many shared IRQs there are, can ++ * be changed anytime. ++ */ ++#define MAX_PLUS_SHARED_IRQS NR_IRQS ++#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) ++ ++/* ++ * This is performance-critical, we want to do it O(1) ++ * ++ * the indexing order of this array favors 1:1 mappings ++ * between pins and IRQs. ++ */ ++ ++static struct irq_pin_list { ++ int apic, pin, next; ++} irq_2_pin[PIN_MAP_SIZE]; ++ ++#ifndef CONFIG_XEN ++struct io_apic { ++ unsigned int index; ++ unsigned int unused[3]; ++ unsigned int data; ++}; ++ ++static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) ++{ ++ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) ++ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); ++} ++#endif ++ ++static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) ++{ ++#ifndef CONFIG_XEN ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ return readl(&io_apic->data); ++#else ++ struct physdev_apic apic_op; ++ int ret; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); ++ if (ret) ++ return ret; ++ return apic_op.value; ++#endif ++} ++ ++static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++#ifndef CONFIG_XEN ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++#else ++ struct physdev_apic apic_op; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ apic_op.value = value; ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); ++#endif ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * Re-write a value: to be used for read-modify-write ++ * cycles where the read already set up the index register. ++ * ++ * Older SiS APIC requires we rewrite the index register ++ */ ++static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++ volatile struct io_apic __iomem *io_apic = io_apic_base(apic); ++ if (sis_apic_bug) ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++} ++#else ++#define io_apic_modify io_apic_write ++#endif ++ ++union entry_union { ++ struct { u32 w1, w2; }; ++ struct IO_APIC_route_entry entry; ++}; ++ ++static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) ++{ ++ union entry_union eu; ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); ++ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return eu.entry; ++} ++ ++/* ++ * When we write a new IO APIC routing entry, we need to write the high ++ * word first! If the mask bit in the low word is clear, we will enable ++ * the interrupt, and we need to make sure the entry is fully populated ++ * before that happens. ++ */ ++static void ++__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ union entry_union eu; ++ eu.entry = e; ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++} ++ ++static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(apic, pin, e); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * When we mask an IO APIC routing entry, we need to write the low ++ * word first, in order to set the mask bit before we change the ++ * high bits! ++ */ ++static void ioapic_mask_entry(int apic, int pin) ++{ ++ unsigned long flags; ++ union entry_union eu = { .entry.mask = 1 }; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++#endif ++ ++/* ++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are ++ * shared ISA-space IRQs, so we have to support them. We are super ++ * fast in the common case, and fast for shared ISA-space IRQs. ++ */ ++static void add_pin_to_irq(unsigned int irq, int apic, int pin) ++{ ++ static int first_free_entry = NR_IRQS; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ while (entry->next) ++ entry = irq_2_pin + entry->next; ++ ++ if (entry->pin != -1) { ++ entry->next = first_free_entry; ++ entry = irq_2_pin + entry->next; ++ if (++first_free_entry >= PIN_MAP_SIZE) ++ panic("io_apic.c: whoops"); ++ } ++ entry->apic = apic; ++ entry->pin = pin; ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * Reroute an IRQ to a different pin. ++ */ ++static void __init replace_pin_at_irq(unsigned int irq, ++ int oldapic, int oldpin, ++ int newapic, int newpin) ++{ ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ while (1) { ++ if (entry->apic == oldapic && entry->pin == oldpin) { ++ entry->apic = newapic; ++ entry->pin = newpin; ++ } ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++} ++ ++static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) ++{ ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ unsigned int pin, reg; ++ ++ for (;;) { ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ reg = io_apic_read(entry->apic, 0x10 + pin*2); ++ reg &= ~disable; ++ reg |= enable; ++ io_apic_modify(entry->apic, 0x10 + pin*2, reg); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++} ++ ++/* mask = 1 */ ++static void __mask_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00010000, 0); ++} ++ ++/* mask = 0 */ ++static void __unmask_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0, 0x00010000); ++} ++ ++/* mask = 1, trigger = 0 */ ++static void __mask_and_edge_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); ++} ++ ++/* mask = 0, trigger = 1 */ ++static void __unmask_and_level_IO_APIC_irq (unsigned int irq) ++{ ++ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); ++} ++ ++static void mask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __mask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void unmask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ /* Check delivery_mode to be sure we're not clearing an SMI pin */ ++ entry = ioapic_read_entry(apic, pin); ++ if (entry.delivery_mode == dest_SMI) ++ return; ++ ++ /* ++ * Disable it in the IO-APIC irq-routing table: ++ */ ++ ioapic_mask_entry(apic, pin); ++} ++ ++static void clear_IO_APIC (void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ clear_IO_APIC_pin(apic, pin); ++} ++ ++#ifdef CONFIG_SMP ++static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) ++{ ++ unsigned long flags; ++ int pin; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ unsigned int apicid_value; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, cpumask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ cpus_and(cpumask, tmp, CPU_MASK_ALL); ++ ++ apicid_value = cpu_mask_to_apicid(cpumask); ++ /* Prepare to do the io_apic_write */ ++ apicid_value = apicid_value << 24; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ for (;;) { ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ irq_desc[irq].affinity = cpumask; ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#if defined(CONFIG_IRQBALANCE) ++# include /* kernel_thread() */ ++# include /* kstat */ ++# include /* kmalloc() */ ++# include ++ ++#define IRQBALANCE_CHECK_ARCH -999 ++#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) ++#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) ++#define BALANCED_IRQ_MORE_DELTA (HZ/10) ++#define BALANCED_IRQ_LESS_DELTA (HZ) ++ ++static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; ++static int physical_balance __read_mostly; ++static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; ++ ++static struct irq_cpu_info { ++ unsigned long * last_irq; ++ unsigned long * irq_delta; ++ unsigned long irq; ++} irq_cpu_data[NR_CPUS]; ++ ++#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) ++#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) ++#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) ++ ++#define IDLE_ENOUGH(cpu,now) \ ++ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) ++ ++#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) ++ ++#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) ++ ++static cpumask_t balance_irq_affinity[NR_IRQS] = { ++ [0 ... NR_IRQS-1] = CPU_MASK_ALL ++}; ++ ++void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ balance_irq_affinity[irq] = mask; ++} ++ ++static unsigned long move(int curr_cpu, cpumask_t allowed_mask, ++ unsigned long now, int direction) ++{ ++ int search_idle = 1; ++ int cpu = curr_cpu; ++ ++ goto inside; ++ ++ do { ++ if (unlikely(cpu == curr_cpu)) ++ search_idle = 0; ++inside: ++ if (direction == 1) { ++ cpu++; ++ if (cpu >= NR_CPUS) ++ cpu = 0; ++ } else { ++ cpu--; ++ if (cpu == -1) ++ cpu = NR_CPUS-1; ++ } ++ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || ++ (search_idle && !IDLE_ENOUGH(cpu,now))); ++ ++ return cpu; ++} ++ ++static inline void balance_irq(int cpu, int irq) ++{ ++ unsigned long now = jiffies; ++ cpumask_t allowed_mask; ++ unsigned int new_cpu; ++ ++ if (irqbalance_disabled) ++ return; ++ ++ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); ++ new_cpu = move(cpu, allowed_mask, now, 1); ++ if (cpu != new_cpu) { ++ set_pending_irq(irq, cpumask_of_cpu(new_cpu)); ++ } ++} ++ ++static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) ++{ ++ int i, j; ++ ++ for_each_online_cpu(i) { ++ for (j = 0; j < NR_IRQS; j++) { ++ if (!irq_desc[j].action) ++ continue; ++ /* Is it a significant load ? */ ++ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < ++ useful_load_threshold) ++ continue; ++ balance_irq(i, j); ++ } ++ } ++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); ++ return; ++} ++ ++static void do_irq_balance(void) ++{ ++ int i, j; ++ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); ++ unsigned long move_this_load = 0; ++ int max_loaded = 0, min_loaded = 0; ++ int load; ++ unsigned long useful_load_threshold = balanced_irq_interval + 10; ++ int selected_irq; ++ int tmp_loaded, first_attempt = 1; ++ unsigned long tmp_cpu_irq; ++ unsigned long imbalance = 0; ++ cpumask_t allowed_mask, target_cpu_mask, tmp; ++ ++ for_each_possible_cpu(i) { ++ int package_index; ++ CPU_IRQ(i) = 0; ++ if (!cpu_online(i)) ++ continue; ++ package_index = CPU_TO_PACKAGEINDEX(i); ++ for (j = 0; j < NR_IRQS; j++) { ++ unsigned long value_now, delta; ++ /* Is this an active IRQ or balancing disabled ? */ ++ if (!irq_desc[j].action || irq_balancing_disabled(j)) ++ continue; ++ if ( package_index == i ) ++ IRQ_DELTA(package_index,j) = 0; ++ /* Determine the total count per processor per IRQ */ ++ value_now = (unsigned long) kstat_cpu(i).irqs[j]; ++ ++ /* Determine the activity per processor per IRQ */ ++ delta = value_now - LAST_CPU_IRQ(i,j); ++ ++ /* Update last_cpu_irq[][] for the next time */ ++ LAST_CPU_IRQ(i,j) = value_now; ++ ++ /* Ignore IRQs whose rate is less than the clock */ ++ if (delta < useful_load_threshold) ++ continue; ++ /* update the load for the processor or package total */ ++ IRQ_DELTA(package_index,j) += delta; ++ ++ /* Keep track of the higher numbered sibling as well */ ++ if (i != package_index) ++ CPU_IRQ(i) += delta; ++ /* ++ * We have sibling A and sibling B in the package ++ * ++ * cpu_irq[A] = load for cpu A + load for cpu B ++ * cpu_irq[B] = load for cpu B ++ */ ++ CPU_IRQ(package_index) += delta; ++ } ++ } ++ /* Find the least loaded processor package */ ++ for_each_online_cpu(i) { ++ if (i != CPU_TO_PACKAGEINDEX(i)) ++ continue; ++ if (min_cpu_irq > CPU_IRQ(i)) { ++ min_cpu_irq = CPU_IRQ(i); ++ min_loaded = i; ++ } ++ } ++ max_cpu_irq = ULONG_MAX; ++ ++tryanothercpu: ++ /* Look for heaviest loaded processor. ++ * We may come back to get the next heaviest loaded processor. ++ * Skip processors with trivial loads. ++ */ ++ tmp_cpu_irq = 0; ++ tmp_loaded = -1; ++ for_each_online_cpu(i) { ++ if (i != CPU_TO_PACKAGEINDEX(i)) ++ continue; ++ if (max_cpu_irq <= CPU_IRQ(i)) ++ continue; ++ if (tmp_cpu_irq < CPU_IRQ(i)) { ++ tmp_cpu_irq = CPU_IRQ(i); ++ tmp_loaded = i; ++ } ++ } ++ ++ if (tmp_loaded == -1) { ++ /* In the case of small number of heavy interrupt sources, ++ * loading some of the cpus too much. We use Ingo's original ++ * approach to rotate them around. ++ */ ++ if (!first_attempt && imbalance >= useful_load_threshold) { ++ rotate_irqs_among_cpus(useful_load_threshold); ++ return; ++ } ++ goto not_worth_the_effort; ++ } ++ ++ first_attempt = 0; /* heaviest search */ ++ max_cpu_irq = tmp_cpu_irq; /* load */ ++ max_loaded = tmp_loaded; /* processor */ ++ imbalance = (max_cpu_irq - min_cpu_irq) / 2; ++ ++ /* if imbalance is less than approx 10% of max load, then ++ * observe diminishing returns action. - quit ++ */ ++ if (imbalance < (max_cpu_irq >> 3)) ++ goto not_worth_the_effort; ++ ++tryanotherirq: ++ /* if we select an IRQ to move that can't go where we want, then ++ * see if there is another one to try. ++ */ ++ move_this_load = 0; ++ selected_irq = -1; ++ for (j = 0; j < NR_IRQS; j++) { ++ /* Is this an active IRQ? */ ++ if (!irq_desc[j].action) ++ continue; ++ if (imbalance <= IRQ_DELTA(max_loaded,j)) ++ continue; ++ /* Try to find the IRQ that is closest to the imbalance ++ * without going over. ++ */ ++ if (move_this_load < IRQ_DELTA(max_loaded,j)) { ++ move_this_load = IRQ_DELTA(max_loaded,j); ++ selected_irq = j; ++ } ++ } ++ if (selected_irq == -1) { ++ goto tryanothercpu; ++ } ++ ++ imbalance = move_this_load; ++ ++ /* For physical_balance case, we accumulated both load ++ * values in the one of the siblings cpu_irq[], ++ * to use the same code for physical and logical processors ++ * as much as possible. ++ * ++ * NOTE: the cpu_irq[] array holds the sum of the load for ++ * sibling A and sibling B in the slot for the lowest numbered ++ * sibling (A), _AND_ the load for sibling B in the slot for ++ * the higher numbered sibling. ++ * ++ * We seek the least loaded sibling by making the comparison ++ * (A+B)/2 vs B ++ */ ++ load = CPU_IRQ(min_loaded) >> 1; ++ for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { ++ if (load > CPU_IRQ(j)) { ++ /* This won't change cpu_sibling_map[min_loaded] */ ++ load = CPU_IRQ(j); ++ min_loaded = j; ++ } ++ } ++ ++ cpus_and(allowed_mask, ++ cpu_online_map, ++ balance_irq_affinity[selected_irq]); ++ target_cpu_mask = cpumask_of_cpu(min_loaded); ++ cpus_and(tmp, target_cpu_mask, allowed_mask); ++ ++ if (!cpus_empty(tmp)) { ++ /* mark for change destination */ ++ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); ++ ++ /* Since we made a change, come back sooner to ++ * check for more variation. ++ */ ++ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); ++ return; ++ } ++ goto tryanotherirq; ++ ++not_worth_the_effort: ++ /* ++ * if we did not find an IRQ to move, then adjust the time interval ++ * upward ++ */ ++ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, ++ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); ++ return; ++} ++ ++static int balanced_irq(void *unused) ++{ ++ int i; ++ unsigned long prev_balance_time = jiffies; ++ long time_remaining = balanced_irq_interval; ++ ++ /* push everything to CPU 0 to give us a starting point. */ ++ for (i = 0 ; i < NR_IRQS ; i++) { ++ irq_desc[i].pending_mask = cpumask_of_cpu(0); ++ set_pending_irq(i, cpumask_of_cpu(0)); ++ } ++ ++ set_freezable(); ++ for ( ; ; ) { ++ time_remaining = schedule_timeout_interruptible(time_remaining); ++ try_to_freeze(); ++ if (time_after(jiffies, ++ prev_balance_time+balanced_irq_interval)) { ++ preempt_disable(); ++ do_irq_balance(); ++ prev_balance_time = jiffies; ++ time_remaining = balanced_irq_interval; ++ preempt_enable(); ++ } ++ } ++ return 0; ++} ++ ++static int __init balanced_irq_init(void) ++{ ++ int i; ++ struct cpuinfo_x86 *c; ++ cpumask_t tmp; ++ ++ cpus_shift_right(tmp, cpu_online_map, 2); ++ c = &boot_cpu_data; ++ /* When not overwritten by the command line ask subarchitecture. */ ++ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) ++ irqbalance_disabled = NO_BALANCE_IRQ; ++ if (irqbalance_disabled) ++ return 0; ++ ++ /* disable irqbalance completely if there is only one processor online */ ++ if (num_online_cpus() < 2) { ++ irqbalance_disabled = 1; ++ return 0; ++ } ++ /* ++ * Enable physical balance only if more than 1 physical processor ++ * is present ++ */ ++ if (smp_num_siblings > 1 && !cpus_empty(tmp)) ++ physical_balance = 1; ++ ++ for_each_online_cpu(i) { ++ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); ++ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); ++ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { ++ printk(KERN_ERR "balanced_irq_init: out of memory"); ++ goto failed; ++ } ++ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); ++ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); ++ } ++ ++ printk(KERN_INFO "Starting balanced_irq\n"); ++ if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) ++ return 0; ++ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); ++failed: ++ for_each_possible_cpu(i) { ++ kfree(irq_cpu_data[i].irq_delta); ++ irq_cpu_data[i].irq_delta = NULL; ++ kfree(irq_cpu_data[i].last_irq); ++ irq_cpu_data[i].last_irq = NULL; ++ } ++ return 0; ++} ++ ++int __devinit irqbalance_disable(char *str) ++{ ++ irqbalance_disabled = 1; ++ return 1; ++} ++ ++__setup("noirqbalance", irqbalance_disable); ++ ++late_initcall(balanced_irq_init); ++#endif /* CONFIG_IRQBALANCE */ ++#endif /* CONFIG_SMP */ ++#endif ++ ++#ifndef CONFIG_SMP ++void send_IPI_self(int vector) ++{ ++#ifndef CONFIG_XEN ++ unsigned int cfg; ++ ++ /* ++ * Wait for idle. ++ */ ++ apic_wait_icr_idle(); ++ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; ++ /* ++ * Send the IPI. The write to APIC_ICR fires this off. ++ */ ++ apic_write_around(APIC_ICR, cfg); ++#endif ++} ++#endif /* !CONFIG_SMP */ ++ ++ ++/* ++ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to ++ * specific CPU-side IRQs. ++ */ ++ ++#define MAX_PIRQS 8 ++static int pirq_entries [MAX_PIRQS]; ++static int pirqs_enabled; ++int skip_ioapic_setup; ++ ++static int __init ioapic_pirq_setup(char *str) ++{ ++ int i, max; ++ int ints[MAX_PIRQS+1]; ++ ++ get_options(str, ARRAY_SIZE(ints), ints); ++ ++ for (i = 0; i < MAX_PIRQS; i++) ++ pirq_entries[i] = -1; ++ ++ pirqs_enabled = 1; ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "PIRQ redirection, working around broken MP-BIOS.\n"); ++ max = MAX_PIRQS; ++ if (ints[0] < MAX_PIRQS) ++ max = ints[0]; ++ ++ for (i = 0; i < max; i++) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); ++ /* ++ * PIRQs are mapped upside down, usually. ++ */ ++ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; ++ } ++ return 1; ++} ++ ++__setup("pirq=", ioapic_pirq_setup); ++ ++/* ++ * Find the IRQ entry number of a certain pin. ++ */ ++static int find_irq_entry(int apic, int pin, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_irqtype == type && ++ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && ++ mp_irqs[i].mpc_dstirq == pin) ++ return i; ++ ++ return -1; ++} ++ ++/* ++ * Find the pin to which IRQ[irq] (ISA) is connected ++ */ ++static int __init find_isa_irq_pin(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ ++ return mp_irqs[i].mpc_dstirq; ++ } ++ return -1; ++} ++ ++static int __init find_isa_irq_apic(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ break; ++ } ++ if (i < mp_irq_entries) { ++ int apic; ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) ++ return apic; ++ } ++ } ++ ++ return -1; ++} ++ ++/* ++ * Find a specific PCI IRQ entry. ++ * Not an __init, possibly needed by modules ++ */ ++static int pin_2_irq(int idx, int apic, int pin); ++ ++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) ++{ ++ int apic, i, best_guess = -1; ++ ++ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " ++ "slot:%d, pin:%d.\n", bus, slot, pin); ++ if (mp_bus_id_to_pci_bus[bus] == -1) { ++ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); ++ return -1; ++ } ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) ++ break; ++ ++ if (!test_bit(lbus, mp_bus_not_pci) && ++ !mp_irqs[i].mpc_irqtype && ++ (bus == lbus) && ++ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { ++ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); ++ ++ if (!(apic || IO_APIC_IRQ(irq))) ++ continue; ++ ++ if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) ++ return irq; ++ /* ++ * Use the first all-but-pin matching entry as a ++ * best-guess fuzzy result for broken mptables. ++ */ ++ if (best_guess < 0) ++ best_guess = irq; ++ } ++ } ++ return best_guess; ++} ++EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); ++ ++/* ++ * This function currently is only a helper for the i386 smp boot process where ++ * we need to reprogram the ioredtbls to cater for the cpus which have come online ++ * so mask in all cases should simply be TARGET_CPUS ++ */ ++#ifdef CONFIG_SMP ++#ifndef CONFIG_XEN ++void __init setup_ioapic_dest(void) ++{ ++ int pin, ioapic, irq, irq_entry; ++ ++ if (skip_ioapic_setup == 1) ++ return; ++ ++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { ++ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { ++ irq_entry = find_irq_entry(ioapic, pin, mp_INT); ++ if (irq_entry == -1) ++ continue; ++ irq = pin_2_irq(irq_entry, ioapic, pin); ++ set_ioapic_affinity_irq(irq, TARGET_CPUS); ++ } ++ ++ } ++} ++#endif /* !CONFIG_XEN */ ++#endif ++ ++#if defined(CONFIG_EISA) || defined(CONFIG_MCA) ++/* ++ * EISA Edge/Level control register, ELCR ++ */ ++static int EISA_ELCR(unsigned int irq) ++{ ++ if (irq < 16) { ++ unsigned int port = 0x4d0 + (irq >> 3); ++ return (inb(port) >> (irq & 7)) & 1; ++ } ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "Broken MPtable reports ISA irq %d\n", irq); ++ return 0; ++} ++#endif ++ ++/* ISA interrupts are always polarity zero edge triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_ISA_trigger(idx) (0) ++#define default_ISA_polarity(idx) (0) ++ ++/* EISA interrupts are always polarity zero and can be edge or level ++ * trigger depending on the ELCR value. If an interrupt is listed as ++ * EISA conforming in the MP table, that means its trigger type must ++ * be read in from the ELCR */ ++ ++#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) ++#define default_EISA_polarity(idx) default_ISA_polarity(idx) ++ ++/* PCI interrupts are always polarity one level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_PCI_trigger(idx) (1) ++#define default_PCI_polarity(idx) (1) ++ ++/* MCA interrupts are always polarity zero level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_MCA_trigger(idx) (1) ++#define default_MCA_polarity(idx) default_ISA_polarity(idx) ++ ++static int MPBIOS_polarity(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int polarity; ++ ++ /* ++ * Determine IRQ line polarity (high active or low active): ++ */ ++ switch (mp_irqs[idx].mpc_irqflag & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent polarity */ ++ { ++ polarity = test_bit(bus, mp_bus_not_pci)? ++ default_ISA_polarity(idx): ++ default_PCI_polarity(idx); ++ break; ++ } ++ case 1: /* high active */ ++ { ++ polarity = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ case 3: /* low active */ ++ { ++ polarity = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ } ++ return polarity; ++} ++ ++static int MPBIOS_trigger(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int trigger; ++ ++ /* ++ * Determine IRQ trigger mode (edge or level sensitive): ++ */ ++ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent */ ++ { ++ trigger = test_bit(bus, mp_bus_not_pci)? ++ default_ISA_trigger(idx): ++ default_PCI_trigger(idx); ++#if defined(CONFIG_EISA) || defined(CONFIG_MCA) ++ switch (mp_bus_id_to_type[bus]) ++ { ++ case MP_BUS_ISA: /* ISA pin */ ++ { ++ /* set before the switch */ ++ break; ++ } ++ case MP_BUS_EISA: /* EISA pin */ ++ { ++ trigger = default_EISA_trigger(idx); ++ break; ++ } ++ case MP_BUS_PCI: /* PCI pin */ ++ { ++ /* set before the switch */ ++ break; ++ } ++ case MP_BUS_MCA: /* MCA pin */ ++ { ++ trigger = default_MCA_trigger(idx); ++ break; ++ } ++ default: ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ } ++#endif ++ break; ++ } ++ case 1: /* edge */ ++ { ++ trigger = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ case 3: /* level */ ++ { ++ trigger = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 0; ++ break; ++ } ++ } ++ return trigger; ++} ++ ++static inline int irq_polarity(int idx) ++{ ++ return MPBIOS_polarity(idx); ++} ++ ++static inline int irq_trigger(int idx) ++{ ++ return MPBIOS_trigger(idx); ++} ++ ++static int pin_2_irq(int idx, int apic, int pin) ++{ ++ int irq, i; ++ int bus = mp_irqs[idx].mpc_srcbus; ++ ++ /* ++ * Debugging check, we are in big trouble if this message pops up! ++ */ ++ if (mp_irqs[idx].mpc_dstirq != pin) ++ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); ++ ++ if (test_bit(bus, mp_bus_not_pci)) ++ irq = mp_irqs[idx].mpc_srcbusirq; ++ else { ++ /* ++ * PCI IRQs are mapped in order ++ */ ++ i = irq = 0; ++ while (i < apic) ++ irq += nr_ioapic_registers[i++]; ++ irq += pin; ++ ++#ifndef CONFIG_XEN ++ /* ++ * For MPS mode, so far only needed by ES7000 platform ++ */ ++ if (ioapic_renumber_irq) ++ irq = ioapic_renumber_irq(apic, irq); ++#endif ++ } ++ ++ /* ++ * PCI IRQ command line redirection. Yes, limits are hardcoded. ++ */ ++ if ((pin >= 16) && (pin <= 23)) { ++ if (pirq_entries[pin-16] != -1) { ++ if (!pirq_entries[pin-16]) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "disabling PIRQ%d\n", pin-16); ++ } else { ++ irq = pirq_entries[pin-16]; ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ "using PIRQ%d -> IRQ %d\n", ++ pin-16, irq); ++ } ++ } ++ } ++ return irq; ++} ++ ++static inline int IO_APIC_irq_trigger(int irq) ++{ ++ int apic, idx, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ idx = find_irq_entry(apic,pin,mp_INT); ++ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) ++ return irq_trigger(idx); ++ } ++ } ++ /* ++ * nonexistent IRQs are edge default ++ */ ++ return 0; ++} ++ ++/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ ++static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ ++ ++static int __assign_irq_vector(int irq) ++{ ++ int vector; ++ struct physdev_irq irq_op; ++ ++ BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); ++ ++ if (irq_vector[irq] > 0) ++ return irq_vector[irq]; ++ ++ irq_op.irq = irq; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) ++ return -ENOSPC; ++ ++ vector = irq_op.vector; ++ irq_vector[irq] = vector; ++ ++ return vector; ++} ++ ++static int assign_irq_vector(int irq) ++{ ++ unsigned long flags; ++ int vector; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ vector = __assign_irq_vector(irq); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ return vector; ++} ++ ++#ifndef CONFIG_XEN ++static struct irq_chip ioapic_chip; ++ ++#define IOAPIC_AUTO -1 ++#define IOAPIC_EDGE 0 ++#define IOAPIC_LEVEL 1 ++ ++static void ioapic_register_intr(int irq, int vector, unsigned long trigger) ++{ ++ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || ++ trigger == IOAPIC_LEVEL) { ++ irq_desc[irq].status |= IRQ_LEVEL; ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_fasteoi_irq, "fasteoi"); ++ } else { ++ irq_desc[irq].status &= ~IRQ_LEVEL; ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_edge_irq, "edge"); ++ } ++ set_intr_gate(vector, interrupt[irq]); ++} ++#else ++#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) ++#endif ++ ++static void __init setup_IO_APIC_irqs(void) ++{ ++ struct IO_APIC_route_entry entry; ++ int apic, pin, idx, irq, first_notcon = 1, vector; ++ ++ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ ++ /* ++ * add it to the IO-APIC irq-routing table: ++ */ ++ memset(&entry,0,sizeof(entry)); ++ ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.dest_mode = INT_DEST_MODE; ++ entry.mask = 0; /* enable IRQ */ ++ entry.dest.logical.logical_dest = ++ cpu_mask_to_apicid(TARGET_CPUS); ++ ++ idx = find_irq_entry(apic,pin,mp_INT); ++ if (idx == -1) { ++ if (first_notcon) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG ++ " IO-APIC (apicid-pin) %d-%d", ++ mp_ioapics[apic].mpc_apicid, ++ pin); ++ first_notcon = 0; ++ } else ++ apic_printk(APIC_VERBOSE, ", %d-%d", ++ mp_ioapics[apic].mpc_apicid, pin); ++ continue; ++ } ++ ++ if (!first_notcon) { ++ apic_printk(APIC_VERBOSE, " not connected.\n"); ++ first_notcon = 1; ++ } ++ ++ entry.trigger = irq_trigger(idx); ++ entry.polarity = irq_polarity(idx); ++ ++ if (irq_trigger(idx)) { ++ entry.trigger = 1; ++ entry.mask = 1; ++ } ++ ++ irq = pin_2_irq(idx, apic, pin); ++ /* ++ * skip adding the timer int on secondary nodes, which causes ++ * a small but painful rift in the time-space continuum ++ */ ++ if (multi_timer_check(apic, irq)) ++ continue; ++ else ++ add_pin_to_irq(irq, apic, pin); ++ ++ if (/*!apic &&*/ !IO_APIC_IRQ(irq)) ++ continue; ++ ++ if (IO_APIC_IRQ(irq)) { ++ vector = assign_irq_vector(irq); ++ entry.vector = vector; ++ ioapic_register_intr(irq, vector, IOAPIC_AUTO); ++ ++ if (!apic && (irq < 16)) ++ disable_8259A_irq(irq); ++ } ++ ioapic_write_entry(apic, pin, entry); ++ } ++ } ++ ++ if (!first_notcon) ++ apic_printk(APIC_VERBOSE, " not connected.\n"); ++} ++ ++/* ++ * Set up the 8259A-master output pin: ++ */ ++#ifndef CONFIG_XEN ++static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry,0,sizeof(entry)); ++ ++ disable_8259A_irq(0); ++ ++ /* mask LVT0 */ ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ ++ /* ++ * We use logical delivery to get the timer IRQ ++ * to the first CPU. ++ */ ++ entry.dest_mode = INT_DEST_MODE; ++ entry.mask = 0; /* unmask IRQ now */ ++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.polarity = 0; ++ entry.trigger = 0; ++ entry.vector = vector; ++ ++ /* ++ * The timer IRQ doesn't have to know that behind the ++ * scene we have a 8259A-master in AEOI mode ... ++ */ ++ irq_desc[0].chip = &ioapic_chip; ++ set_irq_handler(0, handle_edge_irq); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(apic, pin, entry); ++ ++ enable_8259A_irq(0); ++} ++ ++void __init print_IO_APIC(void) ++{ ++ int apic, i; ++ union IO_APIC_reg_00 reg_00; ++ union IO_APIC_reg_01 reg_01; ++ union IO_APIC_reg_02 reg_02; ++ union IO_APIC_reg_03 reg_03; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); ++ for (i = 0; i < nr_ioapics; i++) ++ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", ++ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); ++ ++ /* ++ * We are a bit conservative about what we expect. We have to ++ * know about every hardware change ASAP. ++ */ ++ printk(KERN_INFO "testing the IO APIC.......................\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ reg_01.raw = io_apic_read(apic, 1); ++ if (reg_01.bits.version >= 0x10) ++ reg_02.raw = io_apic_read(apic, 2); ++ if (reg_01.bits.version >= 0x20) ++ reg_03.raw = io_apic_read(apic, 3); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); ++ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); ++ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); ++ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); ++ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); ++ ++ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); ++ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); ++ ++ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); ++ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, ++ * but the value of reg_02 is read as the previous read register ++ * value, so ignore it if reg_02 == reg_01. ++ */ ++ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); ++ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); ++ } ++ ++ /* ++ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 ++ * or reg_03, but the value of reg_0[23] is read as the previous read ++ * register value, so ignore it if reg_03 == reg_0[12]. ++ */ ++ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && ++ reg_03.raw != reg_01.raw) { ++ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); ++ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); ++ } ++ ++ printk(KERN_DEBUG ".... IRQ redirection table:\n"); ++ ++ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" ++ " Stat Dest Deli Vect: \n"); ++ ++ for (i = 0; i <= reg_01.bits.entries; i++) { ++ struct IO_APIC_route_entry entry; ++ ++ entry = ioapic_read_entry(apic, i); ++ ++ printk(KERN_DEBUG " %02x %03X %02X ", ++ i, ++ entry.dest.logical.logical_dest, ++ entry.dest.physical.physical_dest ++ ); ++ ++ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", ++ entry.mask, ++ entry.trigger, ++ entry.irr, ++ entry.polarity, ++ entry.delivery_status, ++ entry.dest_mode, ++ entry.delivery_mode, ++ entry.vector ++ ); ++ } ++ } ++ printk(KERN_DEBUG "IRQ to pin mappings:\n"); ++ for (i = 0; i < NR_IRQS; i++) { ++ struct irq_pin_list *entry = irq_2_pin + i; ++ if (entry->pin < 0) ++ continue; ++ printk(KERN_DEBUG "IRQ%d ", i); ++ for (;;) { ++ printk("-> %d:%d", entry->apic, entry->pin); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ printk("\n"); ++ } ++ ++ printk(KERN_INFO ".................................... done.\n"); ++ ++ return; ++} ++ ++static void print_APIC_bitfield (int base) ++{ ++ unsigned int v; ++ int i, j; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); ++ for (i = 0; i < 8; i++) { ++ v = apic_read(base + i*0x10); ++ for (j = 0; j < 32; j++) { ++ if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ ++ apic_write(APIC_ESR, 0); ++ v = apic_read(APIC_ESR); ++ printk(KERN_DEBUG "... APIC ESR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_ICR); ++ printk(KERN_DEBUG "... APIC ICR: %08x\n", v); ++ v = apic_read(APIC_ICR2); ++ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); ++ ++ v = apic_read(APIC_LVTT); ++ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); ++ ++ if (maxlvt > 3) { /* PC is LVT#4. */ ++ v = apic_read(APIC_LVTPC); ++ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); ++ } ++ v = apic_read(APIC_LVT0); ++ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); ++ v = apic_read(APIC_LVT1); ++ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); ++ ++ if (maxlvt > 2) { /* ERR is LVT#3. */ ++ v = apic_read(APIC_LVTERR); ++ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_TMICT); ++ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); ++ v = apic_read(APIC_TMCCT); ++ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); ++ v = apic_read(APIC_TDCR); ++ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); ++ printk("\n"); ++} ++ ++void print_all_local_APICs (void) ++{ ++ on_each_cpu(print_local_APIC, NULL, 1, 1); ++} ++ ++void /*__init*/ print_PIC(void) ++{ ++ unsigned int v; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "\nprinting PIC contents\n"); ++ ++ spin_lock_irqsave(&i8259A_lock, flags); ++ ++ v = inb(0xa1) << 8 | inb(0x21); ++ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); ++ ++ v = inb(0xa0) << 8 | inb(0x20); ++ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); ++ ++ outb(0x0b,0xa0); ++ outb(0x0b,0x20); ++ v = inb(0xa0) << 8 | inb(0x20); ++ outb(0x0a,0xa0); ++ outb(0x0a,0x20); ++ ++ spin_unlock_irqrestore(&i8259A_lock, flags); ++ ++ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); ++ ++ v = inb(0x4d1) << 8 | inb(0x4d0); ++ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); ++} ++#endif /* !CONFIG_XEN */ ++ ++static void __init enable_IO_APIC(void) ++{ ++ union IO_APIC_reg_01 reg_01; ++ int i8259_apic, i8259_pin; ++ int i, apic; ++ unsigned long flags; ++ ++ for (i = 0; i < PIN_MAP_SIZE; i++) { ++ irq_2_pin[i].pin = -1; ++ irq_2_pin[i].next = 0; ++ } ++ if (!pirqs_enabled) ++ for (i = 0; i < MAX_PIRQS; i++) ++ pirq_entries[i] = -1; ++ ++ /* ++ * The number of IO-APIC IRQ registers (== #pins): ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(apic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ nr_ioapic_registers[apic] = reg_01.bits.entries+1; ++ } ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ int pin; ++ /* See if any of the pins is in ExtINT mode */ ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ struct IO_APIC_route_entry entry; ++ entry = ioapic_read_entry(apic, pin); ++ ++ ++ /* If the interrupt line is enabled and in ExtInt mode ++ * I have found the pin where the i8259 is connected. ++ */ ++ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { ++ ioapic_i8259.apic = apic; ++ ioapic_i8259.pin = pin; ++ goto found_i8259; ++ } ++ } ++ } ++ found_i8259: ++ /* Look to see what if the MP table has reported the ExtINT */ ++ /* If we could not find the appropriate pin by looking at the ioapic ++ * the i8259 probably is not connected the ioapic but give the ++ * mptable a chance anyway. ++ */ ++ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); ++ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); ++ /* Trust the MP table if nothing is setup in the hardware */ ++ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { ++ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); ++ ioapic_i8259.pin = i8259_pin; ++ ioapic_i8259.apic = i8259_apic; ++ } ++ /* Complain if the MP table and the hardware disagree */ ++ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && ++ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) ++ { ++ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); ++ } ++ ++ /* ++ * Do not trust the IO-APIC being empty at bootup ++ */ ++ clear_IO_APIC(); ++} ++ ++/* ++ * Not an __init, needed by the reboot code ++ */ ++void disable_IO_APIC(void) ++{ ++ /* ++ * Clear the IO-APIC before rebooting: ++ */ ++ clear_IO_APIC(); ++ ++#ifndef CONFIG_XEN ++ /* ++ * If the i8259 is routed through an IOAPIC ++ * Put that IOAPIC in virtual wire mode ++ * so legacy interrupts can be delivered. ++ */ ++ if (ioapic_i8259.pin != -1) { ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry, 0, sizeof(entry)); ++ entry.mask = 0; /* Enabled */ ++ entry.trigger = 0; /* Edge */ ++ entry.irr = 0; ++ entry.polarity = 0; /* High */ ++ entry.delivery_status = 0; ++ entry.dest_mode = 0; /* Physical */ ++ entry.delivery_mode = dest_ExtINT; /* ExtInt */ ++ entry.vector = 0; ++ entry.dest.physical.physical_dest = ++ GET_APIC_ID(read_apic_id()); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); ++ } ++ disconnect_bsp_APIC(ioapic_i8259.pin != -1); ++#endif ++} ++ ++/* ++ * function to set the IO-APIC physical IDs based on the ++ * values stored in the MPC table. ++ * ++ * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 ++ */ ++ ++#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) ++static void __init setup_ioapic_ids_from_mpc(void) ++{ ++ union IO_APIC_reg_00 reg_00; ++ physid_mask_t phys_id_present_map; ++ int apic; ++ int i; ++ unsigned char old_id; ++ unsigned long flags; ++ ++ /* ++ * Don't check I/O APIC IDs for xAPIC systems. They have ++ * no meaning without the serial APIC bus. ++ */ ++ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) ++ return; ++ /* ++ * This is broken; anything with a real cpu count has to ++ * circumvent this idiocy regardless. ++ */ ++ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); ++ ++ /* ++ * Set the IOAPIC ID to the value stored in the MPC table. ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ /* Read the register 0 value */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ old_id = mp_ioapics[apic].mpc_apicid; ++ ++ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", ++ apic, mp_ioapics[apic].mpc_apicid); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ reg_00.bits.ID); ++ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; ++ } ++ ++ /* ++ * Sanity check, is the ID really free? Every APIC in a ++ * system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (check_apicid_used(phys_id_present_map, ++ mp_ioapics[apic].mpc_apicid)) { ++ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", ++ apic, mp_ioapics[apic].mpc_apicid); ++ for (i = 0; i < get_physical_broadcast(); i++) ++ if (!physid_isset(i, phys_id_present_map)) ++ break; ++ if (i >= get_physical_broadcast()) ++ panic("Max APIC ID exceeded!\n"); ++ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", ++ i); ++ physid_set(i, phys_id_present_map); ++ mp_ioapics[apic].mpc_apicid = i; ++ } else { ++ physid_mask_t tmp; ++ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); ++ apic_printk(APIC_VERBOSE, "Setting %d in the " ++ "phys_id_present_map\n", ++ mp_ioapics[apic].mpc_apicid); ++ physids_or(phys_id_present_map, phys_id_present_map, tmp); ++ } ++ ++ ++ /* ++ * We need to adjust the IRQ routing table ++ * if the ID changed. ++ */ ++ if (old_id != mp_ioapics[apic].mpc_apicid) ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_dstapic == old_id) ++ mp_irqs[i].mpc_dstapic ++ = mp_ioapics[apic].mpc_apicid; ++ ++ /* ++ * Read the right value from the MPC table and ++ * write it into the ID register. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "...changing IO-APIC physical APIC ID to %d ...", ++ mp_ioapics[apic].mpc_apicid); ++ ++ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0, reg_00.raw); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* ++ * Sanity check ++ */ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) ++ printk("could not set ID!\n"); ++ else ++ apic_printk(APIC_VERBOSE, " ok.\n"); ++ } ++} ++#else ++static void __init setup_ioapic_ids_from_mpc(void) { } ++#endif ++ ++#ifndef CONFIG_XEN ++int no_timer_check __initdata; ++ ++static int __init notimercheck(char *s) ++{ ++ no_timer_check = 1; ++ return 1; ++} ++__setup("no_timer_check", notimercheck); ++ ++/* ++ * There is a nasty bug in some older SMP boards, their mptable lies ++ * about the timer IRQ. We do the following to work around the situation: ++ * ++ * - timer IRQ defaults to IO-APIC IRQ ++ * - if this function detects that timer IRQs are defunct, then we fall ++ * back to ISA timer IRQs ++ */ ++static int __init timer_irq_works(void) ++{ ++ unsigned long t1 = jiffies; ++ unsigned long flags; ++ ++ if (no_timer_check) ++ return 1; ++ ++ local_save_flags(flags); ++ local_irq_enable(); ++ /* Let ten ticks pass... */ ++ mdelay((10 * 1000) / HZ); ++ local_irq_restore(flags); ++ ++ /* ++ * Expect a few ticks at least, to be sure some possible ++ * glue logic does not lock up after one or two first ++ * ticks in a non-ExtINT mode. Also the local APIC ++ * might have cached one ExtINT interrupt. Finally, at ++ * least one tick may be lost due to delays. ++ */ ++ if (time_after(jiffies, t1 + 4)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * In the SMP+IOAPIC case it might happen that there are an unspecified ++ * number of pending IRQ events unhandled. These cases are very rare, ++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much ++ * better to do it this way as thus we do not have to be aware of ++ * 'pending' interrupts in the IRQ path, except at this point. ++ */ ++/* ++ * Edge triggered needs to resend any interrupt ++ * that was delayed but this is now handled in the device ++ * independent code. ++ */ ++ ++/* ++ * Startup quirk: ++ * ++ * Starting up a edge-triggered IO-APIC interrupt is ++ * nasty - we need to make sure that we get the edge. ++ * If it is already asserted for some reason, we need ++ * return 1 to indicate that is was pending. ++ * ++ * This is not complete - we should be able to fake ++ * an edge even if it isn't on the 8259A... ++ * ++ * (We do this for level-triggered IRQs too - it cannot hurt.) ++ */ ++static unsigned int startup_ioapic_irq(unsigned int irq) ++{ ++ int was_pending = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ if (irq < 16) { ++ disable_8259A_irq(irq); ++ if (i8259A_irq_pending(irq)) ++ was_pending = 1; ++ } ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return was_pending; ++} ++ ++static void ack_ioapic_irq(unsigned int irq) ++{ ++ move_native_irq(irq); ++ ack_APIC_irq(); ++} ++ ++static void ack_ioapic_quirk_irq(unsigned int irq) ++{ ++ unsigned long v; ++ int i; ++ ++ move_native_irq(irq); ++/* ++ * It appears there is an erratum which affects at least version 0x11 ++ * of I/O APIC (that's the 82093AA and cores integrated into various ++ * chipsets). Under certain conditions a level-triggered interrupt is ++ * erroneously delivered as edge-triggered one but the respective IRR ++ * bit gets set nevertheless. As a result the I/O unit expects an EOI ++ * message but it will never arrive and further interrupts are blocked ++ * from the source. The exact reason is so far unknown, but the ++ * phenomenon was observed when two consecutive interrupt requests ++ * from a given source get delivered to the same CPU and the source is ++ * temporarily disabled in between. ++ * ++ * A workaround is to simulate an EOI message manually. We achieve it ++ * by setting the trigger mode to edge and then to level when the edge ++ * trigger mode gets detected in the TMR of a local APIC for a ++ * level-triggered interrupt. We mask the source for the time of the ++ * operation to prevent an edge-triggered interrupt escaping meanwhile. ++ * The idea is from Manfred Spraul. --macro ++ */ ++ i = irq_vector[irq]; ++ ++ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); ++ ++ ack_APIC_irq(); ++ ++ if (!(v & (1 << (i & 0x1f)))) { ++ atomic_inc(&irq_mis_count); ++ spin_lock(&ioapic_lock); ++ __mask_and_edge_IO_APIC_irq(irq); ++ __unmask_and_level_IO_APIC_irq(irq); ++ spin_unlock(&ioapic_lock); ++ } ++} ++ ++static int ioapic_retrigger_irq(unsigned int irq) ++{ ++ send_IPI_self(irq_vector[irq]); ++ ++ return 1; ++} ++ ++static struct irq_chip ioapic_chip __read_mostly = { ++ .name = "IO-APIC", ++ .startup = startup_ioapic_irq, ++ .mask = mask_IO_APIC_irq, ++ .unmask = unmask_IO_APIC_irq, ++ .ack = ack_ioapic_irq, ++ .eoi = ack_ioapic_quirk_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ioapic_affinity_irq, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++#endif /* !CONFIG_XEN */ ++ ++static inline void init_IO_APIC_traps(void) ++{ ++ int irq; ++ ++ /* ++ * NOTE! The local APIC isn't very good at handling ++ * multiple interrupts at the same interrupt level. ++ * As the interrupt level is determined by taking the ++ * vector number and shifting that right by 4, we ++ * want to spread these out a bit so that they don't ++ * all fall in the same interrupt level. ++ * ++ * Also, we've got to be careful not to trash gate ++ * 0x80, because int 0x80 is hm, kind of importantish. ;) ++ */ ++ for (irq = 0; irq < NR_IRQS ; irq++) { ++ if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { ++ /* ++ * Hmm.. We don't have an entry for this, ++ * so default to an old-fashioned 8259 ++ * interrupt if we can.. ++ */ ++ if (irq < 16) ++ make_8259A_irq(irq); ++#ifndef CONFIG_XEN ++ else ++ /* Strange. Oh, well.. */ ++ irq_desc[irq].chip = &no_irq_chip; ++#endif ++ } ++ } ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * The local APIC irq-chip implementation: ++ */ ++ ++static void ack_apic(unsigned int irq) ++{ ++ ack_APIC_irq(); ++} ++ ++static void mask_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); ++} ++ ++static void unmask_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); ++} ++ ++static struct irq_chip lapic_chip __read_mostly = { ++ .name = "local-APIC-edge", ++ .mask = mask_lapic_irq, ++ .unmask = unmask_lapic_irq, ++ .eoi = ack_apic, ++}; ++ ++static void __init setup_nmi(void) ++{ ++ /* ++ * Dirty trick to enable the NMI watchdog ... ++ * We put the 8259A master into AEOI mode and ++ * unmask on all local APICs LVT0 as NMI. ++ * ++ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') ++ * is from Maciej W. Rozycki - so we do not have to EOI from ++ * the NMI handler or the timer interrupt. ++ */ ++ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); ++ ++ enable_NMI_through_LVT0(); ++ ++ apic_printk(APIC_VERBOSE, " done.\n"); ++} ++ ++/* ++ * This looks a bit hackish but it's about the only one way of sending ++ * a few INTA cycles to 8259As and any associated glue logic. ICR does ++ * not support the ExtINT mode, unfortunately. We need to send these ++ * cycles as some i82489DX-based boards have glue logic that keeps the ++ * 8259A interrupt line asserted until INTA. --macro ++ */ ++static inline void __init unlock_ExtINT_logic(void) ++{ ++ int apic, pin, i; ++ struct IO_APIC_route_entry entry0, entry1; ++ unsigned char save_control, save_freq_select; ++ ++ pin = find_isa_irq_pin(8, mp_INT); ++ if (pin == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ apic = find_isa_irq_apic(8, mp_INT); ++ if (apic == -1) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ entry0 = ioapic_read_entry(apic, pin); ++ clear_IO_APIC_pin(apic, pin); ++ ++ memset(&entry1, 0, sizeof(entry1)); ++ ++ entry1.dest_mode = 0; /* physical delivery */ ++ entry1.mask = 0; /* unmask IRQ now */ ++ entry1.dest.physical.physical_dest = hard_smp_processor_id(); ++ entry1.delivery_mode = dest_ExtINT; ++ entry1.polarity = entry0.polarity; ++ entry1.trigger = 0; ++ entry1.vector = 0; ++ ++ ioapic_write_entry(apic, pin, entry1); ++ ++ save_control = CMOS_READ(RTC_CONTROL); ++ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); ++ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, ++ RTC_FREQ_SELECT); ++ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); ++ ++ i = 100; ++ while (i-- > 0) { ++ mdelay(10); ++ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) ++ i -= 10; ++ } ++ ++ CMOS_WRITE(save_control, RTC_CONTROL); ++ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); ++ clear_IO_APIC_pin(apic, pin); ++ ++ ioapic_write_entry(apic, pin, entry0); ++} ++ ++/* ++ * This code may look a bit paranoid, but it's supposed to cooperate with ++ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ ++ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast ++ * fanatically on his truly buggy board. ++ */ ++static inline void __init check_timer(void) ++{ ++ int apic1, pin1, apic2, pin2; ++ int vector; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ /* ++ * get/set the timer IRQ vector: ++ */ ++ disable_8259A_irq(0); ++ vector = assign_irq_vector(0); ++ set_intr_gate(vector, interrupt[0]); ++ ++ /* ++ * Subtle, code in do_timer_interrupt() expects an AEOI ++ * mode for the 8259A whenever interrupts are routed ++ * through I/O APICs. Also IRQ0 has to be enabled in ++ * the 8259A which implies the virtual wire has to be ++ * disabled in the local APIC. ++ */ ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ init_8259A(1); ++ timer_ack = 1; ++ if (timer_over_8254 > 0) ++ enable_8259A_irq(0); ++ ++ pin1 = find_isa_irq_pin(0, mp_INT); ++ apic1 = find_isa_irq_apic(0, mp_INT); ++ pin2 = ioapic_i8259.pin; ++ apic2 = ioapic_i8259.apic; ++ ++ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", ++ vector, apic1, pin1, apic2, pin2); ++ ++ if (pin1 != -1) { ++ /* ++ * Ok, does IRQ0 through the IOAPIC work? ++ */ ++ unmask_IO_APIC_irq(0); ++ if (timer_irq_works()) { ++ if (nmi_watchdog == NMI_IO_APIC) { ++ disable_8259A_irq(0); ++ setup_nmi(); ++ enable_8259A_irq(0); ++ } ++ if (disable_timer_pin_1 > 0) ++ clear_IO_APIC_pin(0, pin1); ++ goto out; ++ } ++ clear_IO_APIC_pin(apic1, pin1); ++ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " ++ "IO-APIC\n"); ++ } ++ ++ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); ++ if (pin2 != -1) { ++ printk("\n..... (found pin %d) ...", pin2); ++ /* ++ * legacy devices should be connected to IO APIC #0 ++ */ ++ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); ++ if (timer_irq_works()) { ++ printk("works.\n"); ++ if (pin1 != -1) ++ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); ++ else ++ add_pin_to_irq(0, apic2, pin2); ++ if (nmi_watchdog == NMI_IO_APIC) { ++ setup_nmi(); ++ } ++ goto out; ++ } ++ /* ++ * Cleanup, just in case ... ++ */ ++ clear_IO_APIC_pin(apic2, pin2); ++ } ++ printk(" failed.\n"); ++ ++ if (nmi_watchdog == NMI_IO_APIC) { ++ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); ++ nmi_watchdog = 0; ++ } ++ ++ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); ++ ++ disable_8259A_irq(0); ++ set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, ++ "fasteoi"); ++ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ ++ enable_8259A_irq(0); ++ ++ if (timer_irq_works()) { ++ printk(" works.\n"); ++ goto out; ++ } ++ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); ++ printk(" failed.\n"); ++ ++ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); ++ ++ timer_ack = 0; ++ init_8259A(0); ++ make_8259A_irq(0); ++ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); ++ ++ unlock_ExtINT_logic(); ++ ++ if (timer_irq_works()) { ++ printk(" works.\n"); ++ goto out; ++ } ++ printk(" failed :(.\n"); ++ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " ++ "report. Then try booting with the 'noapic' option"); ++out: ++ local_irq_restore(flags); ++} ++#else ++int timer_uses_ioapic_pin_0 = 0; ++#define check_timer() ((void)0) ++#endif ++ ++/* ++ * ++ * IRQ's that are handled by the PIC in the MPS IOAPIC case. ++ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. ++ * Linux doesn't really care, as it's not actually used ++ * for any interrupt handling anyway. ++ */ ++#define PIC_IRQS (1 << PIC_CASCADE_IR) ++ ++void __init setup_IO_APIC(void) ++{ ++#ifndef CONFIG_XEN ++ int i; ++ ++ /* Reserve all the system vectors. */ ++ for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) ++ set_bit(i, used_vectors); ++#endif ++ ++ enable_IO_APIC(); ++ ++ if (acpi_ioapic) ++ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ ++ else ++ io_apic_irqs = ~PIC_IRQS; ++ ++ printk("ENABLING IO-APIC IRQs\n"); ++ ++ /* ++ * Set up IO-APIC IRQ routing. ++ */ ++ if (!acpi_ioapic) ++ setup_ioapic_ids_from_mpc(); ++#ifndef CONFIG_XEN ++ sync_Arb_IDs(); ++#endif ++ setup_IO_APIC_irqs(); ++ init_IO_APIC_traps(); ++ check_timer(); ++ if (!acpi_ioapic) ++ print_IO_APIC(); ++} ++ ++#ifndef CONFIG_XEN ++static int __init setup_disable_8254_timer(char *s) ++{ ++ timer_over_8254 = -1; ++ return 1; ++} ++static int __init setup_enable_8254_timer(char *s) ++{ ++ timer_over_8254 = 2; ++ return 1; ++} ++ ++__setup("disable_8254_timer", setup_disable_8254_timer); ++__setup("enable_8254_timer", setup_enable_8254_timer); ++#endif ++ ++/* ++ * Called after all the initialization is done. If we didnt find any ++ * APIC bugs then we can allow the modify fast path ++ */ ++ ++static int __init io_apic_bug_finalize(void) ++{ ++ if(sis_apic_bug == -1) ++ sis_apic_bug = 0; ++ if (is_initial_xendomain()) { ++ struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; ++ op.u.platform_quirk.quirk_id = sis_apic_bug ? ++ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; ++ VOID(HYPERVISOR_platform_op(&op)); ++ } ++ return 0; ++} ++ ++late_initcall(io_apic_bug_finalize); ++ ++struct sysfs_ioapic_data { ++ struct sys_device dev; ++ struct IO_APIC_route_entry entry[0]; ++}; ++static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; ++ ++static int ioapic_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) ++ entry[i] = ioapic_read_entry(dev->id, i); ++ ++ return 0; ++} ++ ++static int ioapic_resume(struct sys_device *dev) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ unsigned long flags; ++ union IO_APIC_reg_00 reg_00; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(dev->id, 0); ++ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { ++ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; ++ io_apic_write(dev->id, 0, reg_00.raw); ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) ++ ioapic_write_entry(dev->id, i, entry[i]); ++ ++ return 0; ++} ++ ++static struct sysdev_class ioapic_sysdev_class = { ++ .name = "ioapic", ++ .suspend = ioapic_suspend, ++ .resume = ioapic_resume, ++}; ++ ++static int __init ioapic_init_sysfs(void) ++{ ++ struct sys_device * dev; ++ int i, size, error = 0; ++ ++ error = sysdev_class_register(&ioapic_sysdev_class); ++ if (error) ++ return error; ++ ++ for (i = 0; i < nr_ioapics; i++ ) { ++ size = sizeof(struct sys_device) + nr_ioapic_registers[i] ++ * sizeof(struct IO_APIC_route_entry); ++ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); ++ if (!mp_ioapic_data[i]) { ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ memset(mp_ioapic_data[i], 0, size); ++ dev = &mp_ioapic_data[i]->dev; ++ dev->id = i; ++ dev->cls = &ioapic_sysdev_class; ++ error = sysdev_register(dev); ++ if (error) { ++ kfree(mp_ioapic_data[i]); ++ mp_ioapic_data[i] = NULL; ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ } ++ ++ return 0; ++} ++ ++device_initcall(ioapic_init_sysfs); ++ ++#ifndef CONFIG_XEN ++/* ++ * Dynamic irq allocate and deallocation ++ */ ++int create_irq(void) ++{ ++ /* Allocate an unused irq */ ++ int irq, new, vector = 0; ++ unsigned long flags; ++ ++ irq = -ENOSPC; ++ spin_lock_irqsave(&vector_lock, flags); ++ for (new = (NR_IRQS - 1); new >= 0; new--) { ++ if (platform_legacy_irq(new)) ++ continue; ++ if (irq_vector[new] != 0) ++ continue; ++ vector = __assign_irq_vector(new); ++ if (likely(vector > 0)) ++ irq = new; ++ break; ++ } ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ if (irq >= 0) { ++ set_intr_gate(vector, interrupt[irq]); ++ dynamic_irq_init(irq); ++ } ++ return irq; ++} ++ ++void destroy_irq(unsigned int irq) ++{ ++ unsigned long flags; ++ ++ dynamic_irq_cleanup(irq); ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ clear_bit(irq_vector[irq], used_vectors); ++ irq_vector[irq] = 0; ++ spin_unlock_irqrestore(&vector_lock, flags); ++} ++#endif ++ ++/* ++ * MSI message composition ++ */ ++#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) ++static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) ++{ ++ int vector; ++ unsigned dest; ++ ++ vector = assign_irq_vector(irq); ++ if (vector >= 0) { ++ dest = cpu_mask_to_apicid(TARGET_CPUS); ++ ++ msg->address_hi = MSI_ADDR_BASE_HI; ++ msg->address_lo = ++ MSI_ADDR_BASE_LO | ++ ((INT_DEST_MODE == 0) ? ++ MSI_ADDR_DEST_MODE_PHYSICAL: ++ MSI_ADDR_DEST_MODE_LOGICAL) | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_ADDR_REDIRECTION_CPU: ++ MSI_ADDR_REDIRECTION_LOWPRI) | ++ MSI_ADDR_DEST_ID(dest); ++ ++ msg->data = ++ MSI_DATA_TRIGGER_EDGE | ++ MSI_DATA_LEVEL_ASSERT | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_DATA_DELIVERY_FIXED: ++ MSI_DATA_DELIVERY_LOWPRI) | ++ MSI_DATA_VECTOR(vector); ++ } ++ return vector; ++} ++ ++#ifdef CONFIG_SMP ++static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ struct msi_msg msg; ++ unsigned int dest; ++ cpumask_t tmp; ++ int vector; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ vector = assign_irq_vector(irq); ++ if (vector < 0) ++ return; ++ ++ dest = cpu_mask_to_apicid(mask); ++ ++ read_msi_msg(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ write_msi_msg(irq, &msg); ++ irq_desc[irq].affinity = mask; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, ++ * which implement the MSI or MSI-X Capability Structure. ++ */ ++static struct irq_chip msi_chip = { ++ .name = "PCI-MSI", ++ .unmask = unmask_msi_irq, ++ .mask = mask_msi_irq, ++ .ack = ack_ioapic_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_msi_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) ++{ ++ struct msi_msg msg; ++ int irq, ret; ++ irq = create_irq(); ++ if (irq < 0) ++ return irq; ++ ++ ret = msi_compose_msg(dev, irq, &msg); ++ if (ret < 0) { ++ destroy_irq(irq); ++ return ret; ++ } ++ ++ set_irq_msi(irq, desc); ++ write_msi_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, ++ "edge"); ++ ++ return 0; ++} ++ ++void arch_teardown_msi_irq(unsigned int irq) ++{ ++ destroy_irq(irq); ++} ++ ++#endif /* CONFIG_PCI_MSI */ ++ ++/* ++ * Hypertransport interrupt support ++ */ ++#ifdef CONFIG_HT_IRQ ++ ++#ifdef CONFIG_SMP ++ ++static void target_ht_irq(unsigned int irq, unsigned int dest) ++{ ++ struct ht_irq_msg msg; ++ fetch_ht_irq_msg(irq, &msg); ++ ++ msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); ++ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); ++ ++ msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); ++ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); ++ ++ write_ht_irq_msg(irq, &msg); ++} ++ ++static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ tmp = TARGET_CPUS; ++ ++ cpus_and(mask, tmp, CPU_MASK_ALL); ++ ++ dest = cpu_mask_to_apicid(mask); ++ ++ target_ht_irq(irq, dest); ++ irq_desc[irq].affinity = mask; ++} ++#endif ++ ++static struct irq_chip ht_irq_chip = { ++ .name = "PCI-HT", ++ .mask = mask_ht_irq, ++ .unmask = unmask_ht_irq, ++ .ack = ack_ioapic_irq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ht_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) ++{ ++ int vector; ++ ++ vector = assign_irq_vector(irq); ++ if (vector >= 0) { ++ struct ht_irq_msg msg; ++ unsigned dest; ++ cpumask_t tmp; ++ ++ cpus_clear(tmp); ++ cpu_set(vector >> 8, tmp); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); ++ ++ msg.address_lo = ++ HT_IRQ_LOW_BASE | ++ HT_IRQ_LOW_DEST_ID(dest) | ++ HT_IRQ_LOW_VECTOR(vector) | ++ ((INT_DEST_MODE == 0) ? ++ HT_IRQ_LOW_DM_PHYSICAL : ++ HT_IRQ_LOW_DM_LOGICAL) | ++ HT_IRQ_LOW_RQEOI_EDGE | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ HT_IRQ_LOW_MT_FIXED : ++ HT_IRQ_LOW_MT_ARBITRATED) | ++ HT_IRQ_LOW_IRQ_MASKED; ++ ++ write_ht_irq_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &ht_irq_chip, ++ handle_edge_irq, "edge"); ++ } ++ return vector; ++} ++#endif /* CONFIG_HT_IRQ */ ++ ++/* -------------------------------------------------------------------------- ++ ACPI-based IOAPIC Configuration ++ -------------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_ACPI ++ ++int __init io_apic_get_unique_id (int ioapic, int apic_id) ++{ ++#ifndef CONFIG_XEN ++ union IO_APIC_reg_00 reg_00; ++ static physid_mask_t apic_id_map = PHYSID_MASK_NONE; ++ physid_mask_t tmp; ++ unsigned long flags; ++ int i = 0; ++ ++ /* ++ * The P4 platform supports up to 256 APIC IDs on two separate APIC ++ * buses (one for LAPICs, one for IOAPICs), where predecessors only ++ * supports up to 16 on one shared APIC bus. ++ * ++ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full ++ * advantage of new APIC bus architecture. ++ */ ++ ++ if (physids_empty(apic_id_map)) ++ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ if (apic_id >= get_physical_broadcast()) { ++ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " ++ "%d\n", ioapic, apic_id, reg_00.bits.ID); ++ apic_id = reg_00.bits.ID; ++ } ++ ++ /* ++ * Every APIC in a system must have a unique ID or we get lots of nice ++ * 'stuck on smp_invalidate_needed IPI wait' messages. ++ */ ++ if (check_apicid_used(apic_id_map, apic_id)) { ++ ++ for (i = 0; i < get_physical_broadcast(); i++) { ++ if (!check_apicid_used(apic_id_map, i)) ++ break; ++ } ++ ++ if (i == get_physical_broadcast()) ++ panic("Max apic_id exceeded!\n"); ++ ++ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " ++ "trying %d\n", ioapic, apic_id, i); ++ ++ apic_id = i; ++ } ++ ++ tmp = apicid_to_cpu_present(apic_id); ++ physids_or(apic_id_map, apic_id_map, tmp); ++ ++ if (reg_00.bits.ID != apic_id) { ++ reg_00.bits.ID = apic_id; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(ioapic, 0, reg_00.raw); ++ reg_00.raw = io_apic_read(ioapic, 0); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ /* Sanity check */ ++ if (reg_00.bits.ID != apic_id) { ++ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); ++ return -1; ++ } ++ } ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO ++ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); ++#endif /* !CONFIG_XEN */ ++ ++ return apic_id; ++} ++ ++ ++int __init io_apic_get_version (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.version; ++} ++ ++ ++int __init io_apic_get_redir_entries (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.entries; ++} ++ ++ ++int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ if (!IO_APIC_IRQ(irq)) { ++ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ++ ioapic); ++ return -EINVAL; ++ } ++ ++ /* ++ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. ++ * Note that we mask (disable) IRQs now -- these get enabled when the ++ * corresponding device driver registers for this IRQ. ++ */ ++ ++ memset(&entry,0,sizeof(entry)); ++ ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.dest_mode = INT_DEST_MODE; ++ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); ++ entry.trigger = edge_level; ++ entry.polarity = active_high_low; ++ entry.mask = 1; ++ ++ /* ++ * IRQs < 16 are already in the irq_2_pin[] map ++ */ ++ if (irq >= 16) ++ add_pin_to_irq(irq, ioapic, pin); ++ ++ entry.vector = assign_irq_vector(irq); ++ ++ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " ++ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, ++ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, ++ edge_level, active_high_low); ++ ++ ioapic_register_intr(irq, entry.vector, edge_level); ++ ++ if (!ioapic && (irq < 16)) ++ disable_8259A_irq(irq); ++ ++ ioapic_write_entry(ioapic, pin, entry); ++ ++ return 0; ++} ++ ++int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) ++{ ++ int i; ++ ++ if (skip_ioapic_setup) ++ return -1; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_irqtype == mp_INT && ++ mp_irqs[i].mpc_srcbusirq == bus_irq) ++ break; ++ if (i >= mp_irq_entries) ++ return -1; ++ ++ *trigger = irq_trigger(i); ++ *polarity = irq_polarity(i); ++ return 0; ++} ++ ++#endif /* CONFIG_ACPI */ ++ ++#ifndef CONFIG_XEN ++static int __init parse_disable_timer_pin_1(char *arg) ++{ ++ disable_timer_pin_1 = 1; ++ return 0; ++} ++early_param("disable_timer_pin_1", parse_disable_timer_pin_1); ++ ++static int __init parse_enable_timer_pin_1(char *arg) ++{ ++ disable_timer_pin_1 = -1; ++ return 0; ++} ++early_param("enable_timer_pin_1", parse_enable_timer_pin_1); ++#endif ++ ++static int __init parse_noapic(char *arg) ++{ ++ /* disable IO-APIC */ ++ disable_ioapic_setup(); ++ return 0; ++} ++early_param("noapic", parse_noapic); +diff --git a/arch/x86/kernel/io_apic_64-xen.c b/arch/x86/kernel/io_apic_64-xen.c +new file mode 100644 +index 0000000..ad00f4d +--- /dev/null ++++ b/arch/x86/kernel/io_apic_64-xen.c +@@ -0,0 +1,2385 @@ ++/* ++ * Intel IO-APIC support for multi-Pentium hosts. ++ * ++ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo ++ * ++ * Many thanks to Stig Venaas for trying out countless experimental ++ * patches and reporting/debugging problems patiently! ++ * ++ * (c) 1999, Multiple IO-APIC support, developed by ++ * Ken-ichi Yaku and ++ * Hidemi Kishimoto , ++ * further tested and cleaned up by Zach Brown ++ * and Ingo Molnar ++ * ++ * Fixes ++ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; ++ * thanks to Eric Gilmore ++ * and Rolf G. Tews ++ * for testing these extensively ++ * Paul Diefenbaugh : Added full ACPI support ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_ACPI ++#include ++#endif ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++struct irq_cfg { ++#ifndef CONFIG_XEN ++ cpumask_t domain; ++ cpumask_t old_domain; ++#endif ++ unsigned move_cleanup_count; ++ u8 vector; ++ u8 move_in_progress : 1; ++}; ++ ++/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ ++struct irq_cfg irq_cfg[NR_IRQS] __read_mostly; ++ ++static int assign_irq_vector(int irq, cpumask_t mask); ++ ++#define __apicdebuginit __init ++ ++int sis_apic_bug; /* not actually supported, dummy for compile */ ++ ++#ifdef CONFIG_XEN ++#include ++#include ++ ++/* Fake i8259 */ ++#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) ++#define disable_8259A_irq(_irq) ((void)0) ++#define i8259A_irq_pending(_irq) (0) ++ ++unsigned long io_apic_irqs; ++ ++#define clear_IO_APIC() ((void)0) ++#else ++static int no_timer_check; ++ ++static int disable_timer_pin_1 __initdata; ++ ++int timer_over_8254 __initdata = 1; ++ ++/* Where if anywhere is the i8259 connect in external int mode */ ++static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; ++#endif ++ ++static DEFINE_SPINLOCK(ioapic_lock); ++DEFINE_SPINLOCK(vector_lock); ++ ++/* ++ * # of IRQ routing registers ++ */ ++int nr_ioapic_registers[MAX_IO_APICS]; ++ ++/* I/O APIC entries */ ++struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; ++int nr_ioapics; ++ ++/* MP IRQ source entries */ ++struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; ++ ++/* # of MP IRQ source entries */ ++int mp_irq_entries; ++ ++/* ++ * Rough estimation of how many shared IRQs there are, can ++ * be changed anytime. ++ */ ++#define MAX_PLUS_SHARED_IRQS NR_IRQS ++#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) ++ ++/* ++ * This is performance-critical, we want to do it O(1) ++ * ++ * the indexing order of this array favors 1:1 mappings ++ * between pins and IRQs. ++ */ ++ ++static struct irq_pin_list { ++ short apic, pin, next; ++} irq_2_pin[PIN_MAP_SIZE]; ++ ++#ifndef CONFIG_XEN ++struct io_apic { ++ unsigned int index; ++ unsigned int unused[3]; ++ unsigned int data; ++}; ++ ++static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) ++{ ++ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) ++ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); ++} ++#endif ++ ++static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) ++{ ++#ifndef CONFIG_XEN ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ return readl(&io_apic->data); ++#else ++ struct physdev_apic apic_op; ++ int ret; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); ++ if (ret) ++ return ret; ++ return apic_op.value; ++#endif ++} ++ ++static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) ++{ ++#ifndef CONFIG_XEN ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(reg, &io_apic->index); ++ writel(value, &io_apic->data); ++#else ++ struct physdev_apic apic_op; ++ ++ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; ++ apic_op.reg = reg; ++ apic_op.value = value; ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); ++#endif ++} ++ ++#ifdef CONFIG_XEN ++#define io_apic_modify io_apic_write ++#else ++/* ++ * Re-write a value: to be used for read-modify-write ++ * cycles where the read already set up the index register. ++ */ ++static inline void io_apic_modify(unsigned int apic, unsigned int value) ++{ ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ writel(value, &io_apic->data); ++} ++ ++static bool io_apic_level_ack_pending(unsigned int irq) ++{ ++ struct irq_pin_list *entry; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ entry = irq_2_pin + irq; ++ for (;;) { ++ unsigned int reg; ++ int pin; ++ ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ reg = io_apic_read(entry->apic, 0x10 + pin*2); ++ /* Is the remote IRR bit set? */ ++ if ((reg >> 14) & 1) { ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return true; ++ } ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return false; ++} ++#endif ++ ++/* ++ * Synchronize the IO-APIC and the CPU by doing ++ * a dummy read from the IO-APIC ++ */ ++static inline void io_apic_sync(unsigned int apic) ++{ ++#ifndef CONFIG_XEN ++ struct io_apic __iomem *io_apic = io_apic_base(apic); ++ readl(&io_apic->data); ++#endif ++} ++ ++union entry_union { ++ struct { u32 w1, w2; }; ++ struct IO_APIC_route_entry entry; ++}; ++ ++static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) ++{ ++ union entry_union eu; ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); ++ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ return eu.entry; ++} ++ ++/* ++ * When we write a new IO APIC routing entry, we need to write the high ++ * word first! If the mask bit in the low word is clear, we will enable ++ * the interrupt, and we need to make sure the entry is fully populated ++ * before that happens. ++ */ ++static void ++__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ union entry_union eu; ++ eu.entry = e; ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++} ++ ++static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __ioapic_write_entry(apic, pin, e); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * When we mask an IO APIC routing entry, we need to write the low ++ * word first, in order to set the mask bit before we change the ++ * high bits! ++ */ ++static void ioapic_mask_entry(int apic, int pin) ++{ ++ unsigned long flags; ++ union entry_union eu = { .entry.mask = 1 }; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ io_apic_write(apic, 0x10 + 2*pin, eu.w1); ++ io_apic_write(apic, 0x11 + 2*pin, eu.w2); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) ++{ ++ int apic, pin; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ BUG_ON(irq >= NR_IRQS); ++ for (;;) { ++ unsigned int reg; ++ apic = entry->apic; ++ pin = entry->pin; ++ if (pin == -1) ++ break; ++ io_apic_write(apic, 0x11 + pin*2, dest); ++ reg = io_apic_read(apic, 0x10 + pin*2); ++ reg &= ~0x000000ff; ++ reg |= vector; ++ io_apic_modify(apic, reg); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++} ++ ++static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ unsigned long flags; ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ return; ++ ++ if (assign_irq_vector(irq, mask)) ++ return; ++ ++ cpus_and(tmp, cfg->domain, mask); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ /* ++ * Only the high 8 bits are valid. ++ */ ++ dest = SET_APIC_LOGICAL_ID(dest); ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __target_IO_APIC_irq(irq, dest, cfg->vector); ++ irq_desc[irq].affinity = mask; ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++#endif ++#endif ++ ++/* ++ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are ++ * shared ISA-space IRQs, so we have to support them. We are super ++ * fast in the common case, and fast for shared ISA-space IRQs. ++ */ ++static void add_pin_to_irq(unsigned int irq, int apic, int pin) ++{ ++ static int first_free_entry = NR_IRQS; ++ struct irq_pin_list *entry = irq_2_pin + irq; ++ ++ BUG_ON(irq >= NR_IRQS); ++ while (entry->next) ++ entry = irq_2_pin + entry->next; ++ ++ if (entry->pin != -1) { ++ entry->next = first_free_entry; ++ entry = irq_2_pin + entry->next; ++ if (++first_free_entry >= PIN_MAP_SIZE) ++ panic("io_apic.c: ran out of irq_2_pin entries!"); ++ } ++ entry->apic = apic; ++ entry->pin = pin; ++} ++ ++#ifndef CONFIG_XEN ++#define __DO_ACTION(R, ACTION, FINAL) \ ++ \ ++{ \ ++ int pin; \ ++ struct irq_pin_list *entry = irq_2_pin + irq; \ ++ \ ++ BUG_ON(irq >= NR_IRQS); \ ++ for (;;) { \ ++ unsigned int reg; \ ++ pin = entry->pin; \ ++ if (pin == -1) \ ++ break; \ ++ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ ++ reg ACTION; \ ++ io_apic_modify(entry->apic, reg); \ ++ FINAL; \ ++ if (!entry->next) \ ++ break; \ ++ entry = irq_2_pin + entry->next; \ ++ } \ ++} ++ ++#define DO_ACTION(name,R,ACTION, FINAL) \ ++ \ ++ static void name##_IO_APIC_irq (unsigned int irq) \ ++ __DO_ACTION(R, ACTION, FINAL) ++ ++DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) ++ /* mask = 1 */ ++DO_ACTION( __unmask, 0, &= 0xfffeffff, ) ++ /* mask = 0 */ ++ ++static void mask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __mask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void unmask_IO_APIC_irq (unsigned int irq) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++} ++ ++static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ /* Check delivery_mode to be sure we're not clearing an SMI pin */ ++ entry = ioapic_read_entry(apic, pin); ++ if (entry.delivery_mode == dest_SMI) ++ return; ++ /* ++ * Disable it in the IO-APIC irq-routing table: ++ */ ++ ioapic_mask_entry(apic, pin); ++} ++ ++static void clear_IO_APIC (void) ++{ ++ int apic, pin; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ++ clear_IO_APIC_pin(apic, pin); ++} ++ ++#endif /* !CONFIG_XEN */ ++ ++int skip_ioapic_setup; ++int ioapic_force; ++ ++static int __init parse_noapic(char *str) ++{ ++ disable_ioapic_setup(); ++ return 0; ++} ++early_param("noapic", parse_noapic); ++ ++#ifndef CONFIG_XEN ++/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ ++static int __init disable_timer_pin_setup(char *arg) ++{ ++ disable_timer_pin_1 = 1; ++ return 1; ++} ++__setup("disable_timer_pin_1", disable_timer_pin_setup); ++ ++static int __init setup_disable_8254_timer(char *s) ++{ ++ timer_over_8254 = -1; ++ return 1; ++} ++static int __init setup_enable_8254_timer(char *s) ++{ ++ timer_over_8254 = 2; ++ return 1; ++} ++ ++__setup("disable_8254_timer", setup_disable_8254_timer); ++__setup("enable_8254_timer", setup_enable_8254_timer); ++#endif /* !CONFIG_XEN */ ++ ++ ++/* ++ * Find the IRQ entry number of a certain pin. ++ */ ++static int find_irq_entry(int apic, int pin, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_irqtype == type && ++ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && ++ mp_irqs[i].mpc_dstirq == pin) ++ return i; ++ ++ return -1; ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * Find the pin to which IRQ[irq] (ISA) is connected ++ */ ++static int __init find_isa_irq_pin(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ ++ return mp_irqs[i].mpc_dstirq; ++ } ++ return -1; ++} ++ ++static int __init find_isa_irq_apic(int irq, int type) ++{ ++ int i; ++ ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ if (test_bit(lbus, mp_bus_not_pci) && ++ (mp_irqs[i].mpc_irqtype == type) && ++ (mp_irqs[i].mpc_srcbusirq == irq)) ++ break; ++ } ++ if (i < mp_irq_entries) { ++ int apic; ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) ++ return apic; ++ } ++ } ++ ++ return -1; ++} ++#endif ++ ++/* ++ * Find a specific PCI IRQ entry. ++ * Not an __init, possibly needed by modules ++ */ ++static int pin_2_irq(int idx, int apic, int pin); ++ ++int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) ++{ ++ int apic, i, best_guess = -1; ++ ++ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", ++ bus, slot, pin); ++ if (mp_bus_id_to_pci_bus[bus] == -1) { ++ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); ++ return -1; ++ } ++ for (i = 0; i < mp_irq_entries; i++) { ++ int lbus = mp_irqs[i].mpc_srcbus; ++ ++ for (apic = 0; apic < nr_ioapics; apic++) ++ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || ++ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) ++ break; ++ ++ if (!test_bit(lbus, mp_bus_not_pci) && ++ !mp_irqs[i].mpc_irqtype && ++ (bus == lbus) && ++ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { ++ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); ++ ++ if (!(apic || IO_APIC_IRQ(irq))) ++ continue; ++ ++ if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) ++ return irq; ++ /* ++ * Use the first all-but-pin matching entry as a ++ * best-guess fuzzy result for broken mptables. ++ */ ++ if (best_guess < 0) ++ best_guess = irq; ++ } ++ } ++ BUG_ON(best_guess >= NR_IRQS); ++ return best_guess; ++} ++ ++/* ISA interrupts are always polarity zero edge triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_ISA_trigger(idx) (0) ++#define default_ISA_polarity(idx) (0) ++ ++/* PCI interrupts are always polarity one level triggered, ++ * when listed as conforming in the MP table. */ ++ ++#define default_PCI_trigger(idx) (1) ++#define default_PCI_polarity(idx) (1) ++ ++static int MPBIOS_polarity(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int polarity; ++ ++ /* ++ * Determine IRQ line polarity (high active or low active): ++ */ ++ switch (mp_irqs[idx].mpc_irqflag & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent polarity */ ++ if (test_bit(bus, mp_bus_not_pci)) ++ polarity = default_ISA_polarity(idx); ++ else ++ polarity = default_PCI_polarity(idx); ++ break; ++ case 1: /* high active */ ++ { ++ polarity = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ case 3: /* low active */ ++ { ++ polarity = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ polarity = 1; ++ break; ++ } ++ } ++ return polarity; ++} ++ ++static int MPBIOS_trigger(int idx) ++{ ++ int bus = mp_irqs[idx].mpc_srcbus; ++ int trigger; ++ ++ /* ++ * Determine IRQ trigger mode (edge or level sensitive): ++ */ ++ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) ++ { ++ case 0: /* conforms, ie. bus-type dependent */ ++ if (test_bit(bus, mp_bus_not_pci)) ++ trigger = default_ISA_trigger(idx); ++ else ++ trigger = default_PCI_trigger(idx); ++ break; ++ case 1: /* edge */ ++ { ++ trigger = 0; ++ break; ++ } ++ case 2: /* reserved */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 1; ++ break; ++ } ++ case 3: /* level */ ++ { ++ trigger = 1; ++ break; ++ } ++ default: /* invalid */ ++ { ++ printk(KERN_WARNING "broken BIOS!!\n"); ++ trigger = 0; ++ break; ++ } ++ } ++ return trigger; ++} ++ ++static inline int irq_polarity(int idx) ++{ ++ return MPBIOS_polarity(idx); ++} ++ ++static inline int irq_trigger(int idx) ++{ ++ return MPBIOS_trigger(idx); ++} ++ ++static int pin_2_irq(int idx, int apic, int pin) ++{ ++ int irq, i; ++ int bus = mp_irqs[idx].mpc_srcbus; ++ ++ /* ++ * Debugging check, we are in big trouble if this message pops up! ++ */ ++ if (mp_irqs[idx].mpc_dstirq != pin) ++ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); ++ ++ if (test_bit(bus, mp_bus_not_pci)) { ++ irq = mp_irqs[idx].mpc_srcbusirq; ++ } else { ++ /* ++ * PCI IRQs are mapped in order ++ */ ++ i = irq = 0; ++ while (i < apic) ++ irq += nr_ioapic_registers[i++]; ++ irq += pin; ++ } ++ BUG_ON(irq >= NR_IRQS); ++ return irq; ++} ++ ++static int __assign_irq_vector(int irq, cpumask_t mask) ++{ ++ struct physdev_irq irq_op; ++ struct irq_cfg *cfg; ++ ++ BUG_ON((unsigned)irq >= NR_IRQS); ++ cfg = &irq_cfg[irq]; ++ ++ if ((cfg->move_in_progress) || cfg->move_cleanup_count) ++ return -EBUSY; ++ ++ if (cfg->vector) ++ return 0; ++ ++ irq_op.irq = irq; ++ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) ++ return -ENOSPC; ++ ++ cfg->vector = irq_op.vector; ++ ++ return 0; ++} ++ ++static int assign_irq_vector(int irq, cpumask_t mask) ++{ ++ int err; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ err = __assign_irq_vector(irq, mask); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ return err; ++} ++ ++#ifndef CONFIG_XEN ++static void __clear_irq_vector(int irq) ++{ ++ struct irq_cfg *cfg; ++ cpumask_t mask; ++ int cpu, vector; ++ ++ BUG_ON((unsigned)irq >= NR_IRQS); ++ cfg = &irq_cfg[irq]; ++ BUG_ON(!cfg->vector); ++ ++ vector = cfg->vector; ++ cpus_and(mask, cfg->domain, cpu_online_map); ++ for_each_cpu_mask(cpu, mask) ++ per_cpu(vector_irq, cpu)[vector] = -1; ++ ++ cfg->vector = 0; ++ cpus_clear(cfg->domain); ++} ++ ++void __setup_vector_irq(int cpu) ++{ ++ /* Initialize vector_irq on a new cpu */ ++ /* This function must be called with vector_lock held */ ++ int irq, vector; ++ ++ /* Mark the inuse vectors */ ++ for (irq = 0; irq < NR_IRQS; ++irq) { ++ if (!cpu_isset(cpu, irq_cfg[irq].domain)) ++ continue; ++ vector = irq_cfg[irq].vector; ++ per_cpu(vector_irq, cpu)[vector] = irq; ++ } ++ /* Mark the free vectors */ ++ for (vector = 0; vector < NR_VECTORS; ++vector) { ++ irq = per_cpu(vector_irq, cpu)[vector]; ++ if (irq < 0) ++ continue; ++ if (!cpu_isset(cpu, irq_cfg[irq].domain)) ++ per_cpu(vector_irq, cpu)[vector] = -1; ++ } ++} ++ ++static struct irq_chip ioapic_chip; ++ ++static void ioapic_register_intr(int irq, unsigned long trigger) ++{ ++ if (trigger) { ++ irq_desc[irq].status |= IRQ_LEVEL; ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_fasteoi_irq, "fasteoi"); ++ } else { ++ irq_desc[irq].status &= ~IRQ_LEVEL; ++ set_irq_chip_and_handler_name(irq, &ioapic_chip, ++ handle_edge_irq, "edge"); ++ } ++} ++#else ++#define ioapic_register_intr(irq,trigger) ((void)0) ++#endif /* !CONFIG_XEN */ ++ ++static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, ++ int trigger, int polarity) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ struct IO_APIC_route_entry entry; ++ cpumask_t mask; ++ ++ if (!IO_APIC_IRQ(irq)) ++ return; ++ ++ mask = TARGET_CPUS; ++ if (assign_irq_vector(irq, mask)) ++ return; ++ ++#ifndef CONFIG_XEN ++ cpus_and(mask, cfg->domain, mask); ++#endif ++ ++ apic_printk(APIC_VERBOSE,KERN_DEBUG ++ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " ++ "IRQ %d Mode:%i Active:%i)\n", ++ apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector, ++ irq, trigger, polarity); ++ ++ /* ++ * add it to the IO-APIC irq-routing table: ++ */ ++ memset(&entry,0,sizeof(entry)); ++ ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.dest_mode = INT_DEST_MODE; ++ entry.dest = cpu_mask_to_apicid(mask); ++ entry.mask = 0; /* enable IRQ */ ++ entry.trigger = trigger; ++ entry.polarity = polarity; ++ entry.vector = cfg->vector; ++ ++ /* Mask level triggered irqs. ++ * Use IRQ_DELAYED_DISABLE for edge triggered irqs. ++ */ ++ if (trigger) ++ entry.mask = 1; ++ ++ ioapic_register_intr(irq, trigger); ++ if (irq < 16) ++ disable_8259A_irq(irq); ++ ++ ioapic_write_entry(apic, pin, entry); ++} ++ ++static void __init setup_IO_APIC_irqs(void) ++{ ++ int apic, pin, idx, irq, first_notcon = 1; ++ ++ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ ++ idx = find_irq_entry(apic,pin,mp_INT); ++ if (idx == -1) { ++ if (first_notcon) { ++ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); ++ first_notcon = 0; ++ } else ++ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); ++ continue; ++ } ++ if (!first_notcon) { ++ apic_printk(APIC_VERBOSE, " not connected.\n"); ++ first_notcon = 1; ++ } ++ ++ irq = pin_2_irq(idx, apic, pin); ++ add_pin_to_irq(irq, apic, pin); ++ ++ setup_IO_APIC_irq(apic, pin, irq, ++ irq_trigger(idx), irq_polarity(idx)); ++ } ++ } ++ ++ if (!first_notcon) ++ apic_printk(APIC_VERBOSE, " not connected.\n"); ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * Set up the 8259A-master output pin as broadcast to all ++ * CPUs. ++ */ ++static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) ++{ ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry, 0, sizeof(entry)); ++ ++ disable_8259A_irq(0); ++ ++ /* mask LVT0 */ ++ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ ++ /* ++ * We use logical delivery to get the timer IRQ ++ * to the first CPU. ++ */ ++ entry.dest_mode = INT_DEST_MODE; ++ entry.mask = 0; /* unmask IRQ now */ ++ entry.dest = cpu_mask_to_apicid(TARGET_CPUS); ++ entry.delivery_mode = INT_DELIVERY_MODE; ++ entry.polarity = 0; ++ entry.trigger = 0; ++ entry.vector = vector; ++ ++ /* ++ * The timer IRQ doesn't have to know that behind the ++ * scene we have a 8259A-master in AEOI mode ... ++ */ ++ set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(apic, pin, entry); ++ ++ enable_8259A_irq(0); ++} ++ ++void __apicdebuginit print_IO_APIC(void) ++{ ++ int apic, i; ++ union IO_APIC_reg_00 reg_00; ++ union IO_APIC_reg_01 reg_01; ++ union IO_APIC_reg_02 reg_02; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); ++ for (i = 0; i < nr_ioapics; i++) ++ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", ++ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); ++ ++ /* ++ * We are a bit conservative about what we expect. We have to ++ * know about every hardware change ASAP. ++ */ ++ printk(KERN_INFO "testing the IO APIC.......................\n"); ++ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(apic, 0); ++ reg_01.raw = io_apic_read(apic, 1); ++ if (reg_01.bits.version >= 0x10) ++ reg_02.raw = io_apic_read(apic, 2); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ printk("\n"); ++ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); ++ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); ++ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); ++ ++ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); ++ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); ++ ++ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); ++ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); ++ ++ if (reg_01.bits.version >= 0x10) { ++ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); ++ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); ++ } ++ ++ printk(KERN_DEBUG ".... IRQ redirection table:\n"); ++ ++ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" ++ " Stat Dmod Deli Vect: \n"); ++ ++ for (i = 0; i <= reg_01.bits.entries; i++) { ++ struct IO_APIC_route_entry entry; ++ ++ entry = ioapic_read_entry(apic, i); ++ ++ printk(KERN_DEBUG " %02x %03X ", ++ i, ++ entry.dest ++ ); ++ ++ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", ++ entry.mask, ++ entry.trigger, ++ entry.irr, ++ entry.polarity, ++ entry.delivery_status, ++ entry.dest_mode, ++ entry.delivery_mode, ++ entry.vector ++ ); ++ } ++ } ++ printk(KERN_DEBUG "IRQ to pin mappings:\n"); ++ for (i = 0; i < NR_IRQS; i++) { ++ struct irq_pin_list *entry = irq_2_pin + i; ++ if (entry->pin < 0) ++ continue; ++ printk(KERN_DEBUG "IRQ%d ", i); ++ for (;;) { ++ printk("-> %d:%d", entry->apic, entry->pin); ++ if (!entry->next) ++ break; ++ entry = irq_2_pin + entry->next; ++ } ++ printk("\n"); ++ } ++ ++ printk(KERN_INFO ".................................... done.\n"); ++ ++ return; ++} ++ ++static __apicdebuginit void print_APIC_bitfield (int base) ++{ ++ unsigned int v; ++ int i, j; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); ++ for (i = 0; i < 8; i++) { ++ v = apic_read(base + i*0x10); ++ for (j = 0; j < 32; j++) { ++ if (v & (1< 3) { /* PC is LVT#4. */ ++ v = apic_read(APIC_LVTPC); ++ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); ++ } ++ v = apic_read(APIC_LVT0); ++ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); ++ v = apic_read(APIC_LVT1); ++ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); ++ ++ if (maxlvt > 2) { /* ERR is LVT#3. */ ++ v = apic_read(APIC_LVTERR); ++ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); ++ } ++ ++ v = apic_read(APIC_TMICT); ++ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); ++ v = apic_read(APIC_TMCCT); ++ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); ++ v = apic_read(APIC_TDCR); ++ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); ++ printk("\n"); ++} ++ ++void print_all_local_APICs (void) ++{ ++ on_each_cpu(print_local_APIC, NULL, 1, 1); ++} ++ ++void __apicdebuginit print_PIC(void) ++{ ++ unsigned int v; ++ unsigned long flags; ++ ++ if (apic_verbosity == APIC_QUIET) ++ return; ++ ++ printk(KERN_DEBUG "\nprinting PIC contents\n"); ++ ++ spin_lock_irqsave(&i8259A_lock, flags); ++ ++ v = inb(0xa1) << 8 | inb(0x21); ++ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); ++ ++ v = inb(0xa0) << 8 | inb(0x20); ++ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); ++ ++ outb(0x0b,0xa0); ++ outb(0x0b,0x20); ++ v = inb(0xa0) << 8 | inb(0x20); ++ outb(0x0a,0xa0); ++ outb(0x0a,0x20); ++ ++ spin_unlock_irqrestore(&i8259A_lock, flags); ++ ++ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); ++ ++ v = inb(0x4d1) << 8 | inb(0x4d0); ++ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); ++} ++#endif /* !CONFIG_XEN */ ++ ++void __init enable_IO_APIC(void) ++{ ++ union IO_APIC_reg_01 reg_01; ++#ifndef CONFIG_XEN ++ int i8259_apic, i8259_pin; ++#endif ++ int i, apic; ++ unsigned long flags; ++ ++ for (i = 0; i < PIN_MAP_SIZE; i++) { ++ irq_2_pin[i].pin = -1; ++ irq_2_pin[i].next = 0; ++ } ++ ++ /* ++ * The number of IO-APIC IRQ registers (== #pins): ++ */ ++ for (apic = 0; apic < nr_ioapics; apic++) { ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(apic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ nr_ioapic_registers[apic] = reg_01.bits.entries+1; ++ } ++#ifndef CONFIG_XEN ++ for(apic = 0; apic < nr_ioapics; apic++) { ++ int pin; ++ /* See if any of the pins is in ExtINT mode */ ++ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { ++ struct IO_APIC_route_entry entry; ++ entry = ioapic_read_entry(apic, pin); ++ ++ /* If the interrupt line is enabled and in ExtInt mode ++ * I have found the pin where the i8259 is connected. ++ */ ++ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { ++ ioapic_i8259.apic = apic; ++ ioapic_i8259.pin = pin; ++ goto found_i8259; ++ } ++ } ++ } ++ found_i8259: ++ /* Look to see what if the MP table has reported the ExtINT */ ++ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); ++ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); ++ /* Trust the MP table if nothing is setup in the hardware */ ++ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { ++ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); ++ ioapic_i8259.pin = i8259_pin; ++ ioapic_i8259.apic = i8259_apic; ++ } ++ /* Complain if the MP table and the hardware disagree */ ++ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && ++ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) ++ { ++ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); ++ } ++#endif ++ ++ /* ++ * Do not trust the IO-APIC being empty at bootup ++ */ ++ clear_IO_APIC(); ++} ++ ++/* ++ * Not an __init, needed by the reboot code ++ */ ++void disable_IO_APIC(void) ++{ ++ /* ++ * Clear the IO-APIC before rebooting: ++ */ ++ clear_IO_APIC(); ++ ++#ifndef CONFIG_XEN ++ /* ++ * If the i8259 is routed through an IOAPIC ++ * Put that IOAPIC in virtual wire mode ++ * so legacy interrupts can be delivered. ++ */ ++ if (ioapic_i8259.pin != -1) { ++ struct IO_APIC_route_entry entry; ++ ++ memset(&entry, 0, sizeof(entry)); ++ entry.mask = 0; /* Enabled */ ++ entry.trigger = 0; /* Edge */ ++ entry.irr = 0; ++ entry.polarity = 0; /* High */ ++ entry.delivery_status = 0; ++ entry.dest_mode = 0; /* Physical */ ++ entry.delivery_mode = dest_ExtINT; /* ExtInt */ ++ entry.vector = 0; ++ entry.dest = GET_APIC_ID(read_apic_id()); ++ ++ /* ++ * Add it to the IO-APIC irq-routing table: ++ */ ++ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); ++ } ++ ++ disconnect_bsp_APIC(ioapic_i8259.pin != -1); ++#endif ++} ++ ++/* ++ * There is a nasty bug in some older SMP boards, their mptable lies ++ * about the timer IRQ. We do the following to work around the situation: ++ * ++ * - timer IRQ defaults to IO-APIC IRQ ++ * - if this function detects that timer IRQs are defunct, then we fall ++ * back to ISA timer IRQs ++ */ ++#ifndef CONFIG_XEN ++static int __init timer_irq_works(void) ++{ ++ unsigned long t1 = jiffies; ++ unsigned long flags; ++ ++ local_save_flags(flags); ++ local_irq_enable(); ++ /* Let ten ticks pass... */ ++ mdelay((10 * 1000) / HZ); ++ local_irq_restore(flags); ++ ++ /* ++ * Expect a few ticks at least, to be sure some possible ++ * glue logic does not lock up after one or two first ++ * ticks in a non-ExtINT mode. Also the local APIC ++ * might have cached one ExtINT interrupt. Finally, at ++ * least one tick may be lost due to delays. ++ */ ++ ++ /* jiffies wrap? */ ++ if (time_after(jiffies, t1 + 4)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * In the SMP+IOAPIC case it might happen that there are an unspecified ++ * number of pending IRQ events unhandled. These cases are very rare, ++ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much ++ * better to do it this way as thus we do not have to be aware of ++ * 'pending' interrupts in the IRQ path, except at this point. ++ */ ++/* ++ * Edge triggered needs to resend any interrupt ++ * that was delayed but this is now handled in the device ++ * independent code. ++ */ ++ ++/* ++ * Starting up a edge-triggered IO-APIC interrupt is ++ * nasty - we need to make sure that we get the edge. ++ * If it is already asserted for some reason, we need ++ * return 1 to indicate that is was pending. ++ * ++ * This is not complete - we should be able to fake ++ * an edge even if it isn't on the 8259A... ++ */ ++ ++static unsigned int startup_ioapic_irq(unsigned int irq) ++{ ++ int was_pending = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ if (irq < 16) { ++ disable_8259A_irq(irq); ++ if (i8259A_irq_pending(irq)) ++ was_pending = 1; ++ } ++ __unmask_IO_APIC_irq(irq); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return was_pending; ++} ++ ++static int ioapic_retrigger_irq(unsigned int irq) ++{ ++ struct irq_cfg *cfg = &irq_cfg[irq]; ++ cpumask_t mask; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ mask = cpumask_of_cpu(first_cpu(cfg->domain)); ++ send_IPI_mask(mask, cfg->vector); ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ return 1; ++} ++ ++/* ++ * Level and edge triggered IO-APIC interrupts need different handling, ++ * so we use two separate IRQ descriptors. Edge triggered IRQs can be ++ * handled with the level-triggered descriptor, but that one has slightly ++ * more overhead. Level-triggered interrupts cannot be handled with the ++ * edge-triggered handler, without risking IRQ storms and other ugly ++ * races. ++ */ ++ ++#ifdef CONFIG_SMP ++asmlinkage void smp_irq_move_cleanup_interrupt(void) ++{ ++ unsigned vector, me; ++ ack_APIC_irq(); ++ exit_idle(); ++ irq_enter(); ++ ++ me = smp_processor_id(); ++ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { ++ unsigned int irq; ++ struct irq_desc *desc; ++ struct irq_cfg *cfg; ++ irq = __get_cpu_var(vector_irq)[vector]; ++ if (irq >= NR_IRQS) ++ continue; ++ ++ desc = irq_desc + irq; ++ cfg = irq_cfg + irq; ++ spin_lock(&desc->lock); ++ if (!cfg->move_cleanup_count) ++ goto unlock; ++ ++ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) ++ goto unlock; ++ ++ __get_cpu_var(vector_irq)[vector] = -1; ++ cfg->move_cleanup_count--; ++unlock: ++ spin_unlock(&desc->lock); ++ } ++ ++ irq_exit(); ++} ++ ++static void irq_complete_move(unsigned int irq) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ unsigned vector, me; ++ ++ if (likely(!cfg->move_in_progress)) ++ return; ++ ++ vector = ~get_irq_regs()->orig_ax; ++ me = smp_processor_id(); ++ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { ++ cpumask_t cleanup_mask; ++ ++ cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); ++ cfg->move_cleanup_count = cpus_weight(cleanup_mask); ++ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); ++ cfg->move_in_progress = 0; ++ } ++} ++#else ++static inline void irq_complete_move(unsigned int irq) {} ++#endif ++ ++static void ack_apic_edge(unsigned int irq) ++{ ++ irq_complete_move(irq); ++ move_native_irq(irq); ++ ack_APIC_irq(); ++} ++ ++static void ack_apic_level(unsigned int irq) ++{ ++ int do_unmask_irq = 0; ++ ++ irq_complete_move(irq); ++#ifdef CONFIG_GENERIC_PENDING_IRQ ++ /* If we are moving the irq we need to mask it */ ++ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { ++ do_unmask_irq = 1; ++ mask_IO_APIC_irq(irq); ++ } ++#endif ++ ++ /* ++ * We must acknowledge the irq before we move it or the acknowledge will ++ * not propagate properly. ++ */ ++ ack_APIC_irq(); ++ ++ /* Now we can move and renable the irq */ ++ if (unlikely(do_unmask_irq)) { ++ /* Only migrate the irq if the ack has been received. ++ * ++ * On rare occasions the broadcast level triggered ack gets ++ * delayed going to ioapics, and if we reprogram the ++ * vector while Remote IRR is still set the irq will never ++ * fire again. ++ * ++ * To prevent this scenario we read the Remote IRR bit ++ * of the ioapic. This has two effects. ++ * - On any sane system the read of the ioapic will ++ * flush writes (and acks) going to the ioapic from ++ * this cpu. ++ * - We get to see if the ACK has actually been delivered. ++ * ++ * Based on failed experiments of reprogramming the ++ * ioapic entry from outside of irq context starting ++ * with masking the ioapic entry and then polling until ++ * Remote IRR was clear before reprogramming the ++ * ioapic I don't trust the Remote IRR bit to be ++ * completey accurate. ++ * ++ * However there appears to be no other way to plug ++ * this race, so if the Remote IRR bit is not ++ * accurate and is causing problems then it is a hardware bug ++ * and you can go talk to the chipset vendor about it. ++ */ ++ if (!io_apic_level_ack_pending(irq)) ++ move_masked_irq(irq); ++ unmask_IO_APIC_irq(irq); ++ } ++} ++ ++static struct irq_chip ioapic_chip __read_mostly = { ++ .name = "IO-APIC", ++ .startup = startup_ioapic_irq, ++ .mask = mask_IO_APIC_irq, ++ .unmask = unmask_IO_APIC_irq, ++ .ack = ack_apic_edge, ++ .eoi = ack_apic_level, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ioapic_affinity_irq, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++#endif /* !CONFIG_XEN */ ++ ++static inline void init_IO_APIC_traps(void) ++{ ++ int irq; ++ ++ /* ++ * NOTE! The local APIC isn't very good at handling ++ * multiple interrupts at the same interrupt level. ++ * As the interrupt level is determined by taking the ++ * vector number and shifting that right by 4, we ++ * want to spread these out a bit so that they don't ++ * all fall in the same interrupt level. ++ * ++ * Also, we've got to be careful not to trash gate ++ * 0x80, because int 0x80 is hm, kind of importantish. ;) ++ */ ++ for (irq = 0; irq < NR_IRQS ; irq++) { ++ if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { ++ /* ++ * Hmm.. We don't have an entry for this, ++ * so default to an old-fashioned 8259 ++ * interrupt if we can.. ++ */ ++ if (irq < 16) ++ make_8259A_irq(irq); ++#ifndef CONFIG_XEN ++ else ++ /* Strange. Oh, well.. */ ++ irq_desc[irq].chip = &no_irq_chip; ++#endif ++ } ++ } ++} ++ ++#ifndef CONFIG_XEN ++static void enable_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); ++} ++ ++static void disable_lapic_irq (unsigned int irq) ++{ ++ unsigned long v; ++ ++ v = apic_read(APIC_LVT0); ++ apic_write(APIC_LVT0, v | APIC_LVT_MASKED); ++} ++ ++static void ack_lapic_irq (unsigned int irq) ++{ ++ ack_APIC_irq(); ++} ++ ++static void end_lapic_irq (unsigned int i) { /* nothing */ } ++ ++static struct hw_interrupt_type lapic_irq_type __read_mostly = { ++ .name = "local-APIC", ++ .typename = "local-APIC-edge", ++ .startup = NULL, /* startup_irq() not used for IRQ0 */ ++ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ ++ .enable = enable_lapic_irq, ++ .disable = disable_lapic_irq, ++ .ack = ack_lapic_irq, ++ .end = end_lapic_irq, ++}; ++ ++static void __init setup_nmi(void) ++{ ++ /* ++ * Dirty trick to enable the NMI watchdog ... ++ * We put the 8259A master into AEOI mode and ++ * unmask on all local APICs LVT0 as NMI. ++ * ++ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') ++ * is from Maciej W. Rozycki - so we do not have to EOI from ++ * the NMI handler or the timer interrupt. ++ */ ++ printk(KERN_INFO "activating NMI Watchdog ..."); ++ ++ enable_NMI_through_LVT0(); ++ ++ printk(" done.\n"); ++} ++ ++/* ++ * This looks a bit hackish but it's about the only one way of sending ++ * a few INTA cycles to 8259As and any associated glue logic. ICR does ++ * not support the ExtINT mode, unfortunately. We need to send these ++ * cycles as some i82489DX-based boards have glue logic that keeps the ++ * 8259A interrupt line asserted until INTA. --macro ++ */ ++static inline void __init unlock_ExtINT_logic(void) ++{ ++ int apic, pin, i; ++ struct IO_APIC_route_entry entry0, entry1; ++ unsigned char save_control, save_freq_select; ++ ++ pin = find_isa_irq_pin(8, mp_INT); ++ apic = find_isa_irq_apic(8, mp_INT); ++ if (pin == -1) ++ return; ++ ++ entry0 = ioapic_read_entry(apic, pin); ++ ++ clear_IO_APIC_pin(apic, pin); ++ ++ memset(&entry1, 0, sizeof(entry1)); ++ ++ entry1.dest_mode = 0; /* physical delivery */ ++ entry1.mask = 0; /* unmask IRQ now */ ++ entry1.dest = hard_smp_processor_id(); ++ entry1.delivery_mode = dest_ExtINT; ++ entry1.polarity = entry0.polarity; ++ entry1.trigger = 0; ++ entry1.vector = 0; ++ ++ ioapic_write_entry(apic, pin, entry1); ++ ++ save_control = CMOS_READ(RTC_CONTROL); ++ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); ++ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, ++ RTC_FREQ_SELECT); ++ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); ++ ++ i = 100; ++ while (i-- > 0) { ++ mdelay(10); ++ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) ++ i -= 10; ++ } ++ ++ CMOS_WRITE(save_control, RTC_CONTROL); ++ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); ++ clear_IO_APIC_pin(apic, pin); ++ ++ ioapic_write_entry(apic, pin, entry0); ++} ++ ++/* ++ * This code may look a bit paranoid, but it's supposed to cooperate with ++ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ ++ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast ++ * fanatically on his truly buggy board. ++ * ++ * FIXME: really need to revamp this for modern platforms only. ++ */ ++static inline void __init check_timer(void) ++{ ++ struct irq_cfg *cfg = irq_cfg + 0; ++ int apic1, pin1, apic2, pin2; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ /* ++ * get/set the timer IRQ vector: ++ */ ++ disable_8259A_irq(0); ++ assign_irq_vector(0, TARGET_CPUS); ++ ++ /* ++ * Subtle, code in do_timer_interrupt() expects an AEOI ++ * mode for the 8259A whenever interrupts are routed ++ * through I/O APICs. Also IRQ0 has to be enabled in ++ * the 8259A which implies the virtual wire has to be ++ * disabled in the local APIC. ++ */ ++ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); ++ init_8259A(1); ++ if (timer_over_8254 > 0) ++ enable_8259A_irq(0); ++ ++ pin1 = find_isa_irq_pin(0, mp_INT); ++ apic1 = find_isa_irq_apic(0, mp_INT); ++ pin2 = ioapic_i8259.pin; ++ apic2 = ioapic_i8259.apic; ++ ++ apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", ++ cfg->vector, apic1, pin1, apic2, pin2); ++ ++ if (pin1 != -1) { ++ /* ++ * Ok, does IRQ0 through the IOAPIC work? ++ */ ++ unmask_IO_APIC_irq(0); ++ if (!no_timer_check && timer_irq_works()) { ++ nmi_watchdog_default(); ++ if (nmi_watchdog == NMI_IO_APIC) { ++ disable_8259A_irq(0); ++ setup_nmi(); ++ enable_8259A_irq(0); ++ } ++ if (disable_timer_pin_1 > 0) ++ clear_IO_APIC_pin(0, pin1); ++ goto out; ++ } ++ clear_IO_APIC_pin(apic1, pin1); ++ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " ++ "connected to IO-APIC\n"); ++ } ++ ++ apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " ++ "through the 8259A ... "); ++ if (pin2 != -1) { ++ apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", ++ apic2, pin2); ++ /* ++ * legacy devices should be connected to IO APIC #0 ++ */ ++ setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); ++ if (timer_irq_works()) { ++ apic_printk(APIC_VERBOSE," works.\n"); ++ nmi_watchdog_default(); ++ if (nmi_watchdog == NMI_IO_APIC) { ++ setup_nmi(); ++ } ++ goto out; ++ } ++ /* ++ * Cleanup, just in case ... ++ */ ++ clear_IO_APIC_pin(apic2, pin2); ++ } ++ apic_printk(APIC_VERBOSE," failed.\n"); ++ ++ if (nmi_watchdog == NMI_IO_APIC) { ++ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); ++ nmi_watchdog = 0; ++ } ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); ++ ++ disable_8259A_irq(0); ++ irq_desc[0].chip = &lapic_irq_type; ++ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ ++ enable_8259A_irq(0); ++ ++ if (timer_irq_works()) { ++ apic_printk(APIC_VERBOSE," works.\n"); ++ goto out; ++ } ++ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); ++ apic_printk(APIC_VERBOSE," failed.\n"); ++ ++ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); ++ ++ init_8259A(0); ++ make_8259A_irq(0); ++ apic_write(APIC_LVT0, APIC_DM_EXTINT); ++ ++ unlock_ExtINT_logic(); ++ ++ if (timer_irq_works()) { ++ apic_printk(APIC_VERBOSE," works.\n"); ++ goto out; ++ } ++ apic_printk(APIC_VERBOSE," failed :(.\n"); ++ panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); ++out: ++ local_irq_restore(flags); ++} ++ ++static int __init notimercheck(char *s) ++{ ++ no_timer_check = 1; ++ return 1; ++} ++__setup("no_timer_check", notimercheck); ++#else ++#define check_timer() ((void)0) ++int timer_uses_ioapic_pin_0 = 0; ++#endif /* !CONFIG_XEN */ ++ ++/* ++ * ++ * IRQs that are handled by the PIC in the MPS IOAPIC case. ++ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. ++ * Linux doesn't really care, as it's not actually used ++ * for any interrupt handling anyway. ++ */ ++#define PIC_IRQS (1<<2) ++ ++void __init setup_IO_APIC(void) ++{ ++ enable_IO_APIC(); ++ ++ if (acpi_ioapic) ++ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ ++ else ++ io_apic_irqs = ~PIC_IRQS; ++ ++ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); ++ ++#ifndef CONFIG_XEN ++ sync_Arb_IDs(); ++#endif /* !CONFIG_XEN */ ++ setup_IO_APIC_irqs(); ++ init_IO_APIC_traps(); ++ check_timer(); ++ if (!acpi_ioapic) ++ print_IO_APIC(); ++} ++ ++struct sysfs_ioapic_data { ++ struct sys_device dev; ++ struct IO_APIC_route_entry entry[0]; ++}; ++static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; ++ ++static int ioapic_suspend(struct sys_device *dev, pm_message_t state) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) ++ *entry = ioapic_read_entry(dev->id, i); ++ ++ return 0; ++} ++ ++static int ioapic_resume(struct sys_device *dev) ++{ ++ struct IO_APIC_route_entry *entry; ++ struct sysfs_ioapic_data *data; ++ unsigned long flags; ++ union IO_APIC_reg_00 reg_00; ++ int i; ++ ++ data = container_of(dev, struct sysfs_ioapic_data, dev); ++ entry = data->entry; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_00.raw = io_apic_read(dev->id, 0); ++ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { ++ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; ++ io_apic_write(dev->id, 0, reg_00.raw); ++ } ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ++ ioapic_write_entry(dev->id, i, entry[i]); ++ ++ return 0; ++} ++ ++static struct sysdev_class ioapic_sysdev_class = { ++ .name = "ioapic", ++ .suspend = ioapic_suspend, ++ .resume = ioapic_resume, ++}; ++ ++static int __init ioapic_init_sysfs(void) ++{ ++ struct sys_device * dev; ++ int i, size, error; ++ ++ error = sysdev_class_register(&ioapic_sysdev_class); ++ if (error) ++ return error; ++ ++ for (i = 0; i < nr_ioapics; i++ ) { ++ size = sizeof(struct sys_device) + nr_ioapic_registers[i] ++ * sizeof(struct IO_APIC_route_entry); ++ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); ++ if (!mp_ioapic_data[i]) { ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ dev = &mp_ioapic_data[i]->dev; ++ dev->id = i; ++ dev->cls = &ioapic_sysdev_class; ++ error = sysdev_register(dev); ++ if (error) { ++ kfree(mp_ioapic_data[i]); ++ mp_ioapic_data[i] = NULL; ++ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); ++ continue; ++ } ++ } ++ ++ return 0; ++} ++ ++device_initcall(ioapic_init_sysfs); ++ ++#ifndef CONFIG_XEN ++/* ++ * Dynamic irq allocate and deallocation ++ */ ++int create_irq(void) ++{ ++ /* Allocate an unused irq */ ++ int irq; ++ int new; ++ unsigned long flags; ++ ++ irq = -ENOSPC; ++ spin_lock_irqsave(&vector_lock, flags); ++ for (new = (NR_IRQS - 1); new >= 0; new--) { ++ if (platform_legacy_irq(new)) ++ continue; ++ if (irq_cfg[new].vector != 0) ++ continue; ++ if (__assign_irq_vector(new, TARGET_CPUS) == 0) ++ irq = new; ++ break; ++ } ++ spin_unlock_irqrestore(&vector_lock, flags); ++ ++ if (irq >= 0) { ++ dynamic_irq_init(irq); ++ } ++ return irq; ++} ++ ++void destroy_irq(unsigned int irq) ++{ ++ unsigned long flags; ++ ++ dynamic_irq_cleanup(irq); ++ ++ spin_lock_irqsave(&vector_lock, flags); ++ __clear_irq_vector(irq); ++ spin_unlock_irqrestore(&vector_lock, flags); ++} ++#endif ++ ++/* ++ * MSI message composition ++ */ ++#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) ++static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ int err; ++ unsigned dest; ++ cpumask_t tmp; ++ ++ tmp = TARGET_CPUS; ++ err = assign_irq_vector(irq, tmp); ++ if (!err) { ++ cpus_and(tmp, cfg->domain, tmp); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ msg->address_hi = MSI_ADDR_BASE_HI; ++ msg->address_lo = ++ MSI_ADDR_BASE_LO | ++ ((INT_DEST_MODE == 0) ? ++ MSI_ADDR_DEST_MODE_PHYSICAL: ++ MSI_ADDR_DEST_MODE_LOGICAL) | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_ADDR_REDIRECTION_CPU: ++ MSI_ADDR_REDIRECTION_LOWPRI) | ++ MSI_ADDR_DEST_ID(dest); ++ ++ msg->data = ++ MSI_DATA_TRIGGER_EDGE | ++ MSI_DATA_LEVEL_ASSERT | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ MSI_DATA_DELIVERY_FIXED: ++ MSI_DATA_DELIVERY_LOWPRI) | ++ MSI_DATA_VECTOR(cfg->vector); ++ } ++ return err; ++} ++ ++#ifdef CONFIG_SMP ++static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ struct msi_msg msg; ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ return; ++ ++ if (assign_irq_vector(irq, mask)) ++ return; ++ ++ cpus_and(tmp, cfg->domain, mask); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ read_msi_msg(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(cfg->vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ write_msi_msg(irq, &msg); ++ irq_desc[irq].affinity = mask; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, ++ * which implement the MSI or MSI-X Capability Structure. ++ */ ++static struct irq_chip msi_chip = { ++ .name = "PCI-MSI", ++ .unmask = unmask_msi_irq, ++ .mask = mask_msi_irq, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = set_msi_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) ++{ ++ struct msi_msg msg; ++ int irq, ret; ++ irq = create_irq(); ++ if (irq < 0) ++ return irq; ++ ++ ret = msi_compose_msg(dev, irq, &msg); ++ if (ret < 0) { ++ destroy_irq(irq); ++ return ret; ++ } ++ ++ set_irq_msi(irq, desc); ++ write_msi_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); ++ ++ return 0; ++} ++ ++void arch_teardown_msi_irq(unsigned int irq) ++{ ++ destroy_irq(irq); ++} ++ ++#ifdef CONFIG_DMAR ++#ifdef CONFIG_SMP ++static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ struct msi_msg msg; ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ return; ++ ++ if (assign_irq_vector(irq, mask)) ++ return; ++ ++ cpus_and(tmp, cfg->domain, mask); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ dmar_msi_read(irq, &msg); ++ ++ msg.data &= ~MSI_DATA_VECTOR_MASK; ++ msg.data |= MSI_DATA_VECTOR(cfg->vector); ++ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; ++ msg.address_lo |= MSI_ADDR_DEST_ID(dest); ++ ++ dmar_msi_write(irq, &msg); ++ irq_desc[irq].affinity = mask; ++} ++#endif /* CONFIG_SMP */ ++ ++struct irq_chip dmar_msi_type = { ++ .name = "DMAR_MSI", ++ .unmask = dmar_msi_unmask, ++ .mask = dmar_msi_mask, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = dmar_msi_set_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_dmar_msi(unsigned int irq) ++{ ++ int ret; ++ struct msi_msg msg; ++ ++ ret = msi_compose_msg(NULL, irq, &msg); ++ if (ret < 0) ++ return ret; ++ dmar_msi_write(irq, &msg); ++ set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, ++ "edge"); ++ return 0; ++} ++#endif ++ ++#endif /* CONFIG_PCI_MSI */ ++/* ++ * Hypertransport interrupt support ++ */ ++#ifdef CONFIG_HT_IRQ ++ ++#ifdef CONFIG_SMP ++ ++static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) ++{ ++ struct ht_irq_msg msg; ++ fetch_ht_irq_msg(irq, &msg); ++ ++ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); ++ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); ++ ++ msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); ++ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); ++ ++ write_ht_irq_msg(irq, &msg); ++} ++ ++static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ unsigned int dest; ++ cpumask_t tmp; ++ ++ cpus_and(tmp, mask, cpu_online_map); ++ if (cpus_empty(tmp)) ++ return; ++ ++ if (assign_irq_vector(irq, mask)) ++ return; ++ ++ cpus_and(tmp, cfg->domain, mask); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ target_ht_irq(irq, dest, cfg->vector); ++ irq_desc[irq].affinity = mask; ++} ++#endif ++ ++static struct irq_chip ht_irq_chip = { ++ .name = "PCI-HT", ++ .mask = mask_ht_irq, ++ .unmask = unmask_ht_irq, ++ .ack = ack_apic_edge, ++#ifdef CONFIG_SMP ++ .set_affinity = set_ht_irq_affinity, ++#endif ++ .retrigger = ioapic_retrigger_irq, ++}; ++ ++int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) ++{ ++ struct irq_cfg *cfg = irq_cfg + irq; ++ int err; ++ cpumask_t tmp; ++ ++ tmp = TARGET_CPUS; ++ err = assign_irq_vector(irq, tmp); ++ if (!err) { ++ struct ht_irq_msg msg; ++ unsigned dest; ++ ++ cpus_and(tmp, cfg->domain, tmp); ++ dest = cpu_mask_to_apicid(tmp); ++ ++ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); ++ ++ msg.address_lo = ++ HT_IRQ_LOW_BASE | ++ HT_IRQ_LOW_DEST_ID(dest) | ++ HT_IRQ_LOW_VECTOR(cfg->vector) | ++ ((INT_DEST_MODE == 0) ? ++ HT_IRQ_LOW_DM_PHYSICAL : ++ HT_IRQ_LOW_DM_LOGICAL) | ++ HT_IRQ_LOW_RQEOI_EDGE | ++ ((INT_DELIVERY_MODE != dest_LowestPrio) ? ++ HT_IRQ_LOW_MT_FIXED : ++ HT_IRQ_LOW_MT_ARBITRATED) | ++ HT_IRQ_LOW_IRQ_MASKED; ++ ++ write_ht_irq_msg(irq, &msg); ++ ++ set_irq_chip_and_handler_name(irq, &ht_irq_chip, ++ handle_edge_irq, "edge"); ++ } ++ return err; ++} ++#endif /* CONFIG_HT_IRQ */ ++ ++/* -------------------------------------------------------------------------- ++ ACPI-based IOAPIC Configuration ++ -------------------------------------------------------------------------- */ ++ ++#ifdef CONFIG_ACPI ++ ++#define IO_APIC_MAX_ID 0xFE ++ ++int __init io_apic_get_redir_entries (int ioapic) ++{ ++ union IO_APIC_reg_01 reg_01; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ioapic_lock, flags); ++ reg_01.raw = io_apic_read(ioapic, 1); ++ spin_unlock_irqrestore(&ioapic_lock, flags); ++ ++ return reg_01.bits.entries; ++} ++ ++ ++int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) ++{ ++ if (!IO_APIC_IRQ(irq)) { ++ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ++ ioapic); ++ return -EINVAL; ++ } ++ ++ /* ++ * IRQs < 16 are already in the irq_2_pin[] map ++ */ ++ if (irq >= 16) ++ add_pin_to_irq(irq, ioapic, pin); ++ ++ setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); ++ ++ return 0; ++} ++ ++ ++int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) ++{ ++ int i; ++ ++ if (skip_ioapic_setup) ++ return -1; ++ ++ for (i = 0; i < mp_irq_entries; i++) ++ if (mp_irqs[i].mpc_irqtype == mp_INT && ++ mp_irqs[i].mpc_srcbusirq == bus_irq) ++ break; ++ if (i >= mp_irq_entries) ++ return -1; ++ ++ *trigger = irq_trigger(i); ++ *polarity = irq_polarity(i); ++ return 0; ++} ++ ++#endif /* CONFIG_ACPI */ ++ ++#ifndef CONFIG_XEN ++/* ++ * This function currently is only a helper for the i386 smp boot process where ++ * we need to reprogram the ioredtbls to cater for the cpus which have come online ++ * so mask in all cases should simply be TARGET_CPUS ++ */ ++#ifdef CONFIG_SMP ++void __init setup_ioapic_dest(void) ++{ ++ int pin, ioapic, irq, irq_entry; ++ ++ if (skip_ioapic_setup == 1) ++ return; ++ ++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { ++ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { ++ irq_entry = find_irq_entry(ioapic, pin, mp_INT); ++ if (irq_entry == -1) ++ continue; ++ irq = pin_2_irq(irq_entry, ioapic, pin); ++ ++ /* setup_IO_APIC_irqs could fail to get vector for some device ++ * when you have too many devices, because at that time only boot ++ * cpu is online. ++ */ ++ if (!irq_cfg[irq].vector) ++ setup_IO_APIC_irq(ioapic, pin, irq, ++ irq_trigger(irq_entry), ++ irq_polarity(irq_entry)); ++ else ++ set_ioapic_affinity_irq(irq, TARGET_CPUS); ++ } ++ ++ } ++} ++#endif ++ ++#define IOAPIC_RESOURCE_NAME_SIZE 11 ++ ++static struct resource *ioapic_resources; ++ ++static struct resource * __init ioapic_setup_resources(void) ++{ ++ unsigned long n; ++ struct resource *res; ++ char *mem; ++ int i; ++ ++ if (nr_ioapics <= 0) ++ return NULL; ++ ++ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); ++ n *= nr_ioapics; ++ ++ mem = alloc_bootmem(n); ++ res = (void *)mem; ++ ++ if (mem != NULL) { ++ mem += sizeof(struct resource) * nr_ioapics; ++ ++ for (i = 0; i < nr_ioapics; i++) { ++ res[i].name = mem; ++ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ sprintf(mem, "IOAPIC %u", i); ++ mem += IOAPIC_RESOURCE_NAME_SIZE; ++ } ++ } ++ ++ ioapic_resources = res; ++ ++ return res; ++} ++ ++void __init ioapic_init_mappings(void) ++{ ++ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; ++ struct resource *ioapic_res; ++ int i; ++ ++ ioapic_res = ioapic_setup_resources(); ++ for (i = 0; i < nr_ioapics; i++) { ++ if (smp_found_config) { ++ ioapic_phys = mp_ioapics[i].mpc_apicaddr; ++ } else { ++ ioapic_phys = (unsigned long) ++ alloc_bootmem_pages(PAGE_SIZE); ++ ioapic_phys = __pa(ioapic_phys); ++ } ++ set_fixmap_nocache(idx, ioapic_phys); ++ apic_printk(APIC_VERBOSE, ++ "mapped IOAPIC to %016lx (%016lx)\n", ++ __fix_to_virt(idx), ioapic_phys); ++ idx++; ++ ++ if (ioapic_res != NULL) { ++ ioapic_res->start = ioapic_phys; ++ ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ++ ioapic_res++; ++ } ++ } ++} ++ ++static int __init ioapic_insert_resources(void) ++{ ++ int i; ++ struct resource *r = ioapic_resources; ++ ++ if (!r) { ++ printk(KERN_ERR ++ "IO APIC resources could be not be allocated.\n"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_ioapics; i++) { ++ insert_resource(&iomem_resource, r); ++ r++; ++ } ++ ++ return 0; ++} ++ ++/* Insert the IO APIC resources after PCI initialization has occured to handle ++ * IO APICS that are mapped in on a BAR in PCI space. */ ++late_initcall(ioapic_insert_resources); ++#endif /* !CONFIG_XEN */ +diff --git a/arch/x86/kernel/ioport-xen.c b/arch/x86/kernel/ioport-xen.c +new file mode 100644 +index 0000000..af83044 +--- /dev/null ++++ b/arch/x86/kernel/ioport-xen.c +@@ -0,0 +1,112 @@ ++/* ++ * This contains the io-permission bitmap code - written by obz, with changes ++ * by Linus. 32/64 bits code unification by Miguel BotĆ³n. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ ++static void set_bitmap(unsigned long *bitmap, unsigned int base, ++ unsigned int extent, int new_value) ++{ ++ unsigned int i; ++ ++ for (i = base; i < base + extent; i++) { ++ if (new_value) ++ __set_bit(i, bitmap); ++ else ++ __clear_bit(i, bitmap); ++ } ++} ++ ++/* ++ * this changes the io permissions bitmap in the current task. ++ */ ++asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) ++{ ++ struct thread_struct * t = ¤t->thread; ++ struct physdev_set_iobitmap set_iobitmap; ++ ++ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) ++ return -EINVAL; ++ if (turn_on && !capable(CAP_SYS_RAWIO)) ++ return -EPERM; ++ ++ /* ++ * If it's the first ioperm() call in this thread's lifetime, set the ++ * IO bitmap up. ioperm() is much less timing critical than clone(), ++ * this is why we delay this operation until now: ++ */ ++ if (!t->io_bitmap_ptr) { ++ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); ++ ++ if (!bitmap) ++ return -ENOMEM; ++ ++ memset(bitmap, 0xff, IO_BITMAP_BYTES); ++ t->io_bitmap_ptr = bitmap; ++ set_thread_flag(TIF_IO_BITMAP); ++ ++ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); ++ set_iobitmap.nr_ports = IO_BITMAP_BITS; ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, ++ &set_iobitmap)); ++ } ++ ++ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); ++ ++ return 0; ++} ++ ++/* ++ * sys_iopl has to be used when you want to access the IO ports ++ * beyond the 0x3ff range: to get the full 65536 ports bitmapped ++ * you'd need 8kB of bitmaps/process, which is a bit excessive. ++ */ ++static int do_iopl(unsigned int level, struct thread_struct *t) ++{ ++ unsigned int old = t->iopl >> 12; ++ ++ if (level > 3) ++ return -EINVAL; ++ /* Trying to gain more privileges? */ ++ if (level > old) { ++ if (!capable(CAP_SYS_RAWIO)) ++ return -EPERM; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_32 ++asmlinkage long sys_iopl(unsigned long regsp) ++{ ++ struct pt_regs *regs = (struct pt_regs *)®sp; ++ unsigned int level = regs->bx; ++#else ++asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) ++{ ++#endif ++ struct thread_struct *t = ¤t->thread; ++ int rc; ++ ++ rc = do_iopl(level, t); ++ if (rc < 0) ++ goto out; ++ ++ t->iopl = level << 12; ++ set_iopl_mask(t->iopl); ++out: ++ return rc; ++} +diff --git a/arch/x86/kernel/ipi-xen.c b/arch/x86/kernel/ipi-xen.c +new file mode 100644 +index 0000000..6fb39f3 +--- /dev/null ++++ b/arch/x86/kernel/ipi-xen.c +@@ -0,0 +1,232 @@ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_X86_32 ++#ifndef CONFIG_XEN ++#include ++/* ++ * the following functions deal with sending IPIs between CPUs. ++ * ++ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. ++ */ ++ ++static inline int __prepare_ICR(unsigned int shortcut, int vector) ++{ ++ unsigned int icr = shortcut | APIC_DEST_LOGICAL; ++ ++ switch (vector) { ++ default: ++ icr |= APIC_DM_FIXED | vector; ++ break; ++ case NMI_VECTOR: ++ icr |= APIC_DM_NMI; ++ break; ++ } ++ return icr; ++} ++ ++static inline int __prepare_ICR2(unsigned int mask) ++{ ++ return SET_APIC_DEST_FIELD(mask); ++} ++#else ++#include ++ ++DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); ++ ++static inline void __send_IPI_one(unsigned int cpu, int vector) ++{ ++ int irq = per_cpu(ipi_to_irq, cpu)[vector]; ++ BUG_ON(irq < 0); ++ notify_remote_via_irq(irq); ++} ++#endif ++ ++void __send_IPI_shortcut(unsigned int shortcut, int vector) ++{ ++#ifndef CONFIG_XEN ++ /* ++ * Subtle. In the case of the 'never do double writes' workaround ++ * we have to lock out interrupts to be safe. As we don't care ++ * of the value read we use an atomic rmw access to avoid costly ++ * cli/sti. Otherwise we use an even cheaper single atomic write ++ * to the APIC. ++ */ ++ unsigned int cfg; ++ ++ /* ++ * Wait for idle. ++ */ ++ apic_wait_icr_idle(); ++ ++ /* ++ * No need to touch the target chip field ++ */ ++ cfg = __prepare_ICR(shortcut, vector); ++ ++ /* ++ * Send the IPI. The write to APIC_ICR fires this off. ++ */ ++ apic_write_around(APIC_ICR, cfg); ++#else ++ int cpu; ++ ++ switch (shortcut) { ++ case APIC_DEST_SELF: ++ __send_IPI_one(smp_processor_id(), vector); ++ break; ++ case APIC_DEST_ALLBUT: ++ for (cpu = 0; cpu < NR_CPUS; ++cpu) { ++ if (cpu == smp_processor_id()) ++ continue; ++ if (cpu_isset(cpu, cpu_online_map)) { ++ __send_IPI_one(cpu, vector); ++ } ++ } ++ break; ++ default: ++ printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, ++ vector); ++ break; ++ } ++#endif ++} ++ ++void send_IPI_self(int vector) ++{ ++ __send_IPI_shortcut(APIC_DEST_SELF, vector); ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * This is used to send an IPI with no shorthand notation (the destination is ++ * specified in bits 56 to 63 of the ICR). ++ */ ++static inline void __send_IPI_dest_field(unsigned long mask, int vector) ++{ ++ unsigned long cfg; ++ ++ /* ++ * Wait for idle. ++ */ ++ if (unlikely(vector == NMI_VECTOR)) ++ safe_apic_wait_icr_idle(); ++ else ++ apic_wait_icr_idle(); ++ ++ /* ++ * prepare target chip field ++ */ ++ cfg = __prepare_ICR2(mask); ++ apic_write_around(APIC_ICR2, cfg); ++ ++ /* ++ * program the ICR ++ */ ++ cfg = __prepare_ICR(0, vector); ++ ++ /* ++ * Send the IPI. The write to APIC_ICR fires this off. ++ */ ++ apic_write_around(APIC_ICR, cfg); ++} ++#endif ++ ++/* ++ * This is only used on smaller machines. ++ */ ++void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) ++{ ++ unsigned long mask = cpus_addr(cpumask)[0]; ++ unsigned long flags; ++#ifdef CONFIG_XEN ++ unsigned int cpu; ++#endif ++ ++ local_irq_save(flags); ++ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); ++#ifndef CONFIG_XEN ++ __send_IPI_dest_field(mask, vector); ++#else ++ for (cpu = 0; cpu < NR_CPUS; ++cpu) ++ if (cpu_isset(cpu, cpumask)) ++ __send_IPI_one(cpu, vector); ++#endif ++ local_irq_restore(flags); ++} ++ ++void send_IPI_mask_sequence(cpumask_t mask, int vector) ++{ ++#ifndef CONFIG_XEN ++ unsigned long flags; ++ unsigned int query_cpu; ++ ++ /* ++ * Hack. The clustered APIC addressing mode doesn't allow us to send ++ * to an arbitrary mask, so I do a unicasts to each CPU instead. This ++ * should be modified to do 1 message per cluster ID - mbligh ++ */ ++ ++ local_irq_save(flags); ++ for_each_possible_cpu(query_cpu) { ++ if (cpu_isset(query_cpu, mask)) { ++ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), ++ vector); ++ } ++ } ++ local_irq_restore(flags); ++#else ++ send_IPI_mask_bitmask(mask, vector); ++#endif ++} ++ ++/* must come after the send_IPI functions above for inlining */ ++#include ++ ++#ifndef CONFIG_XEN ++static int convert_apicid_to_cpu(int apic_id) ++{ ++ int i; ++ ++ for_each_possible_cpu(i) { ++ if (per_cpu(x86_cpu_to_apicid, i) == apic_id) ++ return i; ++ } ++ return -1; ++} ++ ++int safe_smp_processor_id(void) ++{ ++ int apicid, cpuid; ++ ++ if (!boot_cpu_has(X86_FEATURE_APIC)) ++ return 0; ++ ++ apicid = hard_smp_processor_id(); ++ if (apicid == BAD_APICID) ++ return 0; ++ ++ cpuid = convert_apicid_to_cpu(apicid); ++ ++ return cpuid >= 0 ? cpuid : 0; ++} ++#endif ++#endif +diff --git a/arch/x86/kernel/irq_32-xen.c b/arch/x86/kernel/irq_32-xen.c +new file mode 100644 +index 0000000..274718d +--- /dev/null ++++ b/arch/x86/kernel/irq_32-xen.c +@@ -0,0 +1,383 @@ ++/* ++ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar ++ * ++ * This file contains the lowest level x86-specific interrupt ++ * entry, irq-stacks and irq statistics code. All the remaining ++ * irq logic is done by the generic kernel/irq/ code and ++ * by the x86-specific irq controller code. (e.g. i8259.c and ++ * io_apic.c.) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); ++EXPORT_PER_CPU_SYMBOL(irq_stat); ++ ++DEFINE_PER_CPU(struct pt_regs *, irq_regs); ++EXPORT_PER_CPU_SYMBOL(irq_regs); ++ ++/* ++ * 'what should we do if we get a hw irq event on an illegal vector'. ++ * each architecture has to answer this themselves. ++ */ ++void ack_bad_irq(unsigned int irq) ++{ ++ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); ++ ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) ++ /* ++ * Currently unexpected vectors happen only on SMP and APIC. ++ * We _must_ ack these because every local APIC has only N ++ * irq slots per priority level, and a 'hanging, unacked' IRQ ++ * holds up an irq slot - in excessive cases (when multiple ++ * unexpected vectors occur) that might lock up the APIC ++ * completely. ++ * But only ack when the APIC is enabled -AK ++ */ ++ if (cpu_has_apic) ++ ack_APIC_irq(); ++#endif ++} ++ ++#ifdef CONFIG_4KSTACKS ++/* ++ * per-CPU IRQ handling contexts (thread information and stack) ++ */ ++union irq_ctx { ++ struct thread_info tinfo; ++ u32 stack[THREAD_SIZE/sizeof(u32)]; ++}; ++ ++static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; ++static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; ++#endif ++ ++/* ++ * do_IRQ handles all normal device IRQ's (the special ++ * SMP cross-CPU interrupts have their own specific ++ * handlers). ++ */ ++unsigned int do_IRQ(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs; ++ /* high bit used in ret_from_ code */ ++ int irq = ~regs->orig_ax; ++ struct irq_desc *desc = irq_desc + irq; ++#ifdef CONFIG_4KSTACKS ++ union irq_ctx *curctx, *irqctx; ++ u32 *isp; ++#endif ++ ++ if (unlikely((unsigned)irq >= NR_IRQS)) { ++ printk(KERN_EMERG "%s: cannot handle IRQ %d\n", ++ __func__, irq); ++ BUG(); ++ } ++ ++ old_regs = set_irq_regs(regs); ++ irq_enter(); ++#ifdef CONFIG_DEBUG_STACKOVERFLOW ++ /* Debugging check for stack overflow: is there less than 1KB free? */ ++ { ++ long sp; ++ ++ __asm__ __volatile__("andl %%esp,%0" : ++ "=r" (sp) : "0" (THREAD_SIZE - 1)); ++ if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { ++ printk("do_IRQ: stack overflow: %ld\n", ++ sp - sizeof(struct thread_info)); ++ dump_stack(); ++ } ++ } ++#endif ++ ++#ifdef CONFIG_4KSTACKS ++ ++ curctx = (union irq_ctx *) current_thread_info(); ++ irqctx = hardirq_ctx[smp_processor_id()]; ++ ++ /* ++ * this is where we switch to the IRQ stack. However, if we are ++ * already using the IRQ stack (because we interrupted a hardirq ++ * handler) we can't do that and just have to keep using the ++ * current stack (which is the irq stack already after all) ++ */ ++ if (curctx != irqctx) { ++ int arg1, arg2, bx; ++ ++ /* build the stack frame on the IRQ stack */ ++ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); ++ irqctx->tinfo.task = curctx->tinfo.task; ++ irqctx->tinfo.previous_esp = current_stack_pointer; ++ ++ /* ++ * Copy the softirq bits in preempt_count so that the ++ * softirq checks work in the hardirq context. ++ */ ++ irqctx->tinfo.preempt_count = ++ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | ++ (curctx->tinfo.preempt_count & SOFTIRQ_MASK); ++ ++ asm volatile( ++ " xchgl %%ebx,%%esp \n" ++ " call *%%edi \n" ++ " movl %%ebx,%%esp \n" ++ : "=a" (arg1), "=d" (arg2), "=b" (bx) ++ : "0" (irq), "1" (desc), "2" (isp), ++ "D" (desc->handle_irq) ++ : "memory", "cc", "ecx" ++ ); ++ } else ++#endif ++ desc->handle_irq(irq, desc); ++ ++ irq_exit(); ++ set_irq_regs(old_regs); ++ return 1; ++} ++ ++#ifdef CONFIG_4KSTACKS ++ ++static char softirq_stack[NR_CPUS * THREAD_SIZE] ++ __attribute__((__section__(".bss.page_aligned"))); ++ ++static char hardirq_stack[NR_CPUS * THREAD_SIZE] ++ __attribute__((__section__(".bss.page_aligned"))); ++ ++/* ++ * allocate per-cpu stacks for hardirq and for softirq processing ++ */ ++void irq_ctx_init(int cpu) ++{ ++ union irq_ctx *irqctx; ++ ++ if (hardirq_ctx[cpu]) ++ return; ++ ++ irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; ++ irqctx->tinfo.task = NULL; ++ irqctx->tinfo.exec_domain = NULL; ++ irqctx->tinfo.cpu = cpu; ++ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; ++ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); ++ ++ hardirq_ctx[cpu] = irqctx; ++ ++ irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; ++ irqctx->tinfo.task = NULL; ++ irqctx->tinfo.exec_domain = NULL; ++ irqctx->tinfo.cpu = cpu; ++ irqctx->tinfo.preempt_count = 0; ++ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); ++ ++ softirq_ctx[cpu] = irqctx; ++ ++ printk("CPU %u irqstacks, hard=%p soft=%p\n", ++ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); ++} ++ ++void irq_ctx_exit(int cpu) ++{ ++ hardirq_ctx[cpu] = NULL; ++} ++ ++asmlinkage void do_softirq(void) ++{ ++ unsigned long flags; ++ struct thread_info *curctx; ++ union irq_ctx *irqctx; ++ u32 *isp; ++ ++ if (in_interrupt()) ++ return; ++ ++ local_irq_save(flags); ++ ++ if (local_softirq_pending()) { ++ curctx = current_thread_info(); ++ irqctx = softirq_ctx[smp_processor_id()]; ++ irqctx->tinfo.task = curctx->task; ++ irqctx->tinfo.previous_esp = current_stack_pointer; ++ ++ /* build the stack frame on the softirq stack */ ++ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); ++ ++ asm volatile( ++ " xchgl %%ebx,%%esp \n" ++ " call __do_softirq \n" ++ " movl %%ebx,%%esp \n" ++ : "=b"(isp) ++ : "0"(isp) ++ : "memory", "cc", "edx", "ecx", "eax" ++ ); ++ /* ++ * Shouldnt happen, we returned above if in_interrupt(): ++ */ ++ WARN_ON_ONCE(softirq_count()); ++ } ++ ++ local_irq_restore(flags); ++} ++#endif ++ ++/* ++ * Interrupt statistics: ++ */ ++ ++#ifndef CONFIG_XEN ++atomic_t irq_err_count; ++#endif ++ ++/* ++ * /proc/interrupts printing: ++ */ ++ ++int show_interrupts(struct seq_file *p, void *v) ++{ ++ int i = *(loff_t *) v, j; ++ struct irqaction * action; ++ unsigned long flags; ++ ++ if (i == 0) { ++ seq_printf(p, " "); ++ for_each_online_cpu(j) ++ seq_printf(p, "CPU%-8d",j); ++ seq_putc(p, '\n'); ++ } ++ ++ if (i < NR_IRQS) { ++ unsigned any_count = 0; ++ ++ spin_lock_irqsave(&irq_desc[i].lock, flags); ++#ifndef CONFIG_SMP ++ any_count = kstat_irqs(i); ++#else ++ for_each_online_cpu(j) ++ any_count |= kstat_cpu(j).irqs[i]; ++#endif ++ action = irq_desc[i].action; ++ if (!action && !any_count) ++ goto skip; ++ seq_printf(p, "%3d: ",i); ++#ifndef CONFIG_SMP ++ seq_printf(p, "%10u ", kstat_irqs(i)); ++#else ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++#endif ++ seq_printf(p, " %8s", irq_desc[i].chip->name); ++ seq_printf(p, "-%-8s", irq_desc[i].name); ++ ++ if (action) { ++ seq_printf(p, " %s", action->name); ++ while ((action = action->next) != NULL) ++ seq_printf(p, ", %s", action->name); ++ } ++ ++ seq_putc(p, '\n'); ++skip: ++ spin_unlock_irqrestore(&irq_desc[i].lock, flags); ++ } else if (i == NR_IRQS) { ++ seq_printf(p, "NMI: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", nmi_count(j)); ++ seq_printf(p, " Non-maskable interrupts\n"); ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) ++ seq_printf(p, "LOC: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).apic_timer_irqs); ++ seq_printf(p, " Local timer interrupts\n"); ++#endif ++#ifdef CONFIG_SMP ++ seq_printf(p, "RES: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).irq_resched_count); ++ seq_printf(p, " Rescheduling interrupts\n"); ++ seq_printf(p, "CAL: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).irq_call_count); ++ seq_printf(p, " function call interrupts\n"); ++#ifndef CONFIG_XEN ++ seq_printf(p, "TLB: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).irq_tlb_count); ++ seq_printf(p, " TLB shootdowns\n"); ++#endif ++#endif ++#ifdef CONFIG_X86_MCE ++ seq_printf(p, "TRM: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).irq_thermal_count); ++ seq_printf(p, " Thermal event interrupts\n"); ++#endif ++#ifndef CONFIG_XEN ++#ifdef CONFIG_X86_LOCAL_APIC ++ seq_printf(p, "SPU: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", ++ per_cpu(irq_stat,j).irq_spurious_count); ++ seq_printf(p, " Spurious interrupts\n"); ++#endif ++ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); ++#if defined(CONFIG_X86_IO_APIC) ++ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); ++#endif ++#endif ++ } ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ ++void fixup_irqs(cpumask_t map) ++{ ++ unsigned int irq; ++ static int warned; ++ ++ for (irq = 0; irq < NR_IRQS; irq++) { ++ cpumask_t mask; ++ if (irq == 2) ++ continue; ++ ++ cpus_and(mask, irq_desc[irq].affinity, map); ++ if (any_online_cpu(mask) == NR_CPUS) { ++ /*printk("Breaking affinity for irq %i\n", irq);*/ ++ mask = map; ++ } ++ if (irq_desc[irq].chip->set_affinity) ++ irq_desc[irq].chip->set_affinity(irq, mask); ++ else if (irq_desc[irq].action && !(warned++)) ++ printk("Cannot set affinity for irq %i\n", irq); ++ } ++ ++#if 0 ++ barrier(); ++ /* Ingo Molnar says: "after the IO-APIC masks have been redirected ++ [note the nop - the interrupt-enable boundary on x86 is two ++ instructions from sti] - to flush out pending hardirqs and ++ IPIs. After this point nothing is supposed to reach this CPU." */ ++ __asm__ __volatile__("sti; nop; cli"); ++ barrier(); ++#else ++ /* That doesn't seem sufficient. Give it 1ms. */ ++ local_irq_enable(); ++ mdelay(1); ++ local_irq_disable(); ++#endif ++} ++#endif ++ +diff --git a/arch/x86/kernel/irq_64-xen.c b/arch/x86/kernel/irq_64-xen.c +new file mode 100644 +index 0000000..9e30794 +--- /dev/null ++++ b/arch/x86/kernel/irq_64-xen.c +@@ -0,0 +1,264 @@ ++/* ++ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar ++ * ++ * This file contains the lowest level x86_64-specific interrupt ++ * entry and irq statistics code. All the remaining irq logic is ++ * done by the generic kernel/irq/ code and in the ++ * x86_64-specific irq controller code. (e.g. i8259.c and ++ * io_apic.c.) ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifndef CONFIG_XEN ++atomic_t irq_err_count; ++#endif ++ ++/* ++ * 'what should we do if we get a hw irq event on an illegal vector'. ++ * each architecture has to answer this themselves. ++ */ ++void ack_bad_irq(unsigned int irq) ++{ ++ printk(KERN_WARNING "unexpected IRQ trap at irq %02x\n", irq); ++} ++ ++#ifdef CONFIG_DEBUG_STACKOVERFLOW ++/* ++ * Probabilistic stack overflow check: ++ * ++ * Only check the stack in process context, because everything else ++ * runs on the big interrupt stacks. Checking reliably is too expensive, ++ * so we just check from interrupts. ++ */ ++static inline void stack_overflow_check(struct pt_regs *regs) ++{ ++ u64 curbase = (u64)task_stack_page(current); ++ static unsigned long warned = -60*HZ; ++ ++ if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && ++ regs->sp < curbase + sizeof(struct thread_info) + 128 && ++ time_after(jiffies, warned + 60*HZ)) { ++ printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", ++ current->comm, curbase, regs->sp); ++ show_stack(NULL,NULL); ++ warned = jiffies; ++ } ++} ++#endif ++ ++/* ++ * Generic, controller-independent functions: ++ */ ++ ++int show_interrupts(struct seq_file *p, void *v) ++{ ++ int i = *(loff_t *) v, j; ++ struct irqaction * action; ++ unsigned long flags; ++ ++ if (i == 0) { ++ seq_printf(p, " "); ++ for_each_online_cpu(j) ++ seq_printf(p, "CPU%-8d",j); ++ seq_putc(p, '\n'); ++ } ++ ++ if (i < NR_IRQS) { ++ unsigned any_count = 0; ++ ++ spin_lock_irqsave(&irq_desc[i].lock, flags); ++#ifndef CONFIG_SMP ++ any_count = kstat_irqs(i); ++#else ++ for_each_online_cpu(j) ++ any_count |= kstat_cpu(j).irqs[i]; ++#endif ++ action = irq_desc[i].action; ++ if (!action && !any_count) ++ goto skip; ++ seq_printf(p, "%3d: ",i); ++#ifndef CONFIG_SMP ++ seq_printf(p, "%10u ", kstat_irqs(i)); ++#else ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); ++#endif ++ seq_printf(p, " %8s", irq_desc[i].chip->name); ++ seq_printf(p, "-%-8s", irq_desc[i].name); ++ ++ if (action) { ++ seq_printf(p, " %s", action->name); ++ while ((action = action->next) != NULL) ++ seq_printf(p, ", %s", action->name); ++ } ++ seq_putc(p, '\n'); ++skip: ++ spin_unlock_irqrestore(&irq_desc[i].lock, flags); ++ } else if (i == NR_IRQS) { ++ seq_printf(p, "NMI: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); ++ seq_printf(p, " Non-maskable interrupts\n"); ++#ifndef CONFIG_XEN ++ seq_printf(p, "LOC: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); ++ seq_printf(p, " Local timer interrupts\n"); ++#endif ++#ifdef CONFIG_SMP ++ seq_printf(p, "RES: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); ++ seq_printf(p, " Rescheduling interrupts\n"); ++ seq_printf(p, "CAL: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); ++ seq_printf(p, " function call interrupts\n"); ++#ifndef CONFIG_XEN ++ seq_printf(p, "TLB: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); ++ seq_printf(p, " TLB shootdowns\n"); ++#endif ++#endif ++#ifdef CONFIG_X86_MCE ++ seq_printf(p, "TRM: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); ++ seq_printf(p, " Thermal event interrupts\n"); ++ seq_printf(p, "THR: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); ++ seq_printf(p, " Threshold APIC interrupts\n"); ++#endif ++#ifndef CONFIG_XEN ++ seq_printf(p, "SPU: "); ++ for_each_online_cpu(j) ++ seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); ++ seq_printf(p, " Spurious interrupts\n"); ++ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); ++#endif ++ } ++ return 0; ++} ++ ++/* ++ * do_IRQ handles all normal device IRQ's (the special ++ * SMP cross-CPU interrupts have their own specific ++ * handlers). ++ */ ++asmlinkage unsigned int do_IRQ(struct pt_regs *regs) ++{ ++ struct pt_regs *old_regs = set_irq_regs(regs); ++ ++ /* high bit used in ret_from_ code */ ++ unsigned irq = ~regs->orig_ax; ++ ++ exit_idle(); ++ irq_enter(); ++ ++#ifdef CONFIG_DEBUG_STACKOVERFLOW ++ stack_overflow_check(regs); ++#endif ++ ++ if (likely(irq < NR_IRQS)) ++ generic_handle_irq(irq); ++ else { ++#ifndef CONFIG_XEN ++ if (!disable_apic) ++ ack_APIC_irq(); ++#endif ++ if (printk_ratelimit()) ++ printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n", ++ __func__, smp_processor_id(), irq); ++ } ++ ++ irq_exit(); ++ ++ set_irq_regs(old_regs); ++ return 1; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++void fixup_irqs(cpumask_t map) ++{ ++ unsigned int irq; ++ static int warned; ++ ++ for (irq = 0; irq < NR_IRQS; irq++) { ++ cpumask_t mask; ++ int break_affinity = 0; ++ int set_affinity = 1; ++ ++ if (irq == 2) ++ continue; ++ ++ /* interrupt's are disabled at this point */ ++ spin_lock(&irq_desc[irq].lock); ++ ++ if (!irq_has_action(irq) || ++ cpus_equal(irq_desc[irq].affinity, map)) { ++ spin_unlock(&irq_desc[irq].lock); ++ continue; ++ } ++ ++ cpus_and(mask, irq_desc[irq].affinity, map); ++ if (cpus_empty(mask)) { ++ break_affinity = 1; ++ mask = map; ++ } ++ ++ if (irq_desc[irq].chip->mask) ++ irq_desc[irq].chip->mask(irq); ++ ++ if (irq_desc[irq].chip->set_affinity) ++ irq_desc[irq].chip->set_affinity(irq, mask); ++ else if (!(warned++)) ++ set_affinity = 0; ++ ++ if (irq_desc[irq].chip->unmask) ++ irq_desc[irq].chip->unmask(irq); ++ ++ spin_unlock(&irq_desc[irq].lock); ++ ++ if (break_affinity && set_affinity) ++ /*printk("Broke affinity for irq %i\n", irq)*/; ++ else if (!set_affinity) ++ printk("Cannot set affinity for irq %i\n", irq); ++ } ++ ++ /* That doesn't seem sufficient. Give it 1ms. */ ++ local_irq_enable(); ++ mdelay(1); ++ local_irq_disable(); ++} ++#endif ++ ++extern void call_softirq(void); ++ ++asmlinkage void do_softirq(void) ++{ ++ __u32 pending; ++ unsigned long flags; ++ ++ if (in_interrupt()) ++ return; ++ ++ local_irq_save(flags); ++ pending = local_softirq_pending(); ++ /* Switch to interrupt stack */ ++ if (pending) { ++ call_softirq(); ++ WARN_ON_ONCE(softirq_count()); ++ } ++ local_irq_restore(flags); ++} +diff --git a/arch/x86/kernel/ldt-xen.c b/arch/x86/kernel/ldt-xen.c +new file mode 100644 +index 0000000..33b6e3a +--- /dev/null ++++ b/arch/x86/kernel/ldt-xen.c +@@ -0,0 +1,272 @@ ++/* ++ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds ++ * Copyright (C) 1999 Ingo Molnar ++ * Copyright (C) 2002 Andi Kleen ++ * ++ * This handles calls from both 32bit and 64bit mode. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_SMP ++static void flush_ldt(void *null) ++{ ++ if (current->active_mm) ++ load_LDT(¤t->active_mm->context); ++} ++#endif ++ ++static int alloc_ldt(mm_context_t *pc, int mincount, int reload) ++{ ++ void *oldldt, *newldt; ++ int oldsize; ++ ++ if (mincount <= pc->size) ++ return 0; ++ oldsize = pc->size; ++ mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & ++ (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); ++ if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) ++ newldt = vmalloc(mincount * LDT_ENTRY_SIZE); ++ else ++ newldt = (void *)__get_free_page(GFP_KERNEL); ++ ++ if (!newldt) ++ return -ENOMEM; ++ ++ if (oldsize) ++ memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); ++ oldldt = pc->ldt; ++ memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, ++ (mincount - oldsize) * LDT_ENTRY_SIZE); ++ ++#ifdef CONFIG_X86_64 ++ /* CHECKME: Do we really need this ? */ ++ wmb(); ++#endif ++ pc->ldt = newldt; ++ wmb(); ++ pc->size = mincount; ++ wmb(); ++ ++ if (reload) { ++#ifdef CONFIG_SMP ++ cpumask_t mask; ++ ++ preempt_disable(); ++#endif ++ make_pages_readonly(newldt, ++ (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ load_LDT(pc); ++#ifdef CONFIG_SMP ++ mask = cpumask_of_cpu(smp_processor_id()); ++ if (!cpus_equal(current->mm->cpu_vm_mask, mask)) ++ smp_call_function(flush_ldt, NULL, 1, 1); ++ preempt_enable(); ++#endif ++ } ++ if (oldsize) { ++ make_pages_writable(oldldt, ++ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(oldldt); ++ else ++ put_page(virt_to_page(oldldt)); ++ } ++ return 0; ++} ++ ++static inline int copy_ldt(mm_context_t *new, mm_context_t *old) ++{ ++ int err = alloc_ldt(new, old->size, 0); ++ ++ if (err < 0) ++ return err; ++ memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); ++ make_pages_readonly(new->ldt, ++ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ return 0; ++} ++ ++/* ++ * we do not have to muck with descriptors here, that is ++ * done in switch_mm() as needed. ++ */ ++int init_new_context(struct task_struct *tsk, struct mm_struct *mm) ++{ ++ struct mm_struct *old_mm; ++ int retval = 0; ++ ++ memset(&mm->context, 0, sizeof(mm->context)); ++ mutex_init(&mm->context.lock); ++ old_mm = current->mm; ++ if (old_mm) ++ mm->context.vdso = old_mm->context.vdso; ++ if (old_mm && old_mm->context.size > 0) { ++ mutex_lock(&old_mm->context.lock); ++ retval = copy_ldt(&mm->context, &old_mm->context); ++ mutex_unlock(&old_mm->context.lock); ++ } ++ return retval; ++} ++ ++/* ++ * No need to lock the MM as we are the last user ++ * ++ * 64bit: Don't touch the LDT register - we're already in the next thread. ++ */ ++void destroy_context(struct mm_struct *mm) ++{ ++ if (mm->context.size) { ++ /* CHECKME: Can this ever happen ? */ ++ if (mm == current->active_mm) ++ clear_LDT(); ++ make_pages_writable(mm->context.ldt, ++ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, ++ XENFEAT_writable_descriptor_tables); ++ if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(mm->context.ldt); ++ else ++ put_page(virt_to_page(mm->context.ldt)); ++ mm->context.size = 0; ++ } ++} ++ ++static int read_ldt(void __user *ptr, unsigned long bytecount) ++{ ++ int err; ++ unsigned long size; ++ struct mm_struct *mm = current->mm; ++ ++ if (!mm->context.size) ++ return 0; ++ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) ++ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; ++ ++ mutex_lock(&mm->context.lock); ++ size = mm->context.size * LDT_ENTRY_SIZE; ++ if (size > bytecount) ++ size = bytecount; ++ ++ err = 0; ++ if (copy_to_user(ptr, mm->context.ldt, size)) ++ err = -EFAULT; ++ mutex_unlock(&mm->context.lock); ++ if (err < 0) ++ goto error_return; ++ if (size != bytecount) { ++ /* zero-fill the rest */ ++ if (clear_user(ptr + size, bytecount - size) != 0) { ++ err = -EFAULT; ++ goto error_return; ++ } ++ } ++ return bytecount; ++error_return: ++ return err; ++} ++ ++static int read_default_ldt(void __user *ptr, unsigned long bytecount) ++{ ++ /* CHECKME: Can we use _one_ random number ? */ ++#ifdef CONFIG_X86_32 ++ unsigned long size = 5 * sizeof(struct desc_struct); ++#else ++ unsigned long size = 128; ++#endif ++ if (bytecount > size) ++ bytecount = size; ++ if (clear_user(ptr, bytecount)) ++ return -EFAULT; ++ return bytecount; ++} ++ ++static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ++{ ++ struct mm_struct *mm = current->mm; ++ struct desc_struct ldt; ++ int error; ++ struct user_desc ldt_info; ++ ++ error = -EINVAL; ++ if (bytecount != sizeof(ldt_info)) ++ goto out; ++ error = -EFAULT; ++ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) ++ goto out; ++ ++ error = -EINVAL; ++ if (ldt_info.entry_number >= LDT_ENTRIES) ++ goto out; ++ if (ldt_info.contents == 3) { ++ if (oldmode) ++ goto out; ++ if (ldt_info.seg_not_present == 0) ++ goto out; ++ } ++ ++ mutex_lock(&mm->context.lock); ++ if (ldt_info.entry_number >= mm->context.size) { ++ error = alloc_ldt(¤t->mm->context, ++ ldt_info.entry_number + 1, 1); ++ if (error < 0) ++ goto out_unlock; ++ } ++ ++ /* Allow LDTs to be cleared by the user. */ ++ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { ++ if (oldmode || LDT_empty(&ldt_info)) { ++ memset(&ldt, 0, sizeof(ldt)); ++ goto install; ++ } ++ } ++ ++ fill_ldt(&ldt, &ldt_info); ++ if (oldmode) ++ ldt.avl = 0; ++ ++ /* Install the new entry ... */ ++install: ++ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); ++ ++out_unlock: ++ mutex_unlock(&mm->context.lock); ++out: ++ return error; ++} ++ ++asmlinkage int sys_modify_ldt(int func, void __user *ptr, ++ unsigned long bytecount) ++{ ++ int ret = -ENOSYS; ++ ++ switch (func) { ++ case 0: ++ ret = read_ldt(ptr, bytecount); ++ break; ++ case 1: ++ ret = write_ldt(ptr, bytecount, 1); ++ break; ++ case 2: ++ ret = read_default_ldt(ptr, bytecount); ++ break; ++ case 0x11: ++ ret = write_ldt(ptr, bytecount, 0); ++ break; ++ } ++ return ret; ++} +diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c +index d0b234c..c748fc8 100644 +--- a/arch/x86/kernel/machine_kexec_32.c ++++ b/arch/x86/kernel/machine_kexec_32.c +@@ -21,6 +21,10 @@ + #include + #include + ++#ifdef CONFIG_XEN ++#include ++#endif ++ + #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + static u32 kexec_pgd[1024] PAGE_ALIGNED; + #ifdef CONFIG_X86_PAE +@@ -30,48 +34,55 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED; + static u32 kexec_pte0[1024] PAGE_ALIGNED; + static u32 kexec_pte1[1024] PAGE_ALIGNED; + +-static void set_idt(void *newidt, __u16 limit) +-{ +- struct desc_ptr curidt; ++#ifdef CONFIG_XEN + +- /* ia32 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; ++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +- load_idt(&curidt); +-}; ++#if PAGES_NR > KEXEC_XEN_NO_PAGES ++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break ++#endif + ++#if PA_CONTROL_PAGE != 0 ++#error PA_CONTROL_PAGE is non zero - Xen support will break ++#endif + +-static void set_gdt(void *newgdt, __u16 limit) ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) + { +- struct desc_ptr curgdt; ++ void *control_page; + +- /* ia32 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; ++ memset(xki->page_list, 0, sizeof(xki->page_list)); + +- load_gdt(&curgdt); +-}; ++ control_page = page_address(image->control_code_page); ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); + +-static void load_segments(void) ++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); ++ xki->page_list[PA_PGD] = __ma(kexec_pgd); ++#ifdef CONFIG_X86_PAE ++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); ++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); ++#endif ++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); ++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); ++ ++} ++ ++int __init machine_kexec_setup_resources(struct resource *hypervisor, ++ struct resource *phys_cpus, ++ int nr_phys_cpus) + { +-#define __STR(X) #X +-#define STR(X) __STR(X) +- +- __asm__ __volatile__ ( +- "\tljmp $"STR(__KERNEL_CS)",$1f\n" +- "\t1:\n" +- "\tmovl $"STR(__KERNEL_DS)",%%eax\n" +- "\tmovl %%eax,%%ds\n" +- "\tmovl %%eax,%%es\n" +- "\tmovl %%eax,%%fs\n" +- "\tmovl %%eax,%%gs\n" +- "\tmovl %%eax,%%ss\n" +- ::: "eax", "memory"); +-#undef STR +-#undef __STR ++ int k; ++ ++ /* The per-cpu crash note resources belong to the hypervisor resource */ ++ for (k = 0; k < nr_phys_cpus; k++) ++ request_resource(hypervisor, phys_cpus + k); ++ ++ return 0; + } + ++void machine_kexec_register_resources(struct resource *res) { ; } ++ ++#endif /* CONFIG_XEN */ ++ + /* + * A architecture hook called to validate the + * proposed image and prepare the control pages +@@ -98,6 +109,7 @@ void machine_kexec_cleanup(struct kimage *image) + { + } + ++#ifndef CONFIG_XEN + /* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. +@@ -128,26 +140,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + +- /* The segment registers are funny things, they have both a +- * visible and an invisible part. Whenever the visible part is +- * set to a specific selector, the invisible part is loaded +- * with from a table in memory. At no other time is the +- * descriptor table in memory accessed. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); +- +- /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); + } ++#endif + + void arch_crash_save_vmcoreinfo(void) + { +diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c +index 576a03d..99ebe85 100644 +--- a/arch/x86/kernel/machine_kexec_64.c ++++ b/arch/x86/kernel/machine_kexec_64.c +@@ -25,6 +25,117 @@ static u64 kexec_pud1[512] PAGE_ALIGNED; + static u64 kexec_pmd1[512] PAGE_ALIGNED; + static u64 kexec_pte1[512] PAGE_ALIGNED; + ++#ifdef CONFIG_XEN ++ ++/* In the case of Xen, override hypervisor functions to be able to create ++ * a regular identity mapping page table... ++ */ ++ ++#include ++#include ++ ++#define x__pmd(x) ((pmd_t) { (x) } ) ++#define x__pud(x) ((pud_t) { (x) } ) ++#define x__pgd(x) ((pgd_t) { (x) } ) ++ ++#define x_pmd_val(x) ((x).pmd) ++#define x_pud_val(x) ((x).pud) ++#define x_pgd_val(x) ((x).pgd) ++ ++static inline void x_set_pmd(pmd_t *dst, pmd_t val) ++{ ++ x_pmd_val(*dst) = x_pmd_val(val); ++} ++ ++static inline void x_set_pud(pud_t *dst, pud_t val) ++{ ++ x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); ++} ++ ++static inline void x_pud_clear (pud_t *pud) ++{ ++ x_pud_val(*pud) = 0; ++} ++ ++static inline void x_set_pgd(pgd_t *dst, pgd_t val) ++{ ++ x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); ++} ++ ++static inline void x_pgd_clear (pgd_t * pgd) ++{ ++ x_pgd_val(*pgd) = 0; ++} ++ ++#define X__PAGE_KERNEL_LARGE_EXEC \ ++ _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE ++#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY ++ ++#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) ++ ++#if PAGES_NR > KEXEC_XEN_NO_PAGES ++#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break ++#endif ++ ++#if PA_CONTROL_PAGE != 0 ++#error PA_CONTROL_PAGE is non zero - Xen support will break ++#endif ++ ++void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ void *control_page; ++ void *table_page; ++ ++ memset(xki->page_list, 0, sizeof(xki->page_list)); ++ ++ control_page = page_address(image->control_code_page) + PAGE_SIZE; ++ memcpy(control_page, relocate_kernel, PAGE_SIZE); ++ ++ table_page = page_address(image->control_code_page); ++ ++ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); ++ xki->page_list[PA_TABLE_PAGE] = __ma(table_page); ++ ++ xki->page_list[PA_PGD] = __ma(kexec_pgd); ++ xki->page_list[PA_PUD_0] = __ma(kexec_pud0); ++ xki->page_list[PA_PUD_1] = __ma(kexec_pud1); ++ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); ++ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); ++ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); ++ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); ++} ++ ++int __init machine_kexec_setup_resources(struct resource *hypervisor, ++ struct resource *phys_cpus, ++ int nr_phys_cpus) ++{ ++ int k; ++ ++ /* The per-cpu crash note resources belong to the hypervisor resource */ ++ for (k = 0; k < nr_phys_cpus; k++) ++ request_resource(hypervisor, phys_cpus + k); ++ ++ return 0; ++} ++ ++#else /* CONFIG_XEN */ ++ ++#define x__pmd(x) __pmd(x) ++#define x__pud(x) __pud(x) ++#define x__pgd(x) __pgd(x) ++ ++#define x_set_pmd(x, y) set_pmd(x, y) ++#define x_set_pud(x, y) set_pud(x, y) ++#define x_set_pgd(x, y) set_pgd(x, y) ++ ++#define x_pud_clear(x) pud_clear(x) ++#define x_pgd_clear(x) pgd_clear(x) ++ ++#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC ++#define X_KERNPG_TABLE _KERNPG_TABLE ++ ++#endif /* CONFIG_XEN */ ++ + static void init_level2_page(pmd_t *level2p, unsigned long addr) + { + unsigned long end_addr; +@@ -32,7 +143,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long addr) + addr &= PAGE_MASK; + end_addr = addr + PUD_SIZE; + while (addr < end_addr) { +- set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); ++ x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); + addr += PMD_SIZE; + } + } +@@ -57,12 +168,12 @@ static int init_level3_page(struct kimage *image, pud_t *level3p, + } + level2p = (pmd_t *)page_address(page); + init_level2_page(level2p, addr); +- set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); ++ x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); + addr += PUD_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pud_clear(level3p++); ++ x_pud_clear(level3p++); + addr += PUD_SIZE; + } + out: +@@ -93,12 +204,12 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, + if (result) { + goto out; + } +- set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); ++ x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE)); + addr += PGDIR_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { +- pgd_clear(level4p++); ++ x_pgd_clear(level4p++); + addr += PGDIR_SIZE; + } + out: +@@ -109,49 +220,14 @@ out: + static int init_pgtable(struct kimage *image, unsigned long start_pgtable) + { + pgd_t *level4p; +- level4p = (pgd_t *)__va(start_pgtable); +- return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +-} +- +-static void set_idt(void *newidt, u16 limit) +-{ +- struct desc_ptr curidt; +- +- /* x86-64 supports unaliged loads & stores */ +- curidt.size = limit; +- curidt.address = (unsigned long)newidt; +- +- __asm__ __volatile__ ( +- "lidtq %0\n" +- : : "m" (curidt) +- ); +-}; ++ unsigned long x_end_pfn = end_pfn; + ++#ifdef CONFIG_XEN ++ x_end_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); ++#endif + +-static void set_gdt(void *newgdt, u16 limit) +-{ +- struct desc_ptr curgdt; +- +- /* x86-64 supports unaligned loads & stores */ +- curgdt.size = limit; +- curgdt.address = (unsigned long)newgdt; +- +- __asm__ __volatile__ ( +- "lgdtq %0\n" +- : : "m" (curgdt) +- ); +-}; +- +-static void load_segments(void) +-{ +- __asm__ __volatile__ ( +- "\tmovl %0,%%ds\n" +- "\tmovl %0,%%es\n" +- "\tmovl %0,%%ss\n" +- "\tmovl %0,%%fs\n" +- "\tmovl %0,%%gs\n" +- : : "a" (__KERNEL_DS) : "memory" +- ); ++ level4p = (pgd_t *)__va(start_pgtable); ++ return init_level4_page(image, level4p, 0, x_end_pfn << PAGE_SHIFT); + } + + int machine_kexec_prepare(struct kimage *image) +@@ -175,6 +251,7 @@ void machine_kexec_cleanup(struct kimage *image) + return; + } + ++#ifndef CONFIG_XEN + /* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. +@@ -210,26 +287,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + +- /* The segment registers are funny things, they have both a +- * visible and an invisible part. Whenever the visible part is +- * set to a specific selector, the invisible part is loaded +- * with from a table in memory. At no other time is the +- * descriptor table in memory accessed. +- * +- * I take advantage of this here by force loading the +- * segments, before I zap the gdt with an invalid value. +- */ +- load_segments(); +- /* The gdt & idt are now invalid. +- * If you want to load them you must set up your own idt & gdt. +- */ +- set_gdt(phys_to_virt(0),0); +- set_idt(phys_to_virt(0),0); +- +- /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); + } ++#endif + + void arch_crash_save_vmcoreinfo(void) + { +diff --git a/arch/x86/kernel/microcode-xen.c b/arch/x86/kernel/microcode-xen.c +new file mode 100644 +index 0000000..42c6517 +--- /dev/null ++++ b/arch/x86/kernel/microcode-xen.c +@@ -0,0 +1,210 @@ ++/* ++ * Intel CPU Microcode Update Driver for Linux ++ * ++ * Copyright (C) 2000-2006 Tigran Aivazian ++ * 2006 Shaohua Li ++ * ++ * This driver allows to upgrade microcode on Intel processors ++ * belonging to IA-32 family - PentiumPro, Pentium II, ++ * Pentium III, Xeon, Pentium 4, etc. ++ * ++ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, ++ * Order Number 245472 or free download from: ++ * ++ * http://developer.intel.com/design/pentium4/manuals/245472.htm ++ * ++ * For more information, go to http://www.urbanmyth.org/microcode ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++//#define DEBUG /* pr_debug */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); ++MODULE_AUTHOR("Tigran Aivazian "); ++MODULE_LICENSE("GPL"); ++ ++static int verbose; ++module_param(verbose, int, 0644); ++ ++#define MICROCODE_VERSION "1.14a-xen" ++ ++#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ ++#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ ++#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ ++ ++/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ ++static DEFINE_MUTEX(microcode_mutex); ++ ++#ifdef CONFIG_MICROCODE_OLD_INTERFACE ++static int do_microcode_update (const void __user *ubuf, size_t len) ++{ ++ int err; ++ void *kbuf; ++ ++ kbuf = vmalloc(len); ++ if (!kbuf) ++ return -ENOMEM; ++ ++ if (copy_from_user(kbuf, ubuf, len) == 0) { ++ struct xen_platform_op op; ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, kbuf); ++ op.u.microcode.length = len; ++ err = HYPERVISOR_platform_op(&op); ++ } else ++ err = -EFAULT; ++ ++ vfree(kbuf); ++ ++ return err; ++} ++ ++static int microcode_open (struct inode *unused1, struct file *unused2) ++{ ++ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; ++} ++ ++static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) ++{ ++ ssize_t ret; ++ ++ if (len < MC_HEADER_SIZE) { ++ printk(KERN_ERR "microcode: not enough data\n"); ++ return -EINVAL; ++ } ++ ++ mutex_lock(µcode_mutex); ++ ++ ret = do_microcode_update(buf, len); ++ if (!ret) ++ ret = (ssize_t)len; ++ ++ mutex_unlock(µcode_mutex); ++ ++ return ret; ++} ++ ++static const struct file_operations microcode_fops = { ++ .owner = THIS_MODULE, ++ .write = microcode_write, ++ .open = microcode_open, ++}; ++ ++static struct miscdevice microcode_dev = { ++ .minor = MICROCODE_MINOR, ++ .name = "microcode", ++ .fops = µcode_fops, ++}; ++ ++static int __init microcode_dev_init (void) ++{ ++ int error; ++ ++ error = misc_register(µcode_dev); ++ if (error) { ++ printk(KERN_ERR ++ "microcode: can't misc_register on minor=%d\n", ++ MICROCODE_MINOR); ++ return error; ++ } ++ ++ return 0; ++} ++ ++static void microcode_dev_exit (void) ++{ ++ misc_deregister(µcode_dev); ++} ++ ++MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); ++#else ++#define microcode_dev_init() 0 ++#define microcode_dev_exit() do { } while(0) ++#endif ++ ++/* fake device for request_firmware */ ++static struct platform_device *microcode_pdev; ++ ++static int request_microcode(void) ++{ ++ char name[30]; ++ const struct cpuinfo_x86 *c = &boot_cpu_data; ++ const struct firmware *firmware; ++ int error; ++ struct xen_platform_op op; ++ ++ sprintf(name,"intel-ucode/%02x-%02x-%02x", ++ c->x86, c->x86_model, c->x86_mask); ++ error = request_firmware(&firmware, name, µcode_pdev->dev); ++ if (error) { ++ pr_debug("microcode: ucode data file %s load failed\n", name); ++ return error; ++ } ++ ++ op.cmd = XENPF_microcode_update; ++ set_xen_guest_handle(op.u.microcode.data, firmware->data); ++ op.u.microcode.length = firmware->size; ++ error = HYPERVISOR_platform_op(&op); ++ ++ release_firmware(firmware); ++ ++ if (error) ++ pr_debug("ucode load failed\n"); ++ ++ return error; ++} ++ ++static int __init microcode_init (void) ++{ ++ int error; ++ ++ error = microcode_dev_init(); ++ if (error) ++ return error; ++ microcode_pdev = platform_device_register_simple("microcode", -1, ++ NULL, 0); ++ if (IS_ERR(microcode_pdev)) { ++ microcode_dev_exit(); ++ return PTR_ERR(microcode_pdev); ++ } ++ ++ request_microcode(); ++ ++ printk(KERN_INFO ++ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); ++ return 0; ++} ++ ++static void __exit microcode_exit (void) ++{ ++ microcode_dev_exit(); ++ platform_device_unregister(microcode_pdev); ++} ++ ++module_init(microcode_init) ++module_exit(microcode_exit) +diff --git a/arch/x86/kernel/mpparse-xen.c b/arch/x86/kernel/mpparse-xen.c +new file mode 100644 +index 0000000..abc8f05 +--- /dev/null ++++ b/arch/x86/kernel/mpparse-xen.c +@@ -0,0 +1,1111 @@ ++/* ++ * Intel Multiprocessor Specification 1.1 and 1.4 ++ * compliant MP-table parsing routines. ++ * ++ * (c) 1995 Alan Cox, Building #3 ++ * (c) 1998, 1999, 2000 Ingo Molnar ++ * (c) 2008 Alexey Starikovskiy ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#ifdef CONFIG_X86_32 ++#include ++#endif ++ ++/* Have we found an MP table */ ++int smp_found_config; ++ ++/* ++ * Various Linux-internal data structures created from the ++ * MP-table. ++ */ ++#if defined (CONFIG_MCA) || defined (CONFIG_EISA) ++int mp_bus_id_to_type[MAX_MP_BUSSES]; ++#endif ++ ++DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); ++int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 }; ++ ++static int mp_current_pci_id; ++ ++int pic_mode; ++ ++/* ++ * Intel MP BIOS table parsing routines: ++ */ ++ ++/* ++ * Checksum an MP configuration block. ++ */ ++ ++static int __init mpf_checksum(unsigned char *mp, int len) ++{ ++ int sum = 0; ++ ++ while (len--) ++ sum += *mp++; ++ ++ return sum & 0xFF; ++} ++ ++#ifdef CONFIG_X86_NUMAQ ++/* ++ * Have to match translation table entries to main table entries by counter ++ * hence the mpc_record variable .... can't see a less disgusting way of ++ * doing this .... ++ */ ++ ++static int mpc_record; ++static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] ++ __cpuinitdata; ++#endif ++ ++#ifndef CONFIG_XEN ++static void __cpuinit MP_processor_info(struct mpc_config_processor *m) ++{ ++ int apicid; ++ char *bootup_cpu = ""; ++ ++ if (!(m->mpc_cpuflag & CPU_ENABLED)) { ++ disabled_cpus++; ++ return; ++ } ++#ifdef CONFIG_X86_NUMAQ ++ apicid = mpc_apic_id(m, translation_table[mpc_record]); ++#else ++ apicid = m->mpc_apicid; ++#endif ++ if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { ++ bootup_cpu = " (Bootup-CPU)"; ++ boot_cpu_physical_apicid = m->mpc_apicid; ++ } ++ ++ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); ++ generic_processor_info(apicid, m->mpc_apicver); ++} ++#else ++static void __cpuinit MP_processor_info (struct mpc_config_processor *m) ++{ ++ num_processors++; ++} ++#endif /* CONFIG_XEN */ ++ ++static void __init MP_bus_info(struct mpc_config_bus *m) ++{ ++ char str[7]; ++ ++ memcpy(str, m->mpc_bustype, 6); ++ str[6] = 0; ++ ++#ifdef CONFIG_X86_NUMAQ ++ mpc_oem_bus_info(m, str, translation_table[mpc_record]); ++#else ++ Dprintk("Bus #%d is %s\n", m->mpc_busid, str); ++#endif ++ ++#if MAX_MP_BUSSES < 256 ++ if (m->mpc_busid >= MAX_MP_BUSSES) { ++ printk(KERN_WARNING "MP table busid value (%d) for bustype %s " ++ " is too large, max. supported is %d\n", ++ m->mpc_busid, str, MAX_MP_BUSSES - 1); ++ return; ++ } ++#endif ++ ++ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { ++ set_bit(m->mpc_busid, mp_bus_not_pci); ++#if defined(CONFIG_EISA) || defined (CONFIG_MCA) ++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; ++#endif ++ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { ++#ifdef CONFIG_X86_NUMAQ ++ mpc_oem_pci_bus(m, translation_table[mpc_record]); ++#endif ++ clear_bit(m->mpc_busid, mp_bus_not_pci); ++ mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; ++ mp_current_pci_id++; ++#if defined(CONFIG_EISA) || defined (CONFIG_MCA) ++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; ++ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { ++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; ++ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { ++ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; ++#endif ++ } else ++ printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); ++} ++ ++#ifdef CONFIG_X86_IO_APIC ++ ++static int bad_ioapic(unsigned long address) ++{ ++ if (nr_ioapics >= MAX_IO_APICS) { ++ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " ++ "(found %d)\n", MAX_IO_APICS, nr_ioapics); ++ panic("Recompile kernel with bigger MAX_IO_APICS!\n"); ++ } ++ if (!address) { ++ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" ++ " found in table, skipping!\n"); ++ return 1; ++ } ++ return 0; ++} ++ ++static void __init MP_ioapic_info(struct mpc_config_ioapic *m) ++{ ++ if (!(m->mpc_flags & MPC_APIC_USABLE)) ++ return; ++ ++ printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", ++ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); ++ ++ if (bad_ioapic(m->mpc_apicaddr)) ++ return; ++ ++ mp_ioapics[nr_ioapics] = *m; ++ nr_ioapics++; ++} ++ ++static void __init MP_intsrc_info(struct mpc_config_intsrc *m) ++{ ++ mp_irqs[mp_irq_entries] = *m; ++ Dprintk("Int: type %d, pol %d, trig %d, bus %d," ++ " IRQ %02x, APIC ID %x, APIC INT %02x\n", ++ m->mpc_irqtype, m->mpc_irqflag & 3, ++ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, ++ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); ++ if (++mp_irq_entries == MAX_IRQ_SOURCES) ++ panic("Max # of irq sources exceeded!!\n"); ++} ++ ++#endif ++ ++static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) ++{ ++ Dprintk("Lint: type %d, pol %d, trig %d, bus %d," ++ " IRQ %02x, APIC ID %x, APIC LINT %02x\n", ++ m->mpc_irqtype, m->mpc_irqflag & 3, ++ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, ++ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); ++} ++ ++#ifdef CONFIG_X86_NUMAQ ++static void __init MP_translation_info(struct mpc_config_translation *m) ++{ ++ printk(KERN_INFO ++ "Translation: record %d, type %d, quad %d, global %d, local %d\n", ++ mpc_record, m->trans_type, m->trans_quad, m->trans_global, ++ m->trans_local); ++ ++ if (mpc_record >= MAX_MPC_ENTRY) ++ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); ++ else ++ translation_table[mpc_record] = m; /* stash this for later */ ++ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) ++ node_set_online(m->trans_quad); ++} ++ ++/* ++ * Read/parse the MPC oem tables ++ */ ++ ++static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, ++ unsigned short oemsize) ++{ ++ int count = sizeof(*oemtable); /* the header size */ ++ unsigned char *oemptr = ((unsigned char *)oemtable) + count; ++ ++ mpc_record = 0; ++ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", ++ oemtable); ++ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { ++ printk(KERN_WARNING ++ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", ++ oemtable->oem_signature[0], oemtable->oem_signature[1], ++ oemtable->oem_signature[2], oemtable->oem_signature[3]); ++ return; ++ } ++ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { ++ printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); ++ return; ++ } ++ while (count < oemtable->oem_length) { ++ switch (*oemptr) { ++ case MP_TRANSLATION: ++ { ++ struct mpc_config_translation *m = ++ (struct mpc_config_translation *)oemptr; ++ MP_translation_info(m); ++ oemptr += sizeof(*m); ++ count += sizeof(*m); ++ ++mpc_record; ++ break; ++ } ++ default: ++ { ++ printk(KERN_WARNING ++ "Unrecognised OEM table entry type! - %d\n", ++ (int)*oemptr); ++ return; ++ } ++ } ++ } ++} ++ ++static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, ++ char *productid) ++{ ++ if (strncmp(oem, "IBM NUMA", 8)) ++ printk("Warning! May not be a NUMA-Q system!\n"); ++ if (mpc->mpc_oemptr) ++ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, ++ mpc->mpc_oemsize); ++} ++#endif /* CONFIG_X86_NUMAQ */ ++ ++/* ++ * Read/parse the MPC ++ */ ++ ++static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) ++{ ++ char str[16]; ++ char oem[10]; ++ int count = sizeof(*mpc); ++ unsigned char *mpt = ((unsigned char *)mpc) + count; ++ ++ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) { ++ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", ++ mpc->mpc_signature[0], mpc->mpc_signature[1], ++ mpc->mpc_signature[2], mpc->mpc_signature[3]); ++ return 0; ++ } ++ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) { ++ printk(KERN_ERR "MPTABLE: checksum error!\n"); ++ return 0; ++ } ++ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) { ++ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", ++ mpc->mpc_spec); ++ return 0; ++ } ++ if (!mpc->mpc_lapic) { ++ printk(KERN_ERR "MPTABLE: null local APIC address!\n"); ++ return 0; ++ } ++ memcpy(oem, mpc->mpc_oem, 8); ++ oem[8] = 0; ++ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem); ++ ++ memcpy(str, mpc->mpc_productid, 12); ++ str[12] = 0; ++ printk("Product ID: %s ", str); ++ ++#ifdef CONFIG_X86_32 ++ mps_oem_check(mpc, oem, str); ++#endif ++ printk(KERN_INFO "MPTABLE: Product ID: %s ", str); ++ ++#ifndef CONFIG_XEN ++ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic); ++ ++ /* save the local APIC address, it might be non-default */ ++ if (!acpi_lapic) ++ mp_lapic_addr = mpc->mpc_lapic; ++#endif ++ ++ if (early) ++ return 1; ++ ++ /* ++ * Now process the configuration blocks. ++ */ ++#ifdef CONFIG_X86_NUMAQ ++ mpc_record = 0; ++#endif ++ while (count < mpc->mpc_length) { ++ switch (*mpt) { ++ case MP_PROCESSOR: ++ { ++ struct mpc_config_processor *m = ++ (struct mpc_config_processor *)mpt; ++ /* ACPI may have already provided this data */ ++ if (!acpi_lapic) ++ MP_processor_info(m); ++ mpt += sizeof(*m); ++ count += sizeof(*m); ++ break; ++ } ++ case MP_BUS: ++ { ++ struct mpc_config_bus *m = ++ (struct mpc_config_bus *)mpt; ++ MP_bus_info(m); ++ mpt += sizeof(*m); ++ count += sizeof(*m); ++ break; ++ } ++ case MP_IOAPIC: ++ { ++#ifdef CONFIG_X86_IO_APIC ++ struct mpc_config_ioapic *m = ++ (struct mpc_config_ioapic *)mpt; ++ MP_ioapic_info(m); ++#endif ++ mpt += sizeof(struct mpc_config_ioapic); ++ count += sizeof(struct mpc_config_ioapic); ++ break; ++ } ++ case MP_INTSRC: ++ { ++#ifdef CONFIG_X86_IO_APIC ++ struct mpc_config_intsrc *m = ++ (struct mpc_config_intsrc *)mpt; ++ ++ MP_intsrc_info(m); ++#endif ++ mpt += sizeof(struct mpc_config_intsrc); ++ count += sizeof(struct mpc_config_intsrc); ++ break; ++ } ++ case MP_LINTSRC: ++ { ++ struct mpc_config_lintsrc *m = ++ (struct mpc_config_lintsrc *)mpt; ++ MP_lintsrc_info(m); ++ mpt += sizeof(*m); ++ count += sizeof(*m); ++ break; ++ } ++ default: ++ /* wrong mptable */ ++ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"); ++ printk(KERN_ERR "type %x\n", *mpt); ++ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, ++ 1, mpc, mpc->mpc_length, 1); ++ count = mpc->mpc_length; ++ break; ++ } ++#ifdef CONFIG_X86_NUMAQ ++ ++mpc_record; ++#endif ++ } ++ setup_apic_routing(); ++ if (!num_processors) ++ printk(KERN_ERR "MPTABLE: no processors registered!\n"); ++ return num_processors; ++} ++ ++#ifdef CONFIG_X86_IO_APIC ++ ++static int __init ELCR_trigger(unsigned int irq) ++{ ++ unsigned int port; ++ ++ port = 0x4d0 + (irq >> 3); ++ return (inb(port) >> (irq & 7)) & 1; ++} ++ ++static void __init construct_default_ioirq_mptable(int mpc_default_type) ++{ ++ struct mpc_config_intsrc intsrc; ++ int i; ++ int ELCR_fallback = 0; ++ ++ intsrc.mpc_type = MP_INTSRC; ++ intsrc.mpc_irqflag = 0; /* conforming */ ++ intsrc.mpc_srcbus = 0; ++ intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; ++ ++ intsrc.mpc_irqtype = mp_INT; ++ ++ /* ++ * If true, we have an ISA/PCI system with no IRQ entries ++ * in the MP table. To prevent the PCI interrupts from being set up ++ * incorrectly, we try to use the ELCR. The sanity check to see if ++ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can ++ * never be level sensitive, so we simply see if the ELCR agrees. ++ * If it does, we assume it's valid. ++ */ ++ if (mpc_default_type == 5) { ++ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " ++ "falling back to ELCR\n"); ++ ++ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ++ ELCR_trigger(13)) ++ printk(KERN_ERR "ELCR contains invalid data... " ++ "not using ELCR\n"); ++ else { ++ printk(KERN_INFO ++ "Using ELCR to identify PCI interrupts\n"); ++ ELCR_fallback = 1; ++ } ++ } ++ ++ for (i = 0; i < 16; i++) { ++ switch (mpc_default_type) { ++ case 2: ++ if (i == 0 || i == 13) ++ continue; /* IRQ0 & IRQ13 not connected */ ++ /* fall through */ ++ default: ++ if (i == 2) ++ continue; /* IRQ2 is never connected */ ++ } ++ ++ if (ELCR_fallback) { ++ /* ++ * If the ELCR indicates a level-sensitive interrupt, we ++ * copy that information over to the MP table in the ++ * irqflag field (level sensitive, active high polarity). ++ */ ++ if (ELCR_trigger(i)) ++ intsrc.mpc_irqflag = 13; ++ else ++ intsrc.mpc_irqflag = 0; ++ } ++ ++ intsrc.mpc_srcbusirq = i; ++ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ ++ MP_intsrc_info(&intsrc); ++ } ++ ++ intsrc.mpc_irqtype = mp_ExtINT; ++ intsrc.mpc_srcbusirq = 0; ++ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ ++ MP_intsrc_info(&intsrc); ++} ++ ++#endif ++ ++static inline void __init construct_default_ISA_mptable(int mpc_default_type) ++{ ++ struct mpc_config_processor processor; ++ struct mpc_config_bus bus; ++#ifdef CONFIG_X86_IO_APIC ++ struct mpc_config_ioapic ioapic; ++#endif ++ struct mpc_config_lintsrc lintsrc; ++ int linttypes[2] = { mp_ExtINT, mp_NMI }; ++ int i; ++ ++#ifndef CONFIG_XEN ++ /* ++ * local APIC has default address ++ */ ++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; ++#endif ++ ++ /* ++ * 2 CPUs, numbered 0 & 1. ++ */ ++ processor.mpc_type = MP_PROCESSOR; ++ /* Either an integrated APIC or a discrete 82489DX. */ ++ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; ++ processor.mpc_cpuflag = CPU_ENABLED; ++ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | ++ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; ++ processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; ++ processor.mpc_reserved[0] = 0; ++ processor.mpc_reserved[1] = 0; ++ for (i = 0; i < 2; i++) { ++ processor.mpc_apicid = i; ++ MP_processor_info(&processor); ++ } ++ ++ bus.mpc_type = MP_BUS; ++ bus.mpc_busid = 0; ++ switch (mpc_default_type) { ++ default: ++ printk(KERN_ERR "???\nUnknown standard configuration %d\n", ++ mpc_default_type); ++ /* fall through */ ++ case 1: ++ case 5: ++ memcpy(bus.mpc_bustype, "ISA ", 6); ++ break; ++ case 2: ++ case 6: ++ case 3: ++ memcpy(bus.mpc_bustype, "EISA ", 6); ++ break; ++ case 4: ++ case 7: ++ memcpy(bus.mpc_bustype, "MCA ", 6); ++ } ++ MP_bus_info(&bus); ++ if (mpc_default_type > 4) { ++ bus.mpc_busid = 1; ++ memcpy(bus.mpc_bustype, "PCI ", 6); ++ MP_bus_info(&bus); ++ } ++ ++#ifdef CONFIG_X86_IO_APIC ++ ioapic.mpc_type = MP_IOAPIC; ++ ioapic.mpc_apicid = 2; ++ ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; ++ ioapic.mpc_flags = MPC_APIC_USABLE; ++ ioapic.mpc_apicaddr = 0xFEC00000; ++ MP_ioapic_info(&ioapic); ++ ++ /* ++ * We set up most of the low 16 IO-APIC pins according to MPS rules. ++ */ ++ construct_default_ioirq_mptable(mpc_default_type); ++#endif ++ lintsrc.mpc_type = MP_LINTSRC; ++ lintsrc.mpc_irqflag = 0; /* conforming */ ++ lintsrc.mpc_srcbusid = 0; ++ lintsrc.mpc_srcbusirq = 0; ++ lintsrc.mpc_destapic = MP_APIC_ALL; ++ for (i = 0; i < 2; i++) { ++ lintsrc.mpc_irqtype = linttypes[i]; ++ lintsrc.mpc_destapiclint = i; ++ MP_lintsrc_info(&lintsrc); ++ } ++} ++ ++static struct intel_mp_floating *mpf_found; ++ ++/* ++ * Scan the memory blocks for an SMP configuration block. ++ */ ++static void __init __get_smp_config(unsigned early) ++{ ++ struct intel_mp_floating *mpf = mpf_found; ++ ++ if (acpi_lapic && early) ++ return; ++ /* ++ * ACPI supports both logical (e.g. Hyper-Threading) and physical ++ * processors, where MPS only supports physical. ++ */ ++ if (acpi_lapic && acpi_ioapic) { ++ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " ++ "information\n"); ++ return; ++ } else if (acpi_lapic) ++ printk(KERN_INFO "Using ACPI for processor (LAPIC) " ++ "configuration information\n"); ++ ++ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", ++ mpf->mpf_specification); ++#ifdef CONFIG_X86_32 ++ if (mpf->mpf_feature2 & (1 << 7)) { ++ printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); ++ pic_mode = 1; ++ } else { ++ printk(KERN_INFO " Virtual Wire compatibility mode.\n"); ++ pic_mode = 0; ++ } ++#endif ++ /* ++ * Now see if we need to read further. ++ */ ++ if (mpf->mpf_feature1 != 0) { ++ if (early) { ++#ifndef CONFIG_XEN ++ /* ++ * local APIC has default address ++ */ ++ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; ++#endif ++ return; ++ } ++ ++ printk(KERN_INFO "Default MP configuration #%d\n", ++ mpf->mpf_feature1); ++ construct_default_ISA_mptable(mpf->mpf_feature1); ++ ++ } else if (mpf->mpf_physptr) { ++ ++ /* ++ * Read the physical hardware table. Anything here will ++ * override the defaults. ++ */ ++ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr), early)) { ++ smp_found_config = 0; ++ printk(KERN_ERR ++ "BIOS bug, MP table errors detected!...\n"); ++ printk(KERN_ERR "... disabling SMP support. " ++ "(tell your hw vendor)\n"); ++ return; ++ } ++ ++ if (early) ++ return; ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * If there are no explicit MP IRQ entries, then we are ++ * broken. We set up most of the low 16 IO-APIC pins to ++ * ISA defaults and hope it will work. ++ */ ++ if (!mp_irq_entries) { ++ struct mpc_config_bus bus; ++ ++ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " ++ "using default mptable. " ++ "(tell your hw vendor)\n"); ++ ++ bus.mpc_type = MP_BUS; ++ bus.mpc_busid = 0; ++ memcpy(bus.mpc_bustype, "ISA ", 6); ++ MP_bus_info(&bus); ++ ++ construct_default_ioirq_mptable(0); ++ } ++#endif ++ } else ++ BUG(); ++ ++ if (!early) ++ printk(KERN_INFO "Processors: %d\n", num_processors); ++ /* ++ * Only use the first configuration found. ++ */ ++} ++ ++void __init early_get_smp_config(void) ++{ ++ __get_smp_config(1); ++} ++ ++void __init get_smp_config(void) ++{ ++ __get_smp_config(0); ++} ++ ++static int __init smp_scan_config(unsigned long base, unsigned long length, ++ unsigned reserve) ++{ ++ unsigned int *bp = isa_bus_to_virt(base); ++ struct intel_mp_floating *mpf; ++ ++ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); ++ BUILD_BUG_ON(sizeof(*mpf) != 16); ++ ++ while (length > 0) { ++ mpf = (struct intel_mp_floating *)bp; ++ if ((*bp == SMP_MAGIC_IDENT) && ++ (mpf->mpf_length == 1) && ++ !mpf_checksum((unsigned char *)bp, 16) && ++ ((mpf->mpf_specification == 1) ++ || (mpf->mpf_specification == 4))) { ++ ++ smp_found_config = 1; ++ mpf_found = mpf; ++#ifdef CONFIG_X86_32 ++#ifndef CONFIG_XEN ++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", ++ mpf, virt_to_phys(mpf)); ++ reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE, ++ BOOTMEM_DEFAULT); ++ if (mpf->mpf_physptr) { ++ /* ++ * We cannot access to MPC table to compute ++ * table size yet, as only few megabytes from ++ * the bottom is mapped now. ++ * PC-9800's MPC table places on the very last ++ * of physical memory; so that simply reserving ++ * PAGE_SIZE from mpg->mpf_physptr yields BUG() ++ * in reserve_bootmem. ++ */ ++ unsigned long size = PAGE_SIZE; ++ unsigned long end = max_low_pfn * PAGE_SIZE; ++ if (mpf->mpf_physptr + size > end) ++ size = end - mpf->mpf_physptr; ++ reserve_bootmem(mpf->mpf_physptr, size, ++ BOOTMEM_DEFAULT); ++ } ++#else ++ printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", ++ mpf, ((void *)bp - isa_bus_to_virt(base)) + base); ++#endif ++#elif !defined(CONFIG_XEN) ++ if (!reserve) ++ return 1; ++ ++ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); ++ if (mpf->mpf_physptr) ++ reserve_bootmem_generic(mpf->mpf_physptr, ++ PAGE_SIZE); ++#endif ++ return 1; ++ } ++ bp += 4; ++ length -= 16; ++ } ++ return 0; ++} ++ ++static void __init __find_smp_config(unsigned reserve) ++{ ++#ifndef CONFIG_XEN ++ unsigned int address; ++#endif ++ ++ /* ++ * FIXME: Linux assumes you have 640K of base ram.. ++ * this continues the error... ++ * ++ * 1) Scan the bottom 1K for a signature ++ * 2) Scan the top 1K of base RAM ++ * 3) Scan the 64K of bios ++ */ ++ if (smp_scan_config(0x0, 0x400, reserve) || ++ smp_scan_config(639 * 0x400, 0x400, reserve) || ++ smp_scan_config(0xF0000, 0x10000, reserve)) ++ return; ++ /* ++ * If it is an SMP machine we should know now, unless the ++ * configuration is in an EISA/MCA bus machine with an ++ * extended bios data area. ++ * ++ * there is a real-mode segmented pointer pointing to the ++ * 4K EBDA area at 0x40E, calculate and scan it here. ++ * ++ * NOTE! There are Linux loaders that will corrupt the EBDA ++ * area, and as such this kind of SMP config may be less ++ * trustworthy, simply because the SMP table may have been ++ * stomped on during early boot. These loaders are buggy and ++ * should be fixed. ++ * ++ * MP1.4 SPEC states to only scan first 1K of 4K EBDA. ++ */ ++ ++#ifndef CONFIG_XEN ++ address = get_bios_ebda(); ++ if (address) ++ smp_scan_config(address, 0x400, reserve); ++#endif ++} ++ ++void __init early_find_smp_config(void) ++{ ++ __find_smp_config(0); ++} ++ ++void __init find_smp_config(void) ++{ ++ __find_smp_config(1); ++} ++ ++/* -------------------------------------------------------------------------- ++ ACPI-based MP Configuration ++ -------------------------------------------------------------------------- */ ++ ++/* ++ * Keep this outside and initialized to 0, for !CONFIG_ACPI builds: ++ */ ++int es7000_plat; ++ ++#ifdef CONFIG_ACPI ++ ++#ifdef CONFIG_X86_IO_APIC ++ ++#define MP_ISA_BUS 0 ++ ++extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; ++ ++static int mp_find_ioapic(int gsi) ++{ ++ int i = 0; ++ ++ /* Find the IOAPIC that manages this GSI. */ ++ for (i = 0; i < nr_ioapics; i++) { ++ if ((gsi >= mp_ioapic_routing[i].gsi_base) ++ && (gsi <= mp_ioapic_routing[i].gsi_end)) ++ return i; ++ } ++ ++ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); ++ return -1; ++} ++ ++static u8 __init uniq_ioapic_id(u8 id) ++{ ++#ifdef CONFIG_X86_32 ++#ifndef CONFIG_XEN ++ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && ++ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) ++ return io_apic_get_unique_id(nr_ioapics, id); ++ else ++#endif ++ return id; ++#else ++ int i; ++ DECLARE_BITMAP(used, 256); ++ bitmap_zero(used, 256); ++ for (i = 0; i < nr_ioapics; i++) { ++ struct mpc_config_ioapic *ia = &mp_ioapics[i]; ++ __set_bit(ia->mpc_apicid, used); ++ } ++ if (!test_bit(id, used)) ++ return id; ++ return find_first_zero_bit(used, 256); ++#endif ++} ++ ++void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) ++{ ++ int idx = 0; ++ ++ if (bad_ioapic(address)) ++ return; ++ ++ idx = nr_ioapics; ++ ++ mp_ioapics[idx].mpc_type = MP_IOAPIC; ++ mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; ++ mp_ioapics[idx].mpc_apicaddr = address; ++ ++#ifndef CONFIG_XEN ++ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); ++#endif ++ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); ++#ifdef CONFIG_X86_32 ++ mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); ++#else ++ mp_ioapics[idx].mpc_apicver = 0; ++#endif ++ /* ++ * Build basic GSI lookup table to facilitate gsi->io_apic lookups ++ * and to prevent reprogramming of IOAPIC pins (PCI GSIs). ++ */ ++ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; ++ mp_ioapic_routing[idx].gsi_base = gsi_base; ++ mp_ioapic_routing[idx].gsi_end = gsi_base + ++ io_apic_get_redir_entries(idx); ++ ++ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " ++ "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, ++ mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, ++ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); ++ ++ nr_ioapics++; ++} ++ ++void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) ++{ ++ struct mpc_config_intsrc intsrc; ++ int ioapic = -1; ++ int pin = -1; ++ ++ /* ++ * Convert 'gsi' to 'ioapic.pin'. ++ */ ++ ioapic = mp_find_ioapic(gsi); ++ if (ioapic < 0) ++ return; ++ pin = gsi - mp_ioapic_routing[ioapic].gsi_base; ++ ++ /* ++ * TBD: This check is for faulty timer entries, where the override ++ * erroneously sets the trigger to level, resulting in a HUGE ++ * increase of timer interrupts! ++ */ ++ if ((bus_irq == 0) && (trigger == 3)) ++ trigger = 1; ++ ++ intsrc.mpc_type = MP_INTSRC; ++ intsrc.mpc_irqtype = mp_INT; ++ intsrc.mpc_irqflag = (trigger << 2) | polarity; ++ intsrc.mpc_srcbus = MP_ISA_BUS; ++ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ ++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ ++ intsrc.mpc_dstirq = pin; /* INTIN# */ ++ ++ MP_intsrc_info(&intsrc); ++} ++ ++void __init mp_config_acpi_legacy_irqs(void) ++{ ++ struct mpc_config_intsrc intsrc; ++ int i = 0; ++ int ioapic = -1; ++ ++#if defined (CONFIG_MCA) || defined (CONFIG_EISA) ++ /* ++ * Fabricate the legacy ISA bus (bus #31). ++ */ ++ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; ++#endif ++ set_bit(MP_ISA_BUS, mp_bus_not_pci); ++ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); ++ ++ /* ++ * Older generations of ES7000 have no legacy identity mappings ++ */ ++ if (es7000_plat == 1) ++ return; ++ ++ /* ++ * Locate the IOAPIC that manages the ISA IRQs (0-15). ++ */ ++ ioapic = mp_find_ioapic(0); ++ if (ioapic < 0) ++ return; ++ ++ intsrc.mpc_type = MP_INTSRC; ++ intsrc.mpc_irqflag = 0; /* Conforming */ ++ intsrc.mpc_srcbus = MP_ISA_BUS; ++#ifdef CONFIG_X86_IO_APIC ++ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; ++#endif ++ /* ++ * Use the default configuration for the IRQs 0-15. Unless ++ * overridden by (MADT) interrupt source override entries. ++ */ ++ for (i = 0; i < 16; i++) { ++ int idx; ++ ++ for (idx = 0; idx < mp_irq_entries; idx++) { ++ struct mpc_config_intsrc *irq = mp_irqs + idx; ++ ++ /* Do we already have a mapping for this ISA IRQ? */ ++ if (irq->mpc_srcbus == MP_ISA_BUS ++ && irq->mpc_srcbusirq == i) ++ break; ++ ++ /* Do we already have a mapping for this IOAPIC pin */ ++ if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && ++ (irq->mpc_dstirq == i)) ++ break; ++ } ++ ++ if (idx != mp_irq_entries) { ++ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); ++ continue; /* IRQ already used */ ++ } ++ ++ intsrc.mpc_irqtype = mp_INT; ++ intsrc.mpc_srcbusirq = i; /* Identity mapped */ ++ intsrc.mpc_dstirq = i; ++ ++ MP_intsrc_info(&intsrc); ++ } ++} ++ ++int mp_register_gsi(u32 gsi, int triggering, int polarity) ++{ ++ int ioapic; ++ int ioapic_pin; ++#ifdef CONFIG_X86_32 ++#define MAX_GSI_NUM 4096 ++#define IRQ_COMPRESSION_START 64 ++ ++ static int pci_irq = IRQ_COMPRESSION_START; ++ /* ++ * Mapping between Global System Interrupts, which ++ * represent all possible interrupts, and IRQs ++ * assigned to actual devices. ++ */ ++ static int gsi_to_irq[MAX_GSI_NUM]; ++#else ++ ++ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) ++ return gsi; ++#endif ++ ++ /* Don't set up the ACPI SCI because it's already set up */ ++ if (acpi_gbl_FADT.sci_interrupt == gsi) ++ return gsi; ++ ++ ioapic = mp_find_ioapic(gsi); ++ if (ioapic < 0) { ++ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); ++ return gsi; ++ } ++ ++ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; ++ ++#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) ++ if (ioapic_renumber_irq) ++ gsi = ioapic_renumber_irq(ioapic, gsi); ++#endif ++ ++ /* ++ * Avoid pin reprogramming. PRTs typically include entries ++ * with redundant pin->gsi mappings (but unique PCI devices); ++ * we only program the IOAPIC on the first. ++ */ ++ if (ioapic_pin > MP_MAX_IOAPIC_PIN) { ++ printk(KERN_ERR "Invalid reference to IOAPIC pin " ++ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, ++ ioapic_pin); ++ return gsi; ++ } ++ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { ++ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", ++ mp_ioapic_routing[ioapic].apic_id, ioapic_pin); ++#ifdef CONFIG_X86_32 ++ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); ++#else ++ return gsi; ++#endif ++ } ++ ++ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); ++#ifdef CONFIG_X86_32 ++ /* ++ * For GSI >= 64, use IRQ compression ++ */ ++ if ((gsi >= IRQ_COMPRESSION_START) ++ && (triggering == ACPI_LEVEL_SENSITIVE)) { ++ /* ++ * For PCI devices assign IRQs in order, avoiding gaps ++ * due to unused I/O APIC pins. ++ */ ++ int irq = gsi; ++ if (gsi < MAX_GSI_NUM) { ++ /* ++ * Retain the VIA chipset work-around (gsi > 15), but ++ * avoid a problem where the 8254 timer (IRQ0) is setup ++ * via an override (so it's not on pin 0 of the ioapic), ++ * and at the same time, the pin 0 interrupt is a PCI ++ * type. The gsi > 15 test could cause these two pins ++ * to be shared as IRQ0, and they are not shareable. ++ * So test for this condition, and if necessary, avoid ++ * the pin collision. ++ */ ++ gsi = pci_irq++; ++ /* ++ * Don't assign IRQ used by ACPI SCI ++ */ ++ if (gsi == acpi_gbl_FADT.sci_interrupt) ++ gsi = pci_irq++; ++ gsi_to_irq[irq] = gsi; ++ } else { ++ printk(KERN_ERR "GSI %u is too high\n", gsi); ++ return gsi; ++ } ++ } ++#endif ++ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, ++ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, ++ polarity == ACPI_ACTIVE_HIGH ? 0 : 1); ++ return gsi; ++} ++ ++#endif /* CONFIG_X86_IO_APIC */ ++#endif /* CONFIG_ACPI */ +diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c +index 84160f7..60dbfd7 100644 +--- a/arch/x86/kernel/nmi_32.c ++++ b/arch/x86/kernel/nmi_32.c +@@ -29,7 +29,10 @@ + + #include "mach_traps.h" + +-int unknown_nmi_panic; ++extern void die_nmi(struct pt_regs *, const char *msg); ++ ++#ifndef CONFIG_XEN ++ + int nmi_watchdog_enabled; + + static cpumask_t backtrace_mask = CPU_MASK_NONE; +@@ -314,8 +317,6 @@ void touch_nmi_watchdog(void) + } + EXPORT_SYMBOL(touch_nmi_watchdog); + +-extern void die_nmi(struct pt_regs *, const char *msg); +- + notrace __kprobes int + nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + { +@@ -388,8 +389,12 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + return rc; + } + ++#endif /* CONFIG_XEN */ ++ + #ifdef CONFIG_SYSCTL + ++int unknown_nmi_panic; ++ + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) + { + unsigned char reason = get_nmi_reason(); +@@ -400,6 +405,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) + return 0; + } + ++#ifndef CONFIG_XEN + /* + * proc handler for /proc/sys/kernel/nmi + */ +@@ -438,6 +444,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + } + return 0; + } ++#endif + + #endif + +@@ -450,6 +457,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) + return 0; + } + ++#ifndef CONFIG_XEN + void __trigger_all_cpu_backtrace(void) + { + int i; +@@ -465,3 +473,4 @@ void __trigger_all_cpu_backtrace(void) + + EXPORT_SYMBOL(nmi_active); + EXPORT_SYMBOL(nmi_watchdog); ++#endif +diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c +index 5a29ded..ddd2120 100644 +--- a/arch/x86/kernel/nmi_64.c ++++ b/arch/x86/kernel/nmi_64.c +@@ -28,10 +28,12 @@ + + #include + +-int unknown_nmi_panic; +-int nmi_watchdog_enabled; + int panic_on_unrecovered_nmi; + ++#ifndef CONFIG_XEN ++ ++int nmi_watchdog_enabled; ++ + static cpumask_t backtrace_mask = CPU_MASK_NONE; + + /* nmi_active: +@@ -383,6 +385,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + return rc; + } + ++#endif /* CONFIG_XEN */ ++ + static unsigned ignore_nmis; + + asmlinkage notrace __kprobes void +@@ -409,6 +413,8 @@ void restart_nmi(void) + + #ifdef CONFIG_SYSCTL + ++int unknown_nmi_panic; ++ + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) + { + unsigned char reason = get_nmi_reason(); +@@ -419,6 +425,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) + return 0; + } + ++#ifndef CONFIG_XEN + /* + * proc handler for /proc/sys/kernel/nmi + */ +@@ -453,6 +460,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + } + return 0; + } ++#endif + + #endif + +@@ -465,6 +473,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) + return 0; + } + ++#ifndef CONFIG_XEN + void __trigger_all_cpu_backtrace(void) + { + int i; +@@ -480,3 +489,4 @@ void __trigger_all_cpu_backtrace(void) + + EXPORT_SYMBOL(nmi_active); + EXPORT_SYMBOL(nmi_watchdog); ++#endif +diff --git a/arch/x86/kernel/pci-dma-xen.c b/arch/x86/kernel/pci-dma-xen.c +new file mode 100644 +index 0000000..ada8f02 +--- /dev/null ++++ b/arch/x86/kernel/pci-dma-xen.c +@@ -0,0 +1,599 @@ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++int forbid_dac __read_mostly; ++EXPORT_SYMBOL(forbid_dac); ++ ++const struct dma_mapping_ops *dma_ops; ++EXPORT_SYMBOL(dma_ops); ++ ++static int iommu_sac_force __read_mostly; ++ ++#ifdef CONFIG_IOMMU_DEBUG ++int panic_on_overflow __read_mostly = 1; ++int force_iommu __read_mostly = 1; ++#else ++int panic_on_overflow __read_mostly = 0; ++int force_iommu __read_mostly = 0; ++#endif ++ ++int iommu_merge __read_mostly = 0; ++ ++int no_iommu __read_mostly; ++/* Set this to 1 if there is a HW IOMMU in the system */ ++int iommu_detected __read_mostly = 0; ++ ++/* This tells the BIO block layer to assume merging. Default to off ++ because we cannot guarantee merging later. */ ++int iommu_bio_merge __read_mostly = 0; ++EXPORT_SYMBOL(iommu_bio_merge); ++ ++dma_addr_t bad_dma_address __read_mostly = 0; ++EXPORT_SYMBOL(bad_dma_address); ++ ++/* Dummy device used for NULL arguments (normally ISA). Better would ++ be probably a smaller DMA mask, but this is bug-to-bug compatible ++ to older i386. */ ++struct device fallback_dev = { ++ .bus_id = "fallback device", ++ .coherent_dma_mask = DMA_32BIT_MASK, ++ .dma_mask = &fallback_dev.coherent_dma_mask, ++}; ++ ++int dma_set_mask(struct device *dev, u64 mask) ++{ ++ if (!dev->dma_mask || !dma_supported(dev, mask)) ++ return -EIO; ++ ++ *dev->dma_mask = mask; ++ ++ return 0; ++} ++EXPORT_SYMBOL(dma_set_mask); ++ ++#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) ++static __initdata void *dma32_bootmem_ptr; ++static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); ++ ++static int __init parse_dma32_size_opt(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ dma32_bootmem_size = memparse(p, &p); ++ return 0; ++} ++early_param("dma32_size", parse_dma32_size_opt); ++ ++void __init dma32_reserve_bootmem(void) ++{ ++ unsigned long size, align; ++ if (end_pfn <= MAX_DMA32_PFN) ++ return; ++ ++ align = 64ULL<<20; ++ size = round_up(dma32_bootmem_size, align); ++ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, ++ __pa(MAX_DMA_ADDRESS)); ++ if (dma32_bootmem_ptr) ++ dma32_bootmem_size = size; ++ else ++ dma32_bootmem_size = 0; ++} ++static void __init dma32_free_bootmem(void) ++{ ++ int node; ++ ++ if (end_pfn <= MAX_DMA32_PFN) ++ return; ++ ++ if (!dma32_bootmem_ptr) ++ return; ++ ++ for_each_online_node(node) ++ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr), ++ dma32_bootmem_size); ++ ++ dma32_bootmem_ptr = NULL; ++ dma32_bootmem_size = 0; ++} ++#else ++#define dma32_free_bootmem() ((void)0) ++#endif ++ ++static const struct dma_mapping_ops swiotlb_dma_ops = { ++ .mapping_error = swiotlb_dma_mapping_error, ++ .map_single = swiotlb_map_single_phys, ++ .unmap_single = swiotlb_unmap_single, ++ .sync_single_for_cpu = swiotlb_sync_single_for_cpu, ++ .sync_single_for_device = swiotlb_sync_single_for_device, ++ .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, ++ .sync_single_range_for_device = swiotlb_sync_single_range_for_device, ++ .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, ++ .sync_sg_for_device = swiotlb_sync_sg_for_device, ++ .map_sg = swiotlb_map_sg, ++ .unmap_sg = swiotlb_unmap_sg, ++ .dma_supported = swiotlb_dma_supported ++}; ++ ++void __init pci_iommu_alloc(void) ++{ ++ /* free the range so iommu could get some range less than 4G */ ++ dma32_free_bootmem(); ++ /* ++ * The order of these functions is important for ++ * fall-back/fail-over reasons ++ */ ++#ifdef CONFIG_GART_IOMMU ++ gart_iommu_hole_init(); ++#endif ++ ++#ifdef CONFIG_CALGARY_IOMMU ++ detect_calgary(); ++#endif ++ ++ detect_intel_iommu(); ++ ++#ifdef CONFIG_SWIOTLB ++ swiotlb_init(); ++ if (swiotlb) { ++ printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); ++ dma_ops = &swiotlb_dma_ops; ++ } ++#endif ++} ++ ++/* ++ * See for the iommu kernel parameter ++ * documentation. ++ */ ++static __init int iommu_setup(char *p) ++{ ++ iommu_merge = 1; ++ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "off", 3)) ++ no_iommu = 1; ++ /* gart_parse_options has more force support */ ++ if (!strncmp(p, "force", 5)) ++ force_iommu = 1; ++ if (!strncmp(p, "noforce", 7)) { ++ iommu_merge = 0; ++ force_iommu = 0; ++ } ++ ++ if (!strncmp(p, "biomerge", 8)) { ++ iommu_bio_merge = 4096; ++ iommu_merge = 1; ++ force_iommu = 1; ++ } ++ if (!strncmp(p, "panic", 5)) ++ panic_on_overflow = 1; ++ if (!strncmp(p, "nopanic", 7)) ++ panic_on_overflow = 0; ++ if (!strncmp(p, "merge", 5)) { ++ iommu_merge = 1; ++ force_iommu = 1; ++ } ++ if (!strncmp(p, "nomerge", 7)) ++ iommu_merge = 0; ++ if (!strncmp(p, "forcesac", 8)) ++ iommu_sac_force = 1; ++ if (!strncmp(p, "allowdac", 8)) ++ forbid_dac = 0; ++ if (!strncmp(p, "nodac", 5)) ++ forbid_dac = -1; ++ if (!strncmp(p, "usedac", 6)) { ++ forbid_dac = -1; ++ return 1; ++ } ++#ifdef CONFIG_SWIOTLB ++ if (!strncmp(p, "soft", 4)) ++ swiotlb = 1; ++#endif ++ ++#ifdef CONFIG_GART_IOMMU ++ gart_parse_options(p); ++#endif ++ ++#ifdef CONFIG_CALGARY_IOMMU ++ if (!strncmp(p, "calgary", 7)) ++ use_calgary = 1; ++#endif /* CONFIG_CALGARY_IOMMU */ ++ ++ p += strcspn(p, ","); ++ if (*p == ',') ++ ++p; ++ } ++ return 0; ++} ++early_param("iommu", iommu_setup); ++ ++static int check_pages_physically_contiguous(unsigned long pfn, ++ unsigned int offset, ++ size_t length) ++{ ++ unsigned long next_mfn; ++ int i; ++ int nr_pages; ++ ++ next_mfn = pfn_to_mfn(pfn); ++ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ for (i = 1; i < nr_pages; i++) { ++ if (pfn_to_mfn(++pfn) != ++next_mfn) ++ return 0; ++ } ++ return 1; ++} ++ ++int range_straddles_page_boundary(paddr_t p, size_t size) ++{ ++ extern unsigned long *contiguous_bitmap; ++ unsigned long pfn = p >> PAGE_SHIFT; ++ unsigned int offset = p & ~PAGE_MASK; ++ ++ if (offset + size <= PAGE_SIZE) ++ return 0; ++ if (test_bit(pfn, contiguous_bitmap)) ++ return 0; ++ if (check_pages_physically_contiguous(pfn, offset, size)) ++ return 0; ++ return 1; ++} ++ ++#ifdef CONFIG_X86_32 ++int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, ++ dma_addr_t device_addr, size_t size, int flags) ++{ ++ void __iomem *mem_base = NULL; ++ int pages = size >> PAGE_SHIFT; ++ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); ++ ++ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) ++ goto out; ++ if (!size) ++ goto out; ++ if (dev->dma_mem) ++ goto out; ++ ++ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ ++ ++ mem_base = ioremap(bus_addr, size); ++ if (!mem_base) ++ goto out; ++ ++ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); ++ if (!dev->dma_mem) ++ goto out; ++ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); ++ if (!dev->dma_mem->bitmap) ++ goto free1_out; ++ ++ dev->dma_mem->virt_base = mem_base; ++ dev->dma_mem->device_base = device_addr; ++ dev->dma_mem->size = pages; ++ dev->dma_mem->flags = flags; ++ ++ if (flags & DMA_MEMORY_MAP) ++ return DMA_MEMORY_MAP; ++ ++ return DMA_MEMORY_IO; ++ ++ free1_out: ++ kfree(dev->dma_mem); ++ out: ++ if (mem_base) ++ iounmap(mem_base); ++ return 0; ++} ++EXPORT_SYMBOL(dma_declare_coherent_memory); ++ ++void dma_release_declared_memory(struct device *dev) ++{ ++ struct dma_coherent_mem *mem = dev->dma_mem; ++ ++ if (!mem) ++ return; ++ dev->dma_mem = NULL; ++ iounmap(mem->virt_base); ++ kfree(mem->bitmap); ++ kfree(mem); ++} ++EXPORT_SYMBOL(dma_release_declared_memory); ++ ++void *dma_mark_declared_memory_occupied(struct device *dev, ++ dma_addr_t device_addr, size_t size) ++{ ++ struct dma_coherent_mem *mem = dev->dma_mem; ++ int pos, err; ++ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); ++ ++ pages >>= PAGE_SHIFT; ++ ++ if (!mem) ++ return ERR_PTR(-EINVAL); ++ ++ pos = (device_addr - mem->device_base) >> PAGE_SHIFT; ++ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); ++ if (err != 0) ++ return ERR_PTR(err); ++ return mem->virt_base + (pos << PAGE_SHIFT); ++} ++EXPORT_SYMBOL(dma_mark_declared_memory_occupied); ++ ++static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, ++ dma_addr_t *dma_handle, void **ret) ++{ ++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; ++ int order = get_order(size); ++ ++ if (mem) { ++ int page = bitmap_find_free_region(mem->bitmap, mem->size, ++ order); ++ if (page >= 0) { ++ *dma_handle = mem->device_base + (page << PAGE_SHIFT); ++ *ret = mem->virt_base + (page << PAGE_SHIFT); ++ memset(*ret, 0, size); ++ } ++ if (mem->flags & DMA_MEMORY_EXCLUSIVE) ++ *ret = NULL; ++ } ++ return (mem != NULL); ++} ++ ++static int dma_release_coherent(struct device *dev, int order, void *vaddr) ++{ ++ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; ++ ++ if (mem && vaddr >= mem->virt_base && vaddr < ++ (mem->virt_base + (mem->size << PAGE_SHIFT))) { ++ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; ++ ++ bitmap_release_region(mem->bitmap, page, order); ++ return 1; ++ } ++ return 0; ++} ++#else ++#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) ++#define dma_release_coherent(dev, order, vaddr) (0) ++#endif /* CONFIG_X86_32 */ ++ ++int dma_supported(struct device *dev, u64 mask) ++{ ++#ifdef CONFIG_PCI ++ if (mask > 0xffffffff && forbid_dac > 0) { ++ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", ++ dev->bus_id); ++ return 0; ++ } ++#endif ++ ++ if (dma_ops->dma_supported) ++ return dma_ops->dma_supported(dev, mask); ++ ++ /* Copied from i386. Doesn't make much sense, because it will ++ only work for pci_alloc_coherent. ++ The caller just has to use GFP_DMA in this case. */ ++ if (mask < DMA_24BIT_MASK) ++ return 0; ++ ++ /* Tell the device to use SAC when IOMMU force is on. This ++ allows the driver to use cheaper accesses in some cases. ++ ++ Problem with this is that if we overflow the IOMMU area and ++ return DAC as fallback address the device may not handle it ++ correctly. ++ ++ As a special case some controllers have a 39bit address ++ mode that is as efficient as 32bit (aic79xx). Don't force ++ SAC for these. Assume all masks <= 40 bits are of this ++ type. Normally this doesn't make any difference, but gives ++ more gentle handling of IOMMU overflow. */ ++ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { ++ printk(KERN_INFO "%s: Force SAC with mask %Lx\n", ++ dev->bus_id, mask); ++ return 0; ++ } ++ ++ return 1; ++} ++EXPORT_SYMBOL(dma_supported); ++ ++/* Allocate DMA memory on node near device */ ++static struct page * ++dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) ++{ ++ int node; ++ ++ node = dev_to_node(dev); ++ ++ return alloc_pages_node(node, gfp, order); ++} ++ ++/* ++ * Allocate memory for a coherent mapping. ++ */ ++void * ++dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, ++ gfp_t gfp) ++{ ++ void *memory = NULL; ++ struct page *page; ++ unsigned long dma_mask = 0; ++ int noretry = 0; ++ unsigned int order = get_order(size); ++ ++ /* ignore region specifiers */ ++ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); ++ ++ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) ++ return memory; ++ ++ if (!dev) { ++ dev = &fallback_dev; ++ gfp |= GFP_DMA; ++ } ++ dma_mask = dev->coherent_dma_mask; ++ if (dma_mask == 0) ++ dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; ++ ++ /* Device not DMA able */ ++ if (dev->dma_mask == NULL) ++ return NULL; ++ ++ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ ++ if (gfp & __GFP_DMA) ++ noretry = 1; ++ ++#ifdef CONFIG_XEN ++ gfp &= ~(__GFP_DMA | __GFP_DMA32); ++#else ++#ifdef CONFIG_X86_64 ++ /* Why <=? Even when the mask is smaller than 4GB it is often ++ larger than 16MB and in this case we have a chance of ++ finding fitting memory in the next higher zone first. If ++ not retry with true GFP_DMA. -AK */ ++ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { ++ gfp |= GFP_DMA32; ++#endif ++ ++ again: ++#endif ++ page = dma_alloc_pages(dev, ++ noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); ++ if (page == NULL) ++ return NULL; ++ ++#ifndef CONFIG_XEN ++ { ++ int high, mmu; ++ dma_addr_t bus = page_to_phys(page); ++ memory = page_address(page); ++ high = (bus + size) >= dma_mask; ++ mmu = high; ++ if (force_iommu && !(gfp & GFP_DMA)) ++ mmu = 1; ++ else if (high) { ++ free_pages((unsigned long)memory, order); ++ ++ /* Don't use the 16MB ZONE_DMA unless absolutely ++ needed. It's better to use remapping first. */ ++ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { ++ gfp = (gfp & ~GFP_DMA32) | GFP_DMA; ++ goto again; ++ } ++ ++ /* Let low level make its own zone decisions */ ++ gfp &= ~(GFP_DMA32|GFP_DMA); ++ ++ if (dma_ops->alloc_coherent) ++ return dma_ops->alloc_coherent(dev, size, ++ dma_handle, gfp); ++ return NULL; ++ } ++ ++ memset(memory, 0, size); ++ if (!mmu) { ++ *dma_handle = bus; ++ return memory; ++ } ++ } ++ ++ if (dma_ops->alloc_coherent) { ++ free_pages((unsigned long)memory, order); ++ gfp &= ~(GFP_DMA|GFP_DMA32); ++ return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); ++ } ++ ++ if (dma_ops->map_simple) { ++ *dma_handle = dma_ops->map_simple(dev, virt_to_bus(memory), ++ size, ++ PCI_DMA_BIDIRECTIONAL); ++ if (*dma_handle != bad_dma_address) ++ return memory; ++ } ++#else ++ memory = page_address(page); ++ if (xen_create_contiguous_region((unsigned long)memory, order, ++ fls64(dma_mask)) == 0) { ++ memset(memory, 0, size); ++ *dma_handle = virt_to_bus(memory); ++ return memory; ++ } ++#endif ++ ++ if (panic_on_overflow) ++ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", ++ (unsigned long)size); ++ free_pages((unsigned long)memory, get_order(size)); ++ return NULL; ++} ++EXPORT_SYMBOL(dma_alloc_coherent); ++ ++/* ++ * Unmap coherent memory. ++ * The caller must ensure that the device has finished accessing the mapping. ++ */ ++void dma_free_coherent(struct device *dev, size_t size, ++ void *vaddr, dma_addr_t bus) ++{ ++ int order = get_order(size); ++ WARN_ON(irqs_disabled()); /* for portability */ ++ if (dma_release_coherent(dev, order, vaddr)) ++ return; ++#ifndef CONFIG_XEN ++ if (dma_ops->unmap_single) ++ dma_ops->unmap_single(dev, bus, size, 0); ++#endif ++ xen_destroy_contiguous_region((unsigned long)vaddr, order); ++ free_pages((unsigned long)vaddr, order); ++} ++EXPORT_SYMBOL(dma_free_coherent); ++ ++static int __init pci_iommu_init(void) ++{ ++#ifdef CONFIG_CALGARY_IOMMU ++ calgary_iommu_init(); ++#endif ++ ++ intel_iommu_init(); ++ ++#ifdef CONFIG_GART_IOMMU ++ gart_iommu_init(); ++#endif ++ ++ no_iommu_init(); ++ return 0; ++} ++ ++void pci_iommu_shutdown(void) ++{ ++ gart_iommu_shutdown(); ++} ++/* Must execute after PCI subsystem */ ++fs_initcall(pci_iommu_init); ++ ++#ifdef CONFIG_PCI ++/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ ++ ++static __devinit void via_no_dac(struct pci_dev *dev) ++{ ++ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { ++ printk(KERN_INFO "PCI: VIA PCI bridge detected." ++ "Disabling DAC.\n"); ++ forbid_dac = 1; ++ } ++} ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); ++#endif +diff --git a/arch/x86/kernel/pci-nommu-xen.c b/arch/x86/kernel/pci-nommu-xen.c +new file mode 100644 +index 0000000..7e92955 +--- /dev/null ++++ b/arch/x86/kernel/pci-nommu-xen.c +@@ -0,0 +1,103 @@ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define IOMMU_BUG_ON(test) \ ++do { \ ++ if (unlikely(test)) { \ ++ printk(KERN_ALERT "Fatal DMA error! " \ ++ "Please use 'swiotlb=force'\n"); \ ++ BUG(); \ ++ } \ ++} while (0) ++ ++static int ++gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, ++ int direction) ++{ ++ unsigned int i; ++ struct scatterlist *sg; ++ ++ WARN_ON(nents == 0 || sgl->length == 0); ++ ++ for_each_sg(sgl, sg, nents, i) { ++ BUG_ON(!sg_page(sg)); ++ sg->dma_address = ++ gnttab_dma_map_page(sg_page(sg)) + sg->offset; ++ sg->dma_length = sg->length; ++ IOMMU_BUG_ON(address_needs_mapping( ++ hwdev, sg->dma_address)); ++ IOMMU_BUG_ON(range_straddles_page_boundary( ++ page_to_pseudophys(sg_page(sg)) + sg->offset, ++ sg->length)); ++ } ++ ++ return nents; ++} ++ ++static void ++gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, ++ int direction) ++{ ++ unsigned int i; ++ struct scatterlist *sg; ++ ++ for_each_sg(sgl, sg, nents, i) ++ gnttab_dma_unmap_page(sg->dma_address); ++} ++ ++static dma_addr_t ++gnttab_map_single(struct device *dev, phys_addr_t paddr, size_t size, ++ int direction) ++{ ++ dma_addr_t dma; ++ ++ WARN_ON(size == 0); ++ ++ dma = gnttab_dma_map_page(pfn_to_page(paddr >> PAGE_SHIFT)) + ++ offset_in_page(paddr); ++ IOMMU_BUG_ON(range_straddles_page_boundary(paddr, size)); ++ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); ++ ++ return dma; ++} ++ ++static void ++gnttab_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, ++ int direction) ++{ ++ gnttab_dma_unmap_page(dma_addr); ++} ++ ++static int nommu_mapping_error(dma_addr_t dma_addr) ++{ ++ return (dma_addr == bad_dma_address); ++} ++ ++static const struct dma_mapping_ops nommu_dma_ops = { ++ .map_single = gnttab_map_single, ++ .unmap_single = gnttab_unmap_single, ++ .map_sg = gnttab_map_sg, ++ .unmap_sg = gnttab_unmap_sg, ++ .dma_supported = swiotlb_dma_supported, ++ .mapping_error = nommu_mapping_error ++}; ++ ++void __init no_iommu_init(void) ++{ ++ if (dma_ops) ++ return; ++ ++ force_iommu = 0; /* no HW IOMMU */ ++ dma_ops = &nommu_dma_ops; ++} +diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c +index bc1f2d3..a2c342f 100644 +--- a/arch/x86/kernel/pcspeaker.c ++++ b/arch/x86/kernel/pcspeaker.c +@@ -7,6 +7,11 @@ static __init int add_pcspkr(void) + struct platform_device *pd; + int ret; + ++#ifdef CONFIG_XEN ++ if (!is_initial_xendomain()) ++ return 0; ++#endif ++ + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; +diff --git a/arch/x86/kernel/process-xen.c b/arch/x86/kernel/process-xen.c +new file mode 100644 +index 0000000..88f164b +--- /dev/null ++++ b/arch/x86/kernel/process-xen.c +@@ -0,0 +1,188 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct kmem_cache *task_xstate_cachep; ++ ++int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) ++{ ++ *dst = *src; ++ if (src->thread.xstate) { ++ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, ++ GFP_KERNEL); ++ if (!dst->thread.xstate) ++ return -ENOMEM; ++ WARN_ON((unsigned long)dst->thread.xstate & 15); ++ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); ++ } ++ return 0; ++} ++ ++void free_thread_xstate(struct task_struct *tsk) ++{ ++ if (tsk->thread.xstate) { ++ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); ++ tsk->thread.xstate = NULL; ++ } ++} ++ ++void free_thread_info(struct thread_info *ti) ++{ ++ free_thread_xstate(ti->task); ++ free_pages((unsigned long)ti, get_order(THREAD_SIZE)); ++} ++ ++void arch_task_cache_init(void) ++{ ++ task_xstate_cachep = ++ kmem_cache_create("task_xstate", xstate_size, ++ __alignof__(union thread_xstate), ++ SLAB_PANIC, NULL); ++} ++ ++static void do_nothing(void *unused) ++{ ++} ++ ++/* ++ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of ++ * pm_idle and update to new pm_idle value. Required while changing pm_idle ++ * handler on SMP systems. ++ * ++ * Caller must have changed pm_idle to the new value before the call. Old ++ * pm_idle value will not be used by any CPU after the return of this function. ++ */ ++void cpu_idle_wait(void) ++{ ++ smp_mb(); ++ /* kick all the CPUs so that they exit out of pm_idle */ ++ smp_call_function(do_nothing, NULL, 0, 1); ++} ++EXPORT_SYMBOL_GPL(cpu_idle_wait); ++ ++#ifndef CONFIG_XEN ++/* ++ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, ++ * which can obviate IPI to trigger checking of need_resched. ++ * We execute MONITOR against need_resched and enter optimized wait state ++ * through MWAIT. Whenever someone changes need_resched, we would be woken ++ * up from MWAIT (without an IPI). ++ * ++ * New with Core Duo processors, MWAIT can take some hints based on CPU ++ * capability. ++ */ ++void mwait_idle_with_hints(unsigned long ax, unsigned long cx) ++{ ++ if (!need_resched()) { ++ __monitor((void *)¤t_thread_info()->flags, 0, 0); ++ smp_mb(); ++ if (!need_resched()) ++ __mwait(ax, cx); ++ } ++} ++ ++/* Default MONITOR/MWAIT with no hints, used for default C1 state */ ++static void mwait_idle(void) ++{ ++ if (!need_resched()) { ++ __monitor((void *)¤t_thread_info()->flags, 0, 0); ++ smp_mb(); ++ if (!need_resched()) ++ __sti_mwait(0, 0); ++ else ++ local_irq_enable(); ++ } else ++ local_irq_enable(); ++} ++#endif ++ ++/* ++ * On SMP it's slightly faster (but much more power-consuming!) ++ * to poll the ->work.need_resched flag instead of waiting for the ++ * cross-CPU IPI to arrive. Use this option with caution. ++ */ ++static void poll_idle(void) ++{ ++ local_irq_enable(); ++ cpu_relax(); ++} ++ ++#ifndef CONFIG_XEN ++/* ++ * mwait selection logic: ++ * ++ * It depends on the CPU. For AMD CPUs that support MWAIT this is ++ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings ++ * then depend on a clock divisor and current Pstate of the core. If ++ * all cores of a processor are in halt state (C1) the processor can ++ * enter the C1E (C1 enhanced) state. If mwait is used this will never ++ * happen. ++ * ++ * idle=mwait overrides this decision and forces the usage of mwait. ++ */ ++static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) ++{ ++ if (force_mwait) ++ return 1; ++ ++ if (c->x86_vendor == X86_VENDOR_AMD) { ++ switch(c->x86) { ++ case 0x10: ++ case 0x11: ++ return 0; ++ } ++ } ++ return 1; ++} ++#endif ++ ++void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) ++{ ++#ifndef CONFIG_XEN ++ static int selected; ++ ++ if (selected) ++ return; ++#ifdef CONFIG_X86_SMP ++ if (pm_idle == poll_idle && smp_num_siblings > 1) { ++ printk(KERN_WARNING "WARNING: polling idle and HT enabled," ++ " performance may degrade.\n"); ++ } ++#endif ++ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { ++ /* ++ * Skip, if setup has overridden idle. ++ * One CPU supports mwait => All CPUs supports mwait ++ */ ++ if (!pm_idle) { ++ printk(KERN_INFO "using mwait in idle threads.\n"); ++ pm_idle = mwait_idle; ++ } ++ } ++ selected = 1; ++#endif ++} ++ ++static int __init idle_setup(char *str) ++{ ++ if (!strcmp(str, "poll")) { ++ printk("using polling idle threads.\n"); ++ pm_idle = poll_idle; ++ } ++#ifndef CONFIG_XEN ++ else if (!strcmp(str, "mwait")) ++ force_mwait = 1; ++#endif ++ else ++ return -1; ++ ++ boot_option_idle_override = 1; ++ return 0; ++} ++early_param("idle", idle_setup); ++ +diff --git a/arch/x86/kernel/process_32-xen.c b/arch/x86/kernel/process_32-xen.c +new file mode 100644 +index 0000000..86dc5ed +--- /dev/null ++++ b/arch/x86/kernel/process_32-xen.c +@@ -0,0 +1,775 @@ ++/* ++ * Copyright (C) 1995 Linus Torvalds ++ * ++ * Pentium III FXSR, SSE support ++ * Gareth Hughes , May 2000 ++ */ ++ ++/* ++ * This file handles the architecture-dependent parts of process handling.. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_MATH_EMULATION ++#include ++#endif ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++ ++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); ++asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); ++ ++static int hlt_counter; ++ ++unsigned long boot_option_idle_override = 0; ++EXPORT_SYMBOL(boot_option_idle_override); ++ ++DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; ++EXPORT_PER_CPU_SYMBOL(current_task); ++ ++DEFINE_PER_CPU(int, cpu_number); ++EXPORT_PER_CPU_SYMBOL(cpu_number); ++ ++/* ++ * Return saved PC of a blocked thread. ++ */ ++unsigned long thread_saved_pc(struct task_struct *tsk) ++{ ++ return ((unsigned long *)tsk->thread.sp)[3]; ++} ++ ++/* ++ * Powermanagement idle function, if any.. ++ */ ++void (*pm_idle)(void); ++EXPORT_SYMBOL(pm_idle); ++ ++void disable_hlt(void) ++{ ++ hlt_counter++; ++} ++ ++EXPORT_SYMBOL(disable_hlt); ++ ++void enable_hlt(void) ++{ ++ hlt_counter--; ++} ++ ++EXPORT_SYMBOL(enable_hlt); ++ ++static void xen_idle(void) ++{ ++ current_thread_info()->status &= ~TS_POLLING; ++ /* ++ * TS_POLLING-cleared state must be visible before we ++ * test NEED_RESCHED: ++ */ ++ smp_mb(); ++ ++ if (!need_resched()) ++ safe_halt(); /* enables interrupts racelessly */ ++ else ++ local_irq_enable(); ++ current_thread_info()->status |= TS_POLLING; ++} ++#ifdef CONFIG_APM_MODULE ++EXPORT_SYMBOL(default_idle); ++#endif ++ ++#ifdef CONFIG_HOTPLUG_CPU ++extern cpumask_t cpu_initialized; ++static inline void play_dead(void) ++{ ++ idle_task_exit(); ++ local_irq_disable(); ++ cpu_clear(smp_processor_id(), cpu_initialized); ++ preempt_enable_no_resched(); ++ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); ++ cpu_bringup(); ++} ++#else ++static inline void play_dead(void) ++{ ++ BUG(); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++/* ++ * The idle thread. There's no useful work to be ++ * done, so just try to conserve power and have a ++ * low exit latency (ie sit in a loop waiting for ++ * somebody to say that they'd like to reschedule) ++ */ ++void cpu_idle(void) ++{ ++ int cpu = smp_processor_id(); ++ ++ current_thread_info()->status |= TS_POLLING; ++ ++ /* endless idle loop with no priority at all */ ++ while (1) { ++ tick_nohz_stop_sched_tick(); ++ while (!need_resched()) { ++ void (*idle)(void); ++ ++ check_pgt_cache(); ++ rmb(); ++ idle = xen_idle; /* no alternatives */ ++ ++ if (rcu_pending(cpu)) ++ rcu_check_callbacks(cpu, 0); ++ ++ if (cpu_is_offline(cpu)) ++ play_dead(); ++ ++ local_irq_disable(); ++ __get_cpu_var(irq_stat).idle_timestamp = jiffies; ++ idle(); ++ } ++ tick_nohz_restart_sched_tick(); ++ preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++ } ++} ++ ++void __show_registers(struct pt_regs *regs, int all) ++{ ++ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; ++ unsigned long d0, d1, d2, d3, d6, d7; ++ unsigned long sp; ++ unsigned short ss, gs; ++ ++ if (user_mode_vm(regs)) { ++ sp = regs->sp; ++ ss = regs->ss & 0xffff; ++ savesegment(gs, gs); ++ } else { ++ sp = (unsigned long) (®s->sp); ++ savesegment(ss, ss); ++ savesegment(gs, gs); ++ } ++ ++ printk("\n"); ++ printk("Pid: %d, comm: %s %s (%s %.*s)\n", ++ task_pid_nr(current), current->comm, ++ print_tainted(), init_utsname()->release, ++ (int)strcspn(init_utsname()->version, " "), ++ init_utsname()->version); ++ ++ printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", ++ (u16)regs->cs, regs->ip, regs->flags, ++ smp_processor_id()); ++ print_symbol("EIP is at %s\n", regs->ip); ++ ++ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", ++ regs->ax, regs->bx, regs->cx, regs->dx); ++ printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", ++ regs->si, regs->di, regs->bp, sp); ++ printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", ++ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); ++ ++ if (!all) ++ return; ++ ++ cr0 = read_cr0(); ++ cr2 = read_cr2(); ++ cr3 = read_cr3(); ++ cr4 = read_cr4_safe(); ++ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", ++ cr0, cr2, cr3, cr4); ++ ++ get_debugreg(d0, 0); ++ get_debugreg(d1, 1); ++ get_debugreg(d2, 2); ++ get_debugreg(d3, 3); ++ printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", ++ d0, d1, d2, d3); ++ ++ get_debugreg(d6, 6); ++ get_debugreg(d7, 7); ++ printk("DR6: %08lx DR7: %08lx\n", ++ d6, d7); ++} ++ ++void show_regs(struct pt_regs *regs) ++{ ++ __show_registers(regs, 1); ++ show_trace(NULL, regs, ®s->sp, regs->bp); ++} ++ ++/* ++ * This gets run with %bx containing the ++ * function to call, and %dx containing ++ * the "args". ++ */ ++extern void kernel_thread_helper(void); ++ ++/* ++ * Create a kernel thread ++ */ ++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++{ ++ struct pt_regs regs; ++ ++ memset(®s, 0, sizeof(regs)); ++ ++ regs.bx = (unsigned long) fn; ++ regs.dx = (unsigned long) arg; ++ ++ regs.ds = __USER_DS; ++ regs.es = __USER_DS; ++ regs.fs = __KERNEL_PERCPU; ++ regs.orig_ax = -1; ++ regs.ip = (unsigned long) kernel_thread_helper; ++ regs.cs = __KERNEL_CS | get_kernel_rpl(); ++ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; ++ ++ /* Ok, create the new process.. */ ++ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); ++} ++EXPORT_SYMBOL(kernel_thread); ++ ++/* ++ * Free current thread data structures etc.. ++ */ ++void exit_thread(void) ++{ ++ /* The process may have allocated an io port bitmap... nuke it. */ ++ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { ++ struct task_struct *tsk = current; ++ struct thread_struct *t = &tsk->thread; ++ struct physdev_set_iobitmap set_iobitmap; ++ memset(&set_iobitmap, 0, sizeof(set_iobitmap)); ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, ++ &set_iobitmap)); ++ kfree(t->io_bitmap_ptr); ++ t->io_bitmap_ptr = NULL; ++ clear_thread_flag(TIF_IO_BITMAP); ++ } ++} ++ ++void flush_thread(void) ++{ ++ struct task_struct *tsk = current; ++ ++ tsk->thread.debugreg0 = 0; ++ tsk->thread.debugreg1 = 0; ++ tsk->thread.debugreg2 = 0; ++ tsk->thread.debugreg3 = 0; ++ tsk->thread.debugreg6 = 0; ++ tsk->thread.debugreg7 = 0; ++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); ++ clear_tsk_thread_flag(tsk, TIF_DEBUG); ++ /* ++ * Forget coprocessor state.. ++ */ ++ tsk->fpu_counter = 0; ++ clear_fpu(tsk); ++ clear_used_math(); ++} ++ ++void release_thread(struct task_struct *dead_task) ++{ ++ BUG_ON(dead_task->mm); ++ release_vm86_irqs(dead_task); ++} ++ ++/* ++ * This gets called before we allocate a new thread and copy ++ * the current task into it. ++ */ ++void prepare_to_copy(struct task_struct *tsk) ++{ ++ unlazy_fpu(tsk); ++} ++ ++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long unused, ++ struct task_struct * p, struct pt_regs * regs) ++{ ++ struct pt_regs * childregs; ++ struct task_struct *tsk; ++ int err; ++ ++ childregs = task_pt_regs(p); ++ *childregs = *regs; ++ childregs->ax = 0; ++ childregs->sp = sp; ++ ++ p->thread.sp = (unsigned long) childregs; ++ p->thread.sp0 = (unsigned long) (childregs+1); ++ ++ p->thread.ip = (unsigned long) ret_from_fork; ++ ++ savesegment(gs, p->thread.gs); ++ ++ tsk = current; ++ if (test_tsk_thread_flag(tsk, TIF_CSTAR)) ++ p->thread.ip = (unsigned long) cstar_ret_from_fork; ++ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { ++ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, ++ IO_BITMAP_BYTES, GFP_KERNEL); ++ if (!p->thread.io_bitmap_ptr) { ++ p->thread.io_bitmap_max = 0; ++ return -ENOMEM; ++ } ++ set_tsk_thread_flag(p, TIF_IO_BITMAP); ++ } ++ ++ err = 0; ++ ++ /* ++ * Set a new TLS for the child thread? ++ */ ++ if (clone_flags & CLONE_SETTLS) ++ err = do_set_thread_area(p, -1, ++ (struct user_desc __user *)childregs->si, 0); ++ ++ p->thread.iopl = current->thread.iopl; ++ ++ if (err && p->thread.io_bitmap_ptr) { ++ kfree(p->thread.io_bitmap_ptr); ++ p->thread.io_bitmap_max = 0; ++ } ++ return err; ++} ++ ++void ++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) ++{ ++ __asm__("movl %0, %%gs" :: "r"(0)); ++ regs->fs = 0; ++ set_fs(USER_DS); ++ regs->ds = __USER_DS; ++ regs->es = __USER_DS; ++ regs->ss = __USER_DS; ++ regs->cs = __USER_CS; ++ regs->ip = new_ip; ++ regs->sp = new_sp; ++ /* ++ * Free the old FP and other extended state ++ */ ++ free_thread_xstate(current); ++} ++EXPORT_SYMBOL_GPL(start_thread); ++ ++static void hard_disable_TSC(void) ++{ ++ write_cr4(read_cr4() | X86_CR4_TSD); ++} ++ ++void disable_TSC(void) ++{ ++#ifdef CONFIG_SECCOMP_DISABLE_TSC ++ preempt_disable(); ++ if (!test_and_set_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_disable_TSC(); ++ preempt_enable(); ++#endif ++} ++ ++static void hard_enable_TSC(void) ++{ ++ write_cr4(read_cr4() & ~X86_CR4_TSD); ++} ++ ++static void enable_TSC(void) ++{ ++ preempt_disable(); ++ if (test_and_clear_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_enable_TSC(); ++ preempt_enable(); ++} ++ ++int get_tsc_mode(unsigned long adr) ++{ ++ unsigned int val; ++ ++ if (test_thread_flag(TIF_NOTSC)) ++ val = PR_TSC_SIGSEGV; ++ else ++ val = PR_TSC_ENABLE; ++ ++ return put_user(val, (unsigned int __user *)adr); ++} ++ ++int set_tsc_mode(unsigned int val) ++{ ++ if (val == PR_TSC_SIGSEGV) ++ disable_TSC(); ++ else if (val == PR_TSC_ENABLE) ++ enable_TSC(); ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static noinline void ++__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) ++{ ++ struct thread_struct *prev, *next; ++ unsigned long debugctl; ++ ++ prev = &prev_p->thread; ++ next = &next_p->thread; ++ ++ debugctl = prev->debugctlmsr; ++ if (next->ds_area_msr != prev->ds_area_msr) { ++ /* we clear debugctl to make sure DS ++ * is not in use when we change it */ ++ debugctl = 0; ++ update_debugctlmsr(0); ++ wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); ++ } ++ ++ if (next->debugctlmsr != debugctl) ++ update_debugctlmsr(next->debugctlmsr); ++ ++ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { ++ set_debugreg(next->debugreg0, 0); ++ set_debugreg(next->debugreg1, 1); ++ set_debugreg(next->debugreg2, 2); ++ set_debugreg(next->debugreg3, 3); ++ /* no 4 and 5 */ ++ set_debugreg(next->debugreg6, 6); ++ set_debugreg(next->debugreg7, 7); ++ } ++ ++ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ ++ test_tsk_thread_flag(next_p, TIF_NOTSC)) { ++ /* prev and next are different */ ++ if (test_tsk_thread_flag(next_p, TIF_NOTSC)) ++ hard_disable_TSC(); ++ else ++ hard_enable_TSC(); ++ } ++ ++#ifdef X86_BTS ++ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ++ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); ++ ++ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ++ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); ++#endif ++} ++ ++/* ++ * switch_to(x,yn) should switch tasks from x to y. ++ * ++ * We fsave/fwait so that an exception goes off at the right time ++ * (as a call from the fsave or fwait in effect) rather than to ++ * the wrong process. Lazy FP saving no longer makes any sense ++ * with modern CPU's, and this simplifies a lot of things (SMP ++ * and UP become the same). ++ * ++ * NOTE! We used to use the x86 hardware context switching. The ++ * reason for not using it any more becomes apparent when you ++ * try to recover gracefully from saved state that is no longer ++ * valid (stale segment register values in particular). With the ++ * hardware task-switch, there is no way to fix up bad state in ++ * a reasonable manner. ++ * ++ * The fact that Intel documents the hardware task-switching to ++ * be slow is a fairly red herring - this code is not noticeably ++ * faster. However, there _is_ some room for improvement here, ++ * so the performance issues may eventually be a valid point. ++ * More important, however, is the fact that this allows us much ++ * more flexibility. ++ * ++ * The return value (in %ax) will be the "prev" task after ++ * the task-switch, and shows up in ret_from_fork in entry.S, ++ * for example. ++ */ ++struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) ++{ ++ struct thread_struct *prev = &prev_p->thread, ++ *next = &next_p->thread; ++ int cpu = smp_processor_id(); ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *tss = &per_cpu(init_tss, cpu); ++#endif ++#if CONFIG_XEN_COMPAT > 0x030002 ++ struct physdev_set_iopl iopl_op; ++ struct physdev_set_iobitmap iobmp_op; ++#else ++ struct physdev_op _pdo[2], *pdo = _pdo; ++#define iopl_op pdo->u.set_iopl ++#define iobmp_op pdo->u.set_iobitmap ++#endif ++ multicall_entry_t _mcl[8], *mcl = _mcl; ++ ++ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ ++ ++ /* ++ * This is basically '__unlazy_fpu', except that we queue a ++ * multicall to indicate FPU task switch, rather than ++ * synchronously trapping to Xen. ++ */ ++ if (task_thread_info(prev_p)->status & TS_USEDFPU) { ++ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ ++ mcl->op = __HYPERVISOR_fpu_taskswitch; ++ mcl->args[0] = 1; ++ mcl++; ++ } ++#if 0 /* lazy fpu sanity check */ ++ else BUG_ON(!(read_cr0() & 8)); ++#endif ++ ++ /* ++ * Reload sp0. ++ * This is load_sp0(tss, next) with a multicall. ++ */ ++ mcl->op = __HYPERVISOR_stack_switch; ++ mcl->args[0] = __KERNEL_DS; ++ mcl->args[1] = next->sp0; ++ mcl++; ++ ++ /* ++ * Load the per-thread Thread-Local Storage descriptor. ++ * This is load_TLS(next, cpu) with multicalls. ++ */ ++#define C(i) do { \ ++ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ ++ next->tls_array[i].b != prev->tls_array[i].b)) { \ ++ mcl->op = __HYPERVISOR_update_descriptor; \ ++ *(u64 *)&mcl->args[0] = virt_to_machine( \ ++ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ ++ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ ++ mcl++; \ ++ } \ ++} while (0) ++ C(0); C(1); C(2); ++#undef C ++ ++ if (unlikely(prev->iopl != next->iopl)) { ++ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; ++#if CONFIG_XEN_COMPAT > 0x030002 ++ mcl->op = __HYPERVISOR_physdev_op; ++ mcl->args[0] = PHYSDEVOP_set_iopl; ++ mcl->args[1] = (unsigned long)&iopl_op; ++#else ++ mcl->op = __HYPERVISOR_physdev_op_compat; ++ pdo->cmd = PHYSDEVOP_set_iopl; ++ mcl->args[0] = (unsigned long)pdo++; ++#endif ++ mcl++; ++ } ++ ++ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { ++ set_xen_guest_handle(iobmp_op.bitmap, ++ (char *)next->io_bitmap_ptr); ++ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; ++#if CONFIG_XEN_COMPAT > 0x030002 ++ mcl->op = __HYPERVISOR_physdev_op; ++ mcl->args[0] = PHYSDEVOP_set_iobitmap; ++ mcl->args[1] = (unsigned long)&iobmp_op; ++#else ++ mcl->op = __HYPERVISOR_physdev_op_compat; ++ pdo->cmd = PHYSDEVOP_set_iobitmap; ++ mcl->args[0] = (unsigned long)pdo++; ++#endif ++ mcl++; ++ } ++ ++#if CONFIG_XEN_COMPAT <= 0x030002 ++ BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); ++#endif ++ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); ++ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) ++ BUG(); ++ ++ /* we're going to use this soon, after a few expensive things */ ++ if (next_p->fpu_counter > 5) ++ prefetch(next->xstate); ++ ++ /* ++ * Now maybe handle debug registers ++ */ ++ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || ++ task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) ++ __switch_to_xtra(prev_p, next_p); ++ ++ /* ++ * Leave lazy mode, flushing any hypercalls made here. ++ * This must be done before restoring TLS segments so ++ * the GDT and LDT are properly updated, and must be ++ * done before math_state_restore, so the TS bit is up ++ * to date. ++ */ ++ arch_leave_lazy_cpu_mode(); ++ ++ /* If the task has used fpu the last 5 timeslices, just do a full ++ * restore of the math state immediately to avoid the trap; the ++ * chances of needing FPU soon are obviously high now ++ * ++ * tsk_used_math() checks prevent calling math_state_restore(), ++ * which can sleep in the case of !tsk_used_math() ++ */ ++ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) ++ math_state_restore(); ++ ++ /* ++ * Restore %gs if needed (which is common) ++ */ ++ if (prev->gs | next->gs) ++ loadsegment(gs, next->gs); ++ ++ x86_write_percpu(current_task, next_p); ++ ++ return prev_p; ++} ++ ++asmlinkage int sys_fork(struct pt_regs regs) ++{ ++ return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); ++} ++ ++asmlinkage int sys_clone(struct pt_regs regs) ++{ ++ unsigned long clone_flags; ++ unsigned long newsp; ++ int __user *parent_tidptr, *child_tidptr; ++ ++ clone_flags = regs.bx; ++ newsp = regs.cx; ++ parent_tidptr = (int __user *)regs.dx; ++ child_tidptr = (int __user *)regs.di; ++ if (!newsp) ++ newsp = regs.sp; ++ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); ++} ++ ++/* ++ * This is trivial, and on the face of it looks like it ++ * could equally well be done in user mode. ++ * ++ * Not so, for quite unobvious reasons - register pressure. ++ * In user mode vfork() cannot have a stack frame, and if ++ * done by calling the "clone()" system call directly, you ++ * do not have enough call-clobbered registers to hold all ++ * the information you need. ++ */ ++asmlinkage int sys_vfork(struct pt_regs regs) ++{ ++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); ++} ++ ++/* ++ * sys_execve() executes a new program. ++ */ ++asmlinkage int sys_execve(struct pt_regs regs) ++{ ++ int error; ++ char * filename; ++ ++ filename = getname((char __user *) regs.bx); ++ error = PTR_ERR(filename); ++ if (IS_ERR(filename)) ++ goto out; ++ error = do_execve(filename, ++ (char __user * __user *) regs.cx, ++ (char __user * __user *) regs.dx, ++ ®s); ++ if (error == 0) { ++ /* Make sure we don't return using sysenter.. */ ++ set_thread_flag(TIF_IRET); ++ } ++ putname(filename); ++out: ++ return error; ++} ++ ++#define top_esp (THREAD_SIZE - sizeof(unsigned long)) ++#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) ++ ++unsigned long get_wchan(struct task_struct *p) ++{ ++ unsigned long bp, sp, ip; ++ unsigned long stack_page; ++ int count = 0; ++ if (!p || p == current || p->state == TASK_RUNNING) ++ return 0; ++ stack_page = (unsigned long)task_stack_page(p); ++ sp = p->thread.sp; ++ if (!stack_page || sp < stack_page || sp > top_esp+stack_page) ++ return 0; ++ /* include/asm-i386/system.h:switch_to() pushes bp last. */ ++ bp = *(unsigned long *) sp; ++ do { ++ if (bp < stack_page || bp > top_ebp+stack_page) ++ return 0; ++ ip = *(unsigned long *) (bp+4); ++ if (!in_sched_functions(ip)) ++ return ip; ++ bp = *(unsigned long *) bp; ++ } while (count++ < 16); ++ return 0; ++} ++ ++unsigned long arch_align_stack(unsigned long sp) ++{ ++ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) ++ sp -= get_random_int() % 8192; ++ return sp & ~0xf; ++} ++ ++unsigned long arch_randomize_brk(struct mm_struct *mm) ++{ ++ unsigned long range_end = mm->brk + 0x02000000; ++ return randomize_range(mm->brk, range_end, 0) ? : mm->brk; ++} +diff --git a/arch/x86/kernel/process_64-xen.c b/arch/x86/kernel/process_64-xen.c +new file mode 100644 +index 0000000..893fb36 +--- /dev/null ++++ b/arch/x86/kernel/process_64-xen.c +@@ -0,0 +1,923 @@ ++/* ++ * Copyright (C) 1995 Linus Torvalds ++ * ++ * Pentium III FXSR, SSE support ++ * Gareth Hughes , May 2000 ++ * ++ * X86-64 port ++ * Andi Kleen. ++ * ++ * CPU hotplug support - ashok.raj@intel.com ++ * ++ * Jun Nakajima ++ * Modified for Xen ++ */ ++ ++/* ++ * This file handles the architecture-dependent parts of process handling.. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++asmlinkage extern void ret_from_fork(void); ++ ++unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; ++ ++unsigned long boot_option_idle_override = 0; ++EXPORT_SYMBOL(boot_option_idle_override); ++ ++/* ++ * Powermanagement idle function, if any.. ++ */ ++void (*pm_idle)(void); ++EXPORT_SYMBOL(pm_idle); ++ ++static ATOMIC_NOTIFIER_HEAD(idle_notifier); ++ ++void idle_notifier_register(struct notifier_block *n) ++{ ++ atomic_notifier_chain_register(&idle_notifier, n); ++} ++ ++void enter_idle(void) ++{ ++ write_pda(isidle, 1); ++ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); ++} ++ ++static void __exit_idle(void) ++{ ++ if (test_and_clear_bit_pda(0, isidle) == 0) ++ return; ++ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); ++} ++ ++/* Called from interrupts to signify idle end */ ++void exit_idle(void) ++{ ++ /* idle loop has pid 0 */ ++ if (current->pid) ++ return; ++ __exit_idle(); ++} ++ ++static void xen_idle(void) ++{ ++ current_thread_info()->status &= ~TS_POLLING; ++ /* ++ * TS_POLLING-cleared state must be visible before we ++ * test NEED_RESCHED: ++ */ ++ smp_mb(); ++ if (!need_resched()) ++ safe_halt(); /* enables interrupts racelessly */ ++ else ++ local_irq_enable(); ++ current_thread_info()->status |= TS_POLLING; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static inline void play_dead(void) ++{ ++ idle_task_exit(); ++ local_irq_disable(); ++ cpu_clear(smp_processor_id(), cpu_initialized); ++ preempt_enable_no_resched(); ++ VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL)); ++ cpu_bringup(); ++} ++#else ++static inline void play_dead(void) ++{ ++ BUG(); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++/* ++ * The idle thread. There's no useful work to be ++ * done, so just try to conserve power and have a ++ * low exit latency (ie sit in a loop waiting for ++ * somebody to say that they'd like to reschedule) ++ */ ++void cpu_idle(void) ++{ ++ current_thread_info()->status |= TS_POLLING; ++ /* endless idle loop with no priority at all */ ++ while (1) { ++ tick_nohz_stop_sched_tick(); ++ while (!need_resched()) { ++ void (*idle)(void); ++ ++ rmb(); ++ idle = xen_idle; /* no alternatives */ ++ if (cpu_is_offline(smp_processor_id())) ++ play_dead(); ++ /* ++ * Idle routines should keep interrupts disabled ++ * from here on, until they go to idle. ++ * Otherwise, idle callbacks can misfire. ++ */ ++ local_irq_disable(); ++ enter_idle(); ++ idle(); ++ /* In many cases the interrupt that ended idle ++ has already called exit_idle. But some idle ++ loops can be woken up without interrupt. */ ++ __exit_idle(); ++ } ++ ++ tick_nohz_restart_sched_tick(); ++ preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++ } ++} ++ ++/* Prints also some state that isn't saved in the pt_regs */ ++void __show_regs(struct pt_regs * regs) ++{ ++ unsigned long fs, gs, shadowgs; ++ unsigned long d0, d1, d2, d3, d6, d7; ++ unsigned int fsindex, gsindex; ++ unsigned int ds, cs, es; ++ ++ printk("\n"); ++ print_modules(); ++ printk("Pid: %d, comm: %.20s %s %s %.*s\n", ++ current->pid, current->comm, print_tainted(), ++ init_utsname()->release, ++ (int)strcspn(init_utsname()->version, " "), ++ init_utsname()->version); ++ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); ++ printk_address(regs->ip, 1); ++ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, ++ regs->flags); ++ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", ++ regs->ax, regs->bx, regs->cx); ++ printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", ++ regs->dx, regs->si, regs->di); ++ printk("RBP: %016lx R08: %016lx R09: %016lx\n", ++ regs->bp, regs->r8, regs->r9); ++ printk("R10: %016lx R11: %016lx R12: %016lx\n", ++ regs->r10, regs->r11, regs->r12); ++ printk("R13: %016lx R14: %016lx R15: %016lx\n", ++ regs->r13, regs->r14, regs->r15); ++ ++ asm("mov %%ds,%0" : "=r" (ds)); ++ asm("mov %%cs,%0" : "=r" (cs)); ++ asm("mov %%es,%0" : "=r" (es)); ++ asm("mov %%fs,%0" : "=r" (fsindex)); ++ asm("mov %%gs,%0" : "=r" (gsindex)); ++ ++ rdmsrl(MSR_FS_BASE, fs); ++ rdmsrl(MSR_GS_BASE, gs); ++ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); ++ ++ printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", ++ fs,fsindex,gs,gsindex,shadowgs); ++ printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es); ++ ++ get_debugreg(d0, 0); ++ get_debugreg(d1, 1); ++ get_debugreg(d2, 2); ++ printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); ++ get_debugreg(d3, 3); ++ get_debugreg(d6, 6); ++ get_debugreg(d7, 7); ++ printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); ++} ++ ++void show_regs(struct pt_regs *regs) ++{ ++ printk("CPU %d:", smp_processor_id()); ++ __show_regs(regs); ++ show_trace(NULL, regs, (void *)(regs + 1), regs->bp); ++} ++ ++/* ++ * Free current thread data structures etc.. ++ */ ++void exit_thread(void) ++{ ++ struct task_struct *me = current; ++ struct thread_struct *t = &me->thread; ++ ++ if (me->thread.io_bitmap_ptr) { ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); ++#endif ++#ifdef CONFIG_XEN ++ struct physdev_set_iobitmap iobmp_op; ++ memset(&iobmp_op, 0, sizeof(iobmp_op)); ++#endif ++ ++ kfree(t->io_bitmap_ptr); ++ t->io_bitmap_ptr = NULL; ++ clear_thread_flag(TIF_IO_BITMAP); ++ /* ++ * Careful, clear this in the TSS too: ++ */ ++#ifndef CONFIG_X86_NO_TSS ++ memset(tss->io_bitmap, 0xff, t->io_bitmap_max); ++ put_cpu(); ++#endif ++#ifdef CONFIG_XEN ++ WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, ++ &iobmp_op)); ++#endif ++ t->io_bitmap_max = 0; ++ } ++} ++ ++void load_gs_index(unsigned gs) ++{ ++ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); ++} ++ ++void flush_thread(void) ++{ ++ struct task_struct *tsk = current; ++ ++ if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { ++ clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); ++ if (test_tsk_thread_flag(tsk, TIF_IA32)) { ++ clear_tsk_thread_flag(tsk, TIF_IA32); ++ } else { ++ set_tsk_thread_flag(tsk, TIF_IA32); ++ current_thread_info()->status |= TS_COMPAT; ++ } ++ } ++ clear_tsk_thread_flag(tsk, TIF_DEBUG); ++ ++ tsk->thread.debugreg0 = 0; ++ tsk->thread.debugreg1 = 0; ++ tsk->thread.debugreg2 = 0; ++ tsk->thread.debugreg3 = 0; ++ tsk->thread.debugreg6 = 0; ++ tsk->thread.debugreg7 = 0; ++ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); ++ /* ++ * Forget coprocessor state.. ++ */ ++ tsk->fpu_counter = 0; ++ clear_fpu(tsk); ++ clear_used_math(); ++} ++ ++void release_thread(struct task_struct *dead_task) ++{ ++ if (dead_task->mm) { ++ if (dead_task->mm->context.size) { ++ printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", ++ dead_task->comm, ++ dead_task->mm->context.ldt, ++ dead_task->mm->context.size); ++ BUG(); ++ } ++ } ++} ++ ++static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) ++{ ++ struct user_desc ud = { ++ .base_addr = addr, ++ .limit = 0xfffff, ++ .seg_32bit = 1, ++ .limit_in_pages = 1, ++ .useable = 1, ++ }; ++ struct desc_struct *desc = t->thread.tls_array; ++ desc += tls; ++ fill_ldt(desc, &ud); ++} ++ ++static inline u32 read_32bit_tls(struct task_struct *t, int tls) ++{ ++ return get_desc_base(&t->thread.tls_array[tls]); ++} ++ ++/* ++ * This gets called before we allocate a new thread and copy ++ * the current task into it. ++ */ ++void prepare_to_copy(struct task_struct *tsk) ++{ ++ unlazy_fpu(tsk); ++} ++ ++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long unused, ++ struct task_struct * p, struct pt_regs * regs) ++{ ++ int err; ++ struct pt_regs * childregs; ++ struct task_struct *me = current; ++ ++ childregs = ((struct pt_regs *) ++ (THREAD_SIZE + task_stack_page(p))) - 1; ++ *childregs = *regs; ++ ++ childregs->ax = 0; ++ childregs->sp = sp; ++ if (sp == ~0UL) ++ childregs->sp = (unsigned long)childregs; ++ ++ p->thread.sp = (unsigned long) childregs; ++ p->thread.sp0 = (unsigned long) (childregs+1); ++ p->thread.usersp = me->thread.usersp; ++ ++ set_tsk_thread_flag(p, TIF_FORK); ++ ++ p->thread.fs = me->thread.fs; ++ p->thread.gs = me->thread.gs; ++ ++ asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); ++ asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); ++ asm("mov %%es,%0" : "=m" (p->thread.es)); ++ asm("mov %%ds,%0" : "=m" (p->thread.ds)); ++ ++ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { ++ p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); ++ if (!p->thread.io_bitmap_ptr) { ++ p->thread.io_bitmap_max = 0; ++ return -ENOMEM; ++ } ++ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, ++ IO_BITMAP_BYTES); ++ set_tsk_thread_flag(p, TIF_IO_BITMAP); ++ } ++ ++ /* ++ * Set a new TLS for the child thread? ++ */ ++ if (clone_flags & CLONE_SETTLS) { ++#ifdef CONFIG_IA32_EMULATION ++ if (test_thread_flag(TIF_IA32)) ++ err = do_set_thread_area(p, -1, ++ (struct user_desc __user *)childregs->si, 0); ++ else ++#endif ++ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); ++ if (err) ++ goto out; ++ } ++ p->thread.iopl = current->thread.iopl; ++ ++ err = 0; ++out: ++ if (err && p->thread.io_bitmap_ptr) { ++ kfree(p->thread.io_bitmap_ptr); ++ p->thread.io_bitmap_max = 0; ++ } ++ return err; ++} ++ ++void ++start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) ++{ ++ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0)); ++ load_gs_index(0); ++ regs->ip = new_ip; ++ regs->sp = new_sp; ++ write_pda(oldrsp, new_sp); ++ regs->cs = __USER_CS; ++ regs->ss = __USER_DS; ++ regs->flags = 0x200; ++ set_fs(USER_DS); ++ /* ++ * Free the old FP and other extended state ++ */ ++ free_thread_xstate(current); ++} ++EXPORT_SYMBOL_GPL(start_thread); ++ ++static void hard_disable_TSC(void) ++{ ++ write_cr4(read_cr4() | X86_CR4_TSD); ++} ++ ++void disable_TSC(void) ++{ ++#ifdef CONFIG_SECCOMP_DISABLE_TSC ++ preempt_disable(); ++ if (!test_and_set_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_disable_TSC(); ++ preempt_enable(); ++#endif ++} ++ ++static void hard_enable_TSC(void) ++{ ++ write_cr4(read_cr4() & ~X86_CR4_TSD); ++} ++ ++static void enable_TSC(void) ++{ ++ preempt_disable(); ++ if (test_and_clear_thread_flag(TIF_NOTSC)) ++ /* ++ * Must flip the CPU state synchronously with ++ * TIF_NOTSC in the current running context. ++ */ ++ hard_enable_TSC(); ++ preempt_enable(); ++} ++ ++int get_tsc_mode(unsigned long adr) ++{ ++ unsigned int val; ++ ++ if (test_thread_flag(TIF_NOTSC)) ++ val = PR_TSC_SIGSEGV; ++ else ++ val = PR_TSC_ENABLE; ++ ++ return put_user(val, (unsigned int __user *)adr); ++} ++ ++int set_tsc_mode(unsigned int val) ++{ ++ if (val == PR_TSC_SIGSEGV) ++ disable_TSC(); ++ else if (val == PR_TSC_ENABLE) ++ enable_TSC(); ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* ++ * This special macro can be used to load a debugging register ++ */ ++#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) ++ ++static inline void __switch_to_xtra(struct task_struct *prev_p, ++ struct task_struct *next_p) ++{ ++ struct thread_struct *prev, *next; ++ unsigned long debugctl; ++ ++ prev = &prev_p->thread, ++ next = &next_p->thread; ++ ++ debugctl = prev->debugctlmsr; ++ if (next->ds_area_msr != prev->ds_area_msr) { ++ /* we clear debugctl to make sure DS ++ * is not in use when we change it */ ++ debugctl = 0; ++ update_debugctlmsr(0); ++ wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); ++ } ++ ++ if (next->debugctlmsr != debugctl) ++ update_debugctlmsr(next->debugctlmsr); ++ ++ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { ++ loaddebug(next, 0); ++ loaddebug(next, 1); ++ loaddebug(next, 2); ++ loaddebug(next, 3); ++ /* no 4 and 5 */ ++ loaddebug(next, 6); ++ loaddebug(next, 7); ++ } ++ ++ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ ++ test_tsk_thread_flag(next_p, TIF_NOTSC)) { ++ /* prev and next are different */ ++ if (test_tsk_thread_flag(next_p, TIF_NOTSC)) ++ hard_disable_TSC(); ++ else ++ hard_enable_TSC(); ++ } ++ ++#ifdef X86_BTS ++ if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ++ ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); ++ ++ if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ++ ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); ++#endif ++} ++ ++/* ++ * switch_to(x,y) should switch tasks from x to y. ++ * ++ * This could still be optimized: ++ * - fold all the options into a flag word and test it with a single test. ++ * - could test fs/gs bitsliced ++ * ++ * Kprobes not supported here. Set the probe on schedule instead. ++ */ ++struct task_struct * ++__switch_to(struct task_struct *prev_p, struct task_struct *next_p) ++{ ++ struct thread_struct *prev = &prev_p->thread, ++ *next = &next_p->thread; ++ int cpu = smp_processor_id(); ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *tss = &per_cpu(init_tss, cpu); ++#endif ++#if CONFIG_XEN_COMPAT > 0x030002 ++ struct physdev_set_iopl iopl_op; ++ struct physdev_set_iobitmap iobmp_op; ++#else ++ struct physdev_op _pdo[2], *pdo = _pdo; ++#define iopl_op pdo->u.set_iopl ++#define iobmp_op pdo->u.set_iobitmap ++#endif ++ multicall_entry_t _mcl[8], *mcl = _mcl; ++ ++ /* we're going to use this soon, after a few expensive things */ ++ if (next_p->fpu_counter>5) ++ prefetch(next->xstate); ++ ++ /* ++ * This is basically '__unlazy_fpu', except that we queue a ++ * multicall to indicate FPU task switch, rather than ++ * synchronously trapping to Xen. ++ * The AMD workaround requires it to be after DS reload, or ++ * after DS has been cleared, which we do in __prepare_arch_switch. ++ */ ++ if (task_thread_info(prev_p)->status & TS_USEDFPU) { ++ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ ++ mcl->op = __HYPERVISOR_fpu_taskswitch; ++ mcl->args[0] = 1; ++ mcl++; ++ } else ++ prev_p->fpu_counter = 0; ++ ++ /* ++ * Reload sp0. ++ * This is load_sp0(tss, next) with a multicall. ++ */ ++ mcl->op = __HYPERVISOR_stack_switch; ++ mcl->args[0] = __KERNEL_DS; ++ mcl->args[1] = next->sp0; ++ mcl++; ++ ++ /* ++ * Load the per-thread Thread-Local Storage descriptor. ++ * This is load_TLS(next, cpu) with multicalls. ++ */ ++#define C(i) do { \ ++ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ ++ next->tls_array[i].b != prev->tls_array[i].b)) { \ ++ mcl->op = __HYPERVISOR_update_descriptor; \ ++ mcl->args[0] = virt_to_machine( \ ++ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ ++ mcl->args[1] = *(u64 *)&next->tls_array[i]; \ ++ mcl++; \ ++ } \ ++} while (0) ++ C(0); C(1); C(2); ++#undef C ++ ++ if (unlikely(prev->iopl != next->iopl)) { ++ iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; ++#if CONFIG_XEN_COMPAT > 0x030002 ++ mcl->op = __HYPERVISOR_physdev_op; ++ mcl->args[0] = PHYSDEVOP_set_iopl; ++ mcl->args[1] = (unsigned long)&iopl_op; ++#else ++ mcl->op = __HYPERVISOR_physdev_op_compat; ++ pdo->cmd = PHYSDEVOP_set_iopl; ++ mcl->args[0] = (unsigned long)pdo++; ++#endif ++ mcl++; ++ } ++ ++ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { ++ set_xen_guest_handle(iobmp_op.bitmap, ++ (char *)next->io_bitmap_ptr); ++ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; ++#if CONFIG_XEN_COMPAT > 0x030002 ++ mcl->op = __HYPERVISOR_physdev_op; ++ mcl->args[0] = PHYSDEVOP_set_iobitmap; ++ mcl->args[1] = (unsigned long)&iobmp_op; ++#else ++ mcl->op = __HYPERVISOR_physdev_op_compat; ++ pdo->cmd = PHYSDEVOP_set_iobitmap; ++ mcl->args[0] = (unsigned long)pdo++; ++#endif ++ mcl++; ++ } ++ ++#if CONFIG_XEN_COMPAT <= 0x030002 ++ BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); ++#endif ++ BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); ++ if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) ++ BUG(); ++ ++ /* ++ * Switch DS and ES. ++ * This won't pick up thread selector changes, but I guess that is ok. ++ */ ++ if (unlikely(next->es)) ++ loadsegment(es, next->es); ++ ++ if (unlikely(next->ds)) ++ loadsegment(ds, next->ds); ++ ++ /* ++ * Switch FS and GS. ++ */ ++ if (unlikely(next->fsindex)) ++ loadsegment(fs, next->fsindex); ++ ++ if (next->fs) ++ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); ++ ++ if (unlikely(next->gsindex)) ++ load_gs_index(next->gsindex); ++ ++ if (next->gs) ++ WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); ++ ++ /* ++ * Switch the PDA context. ++ */ ++ prev->usersp = read_pda(oldrsp); ++ write_pda(oldrsp, next->usersp); ++ write_pda(pcurrent, next_p); ++ write_pda(kernelstack, ++ (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); ++#ifdef CONFIG_CC_STACKPROTECTOR ++ write_pda(stack_canary, next_p->stack_canary); ++ ++ /* ++ * Build time only check to make sure the stack_canary is at ++ * offset 40 in the pda; this is a gcc ABI requirement ++ */ ++ BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); ++#endif ++ ++ /* ++ * Now maybe reload the debug registers ++ */ ++ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || ++ task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) ++ __switch_to_xtra(prev_p, next_p); ++ ++ /* If the task has used fpu the last 5 timeslices, just do a full ++ * restore of the math state immediately to avoid the trap; the ++ * chances of needing FPU soon are obviously high now ++ * ++ * tsk_used_math() checks prevent calling math_state_restore(), ++ * which can sleep in the case of !tsk_used_math() ++ */ ++ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) ++ math_state_restore(); ++ return prev_p; ++} ++ ++/* ++ * sys_execve() executes a new program. ++ */ ++asmlinkage ++long sys_execve(char __user *name, char __user * __user *argv, ++ char __user * __user *envp, struct pt_regs *regs) ++{ ++ long error; ++ char * filename; ++ ++ filename = getname(name); ++ error = PTR_ERR(filename); ++ if (IS_ERR(filename)) ++ return error; ++ error = do_execve(filename, argv, envp, regs); ++ putname(filename); ++ return error; ++} ++ ++void set_personality_64bit(void) ++{ ++ /* inherit personality from parent */ ++ ++ /* Make sure to be in 64bit mode */ ++ clear_thread_flag(TIF_IA32); ++ ++ /* TBD: overwrites user setup. Should have two bits. ++ But 64bit processes have always behaved this way, ++ so it's not too bad. The main problem is just that ++ 32bit childs are affected again. */ ++ current->personality &= ~READ_IMPLIES_EXEC; ++} ++ ++asmlinkage long sys_fork(struct pt_regs *regs) ++{ ++ return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); ++} ++ ++asmlinkage long ++sys_clone(unsigned long clone_flags, unsigned long newsp, ++ void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) ++{ ++ if (!newsp) ++ newsp = regs->sp; ++ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); ++} ++ ++/* ++ * This is trivial, and on the face of it looks like it ++ * could equally well be done in user mode. ++ * ++ * Not so, for quite unobvious reasons - register pressure. ++ * In user mode vfork() cannot have a stack frame, and if ++ * done by calling the "clone()" system call directly, you ++ * do not have enough call-clobbered registers to hold all ++ * the information you need. ++ */ ++asmlinkage long sys_vfork(struct pt_regs *regs) ++{ ++ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, ++ NULL, NULL); ++} ++ ++unsigned long get_wchan(struct task_struct *p) ++{ ++ unsigned long stack; ++ u64 fp,ip; ++ int count = 0; ++ ++ if (!p || p == current || p->state==TASK_RUNNING) ++ return 0; ++ stack = (unsigned long)task_stack_page(p); ++ if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) ++ return 0; ++ fp = *(u64 *)(p->thread.sp); ++ do { ++ if (fp < (unsigned long)stack || ++ fp > (unsigned long)stack+THREAD_SIZE) ++ return 0; ++ ip = *(u64 *)(fp+8); ++ if (!in_sched_functions(ip)) ++ return ip; ++ fp = *(u64 *)fp; ++ } while (count++ < 16); ++ return 0; ++} ++ ++long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) ++{ ++ int ret = 0; ++ int doit = task == current; ++ int cpu; ++ ++ switch (code) { ++ case ARCH_SET_GS: ++ if (addr >= TASK_SIZE_OF(task)) ++ return -EPERM; ++ cpu = get_cpu(); ++ /* handle small bases via the GDT because that's faster to ++ switch. */ ++ if (addr <= 0xffffffff) { ++ set_32bit_tls(task, GS_TLS, addr); ++ if (doit) { ++ load_TLS(&task->thread, cpu); ++ load_gs_index(GS_TLS_SEL); ++ } ++ task->thread.gsindex = GS_TLS_SEL; ++ task->thread.gs = 0; ++ } else { ++ task->thread.gsindex = 0; ++ task->thread.gs = addr; ++ if (doit) { ++ load_gs_index(0); ++ ret = HYPERVISOR_set_segment_base( ++ SEGBASE_GS_USER, addr); ++ } ++ } ++ put_cpu(); ++ break; ++ case ARCH_SET_FS: ++ /* Not strictly needed for fs, but do it for symmetry ++ with gs */ ++ if (addr >= TASK_SIZE_OF(task)) ++ return -EPERM; ++ cpu = get_cpu(); ++ /* handle small bases via the GDT because that's faster to ++ switch. */ ++ if (addr <= 0xffffffff) { ++ set_32bit_tls(task, FS_TLS, addr); ++ if (doit) { ++ load_TLS(&task->thread, cpu); ++ asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); ++ } ++ task->thread.fsindex = FS_TLS_SEL; ++ task->thread.fs = 0; ++ } else { ++ task->thread.fsindex = 0; ++ task->thread.fs = addr; ++ if (doit) { ++ /* set the selector to 0 to not confuse ++ __switch_to */ ++ asm volatile("movl %0,%%fs" :: "r" (0)); ++ ret = HYPERVISOR_set_segment_base(SEGBASE_FS, ++ addr); ++ } ++ } ++ put_cpu(); ++ break; ++ case ARCH_GET_FS: { ++ unsigned long base; ++ if (task->thread.fsindex == FS_TLS_SEL) ++ base = read_32bit_tls(task, FS_TLS); ++ else if (doit) ++ rdmsrl(MSR_FS_BASE, base); ++ else ++ base = task->thread.fs; ++ ret = put_user(base, (unsigned long __user *)addr); ++ break; ++ } ++ case ARCH_GET_GS: { ++ unsigned long base; ++ unsigned gsindex; ++ if (task->thread.gsindex == GS_TLS_SEL) ++ base = read_32bit_tls(task, GS_TLS); ++ else if (doit) { ++ asm("movl %%gs,%0" : "=r" (gsindex)); ++ if (gsindex) ++ rdmsrl(MSR_KERNEL_GS_BASE, base); ++ else ++ base = task->thread.gs; ++ } ++ else ++ base = task->thread.gs; ++ ret = put_user(base, (unsigned long __user *)addr); ++ break; ++ } ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++long sys_arch_prctl(int code, unsigned long addr) ++{ ++ return do_arch_prctl(current, code, addr); ++} ++ ++unsigned long arch_align_stack(unsigned long sp) ++{ ++ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) ++ sp -= get_random_int() % 8192; ++ return sp & ~0xf; ++} ++ ++unsigned long arch_randomize_brk(struct mm_struct *mm) ++{ ++ unsigned long range_end = mm->brk + 0x02000000; ++ return randomize_range(mm->brk, range_end, 0) ? : mm->brk; ++} +diff --git a/arch/x86/kernel/quirks-xen.c b/arch/x86/kernel/quirks-xen.c +new file mode 100644 +index 0000000..fddd924 +--- /dev/null ++++ b/arch/x86/kernel/quirks-xen.c +@@ -0,0 +1,403 @@ ++/* ++ * This file contains work-arounds for x86 and x86_64 platform bugs. ++ */ ++#include ++#include ++ ++#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) ++ ++static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) ++{ ++ u8 config, rev; ++ u16 word; ++ ++ /* BIOS may enable hardware IRQ balancing for ++ * E7520/E7320/E7525(revision ID 0x9 and below) ++ * based platforms. ++ * Disable SW irqbalance/affinity on those platforms. ++ */ ++ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); ++ if (rev > 0x9) ++ return; ++ ++ /* enable access to config space*/ ++ pci_read_config_byte(dev, 0xf4, &config); ++ pci_write_config_byte(dev, 0xf4, config|0x2); ++ ++ /* ++ * read xTPR register. We may not have a pci_dev for device 8 ++ * because it might be hidden until the above write. ++ */ ++ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); ++ ++ if (!(word & (1 << 13))) { ++ struct xen_platform_op op; ++ ++ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " ++ "disabling irq balancing and affinity\n"); ++ op.cmd = XENPF_platform_quirk; ++ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; ++ WARN_ON(HYPERVISOR_platform_op(&op)); ++ } ++ ++ /* put back the original value for config space*/ ++ if (!(config & 0x2)) ++ pci_write_config_byte(dev, 0xf4, config); ++} ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, ++ quirk_intel_irqbalance); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, ++ quirk_intel_irqbalance); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, ++ quirk_intel_irqbalance); ++#endif ++ ++#if defined(CONFIG_HPET_TIMER) ++#include ++ ++unsigned long force_hpet_address; ++ ++static enum { ++ NONE_FORCE_HPET_RESUME, ++ OLD_ICH_FORCE_HPET_RESUME, ++ ICH_FORCE_HPET_RESUME, ++ VT8237_FORCE_HPET_RESUME, ++ NVIDIA_FORCE_HPET_RESUME, ++} force_hpet_resume_type; ++ ++static void __iomem *rcba_base; ++ ++static void ich_force_hpet_resume(void) ++{ ++ u32 val; ++ ++ if (!force_hpet_address) ++ return; ++ ++ if (rcba_base == NULL) ++ BUG(); ++ ++ /* read the Function Disable register, dword mode only */ ++ val = readl(rcba_base + 0x3404); ++ if (!(val & 0x80)) { ++ /* HPET disabled in HPTC. Trying to enable */ ++ writel(val | 0x80, rcba_base + 0x3404); ++ } ++ ++ val = readl(rcba_base + 0x3404); ++ if (!(val & 0x80)) ++ BUG(); ++ else ++ printk(KERN_DEBUG "Force enabled HPET at resume\n"); ++ ++ return; ++} ++ ++static void ich_force_enable_hpet(struct pci_dev *dev) ++{ ++ u32 val; ++ u32 uninitialized_var(rcba); ++ int err = 0; ++ ++ if (hpet_address || force_hpet_address) ++ return; ++ ++ pci_read_config_dword(dev, 0xF0, &rcba); ++ rcba &= 0xFFFFC000; ++ if (rcba == 0) { ++ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " ++ "cannot force enable HPET\n"); ++ return; ++ } ++ ++ /* use bits 31:14, 16 kB aligned */ ++ rcba_base = ioremap_nocache(rcba, 0x4000); ++ if (rcba_base == NULL) { ++ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " ++ "cannot force enable HPET\n"); ++ return; ++ } ++ ++ /* read the Function Disable register, dword mode only */ ++ val = readl(rcba_base + 0x3404); ++ ++ if (val & 0x80) { ++ /* HPET is enabled in HPTC. Just not reported by BIOS */ ++ val = val & 0x3; ++ force_hpet_address = 0xFED00000 | (val << 12); ++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " ++ "0x%lx\n", force_hpet_address); ++ iounmap(rcba_base); ++ return; ++ } ++ ++ /* HPET disabled in HPTC. Trying to enable */ ++ writel(val | 0x80, rcba_base + 0x3404); ++ ++ val = readl(rcba_base + 0x3404); ++ if (!(val & 0x80)) { ++ err = 1; ++ } else { ++ val = val & 0x3; ++ force_hpet_address = 0xFED00000 | (val << 12); ++ } ++ ++ if (err) { ++ force_hpet_address = 0; ++ iounmap(rcba_base); ++ dev_printk(KERN_DEBUG, &dev->dev, ++ "Failed to force enable HPET\n"); ++ } else { ++ force_hpet_resume_type = ICH_FORCE_HPET_RESUME; ++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " ++ "0x%lx\n", force_hpet_address); ++ } ++} ++ ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ++ ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ++ ich_force_enable_hpet); ++ ++ ++static struct pci_dev *cached_dev; ++ ++static void old_ich_force_hpet_resume(void) ++{ ++ u32 val; ++ u32 uninitialized_var(gen_cntl); ++ ++ if (!force_hpet_address || !cached_dev) ++ return; ++ ++ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); ++ gen_cntl &= (~(0x7 << 15)); ++ gen_cntl |= (0x4 << 15); ++ ++ pci_write_config_dword(cached_dev, 0xD0, gen_cntl); ++ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); ++ val = gen_cntl >> 15; ++ val &= 0x7; ++ if (val == 0x4) ++ printk(KERN_DEBUG "Force enabled HPET at resume\n"); ++ else ++ BUG(); ++} ++ ++static void old_ich_force_enable_hpet(struct pci_dev *dev) ++{ ++ u32 val; ++ u32 uninitialized_var(gen_cntl); ++ ++ if (hpet_address || force_hpet_address) ++ return; ++ ++ pci_read_config_dword(dev, 0xD0, &gen_cntl); ++ /* ++ * Bit 17 is HPET enable bit. ++ * Bit 16:15 control the HPET base address. ++ */ ++ val = gen_cntl >> 15; ++ val &= 0x7; ++ if (val & 0x4) { ++ val &= 0x3; ++ force_hpet_address = 0xFED00000 | (val << 12); ++ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", ++ force_hpet_address); ++ return; ++ } ++ ++ /* ++ * HPET is disabled. Trying enabling at FED00000 and check ++ * whether it sticks ++ */ ++ gen_cntl &= (~(0x7 << 15)); ++ gen_cntl |= (0x4 << 15); ++ pci_write_config_dword(dev, 0xD0, gen_cntl); ++ ++ pci_read_config_dword(dev, 0xD0, &gen_cntl); ++ ++ val = gen_cntl >> 15; ++ val &= 0x7; ++ if (val & 0x4) { ++ /* HPET is enabled in HPTC. Just not reported by BIOS */ ++ val &= 0x3; ++ force_hpet_address = 0xFED00000 | (val << 12); ++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " ++ "0x%lx\n", force_hpet_address); ++ cached_dev = dev; ++ force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; ++ return; ++ } ++ ++ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); ++} ++ ++/* ++ * Undocumented chipset features. Make sure that the user enforced ++ * this. ++ */ ++static void old_ich_force_enable_hpet_user(struct pci_dev *dev) ++{ ++ if (hpet_force_user) ++ old_ich_force_enable_hpet(dev); ++} ++ ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, ++ old_ich_force_enable_hpet_user); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, ++ old_ich_force_enable_hpet_user); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, ++ old_ich_force_enable_hpet_user); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, ++ old_ich_force_enable_hpet_user); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, ++ old_ich_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, ++ old_ich_force_enable_hpet); ++ ++ ++static void vt8237_force_hpet_resume(void) ++{ ++ u32 val; ++ ++ if (!force_hpet_address || !cached_dev) ++ return; ++ ++ val = 0xfed00000 | 0x80; ++ pci_write_config_dword(cached_dev, 0x68, val); ++ ++ pci_read_config_dword(cached_dev, 0x68, &val); ++ if (val & 0x80) ++ printk(KERN_DEBUG "Force enabled HPET at resume\n"); ++ else ++ BUG(); ++} ++ ++static void vt8237_force_enable_hpet(struct pci_dev *dev) ++{ ++ u32 uninitialized_var(val); ++ ++ if (!hpet_force_user || hpet_address || force_hpet_address) ++ return; ++ ++ pci_read_config_dword(dev, 0x68, &val); ++ /* ++ * Bit 7 is HPET enable bit. ++ * Bit 31:10 is HPET base address (contrary to what datasheet claims) ++ */ ++ if (val & 0x80) { ++ force_hpet_address = (val & ~0x3ff); ++ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", ++ force_hpet_address); ++ return; ++ } ++ ++ /* ++ * HPET is disabled. Trying enabling at FED00000 and check ++ * whether it sticks ++ */ ++ val = 0xfed00000 | 0x80; ++ pci_write_config_dword(dev, 0x68, val); ++ ++ pci_read_config_dword(dev, 0x68, &val); ++ if (val & 0x80) { ++ force_hpet_address = (val & ~0x3ff); ++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " ++ "0x%lx\n", force_hpet_address); ++ cached_dev = dev; ++ force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; ++ return; ++ } ++ ++ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); ++} ++ ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, ++ vt8237_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, ++ vt8237_force_enable_hpet); ++ ++/* ++ * Undocumented chipset feature taken from LinuxBIOS. ++ */ ++static void nvidia_force_hpet_resume(void) ++{ ++ pci_write_config_dword(cached_dev, 0x44, 0xfed00001); ++ printk(KERN_DEBUG "Force enabled HPET at resume\n"); ++} ++ ++static void nvidia_force_enable_hpet(struct pci_dev *dev) ++{ ++ u32 uninitialized_var(val); ++ ++ if (!hpet_force_user || hpet_address || force_hpet_address) ++ return; ++ ++ pci_write_config_dword(dev, 0x44, 0xfed00001); ++ pci_read_config_dword(dev, 0x44, &val); ++ force_hpet_address = val & 0xfffffffe; ++ force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; ++ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", ++ force_hpet_address); ++ cached_dev = dev; ++ return; ++} ++ ++/* ISA Bridges */ ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, ++ nvidia_force_enable_hpet); ++ ++/* LPC bridges */ ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, ++ nvidia_force_enable_hpet); ++DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, ++ nvidia_force_enable_hpet); ++ ++void force_hpet_resume(void) ++{ ++ switch (force_hpet_resume_type) { ++ case ICH_FORCE_HPET_RESUME: ++ ich_force_hpet_resume(); ++ return; ++ case OLD_ICH_FORCE_HPET_RESUME: ++ old_ich_force_hpet_resume(); ++ return; ++ case VT8237_FORCE_HPET_RESUME: ++ vt8237_force_hpet_resume(); ++ return; ++ case NVIDIA_FORCE_HPET_RESUME: ++ nvidia_force_hpet_resume(); ++ return; ++ default: ++ break; ++ } ++} ++ ++#endif +diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S +index c30fe25..64beabd 100644 +--- a/arch/x86/kernel/relocate_kernel_32.S ++++ b/arch/x86/kernel/relocate_kernel_32.S +@@ -155,14 +155,45 @@ relocate_new_kernel: + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + ++ /* setup idt */ ++ movl %edi, %eax ++ addl $(idt_48 - relocate_kernel), %eax ++ lidtl (%eax) ++ ++ /* setup gdt */ ++ movl %edi, %eax ++ addl $(gdt - relocate_kernel), %eax ++ movl %edi, %esi ++ addl $((gdt_48 - relocate_kernel) + 2), %esi ++ movl %eax, (%esi) ++ ++ movl %edi, %eax ++ addl $(gdt_48 - relocate_kernel), %eax ++ lgdtl (%eax) ++ ++ /* setup data segment registers */ ++ mov $(gdt_ds - gdt), %eax ++ mov %eax, %ds ++ mov %eax, %es ++ mov %eax, %fs ++ mov %eax, %gs ++ mov %eax, %ss ++ + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%edi), %esp + +- /* jump to identity mapped page */ +- movl %edi, %eax +- addl $(identity_mapped - relocate_kernel), %eax +- pushl %eax +- ret ++ /* load new code segment and jump to identity mapped page */ ++ movl %edi, %esi ++ xorl %eax, %eax ++ pushl %eax ++ pushl %esi ++ pushl %eax ++ movl $(gdt_cs - gdt), %eax ++ pushl %eax ++ movl %edi, %eax ++ addl $(identity_mapped - relocate_kernel),%eax ++ pushl %eax ++ iretl + + identity_mapped: + /* store the start address on the stack */ +@@ -250,3 +281,20 @@ identity_mapped: + xorl %edi, %edi + xorl %ebp, %ebp + ret ++ ++ .align 16 ++gdt: ++ .quad 0x0000000000000000 /* NULL descriptor */ ++gdt_cs: ++ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ ++gdt_ds: ++ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ ++gdt_end: ++ ++gdt_48: ++ .word gdt_end - gdt - 1 /* limit */ ++ .long 0 /* base - filled in by code above */ ++ ++idt_48: ++ .word 0 /* limit */ ++ .long 0 /* base */ +diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S +index f5afe66..4261d0e 100644 +--- a/arch/x86/kernel/relocate_kernel_64.S ++++ b/arch/x86/kernel/relocate_kernel_64.S +@@ -160,13 +160,39 @@ relocate_new_kernel: + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + ++ /* setup idt */ ++ movq %r8, %rax ++ addq $(idt_80 - relocate_kernel), %rax ++ lidtq (%rax) ++ ++ /* setup gdt */ ++ movq %r8, %rax ++ addq $(gdt - relocate_kernel), %rax ++ movq %r8, %r9 ++ addq $((gdt_80 - relocate_kernel) + 2), %r9 ++ movq %rax, (%r9) ++ ++ movq %r8, %rax ++ addq $(gdt_80 - relocate_kernel), %rax ++ lgdtq (%rax) ++ ++ /* setup data segment registers */ ++ xorl %eax, %eax ++ movl %eax, %ds ++ movl %eax, %es ++ movl %eax, %fs ++ movl %eax, %gs ++ movl %eax, %ss ++ + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%r8), %rsp + +- /* jump to identity mapped page */ +- addq $(identity_mapped - relocate_kernel), %r8 +- pushq %r8 +- ret ++ /* load new code segment and jump to identity mapped page */ ++ movq %r8, %rax ++ addq $(identity_mapped - relocate_kernel), %rax ++ pushq $(gdt_cs - gdt) ++ pushq %rax ++ lretq + + identity_mapped: + /* store the start address on the stack */ +@@ -262,5 +288,19 @@ identity_mapped: + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 +- + ret ++ ++ .align 16 ++gdt: ++ .quad 0x0000000000000000 /* NULL descriptor */ ++gdt_cs: ++ .quad 0x00af9a000000ffff ++gdt_end: ++ ++gdt_80: ++ .word gdt_end - gdt - 1 /* limit */ ++ .quad 0 /* base - filled in by code above */ ++ ++idt_80: ++ .word 0 /* limit */ ++ .quad 0 /* base */ +diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c +index 05191bb..6dec56f 100644 +--- a/arch/x86/kernel/rtc.c ++++ b/arch/x86/kernel/rtc.c +@@ -181,6 +181,10 @@ unsigned long read_persistent_clock(void) + { + unsigned long retval, flags; + ++#ifdef CONFIG_XEN ++ if (!is_initial_xendomain()) ++ return xen_read_persistent_clock(); ++#endif + spin_lock_irqsave(&rtc_lock, flags); + retval = get_wallclock(); + spin_unlock_irqrestore(&rtc_lock, flags); +@@ -190,6 +194,10 @@ unsigned long read_persistent_clock(void) + + int update_persistent_clock(struct timespec now) + { ++#ifdef CONFIG_XEN ++ if (xen_update_persistent_clock() < 0 || xen_independent_wallclock()) ++ return 0; ++#endif + return set_rtc_mmss(now.tv_sec); + } + +diff --git a/arch/x86/kernel/setup-xen.c b/arch/x86/kernel/setup-xen.c +new file mode 100644 +index 0000000..22cdc5d +--- /dev/null ++++ b/arch/x86/kernel/setup-xen.c +@@ -0,0 +1,141 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_X86_LOCAL_APIC ++unsigned int num_processors; ++unsigned disabled_cpus __cpuinitdata; ++/* Processor that is doing the boot up */ ++unsigned int boot_cpu_physical_apicid = -1U; ++EXPORT_SYMBOL(boot_cpu_physical_apicid); ++ ++DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; ++EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); ++ ++/* Bitmask of physically existing CPUs */ ++physid_mask_t phys_cpu_present_map; ++#endif ++ ++#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) ++/* ++ * Copy data used in early init routines from the initial arrays to the ++ * per cpu data areas. These arrays then become expendable and the ++ * *_early_ptr's are zeroed indicating that the static arrays are gone. ++ */ ++static void __init setup_per_cpu_maps(void) ++{ ++#ifndef CONFIG_XEN ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu]; ++ per_cpu(x86_bios_cpu_apicid, cpu) = ++ x86_bios_cpu_apicid_init[cpu]; ++#ifdef CONFIG_NUMA ++ per_cpu(x86_cpu_to_node_map, cpu) = ++ x86_cpu_to_node_map_init[cpu]; ++#endif ++ } ++ ++ /* indicate the early static arrays will soon be gone */ ++ x86_cpu_to_apicid_early_ptr = NULL; ++ x86_bios_cpu_apicid_early_ptr = NULL; ++#ifdef CONFIG_NUMA ++ x86_cpu_to_node_map_early_ptr = NULL; ++#endif ++#endif ++} ++ ++#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP ++cpumask_t *cpumask_of_cpu_map __read_mostly; ++EXPORT_SYMBOL(cpumask_of_cpu_map); ++ ++/* requires nr_cpu_ids to be initialized */ ++static void __init setup_cpumask_of_cpu(void) ++{ ++ int i; ++ ++ /* alloc_bootmem zeroes memory */ ++ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); ++ for (i = 0; i < nr_cpu_ids; i++) ++ cpu_set(i, cpumask_of_cpu_map[i]); ++} ++#else ++static inline void setup_cpumask_of_cpu(void) { } ++#endif ++ ++#ifdef CONFIG_X86_32 ++/* ++ * Great future not-so-futuristic plan: make i386 and x86_64 do it ++ * the same way ++ */ ++unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; ++EXPORT_SYMBOL(__per_cpu_offset); ++#endif ++ ++/* ++ * Great future plan: ++ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. ++ * Always point %gs to its beginning ++ */ ++void __init setup_per_cpu_areas(void) ++{ ++ int i, highest_cpu = 0; ++ unsigned long size; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ prefill_possible_map(); ++#endif ++ ++ /* Copy section for each CPU (we discard the original) */ ++ size = PERCPU_ENOUGH_ROOM; ++ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", ++ size); ++ ++ for_each_possible_cpu(i) { ++ char *ptr; ++#ifndef CONFIG_NEED_MULTIPLE_NODES ++ ptr = alloc_bootmem_pages(size); ++#else ++ int node = early_cpu_to_node(i); ++ if (!node_online(node) || !NODE_DATA(node)) { ++ ptr = alloc_bootmem_pages(size); ++ printk(KERN_INFO ++ "cpu %d has no node or node-local memory\n", i); ++ } ++ else ++ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); ++#endif ++ if (!ptr) ++ panic("Cannot allocate cpu data for CPU %d\n", i); ++#ifdef CONFIG_X86_64 ++ cpu_pda(i)->data_offset = ptr - __per_cpu_start; ++#else ++ __per_cpu_offset[i] = ptr - __per_cpu_start; ++#endif ++ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); ++ ++ highest_cpu = i; ++ } ++ ++ nr_cpu_ids = highest_cpu + 1; ++ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); ++ ++ /* Setup percpu data maps */ ++ setup_per_cpu_maps(); ++ ++ /* Setup cpumask_of_cpu map */ ++ setup_cpumask_of_cpu(); ++} ++ ++#endif +diff --git a/arch/x86/kernel/setup64-xen.c b/arch/x86/kernel/setup64-xen.c +new file mode 100644 +index 0000000..589ccbd +--- /dev/null ++++ b/arch/x86/kernel/setup64-xen.c +@@ -0,0 +1,368 @@ ++/* ++ * X86-64 specific CPU setup. ++ * Copyright (C) 1995 Linus Torvalds ++ * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. ++ * See setup.c for older changelog. ++ * ++ * Jun Nakajima ++ * Modified for Xen ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_XEN ++#include ++#endif ++ ++#ifndef CONFIG_DEBUG_BOOT_PARAMS ++struct boot_params __initdata boot_params; ++#else ++struct boot_params boot_params; ++#endif ++ ++cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; ++ ++struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; ++EXPORT_SYMBOL(_cpu_pda); ++struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; ++ ++#ifndef CONFIG_X86_NO_IDT ++struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; ++#endif ++ ++char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); ++ ++unsigned long __supported_pte_mask __read_mostly = ~0UL; ++EXPORT_SYMBOL(__supported_pte_mask); ++ ++static int do_not_nx __cpuinitdata = 0; ++ ++/* noexec=on|off ++Control non executable mappings for 64bit processes. ++ ++on Enable(default) ++off Disable ++*/ ++static int __init nonx_setup(char *str) ++{ ++ if (!str) ++ return -EINVAL; ++ if (!strncmp(str, "on", 2)) { ++ __supported_pte_mask |= _PAGE_NX; ++ do_not_nx = 0; ++ } else if (!strncmp(str, "off", 3)) { ++ do_not_nx = 1; ++ __supported_pte_mask &= ~_PAGE_NX; ++ } ++ return 0; ++} ++early_param("noexec", nonx_setup); ++ ++int force_personality32 = 0; ++ ++/* noexec32=on|off ++Control non executable heap for 32bit processes. ++To control the stack too use noexec=off ++ ++on PROT_READ does not imply PROT_EXEC for 32bit processes (default) ++off PROT_READ implies PROT_EXEC ++*/ ++static int __init nonx32_setup(char *str) ++{ ++ if (!strcmp(str, "on")) ++ force_personality32 &= ~READ_IMPLIES_EXEC; ++ else if (!strcmp(str, "off")) ++ force_personality32 |= READ_IMPLIES_EXEC; ++ return 1; ++} ++__setup("noexec32=", nonx32_setup); ++ ++#ifdef CONFIG_XEN ++static void __init_refok switch_pt(int cpu) ++{ ++ if (cpu == 0) ++ xen_init_pt(); ++ xen_pt_switch(__pa_symbol(init_level4_pgt)); ++ xen_new_user_pt(__pa_symbol(__user_pgd(init_level4_pgt))); ++} ++#define switch_pt() switch_pt(cpu) ++ ++static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) ++{ ++ unsigned long frames[16]; ++ unsigned long va; ++ int f; ++ ++ for (va = gdt_descr->address, f = 0; ++ va < gdt_descr->address + gdt_descr->size; ++ va += PAGE_SIZE, f++) { ++ frames[f] = virt_to_mfn(va); ++ make_page_readonly( ++ (void *)va, XENFEAT_writable_descriptor_tables); ++ } ++ if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / ++ sizeof (struct desc_struct))) ++ BUG(); ++} ++#else ++static void switch_pt(void) ++{ ++ asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); ++} ++ ++static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) ++{ ++ load_gdt(gdt_descr); ++ load_idt(idt_descr); ++} ++#endif ++ ++void pda_init(int cpu) ++{ ++ struct x8664_pda *pda = cpu_pda(cpu); ++ ++ /* Setup up data that may be needed in __get_free_pages early */ ++ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); ++#ifndef CONFIG_XEN ++ /* Memory clobbers used to order PDA accessed */ ++ mb(); ++ wrmsrl(MSR_GS_BASE, pda); ++ mb(); ++#else ++ if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, ++ (unsigned long)pda)) ++ BUG(); ++#endif ++ pda->cpunumber = cpu; ++ pda->irqcount = -1; ++ pda->kernelstack = ++ (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; ++ pda->active_mm = &init_mm; ++ pda->mmu_state = 0; ++ ++ if (cpu == 0) { ++ /* others are initialized in smpboot.c */ ++ pda->pcurrent = &init_task; ++ pda->irqstackptr = boot_cpu_stack; ++ } else { ++ pda->irqstackptr = (char *) ++ __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); ++ if (!pda->irqstackptr) ++ panic("cannot allocate irqstack for cpu %d", cpu); ++ } ++ ++ switch_pt(); ++ ++ pda->irqstackptr += IRQSTACKSIZE-64; ++} ++ ++#ifndef CONFIG_X86_NO_TSS ++char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ] ++__attribute__((section(".bss.page_aligned"))); ++#endif ++ ++extern asmlinkage void ignore_sysret(void); ++ ++/* May not be marked __init: used by software suspend */ ++void syscall_init(void) ++{ ++#ifndef CONFIG_XEN ++ /* ++ * LSTAR and STAR live in a bit strange symbiosis. ++ * They both write to the same internal register. STAR allows to set CS/DS ++ * but only a 32bit target. LSTAR sets the 64bit rip. ++ */ ++ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); ++ wrmsrl(MSR_LSTAR, system_call); ++ wrmsrl(MSR_CSTAR, ignore_sysret); ++ ++ /* Flags to clear on syscall */ ++ wrmsrl(MSR_SYSCALL_MASK, ++ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); ++#endif ++#ifdef CONFIG_IA32_EMULATION ++ syscall32_cpu_init (); ++#else ++ { ++ static const struct callback_register cstar = { ++ .type = CALLBACKTYPE_syscall32, ++ .address = (unsigned long)ignore_sysret ++ }; ++ if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar)) ++ printk(KERN_WARN "Unable to register CSTAR callback\n"); ++ } ++#endif ++} ++ ++void __cpuinit check_efer(void) ++{ ++ unsigned long efer; ++ ++ rdmsrl(MSR_EFER, efer); ++ if (!(efer & EFER_NX) || do_not_nx) { ++ __supported_pte_mask &= ~_PAGE_NX; ++ } ++} ++ ++unsigned long kernel_eflags; ++ ++#ifndef CONFIG_X86_NO_TSS ++/* ++ * Copies of the original ist values from the tss are only accessed during ++ * debugging, no special alignment required. ++ */ ++DEFINE_PER_CPU(struct orig_ist, orig_ist); ++#endif ++ ++/* ++ * cpu_init() initializes state that is per-CPU. Some data is already ++ * initialized (naturally) in the bootstrap process, such as the GDT ++ * and IDT. We reload them nevertheless, this function acts as a ++ * 'CPU state barrier', nothing should get across. ++ * A lot of state is already set up in PDA init. ++ */ ++void __cpuinit cpu_init (void) ++{ ++ int cpu = stack_smp_processor_id(); ++#ifndef CONFIG_X86_NO_TSS ++ struct tss_struct *t = &per_cpu(init_tss, cpu); ++ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); ++ unsigned long v; ++ char *estacks = NULL; ++ unsigned i; ++#endif ++ struct task_struct *me; ++ ++ /* CPU 0 is initialised in head64.c */ ++ if (cpu != 0) { ++ pda_init(cpu); ++ } ++#ifndef CONFIG_X86_NO_TSS ++ else ++ estacks = boot_exception_stacks; ++#endif ++ ++ me = current; ++ ++ if (cpu_test_and_set(cpu, cpu_initialized)) ++ panic("CPU#%d already initialized!\n", cpu); ++ ++ printk("Initializing CPU#%d\n", cpu); ++ ++ clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); ++ ++ /* ++ * Initialize the per-CPU GDT with the boot GDT, ++ * and set up the GDT descriptor: ++ */ ++#ifndef CONFIG_XEN ++ if (cpu) ++ memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); ++#endif ++ ++ cpu_gdt_descr[cpu].size = GDT_SIZE; ++ cpu_gdt_init(&cpu_gdt_descr[cpu]); ++ ++ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); ++ syscall_init(); ++ ++ wrmsrl(MSR_FS_BASE, 0); ++ wrmsrl(MSR_KERNEL_GS_BASE, 0); ++ barrier(); ++ ++ check_efer(); ++ ++#ifndef CONFIG_X86_NO_TSS ++ /* ++ * set up and load the per-CPU TSS ++ */ ++ for (v = 0; v < N_EXCEPTION_STACKS; v++) { ++ static const unsigned int order[N_EXCEPTION_STACKS] = { ++ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, ++ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER ++ }; ++ if (cpu) { ++ estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); ++ if (!estacks) ++ panic("Cannot allocate exception stack %ld %d\n", ++ v, cpu); ++ } ++ estacks += PAGE_SIZE << order[v]; ++ orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; ++ } ++ ++ t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); ++ /* ++ * <= is required because the CPU will access up to ++ * 8 bits beyond the end of the IO permission bitmap. ++ */ ++ for (i = 0; i <= IO_BITMAP_LONGS; i++) ++ t->io_bitmap[i] = ~0UL; ++#endif ++ ++ atomic_inc(&init_mm.mm_count); ++ me->active_mm = &init_mm; ++ if (me->mm) ++ BUG(); ++ enter_lazy_tlb(&init_mm, me); ++ ++#ifndef CONFIG_X86_NO_TSS ++ set_tss_desc(cpu, t); ++#endif ++#ifndef CONFIG_XEN ++ load_TR_desc(); ++#endif ++ load_LDT(&init_mm.context); ++ ++#ifdef CONFIG_KGDB ++ /* ++ * If the kgdb is connected no debug regs should be altered. This ++ * is only applicable when KGDB and a KGDB I/O module are built ++ * into the kernel and you are using early debugging with ++ * kgdbwait. KGDB will control the kernel HW breakpoint registers. ++ */ ++ if (kgdb_connected && arch_kgdb_ops.correct_hw_break) ++ arch_kgdb_ops.correct_hw_break(); ++ else { ++#endif ++ /* ++ * Clear all 6 debug registers: ++ */ ++ ++ set_debugreg(0UL, 0); ++ set_debugreg(0UL, 1); ++ set_debugreg(0UL, 2); ++ set_debugreg(0UL, 3); ++ set_debugreg(0UL, 6); ++ set_debugreg(0UL, 7); ++#ifdef CONFIG_KGDB ++ /* If the kgdb is connected no debug regs should be altered. */ ++ } ++#endif ++ ++ fpu_init(); ++ ++ raw_local_save_flags(kernel_eflags); ++ ++ if (is_uv_system()) ++ uv_cpu_init(); ++} +diff --git a/arch/x86/kernel/setup_32-xen.c b/arch/x86/kernel/setup_32-xen.c +new file mode 100644 +index 0000000..e7a488a +--- /dev/null ++++ b/arch/x86/kernel/setup_32-xen.c +@@ -0,0 +1,1154 @@ ++/* ++ * Copyright (C) 1995 Linus Torvalds ++ * ++ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 ++ * ++ * Memory region support ++ * David Parsons , July-August 1999 ++ * ++ * Added E820 sanitization routine (removes overlapping memory regions); ++ * Brian Moyle , February 2001 ++ * ++ * Moved CPU detection code to cpu/${cpu}.c ++ * Patrick Mochel , March 2002 ++ * ++ * Provisions for empty E820 memory regions (reported by certain BIOSes). ++ * Alex Achenbach , December 2002. ++ * ++ */ ++ ++/* ++ * This file handles the architecture-dependent parts of initialization ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include