linux/debian/patches/features/all/openvz/openvz.patch

90072 lines
2.4 MiB
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

commit 5fd638726a6999e334e5e2c0635a03a447adc0d1
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Jun 17 20:45:46 2010 +0400
OpenVZ kernel 2.6.32-budarin released
Named after Nikolai Mikhailovich Budarin - a Russian cosmonaut
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 25ded1908b24b122325003fe56afa5bc78511aad
Merge: 7e99ed1 c5f1e1a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Jun 17 20:40:20 2010 +0400
Merged linux-2.6.32.15
Conflicts:
Makefile
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7e99ed1bc34b60ed42eb2008edbb4f98684edb0a
Author: Stanislav Kinsbursky <skinsbursky@openvz.org>
Date: Thu Jun 17 20:25:43 2010 +0400
CPT: Replace legacy net statistics with netns one
http://bugzilla.openvz.org/show_bug.cgi?id=1543
Signed-off-by: Stanislav Kinsbursky <skinsbursky@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ad21fa5e96fc66a63364f210aebf4f4f95345ee4
Author: Stanislav Kinsbursky <skinsbursky@openvz.org>
Date: Thu Jun 17 20:24:21 2010 +0400
NET: register net sysfs kobject inside container
Adding of net kobject was supressed if network namespace is not
"init_net". Check for "init_net" is removed.
http://bugzilla.openvz.org/show_bug.cgi?id=1534
Signed-off-by: Stanislav Kinsbursky <skinsbursky@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9783d9e288e8ca754cabf77e7eb68099a75a5292
Author: Andrey Vagin <avagin@openvz.org>
Date: Fri Jun 11 19:48:34 2010 +0400
cpt: fix refcounting of tty and pid on setting tty to process
http://bugzilla.openvz.org/show_bug.cgi?id=1544
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b2fecd288e7d0457df18e57f11685e007cd09ad6
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Jun 11 18:36:51 2010 +0400
mm: Fix oops in do_wp_page
http://bugzilla.openvz.org/show_bug.cgi?id=1541
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e7399c239fadcc813adcf4f947b00ec199d6a11b
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu May 27 20:07:25 2010 +0400
OpenVZ kernel 2.6.32-belyayev released
Named after Pavel Ivanovich Belyayev - a Russian cosmonaut
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f11aece8614b06328cfbb7283622d0c1f392a783
Author: Kir Kolyshkin <kir@openvz.org>
Date: Thu May 27 20:05:53 2010 +0400
Fix/enlarge description of CONFIG_SYSFS_DEPRECATED_DYN
Commit f40134386 adds this option, unfortunately the description
has a high ratio of typos per word, plus it is incomplete.
Fix both issues.
Signed-off-by: Kir Kolyshkin <kir@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 537027d8abbb3f78c9c80b7574a12c920c7af4f6
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue Apr 20 23:14:04 2010 +0400
mm: SLUB -- implement show_slab_info
Note that we had to introduce cache_chain_lock spinlock,
otherwise we could touch entity being removed.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0eb474428f7f5fcae74048682be692033060f21e
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue Apr 20 21:14:24 2010 +0400
mm: SLAB -- use static cache_chain/_lock initializers
There is no need to initialize cache_chain list and spinlock dynamically.
Which save some cpu cycles on the kernel startup.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9bab271973963a03bb1e0d5a1ecaee1e3810907c
Merge: 509eb1f 7b7a917
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu May 27 17:57:10 2010 +0400
Merged linux-2.6.32.14
Conflicts:
Makefile
fs/notify/inotify/inotify_fsnotify.c
fs/notify/inotify/inotify_user.c
fs/proc/array.c
include/linux/sched.h
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 509eb1f29c4301126a0ccda8e001dfd0af0d56d2
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon May 24 14:27:05 2010 +0400
OpenVZ kernel 2.6.32-balandin released
Named after Aleksandr Nikolayevich Balandin - a Russian cosmonaut.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit eb28ec67376e267760e72c96ca3d54346d39a56f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon May 24 15:10:31 2010 +0400
sysctl: Compilation fix after merge of sysctl fixes
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0bb7a0e0615e134b7ae9f7e2e2737be5ff76881b
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon May 24 14:23:28 2010 +0400
fs: Don't list non-VE fs in /proc/filesistems
Which is due to luck of a virtualized filesystems filter.
Implement it.
http://bugzilla.openvz.org/show_bug.cgi?id=1504
Reported-by: Kir Kolyshkin <kir@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 866f4866b2d988c1ac1222f0397efd1e6e64d443
Author: Andrey Vagin <avagin@openvz.org>
Date: Mon May 24 13:14:58 2010 +0400
Fix sysctl warnings about unknown sysctl binary
Switch this entry over to use CTL_UNNUMBERED, because
nobody use it via sys_sysctl.
http://bugzilla.openvz.org/show_bug.cgi?id=1463
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2412f2cf0853b5303af7740000c99179eeece3e4
Author: Andrey Vagin <avagin@openvz.org>
Date: Mon May 24 13:15:37 2010 +0400
susctl: Add sysctl_data_ve helper
This helper is analogous to proc_dointvec_ve
Add generic method for sys_syscal access to per ve values.
The extra1 field of ctl_table contains data field offset from ve_struct begin.
without CONFIG_VE use address from .data field.
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 39f9a055139faf313a1ad823b145e535d5485f5c
Author: Andrey Vagin <avagin@openvz.org>
Date: Mon May 24 13:16:11 2010 +0400
Fix sysctl warnings about msissing strategy for randomize_va_space
http://bugzilla.openvz.org/show_bug.cgi?id=1463
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit de3a7aab2eeab095a81f414d0e5e855da1d99c61
Author: Andrey Vagin <avagin@openvz.ru>
Date: Mon May 24 13:13:36 2010 +0400
cpt: use shem_file for dump inode content of shm
Files with shm_file_operations save link in private_data on
the file with shmem_file_operation. For dumping inode content
we use read from shmem_file_operation, but pass the file with
shm_file_operations.
shmem_file_operation use do_sync_read, which uses file->f_op->aio_read,
but it's absent in smh_file_operation.
do_read
do_sync_read(*f, ...)
f->f_op->aio_read -> Oops
http://bugzilla.openvz.org/show_bug.cgi?id=1500
Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1c4eba47b2d5d3d26c186485de8adf8ef293ebb5
Author: Stanislav Kinsbursky <skinsbursky@openvz.org>
Date: Mon May 24 14:05:44 2010 +0400
tun: device_create_file omitted if net level is not init_net
device_create_file() calls are omitted in tun_set_iff() if net is inside container.
Used the same condition check like in netdev_register_kobject().
http://bugzilla.openvz.org/show_bug.cgi?id=1497
Signed-off-by: Stanislav Kinsbursky <skinsbursky@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 98447fa5c37746da0699b9f8d8bbd59d8147d9bc
Author: Kir Kolyshkin <kir@openvz.org>
Date: Mon May 24 13:04:17 2010 +0400
Revert "mm mmap zero length kludge"
This kludge was made for really old rpm versions which were since then
fixed (see references to RH bugzilla in OpenVZ bug #893). More to say,
it now makes rpm itself segfault in our templates when locale is set,
details are in OpenVZ bug #1502. So remove it and hope for the best.
http://bugzilla.openvz.org/1502
http://bugzilla.openvz.org/893
This reverts commit d252a93b32d6d251fcc73863b75b91edaa801b95.
Signed-off-by: Kir Kolyshkin <kir@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 57358efc0e639282309d8b6aea8efb8ae3d6d9ad
Merge: 42a0a10 1cd8211
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon May 24 12:59:24 2010 +0400
Merged linux-2.6.32.13
Conflicts:
Makefile
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 42a0a1071d3872af254373c1cc07085b9bf24d3a
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon May 24 12:56:47 2010 +0400
ioprio: Make it possible to set ve ioprio finally
Add ioprio compat call for blk-cgroup. Simulate the old ioprio with
the new blk-cgroup weight.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a4452f1cc33f6e4f7d8f58abab818ede313cdfbc
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon May 24 12:55:43 2010 +0400
cgroup-lite: Set task css properly
Fix task moving between cgroups at ve create and enter.
Add a helper to attach a task to a cgroup set (based on the
cgroup_attach_task).
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 983bb0952f838b55130f20a9486a04c92ae5826b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon May 24 12:54:09 2010 +0400
cgroup-lite: add cgroup-id for blk-cgroups
Use one id for all subsystems in one cgroup. Store the id right
on the cgroup struct instead of hacking around css_id structures.
Plus add other cgroup tree related functions required by blk-cgroup.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f54f5b3e0a014f3bb5c530b4c13d443a2fc92b52
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon May 24 12:50:31 2010 +0400
cgroup-lite: fix subsys state refcnt
Add missed __css_put and fix refcnt initial state: for alive css refcnt
starts from 1, see the init_cgroup_css and the cgroup_clear_css_refs.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit dd480cee5d48b5fd88f4f074743b542fab6d9e70
Author: Shaohua Li <shaohua.li@intel.com>
Date: Tue Apr 27 16:52:01 2010 +0400
cfq-iosched: split seeky coop queues after one slice
Currently we split seeky coop queues after 1s, which is too big. Below patch
marks seeky coop queue split_coop flag after one slice. After that, if new
requests come in, the queues will be splitted. Patch is suggested by Corrado.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 187231a1fad899839137f76c08dd016a81245abb
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:57 2010 +0400
cfq-iosched: Do not idle on async queues
Few weeks back, Shaohua Li had posted similar patch. I am reposting it
with more test results.
This patch does two things.
- Do not idle on async queues.
- It also changes the write queue depth CFQ drives (cfq_may_dispatch()).
Currently, we seem to driving queue depth of 1 always for WRITES. This is
true even if there is only one write queue in the system and all the logic
of infinite queue depth in case of single busy queue as well as slowly
increasing queue depth based on last delayed sync request does not seem to
be kicking in at all.
This patch will allow deeper WRITE queue depths (subjected to the other
WRITE queue depth contstraints like cfq_quantum and last delayed sync
request).
Shaohua Li had reported getting more out of his SSD. For me, I have got
one Lun exported from an HP EVA and when pure buffered writes are on, I
can get more out of the system. Following are test results of pure
buffered writes (with end_fsync=1) with vanilla and patched kernel. These
results are average of 3 sets of run with increasing number of threads.
AVERAGE[bufwfs][vanilla]
-------
job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
--- --- -- ------------ ----------- ------------- -----------
bufwfs 3 1 0 0 95349 474141
bufwfs 3 2 0 0 100282 806926
bufwfs 3 4 0 0 109989 2.7301e+06
bufwfs 3 8 0 0 116642 3762231
bufwfs 3 16 0 0 118230 6902970
AVERAGE[bufwfs] [patched kernel]
-------
bufwfs 3 1 0 0 270722 404352
bufwfs 3 2 0 0 206770 1.06552e+06
bufwfs 3 4 0 0 195277 1.62283e+06
bufwfs 3 8 0 0 260960 2.62979e+06
bufwfs 3 16 0 0 299260 1.70731e+06
I also ran buffered writes along with some sequential reads and some
buffered reads going on in the system on a SATA disk because the potential
risk could be that we should not be driving queue depth higher in presence
of sync IO going to keep the max clat low.
With some random and sequential reads going on in the system on one SATA
disk I did not see any significant increase in max clat. So it looks like
other WRITE queue depth control logic is doing its job. Here are the
results.
AVERAGE[brr, bsr, bufw together] [vanilla]
-------
job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
--- --- -- ------------ ----------- ------------- -----------
brr 3 1 850 546345 0 0
bsr 3 1 14650 729543 0 0
bufw 3 1 0 0 23908 8274517
brr 3 2 981.333 579395 0 0
bsr 3 2 14149.7 1175689 0 0
bufw 3 2 0 0 21921 1.28108e+07
brr 3 4 898.333 1.75527e+06 0 0
bsr 3 4 12230.7 1.40072e+06 0 0
bufw 3 4 0 0 19722.3 2.4901e+07
brr 3 8 900 3160594 0 0
bsr 3 8 9282.33 1.91314e+06 0 0
bufw 3 8 0 0 18789.3 23890622
AVERAGE[brr, bsr, bufw mixed] [patched kernel]
-------
job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us)
--- --- -- ------------ ----------- ------------- -----------
brr 3 1 837 417973 0 0
bsr 3 1 14357.7 591275 0 0
bufw 3 1 0 0 24869.7 8910662
brr 3 2 1038.33 543434 0 0
bsr 3 2 13351.3 1205858 0 0
bufw 3 2 0 0 18626.3 13280370
brr 3 4 913 1.86861e+06 0 0
bsr 3 4 12652.3 1430974 0 0
bufw 3 4 0 0 15343.3 2.81305e+07
brr 3 8 890 2.92695e+06 0 0
bsr 3 8 9635.33 1.90244e+06 0 0
bufw 3 8 0 0 17200.3 24424392
So looks like it might make sense to include this patch.
Thanks
Vivek
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 9027160e254ff7ea55338a1857843144445d57aa
Author: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue Apr 27 16:51:53 2010 +0400
blk-cgroup: Fix potential deadlock in blk-cgroup
I triggered a lockdep warning as following.
=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.33-rc2 #1
-------------------------------------------------------
test_io_control/7357 is trying to acquire lock:
(blkio_list_lock){+.+...}, at: [<c053a990>] blkiocg_weight_write+0x82/0x9e
but task is already holding lock:
(&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&(&blkcg->lock)->rlock){......}:
[<c04583b7>] validate_chain+0x8bc/0xb9c
[<c0458dba>] __lock_acquire+0x723/0x789
[<c0458eb0>] lock_acquire+0x90/0xa7
[<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
[<c053a4e1>] blkiocg_add_blkio_group+0x1a/0x6d
[<c053cac7>] cfq_get_queue+0x225/0x3de
[<c053eec2>] cfq_set_request+0x217/0x42d
[<c052c8a6>] elv_set_request+0x17/0x26
[<c0532a0f>] get_request+0x203/0x2c5
[<c0532ae9>] get_request_wait+0x18/0x10e
[<c0533470>] __make_request+0x2ba/0x375
[<c0531985>] generic_make_request+0x28d/0x30f
[<c0532da7>] submit_bio+0x8a/0x8f
[<c04d827a>] submit_bh+0xf0/0x10f
[<c04d91d2>] ll_rw_block+0xc0/0xf9
[<f86e9705>] ext3_find_entry+0x319/0x544 [ext3]
[<f86eae58>] ext3_lookup+0x2c/0xb9 [ext3]
[<c04c3e1b>] do_lookup+0xd3/0x172
[<c04c56c8>] link_path_walk+0x5fb/0x95c
[<c04c5a65>] path_walk+0x3c/0x81
[<c04c5b63>] do_path_lookup+0x21/0x8a
[<c04c66cc>] do_filp_open+0xf0/0x978
[<c04c0c7e>] open_exec+0x1b/0xb7
[<c04c1436>] do_execve+0xbb/0x266
[<c04081a9>] sys_execve+0x24/0x4a
[<c04028a2>] ptregs_execve+0x12/0x18
-> #1 (&(&q->__queue_lock)->rlock){..-.-.}:
[<c04583b7>] validate_chain+0x8bc/0xb9c
[<c0458dba>] __lock_acquire+0x723/0x789
[<c0458eb0>] lock_acquire+0x90/0xa7
[<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
[<c053dd2a>] cfq_unlink_blkio_group+0x17/0x41
[<c053a6eb>] blkiocg_destroy+0x72/0xc7
[<c0467df0>] cgroup_diput+0x4a/0xb2
[<c04ca473>] dentry_iput+0x93/0xb7
[<c04ca4b3>] d_kill+0x1c/0x36
[<c04cb5c5>] dput+0xf5/0xfe
[<c04c6084>] do_rmdir+0x95/0xbe
[<c04c60ec>] sys_rmdir+0x10/0x12
[<c04027cc>] sysenter_do_call+0x12/0x32
-> #0 (blkio_list_lock){+.+...}:
[<c0458117>] validate_chain+0x61c/0xb9c
[<c0458dba>] __lock_acquire+0x723/0x789
[<c0458eb0>] lock_acquire+0x90/0xa7
[<c06929fd>] _raw_spin_lock+0x1e/0x4e
[<c053a990>] blkiocg_weight_write+0x82/0x9e
[<c0467f1e>] cgroup_file_write+0xc6/0x1c0
[<c04bd2f3>] vfs_write+0x8c/0x116
[<c04bd7c6>] sys_write+0x3b/0x60
[<c04027cc>] sysenter_do_call+0x12/0x32
other info that might help us debug this:
1 lock held by test_io_control/7357:
#0: (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
stack backtrace:
Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1
Call Trace:
[<c045754f>] print_circular_bug+0x91/0x9d
[<c0458117>] validate_chain+0x61c/0xb9c
[<c0458dba>] __lock_acquire+0x723/0x789
[<c0458eb0>] lock_acquire+0x90/0xa7
[<c053a990>] ? blkiocg_weight_write+0x82/0x9e
[<c06929fd>] _raw_spin_lock+0x1e/0x4e
[<c053a990>] ? blkiocg_weight_write+0x82/0x9e
[<c053a990>] blkiocg_weight_write+0x82/0x9e
[<c0467f1e>] cgroup_file_write+0xc6/0x1c0
[<c0454df5>] ? trace_hardirqs_off+0xb/0xd
[<c044d93a>] ? cpu_clock+0x2e/0x44
[<c050e6ec>] ? security_file_permission+0xf/0x11
[<c04bcdda>] ? rw_verify_area+0x8a/0xad
[<c0467e58>] ? cgroup_file_write+0x0/0x1c0
[<c04bd2f3>] vfs_write+0x8c/0x116
[<c04bd7c6>] sys_write+0x3b/0x60
[<c04027cc>] sysenter_do_call+0x12/0x32
To prevent deadlock, we should take locks as following sequence:
blkio_list_lock -> queue_lock -> blkcg_lock.
The following patch should fix this bug.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 0460ada9ec82e679632588772a3084652c1db996
Author: Divyesh Shah <dpshah@google.com>
Date: Tue Apr 27 16:51:48 2010 +0400
cfq-iosched: Respect ioprio_class when preempting
In cfq_should_preempt(), we currently allow some cases where a non-RT request
can preempt an ongoing RT cfqq timeslice. This should not happen.
Examples include:
o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
on which we are idling.
o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
async cfqq.
Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 58244fb9adfe3f58b17be18c9f27d59dbf4977fe
Author: Shaohua Li <shaohua.li@intel.com>
Date: Tue Apr 27 16:51:44 2010 +0400
cfq-iosched: don't regard requests with long distance as close
seek_mean could be very big sometimes, using it as close criteria is meaningless
as this doen't improve any performance. So if it's big, let's fallback to
default value.
Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 875add11b7efa93199cd179e17786c8c83cf77ea
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:39 2010 +0400
cfq-iosched: Remove prio_change logic for workload selection
o CFQ now internally divides cfq queues in therr workload categories. sync-idle,
sync-noidle and async. Which workload to run depends primarily on rb_key
offset across three service trees. Which is a combination of mulitiple things
including what time queue got queued on the service tree.
There is one exception though. That is if we switched the prio class, say
we served some RT tasks and again started serving BE class, then with-in
BE class we always started with sync-noidle workload irrespective of rb_key
offset in service trees.
This can provide better latencies for sync-noidle workload in the presence
of RT tasks.
o This patch gets rid of that exception and which workload to run with-in
class always depends on lowest rb_key across service trees. The reason
being that now we have multiple BE class groups and if we always switch
to sync-noidle workload with-in group, we can potentially starve a sync-idle
workload with-in group. Same is true for async workload which will be in
root group. Also the workload-switching with-in group will become very
unpredictable as it now depends whether some RT workload was running in
the system or not.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Acked-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 98a3d07b1fe96e53a15cbab963ea26b68b573194
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:35 2010 +0400
cfq-iosched: Get rid of nr_groups
o Currently code does not seem to be using cfqd->nr_groups. Get rid of it.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit af90feaf148382f0f79b9411fc50d88bd861710a
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:31 2010 +0400
cfq-iosched: Remove the check for same cfq group from allow_merge
o allow_merge() already checks if submitting task is pointing to same cfqq
as rq has been queued in. If everything is fine, we should not be having
a task in one cgroup and having a pointer to cfqq in other cgroup.
Well I guess in some situations it can happen and that is, when a random
IO queue has been moved into root cgroup for group_isolation=0. In
this case, tasks's cgroup/group is different from where actually cfqq is,
but this is intentional and in this case merging should be allowed.
The second situation is where due to close cooperator patches, multiple
processes can be sharing a cfqq. If everything implemented right, we should
not end up in a situation where tasks from different processes in different
groups are sharing the same cfqq as we allow merging of cooperating queues
only if they are in same group.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 76160ce0edc2aeeaa4df9292700aecdd0c4c36cb
Author: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue Apr 27 16:51:27 2010 +0400
cfq: set workload as expired if it doesn't have any slice left
When a group is resumed, if it doesn't have workload slice left,
we should set workload_expires as expired. Otherwise, we might
start from where we left in previous group by error.
Thanks the idea from Corrado.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 6a78ef2e36ba6a63c5617326b38e268820cdd893
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:23 2010 +0400
Fix a CFQ crash in "for-2.6.33" branch of block tree
I think my previous patch introduced a bug which can lead to CFQ hitting
BUG_ON().
The offending commit in for-2.6.33 branch is.
commit 7667aa0630407bc07dc38dcc79d29cc0a65553c1
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Dec 8 17:52:58 2009 -0500
cfq-iosched: Take care of corner cases of group losing share due to deletion
While doing some stress testing on my box, I enountered following.
login: [ 3165.148841] BUG: scheduling while
atomic: swapper/0/0x10000100
[ 3165.149821] Modules linked in: cfq_iosched dm_multipath qla2xxx igb
scsi_transport_fc dm_snapshot [last unloaded: scsi_wait_scan]
[ 3165.149821] Pid: 0, comm: swapper Not tainted
2.6.32-block-for-33-merged-new #3
[ 3165.149821] Call Trace:
[ 3165.149821] <IRQ> [<ffffffff8103fab8>] __schedule_bug+0x5c/0x60
[ 3165.149821] [<ffffffff8103afd7>] ? __wake_up+0x44/0x4d
[ 3165.149821] [<ffffffff8153a979>] schedule+0xe3/0x7bc
[ 3165.149821] [<ffffffff8103a796>] ? cpumask_next+0x1d/0x1f
[ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821] [<ffffffff810422d8>] __cond_resched+0x2a/0x35
[ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821] [<ffffffff8153b1ee>] _cond_resched+0x2c/0x37
[ 3165.149821] [<ffffffff8100e2db>] is_valid_bugaddr+0x16/0x2f
[ 3165.149821] [<ffffffff811e4161>] report_bug+0x18/0xac
[ 3165.149821] [<ffffffff8100f1fc>] die+0x39/0x63
[ 3165.149821] [<ffffffff8153cde1>] do_trap+0x11a/0x129
[ 3165.149821] [<ffffffff8100d470>] do_invalid_op+0x96/0x9f
[ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821] [<ffffffff81034b4d>] ? enqueue_task+0x5c/0x67
[ 3165.149821] [<ffffffff8103ae83>] ? task_rq_unlock+0x11/0x13
[ 3165.149821] [<ffffffff81041aae>] ? try_to_wake_up+0x292/0x2a4
[ 3165.149821] [<ffffffff8100c935>] invalid_op+0x15/0x20
[ 3165.149821] [<ffffffffa000b21d>] ? cfq_dispatch_requests+0x6ba/0x93e
[cfq_iosched]
[ 3165.149821] [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821] [<ffffffff811d8c2a>] blk_peek_request+0x191/0x1a7
[ 3165.149821] [<ffffffff811e5b8d>] ? kobject_get+0x1a/0x21
[ 3165.149821] [<ffffffff812c8d4c>] scsi_request_fn+0x82/0x3df
[ 3165.149821] [<ffffffff8110b2de>] ? bio_fs_destructor+0x15/0x17
[ 3165.149821] [<ffffffff810df5a6>] ? virt_to_head_page+0xe/0x2f
[ 3165.149821] [<ffffffff811d931f>] __blk_run_queue+0x42/0x71
[ 3165.149821] [<ffffffff811d9403>] blk_run_queue+0x26/0x3a
[ 3165.149821] [<ffffffff812c8761>] scsi_run_queue+0x2de/0x375
[ 3165.149821] [<ffffffff812b60ac>] ? put_device+0x17/0x19
[ 3165.149821] [<ffffffff812c92d7>] scsi_next_command+0x3b/0x4b
[ 3165.149821] [<ffffffff812c9b9f>] scsi_io_completion+0x1c9/0x3f5
[ 3165.149821] [<ffffffff812c3c36>] scsi_finish_command+0xb5/0xbe
I think I have hit following BUG_ON() in cfq_dispatch_request().
BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
Please find attached the patch to fix it. I have done some stress testing
with it and have not seen it happening again.
o We should wait on a queue even after slice expiry only if it is empty. If
queue is not empty then continue to expire it.
o If we decide to keep the queue then make cfqq=NULL. Otherwise select_queue()
will return a valid cfqq and cfq_dispatch_request() can hit following
BUG_ON().
BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list))
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 086fcfd4a9aec3209a9a8b2c591734850bbca097
Author: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue Apr 27 16:51:18 2010 +0400
cfq: Remove wait_request flag when idle time is being deleted
Remove wait_request flag when idle time is being deleted, otherwise
it'll hit this path every time when a request is enqueued.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 9714cf0030da3ceaea312be05cc056d4b36fe118
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:51:14 2010 +0400
cfq-iosched: commenting non-obvious initialization
Added a comment to explain the initialization of last_delayed_sync.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 75e3bc83c0d1f9c909bd0bce56ac377623c22807
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:10 2010 +0400
cfq-iosched: Take care of corner cases of group losing share due to deletion
If there is a sequential reader running in a group, we wait for next request
to come in that group after slice expiry and once new request is in, we expire
the queue. Otherwise we delete the group from service tree and group looses
its fair share.
So far I was marking a queue as wait_busy if it had consumed its slice and
it was last queue in the group. But this condition did not cover following
two cases.
1.If a request completed and slice has not expired yet. Next request comes
in and is dispatched to disk. Now select_queue() hits and slice has expired.
This group will be deleted. Because request is still in the disk, this queue
will never get a chance to wait_busy.
2.If request completed and slice has not expired yet. Before next request
comes in (delay due to think time), select_queue() hits and expires the
queue hence group. This queue never got a chance to wait busy.
Gui was hitting the boundary condition 1 and not getting fairness numbers
proportional to weight.
This patch puts the checks for above two conditions and improves the fairness
numbers for sequential workload on rotational media. Check in select_queue()
takes care of case 1 and additional check in should_wait_busy() takes care
of case 2.
Reported-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 6c866a0686a169f5098da254fb6b0f8812318469
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:51:06 2010 +0400
cfq-iosched: Get rid of cfqq wait_busy_done flag
o Get rid of wait_busy_done flag. This flag only tells we were doing wait
busy on a queue and that queue got request so expire it. That information
can easily be obtained by (cfq_cfqq_wait_busy() && queue_is_not_empty). So
remove this flag and keep code simple.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 44c156f1191391dddb02f1abff022a61c2f94a17
Author: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue Apr 27 16:51:02 2010 +0400
cfq: Optimization for close cooperating queue searching
It doesn't make any sense to try to find out a close cooperating
queue if current cfqq is the only one in the group.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit bd4386b49b4ba2c012dc22c7a80512681a5ade15
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:50:58 2010 +0400
cfq-iosched: reduce write depth only if sync was delayed
The introduction of ramp-up formula for async queue depths has
slowed down dirty page reclaim, by reducing async write performance.
This patch makes sure the formula kicks in only when sync request
was recently delayed.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 433c9d47f26fcb9141f1a1c3f15245a8391c5a08
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:52 2010 +0400
cfq-iosched: Do not access cfqq after freeing it
Fix a crash during boot reported by Jeff Moyer. Fix the issue of accessing
cfqq after freeing it.
Reported-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@carl.(none)>
commit 21e7ec5499dfae1930bc103e1f2430b262ac0c61
Author: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue Apr 27 16:50:48 2010 +0400
block: include linux/err.h to use ERR_PTR
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit ba750bcbce0558bfe7ea2fd4a9b9ca74e1eac70f
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:50:44 2010 +0400
cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit
After the merge of the IO controller patches, booting on my megaraid
box ran much slower. Vivek Goyal traced it down to megaraid discovery
creating tons of devices, each suffering a grace period when they later
kill that queue (if no device is found).
So lets use call_rcu() to batch these deferred frees, instead of taking
the grace period hit for each one.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 291282276037c26045453190e5dd441ff03e319a
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:40 2010 +0400
blkio: Implement dynamic io controlling policy registration
o One of the goals of block IO controller is that it should be able to
support mulitple io control policies, some of which be operational at
higher level in storage hierarchy.
o To begin with, we had one io controlling policy implemented by CFQ, and
I hard coded the CFQ functions called by blkio. This created issues when
CFQ is compiled as module.
o This patch implements a basic dynamic io controlling policy registration
functionality in blkio. This is similar to elevator functionality where
ioschedulers register the functions dynamically.
o Now in future, when more IO controlling policies are implemented, these
can dynakically register with block IO controller.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 7701338499b73355707c41ae27358a4dd5bc4b84
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:36 2010 +0400
blkio: Export some symbols from blkio as its user CFQ can be a module
o blkio controller is inside the kernel and cfq makes use of interfaces
exported by blkio. CFQ can be a module too, hence export symbols used
by CFQ.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 8dfe981d81c7a967b6040d73fcae9780ef1519ae
Author: Shaohua Li <shaohua.li@intel.com>
Date: Tue Apr 27 16:50:31 2010 +0400
cfq-iosched: make nonrot check logic consistent
cfq_arm_slice_timer() has logic to disable idle window for SSD device. The same
thing should be done at cfq_select_queue() too, otherwise we will still see
idle window. This makes the nonrot check logic consistent in cfq.
Tests in a intel SSD with low_latency knob close, below patch can triple disk
thoughput for muti-thread sequential read.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit e1853aca5799c76d0dd8ff97c5bed8c2e6059fa2
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:50:28 2010 +0400
cfq-iosched: move IO controller declerations to a header file
They should not be declared inside some other file that's not related
to CFQ.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 16ca6c55c9c1961dbd748a5c94883ab1d65bb04f
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:50:24 2010 +0400
cfq-iosched: fix compile problem with !CONFIG_CGROUP
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 5260a89b72023fcad7242552059312e31a864bf2
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:19 2010 +0400
blkio: Wait on sync-noidle queue even if rq_noidle = 1
o rq_noidle() is supposed to tell cfq that do not expect a request after this
one, hence don't idle. But this does not seem to work very well. For example
for direct random readers, rq_noidle = 1 but there is next request coming
after this. Not idling, leads to a group not getting its share even if
group_isolation=1.
o The right solution for this issue is to scan the higher layers and set
right flag (WRITE_SYNC or WRITE_ODIRECT). For the time being, this single
line fix helps. This should not have any significant impact when we are
not using cgroups. I will later figure out IO paths in higher layer and
fix it.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 3647d976033973a4502696fb45a980baa8cf1350
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:15 2010 +0400
blkio: Implement group_isolation tunable
o If a group is running only a random reader, then it will not have enough
traffic to keep disk busy and we will reduce overall throughput. This
should result in better latencies for random reader though. If we don't
idle on random reader service tree, then this random reader will experience
large latencies if there are other groups present in system with sequential
readers running in these.
o One solution suggested by corrado is that by default keep the random readers
or sync-noidle workload in root group so that during one dispatch round
we idle only once on sync-noidle tree. This means that all the sync-idle
workload queues will be in their respective group and we will see service
differentiation in those but not on sync-noidle workload.
o Provide a tunable group_isolation. If set, this will make sure that even
sync-noidle queues go in their respective group and we wait on these. This
provides stronger isolation between groups but at the expense of throughput
if group does not have enough traffic to keep the disk busy.
o By default group_isolation = 0
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit d7d266e74623a5ff4a196c9ba35edb33d844078d
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:09 2010 +0400
blkio: Determine async workload length based on total number of queues
o Async queues are not per group. Instead these are system wide and maintained
in root group. Hence their workload slice length should be calculated
based on total number of queues in the system and not just queues in the
root group.
o As root group's default weight is 1000, make sure to charge async queue
more in terms of vtime so that it does not get more time on disk because
root group has higher weight.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 853b022fdecf1394bc6f56ed4391acfcdac76a77
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:50:06 2010 +0400
blkio: Wait for cfq queue to get backlogged if group is empty
o If a queue consumes its slice and then gets deleted from service tree, its
associated group will also get deleted from service tree if this was the
only queue in the group. That will make group loose its share.
o For the queues on which we have idling on and if these have used their
slice, wait a bit for these queues to get backlogged again and then
expire these queues so that group does not loose its share.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 6b1099c5bbc770dc0e00e447c91cc2c70abfcd4d
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:55 2010 +0400
blkio: Propagate cgroup weight updation to cfq groups
o Propagate blkio cgroup weight updation to associated cfq groups.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit b8e49f6ef8a5b19dcc3596a957b10ff7783ca8e3
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:51 2010 +0400
blkio: Drop the reference to queue once the task changes cgroup
o If a task changes cgroup, drop reference to the cfqq associated with io
context and set cfqq pointer stored in ioc to NULL so that upon next request
arrival we will allocate a new queue in new group.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit f0939f2fb5a93f52e4c38c96dd403a20412635ac
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:47 2010 +0400
blkio: Provide some isolation between groups
o Do not allow following three operations across groups for isolation.
- selection of co-operating queues
- preemtpions across groups
- request merging across groups.
o Async queues are currently global and not per group. Allow preemption of
an async queue if a sync queue in other group gets backlogged.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 3e5835588e20983417074286dc9c46aeff4bdcb5
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:43 2010 +0400
blkio: Export disk time and sectors used by a group to user space
o Export disk time and sector used by a group to user space through cgroup
interface.
o Also export a "dequeue" interface to cgroup which keeps track of how many
a times a group was deleted from service tree. Helps in debugging.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 5050a2e923c23fee20e5d20350da94328c028ea7
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:38 2010 +0400
blkio: Some debugging aids for CFQ
o Some debugging aids for CFQ.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 32227ad5a49cdf40d128fff9f573e770326fb2a1
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:33 2010 +0400
blkio: Take care of cgroup deletion and cfq group reference counting
o One can choose to change elevator or delete a cgroup. Implement group
reference counting so that both elevator exit and cgroup deletion can
take place gracefully.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Nauman Rafique <nauman@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit c80d513227c069c5f15e1722ef3d63096aa2652b
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:29 2010 +0400
blkio: Dynamic cfq group creation based on cgroup tasks belongs to
o Determine the cgroup IO submitting task belongs to and create the cfq
group if it does not exist already.
o Also link cfqq and associated cfq group.
o Currently all async IO is mapped to root group.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit e890b41384a11cd0eaaf4901d72de44cd21e2b65
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:25 2010 +0400
blkio: Group time used accounting and workload context save restore
o This patch introduces the functionality to do the accounting of group time
when a queue expires. This time used decides which is the group to go
next.
o Also introduce the functionlity to save and restore the workload type
context with-in group. It might happen that once we expire the cfq queue
and group, a different group will schedule in and we will lose the context
of the workload type. Hence save and restore it upon queue expiry.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit fb6067d930baa1b510aba82153ddad866aa0cf65
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:21 2010 +0400
blkio: Implement per cfq group latency target and busy queue avg
o So far we had 300ms soft target latency system wide. Now with the
introduction of cfq groups, divide that latency by number of groups so
that one can come up with group target latency which will be helpful
in determining the workload slice with-in group and also the dynamic
slice length of the cfq queue.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 0fee1302172d62ee9eb34c37d792ac05e30fe2d7
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:17 2010 +0400
blkio: Introduce per cfq group weights and vdisktime calculations
o Bring in the per cfq group weight and how vdisktime is calculated for the
group. Also bring in the functionality of updating the min_vdisktime of
the group service tree.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit a31a7a44995ded913fd031f922cffa9e457b2a83
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:12 2010 +0400
blkio: Introduce blkio controller cgroup interface
o This is basic implementation of blkio controller cgroup interface. This is
the common interface visible to user space and should be used by different
IO control policies as we implement those.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 82041001ee5b7a662d488238f46b8912cc440160
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:06 2010 +0400
blkio: Introduce the root service tree for cfq groups
o So far we just had one cfq_group in cfq_data. To create space for more than
one cfq_group, we need to have a service tree of groups where all the groups
can be queued if they have active cfq queues backlogged in these.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 1b290883254f64d396f11a071b74598d97e1b3d3
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:49:02 2010 +0400
blkio: Keep queue on service tree until we expire it
o Currently cfqq deletes a queue from service tree if it is empty (even if
we might idle on the queue). This patch keeps the queue on service tree
hence associated group remains on the service tree until we decide that
we are not going to idle on the queue and expire it.
o This just helps in time accounting for queue/group and in implementation
of rest of the patches.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit d0d70b93083a4fc811bd3bfed1df04870102d538
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:48:58 2010 +0400
blkio: Implement macro to traverse each service tree in group
o Implement a macro to traverse each service tree in the group. This avoids
usage of double for loop and special condition for idle tree 4 times.
o Macro is little twisted because of special handling of idle class service
tree.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 4fea5fccf125349a109304569acbeda86c9ab67f
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:48:54 2010 +0400
blkio: Introduce the notion of cfq groups
o This patch introduce the notion of cfq groups. Soon we will can have multiple
groups of different weights in the system.
o Various service trees (prioclass and workload type trees), will become per
cfq group. So hierarchy looks as follows.
cfq_groups
|
workload type
|
cfq queue
o When an scheduling decision has to be taken, first we select the cfq group
then workload with-in the group and then cfq queue with-in the workload
type.
o This patch just makes various workload service tree per cfq group and
introduce the function to be able to choose a group for scheduling.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 14d52ec9524545c8eb9c13d05925c53f1bd2b3ff
Author: Vivek Goyal <vgoyal@redhat.com>
Date: Tue Apr 27 16:48:49 2010 +0400
blkio: Set must_dispatch only if we decided to not dispatch the request
o must_dispatch flag should be set only if we decided not to run the queue
and dispatch the request.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit a6a0574d5ab33877885943183de7645e157ed16e
Author: Shaohua Li <shaohua.li@intel.com>
Date: Tue Apr 27 16:48:46 2010 +0400
cfq-iosched: no dispatch limit for single queue
Since commit 2f5cb7381b737e24c8046fd4aeab571fb71315f5, each queue can send
up to 4 * 4 requests if only one queue exists. I wonder why we have such limit.
Device supports tag can send more requests. For example, AHCI can send 31
requests. Test (direct aio randread) shows the limits reduce about 4% disk
thoughput.
On the other hand, since we send one request one time, if other queue
pop when current is sending more than cfq_quantum requests, current queue will
stop send requests soon after one request, so sounds there is no big latency.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 886ef3fce890295b04063e286c1a82c97574b737
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:48:41 2010 +0400
Revert "cfq: Make use of service count to estimate the rb_key offset"
This reverts commit 3586e917f2c7df769d173c4ec99554cb40a911e5.
Corrado Zoccolo <czoccolo@gmail.com> correctly points out, that we need
consistency of rb_key offset across groups. This means we cannot properly
use the per-service_tree service count. Revert this change.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 80216a50226739cd997445d5ff2335a4c944fba7
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:48:36 2010 +0400
cfq-iosched: fix corner cases in idling logic
Idling logic was disabled in some corner cases, leading to unfair share
for noidle queues.
* the idle timer was not armed if there were other requests in the
driver. unfortunately, those requests could come from other workloads,
or queues for which we don't enable idling. So we will check only
pending requests from the active queue
* rq_noidle check on no-idle queue could disable the end of tree idle if
the last completed request was rq_noidle. Now, we will disable that
idle only if all the queues served in the no-idle tree had rq_noidle
requests.
Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit fed0ad86edd704970417ce78b1a130b1951f7bb8
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:48:32 2010 +0400
cfq-iosched: idling on deep seeky sync queues
Seeky sync queues with large depth can gain unfairly big share of disk
time, at the expense of other seeky queues. This patch ensures that
idling will be enabled for queues with I/O depth at least 4, and small
think time. The decision to enable idling is sticky, until an idle
window times out without seeing a new request.
The reasoning behind the decision is that, if an application is using
large I/O depth, it is already optimized to make full utilization of
the hardware, and therefore we reserve a slice of exclusive use for it.
Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 989d070f4d3594f485df16fa5b5786db8188e837
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:48:28 2010 +0400
cfq-iosched: fix no-idle preemption logic
An incoming no-idle queue should preempt the active no-idle queue
only if the active queue is idling due to service tree empty.
Previous code was buggy in two ways:
* it relied on service_tree field to be set on the active queue, while
it is not set when the code is idling for a new request
* it didn't check for the service tree empty condition, so could lead to
LIFO behaviour if multiple queues with depth > 1 were preempting each
other on an non-NCQ device.
Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 1baaab33a240924a5542eeb7a275d2915dc09518
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:48:23 2010 +0400
cfq-iosched: fix ncq detection code
CFQ's detection of queueing devices initially assumes a queuing device
and detects if the queue depth reaches a certain threshold.
However, it will reconsider this choice periodically.
Unfortunately, if device is considered not queuing, CFQ will force a
unit queue depth for some workloads, thus defeating the detection logic.
This leads to poor performance on queuing hardware,
since the idle window remains enabled.
Given this premise, switching to hw_tag = 0 after we have proved at
least once that the device is NCQ capable is not a good choice.
The new detection code starts in an indeterminate state, in which CFQ behaves
as if hw_tag = 1, and then, if for a long observation period we never saw
large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1,
without reconsidering it again.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 43090d5ccb1b6adcd28b2d4d54cc8ddf6c96a212
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:48:19 2010 +0400
cfq-iosched: cleanup unreachable code
cfq_should_idle returns false for no-idle queues that are not the last,
so the control flow will never reach the removed code in a state that
satisfies the if condition.
The unreachable code was added to emulate previous cfq behaviour for
non-NCQ rotational devices. My tests show that even without it, the
performances and fairness are comparable with previous cfq, thanks to
the fact that all seeky queues are grouped together, and that we idle at
the end of the tree.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit ea4004872f1e7a3a3651319fd5df6df17e9c7e66
Author: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue Apr 27 16:48:15 2010 +0400
cfq: Make use of service count to estimate the rb_key offset
For the moment, different workload cfq queues are put into different
service trees. But CFQ still uses "busy_queues" to estimate rb_key
offset when inserting a cfq queue into a service tree. I think this
isn't appropriate, and it should make use of service tree count to do
this estimation. This patch is for for-2.6.33 branch.
Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 4c49bbef74b78184ecdc8d4c14c6d531f9edea42
Author: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue Apr 27 16:46:51 2010 +0400
block: jiffies fixes
Use HZ-independent calculation of milliseconds.
Add jiffies.h where it was missing since functions or macros
from it are used.
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit f96f26aeb96cc338693fe5c2d48ab04e799f0187
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:46:46 2010 +0400
cfq-iosched: fix next_rq computation
Cfq has a bug in computation of next_rq, that affects transition
between multiple sequential request streams in a single queue
(e.g.: two sequential buffered writers of the same priority),
causing the alternation between the two streams for a transient period.
8,0 1 18737 0.260400660 5312 D W 141653311 + 256
8,0 1 20839 0.273239461 5400 D W 141653567 + 256
8,0 1 20841 0.276343885 5394 D W 142803919 + 256
8,0 1 20843 0.279490878 5394 D W 141668927 + 256
8,0 1 20845 0.292459993 5400 D W 142804175 + 256
8,0 1 20847 0.295537247 5400 D W 141668671 + 256
8,0 1 20849 0.298656337 5400 D W 142804431 + 256
8,0 1 20851 0.311481148 5394 D W 141668415 + 256
8,0 1 20853 0.314421305 5394 D W 142804687 + 256
8,0 1 20855 0.318960112 5400 D W 142804943 + 256
The fix makes sure that the next_rq is computed from the last
dispatched request, and not affected by merging.
8,0 1 37776 4.305161306 0 D W 141738087 + 256
8,0 1 37778 4.308298091 0 D W 141738343 + 256
8,0 1 37780 4.312885190 0 D W 141738599 + 256
8,0 1 37782 4.315933291 0 D W 141738855 + 256
8,0 1 37784 4.319064459 0 D W 141739111 + 256
8,0 1 37786 4.331918431 5672 D W 142803007 + 256
8,0 1 37788 4.334930332 5672 D W 142803263 + 256
8,0 1 37790 4.337902723 5672 D W 142803519 + 256
8,0 1 37792 4.342359774 5672 D W 142803775 + 256
8,0 1 37794 4.345318286 0 D W 142804031 + 256
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit b12c0189dd602b89f3c6d82e050a7579f5813a09
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:44:33 2010 +0400
cfq-iosched: get rid of the coop_preempt flag
We need to rework this logic post the cooperating cfq_queue merging,
for now just get rid of it and Jeff Moyer will fix the fall out.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 738f35df496b0c4a214f08b356f1a08d6f87b70e
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:44:28 2010 +0400
cfq-iosched: fix merge error
We ended up with testing the same condition twice, pretty
pointless. Remove that first if.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit d70f9c5005fd87d2d9bcfe5a1dd831e119d497b5
Author: Jens Axboe <jens.axboe@oracle.com>
Date: Tue Apr 27 16:42:34 2010 +0400
cfq-iosched: fix style issue in cfq_get_avg_queues()
Line breaks and bad brace placement.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 1ad5fcfc2beacbe333bd947a6a95acb9ee810891
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:42:30 2010 +0400
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit e2d27033102f717078e4bfdc9229ef84dbd8088c
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:42:03 2010 +0400
cfq-iosched: enable idling for last queue on priority class
cfq can disable idling for queues in various circumstances.
When workloads of different priorities are competing, if the higher
priority queue has idling disabled, lower priority queues may steal
its disk share. For example, in a scenario with an RT process
performing seeky reads vs a BE process performing sequential reads,
on an NCQ enabled hardware, with low_latency unset,
the RT process will dispatch only the few pending requests every full
slice of service for the BE process.
The patch solves this issue by always performing idle on the last
queue at a given priority class > idle. If the same process, or one
that can pre-empt it (so at the same priority or higher), submits a
new request within the idle window, the lower priority queue won't
dispatch, saving the disk bandwidth for higher priority ones.
Note: this doesn't touch the non_rotational + NCQ case (no hardware
to test if this is a benefit in that case).
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit ddc6295b4d6c3461a02f98ba75cbfe900a087ee4
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:41:59 2010 +0400
cfq-iosched: reimplement priorities using different service trees
We use different service trees for different priority classes.
This allows a simplification in the service tree insertion code, that no
longer has to consider priority while walking the tree.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit b1ca547aa679a0605bf9cfbc2ee8c4d0f9738e90
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:41:55 2010 +0400
cfq-iosched: preparation to handle multiple service trees
We embed a pointer to the service tree in each queue, to handle multiple
service trees easily.
Service trees are enriched with a counter.
cfq_add_rq_rb is invoked after putting the rq in the fifo, to ensure
that all fields in rq are properly initialized.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 72c938338cfb00497c498fd05901c23f2fa9e6ce
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:41:50 2010 +0400
cfq-iosched: adapt slice to number of processes doing I/O
When the number of processes performing I/O concurrently increases,
a fixed time slice per process will cause large latencies.
This patch, if low_latency mode is enabled, will scale the time slice
assigned to each process according to a 300ms target latency.
In order to keep fairness among processes:
* The number of active processes is computed using a special form of
running average, that quickly follows sudden increases (to keep latency low),
and decrease slowly (to have fairness in spite of rapid decreases of this
value).
To safeguard sequential bandwidth, we impose a minimum time slice
(computed using 2*cfq_slice_idle as base, adjusted according to priority
and async-ness).
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit ca34f4ef05e2b5abcb60af65a69a367ea9f5148e
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date: Tue Apr 27 16:41:46 2010 +0400
cfq-iosched: simplify prio-unboost code
Eliminate redundant checks.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit ab7d66cd0bd0aff8fe977d03cd20afd1ff3a5dfd
Author: Shaohua Li <shaohua.li@intel.com>
Date: Tue Apr 27 16:41:42 2010 +0400
cfq-iosched: improve hw_tag detection
If active queue hasn't enough requests and idle window opens, cfq will not
dispatch sufficient requests to hardware. In such situation, current code
will zero hw_tag. But this is because cfq doesn't dispatch enough requests
instead of hardware queue doesn't work. Don't zero hw_tag in such case.
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 00b99100690429e98f3a8efe7f59fe124814bc67
Author: Jeff Moyer <jmoyer@redhat.com>
Date: Tue Apr 27 16:41:38 2010 +0400
cfq: break apart merged cfqqs if they stop cooperating
cfq_queues are merged if they are issuing requests within the mean seek
distance of one another. This patch detects when the coopearting stops and
breaks the queues back up.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit 9186d4378bed803bf7cca93c1abc4d74adab2ed2
Author: Jeff Moyer <jmoyer@redhat.com>
Date: Tue Apr 27 16:32:26 2010 +0400
cfq: change the meaning of the cfqq_coop flag
The flag used to indicate that a cfqq was allowed to jump ahead in the
scheduling order due to submitting a request close to the queue that
just executed. Since closely cooperating queues are now merged, the flag
holds little meaning. Change it to indicate that multiple queues were
merged. This will later be used to allow the breaking up of merged queues
when they are no longer cooperating.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit de85cbb1eaf76b988bbe96f89b4761352adf4614
Author: Jeff Moyer <jmoyer@redhat.com>
Date: Tue Apr 27 16:32:20 2010 +0400
cfq: merge cooperating cfq_queues
When cooperating cfq_queues are detected currently, they are allowed to
skip ahead in the scheduling order. It is much more efficient to
automatically share the cfq_queue data structure between cooperating processes.
Performance of the read-test2 benchmark (which is written to emulate the
dump(8) utility) went from 12MB/s to 90MB/s on my SATA disk. NFS servers
with multiple nfsd threads also saw performance increases.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit e09d12221f4d1c7fcb00fd687ae6e759c39054c6
Author: Jeff Moyer <jmoyer@redhat.com>
Date: Tue Apr 27 16:18:17 2010 +0400
cfq: calculate the seek_mean per cfq_queue not per cfq_io_context
async cfq_queue's are already shared between processes within the same
priority, and forthcoming patches will change the mapping of cic to sync
cfq_queue from 1:1 to 1:N. So, calculate the seekiness of a process
based on the cfq_queue instead of the cfq_io_context.
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
commit c05f95fcb04e896c898218d12a8f37c43d2f9cc6
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Apr 27 15:10:13 2010 +0400
OpenVZ kernel 2.6.32-avdeyev released
Named after Sergei Vasilyevich Avdeyev - a Russian cosmonaut.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b4a419d9abd11e3efd02e9fccd4a14180866cf99
Merge: 455792e 5bf3475
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Apr 27 14:01:27 2010 +0400
Merged linux-2.6.32.12
Conflicts:
Makefile
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 455792e7712fac15bba7ca187c244f30c9d0e825
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu Apr 22 19:08:13 2010 +0400
ipv6: fix sysctl unregistering order
call addrconf_ifdown for loopback at last last ipv6 addr delete with how=0
to fix sysctl tables undergister ordering: all other interfaces attach their
sysctl paths to lo's, so unregister lo sysctl tables only at namespace destroy.
https://bugzilla.sw.ru/show_bug.cgi?id=473430
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fa86dba2b6213e770f102d1e688f6527d759aecf
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Apr 5 15:43:18 2010 +0400
ve: fix ve task state percpu counters
Counters overlap detection for ve tasks in running/uninterraprible/iowait state
was broken due to type mismatch:
nr_{running/unin..e/iowait}_ve() uses _long_ for summing _int_ percpu counters.
As result, it broke ve loadavg calculation after first int overlap.
This patch expand all this percpu counters to unsigned long.
http://bugzilla.openvz.org/show_bug.cgi?id=1396
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b484e22d951a02bd7ce25aaac396742766142790
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Apr 5 15:41:30 2010 +0400
check flags on parsed structure
http://bugzilla.openvz.org/show_bug.cgi?id=1464
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d8a86ef5a6c747ddb2896696269c0feef5d6fe1e
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Apr 5 15:38:29 2010 +0400
CPT: check signal curr_target at restore
set signal curr_target to current if right task was not found.
fix oops after broken restore.
"curr_target" controls round robin signal target balance over process
threads, there no reasons to care about migration accuracy.
http://bugzilla.openvz.org/show_bug.cgi?id=1467
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 61845b781db7d86180977270c73f6ea3885485f3
Author: Pavel Emelyanov <xemul@openvzorg>
Date: Mon Apr 5 15:35:58 2010 +0400
cpt: Don't mind the tsk->splice_pipe cache at cpt time
This field is just a cache for sendfile systemcall. It can be dropped
safely during migration - the first sendfile after restore will create
it back.
http://bugzilla.openvz.org/show_bug.cgi?id=881
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fcd86ff706b309999e526dc4a37e9de88ec051fb
Author: Peter Volkov <pva@gentoo.org>
Date: Sun Mar 28 18:04:44 2010 +0400
Fix /proc/kmsg permissions with capabilities active
Whenever application sets cap_sys_admin=ep it is unable to read
/proc/kmsg with EPERM. This patch makes /proc/kmsg readable on HN.
http://bugzilla.openvz.org/show_bug.cgi?id=1360
Signed-off-by: Peter Volkov <pva@gentoo.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8c6af363b89ebf94d3982d786dd21c64fb41528f
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Mar 12 15:58:35 2010 +0300
quota: fix compilation 32-bit compat quota, remove size checks.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 26aeb82fc7ef70e83a4e0640fcb77c7b6f31d81b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Mar 12 15:58:34 2010 +0300
x86: fix compilation for 32-bit kernel
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 92875e3c49a15885ffbf40cbb0f2bd82cf423e43
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Mar 1 13:03:59 2010 +0300
CPT: update image version to CPT_VERSION_27_3
sync cpt minor version with rhel5 branch
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f7dd75ba9debbd60b12eec93128a5742d6876d28
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Mar 1 12:56:27 2010 +0300
CPT: ignore deleted linked chr blk fifo nodes
Ignore unlinked but referenced pipes, character and block device nodes.
Restore process will create it itself.
Bug #455855
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d7c68b191825cbbf6c7a40a75d38d09330b3abca
Author: Pavel Emelianov <xemul@openvz.org>
Date: Mon Mar 1 12:55:36 2010 +0300
CPT: Dump fake hardlinks on inotify watch's inodes
When a watch is attached to unlinked and closed file it
will not be restored, since the inode will not be in image.
To fix this the proposal is to create a fake link on the
inode in a temp dir and dump it.
Bug #454944
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7cf74bdd35d9559c671362cf8ce7016bb51aedaa
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Mar 1 12:52:42 2010 +0300
CPT: Open hardlinked files only if is set 'hardlinked_on'
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 52c2eb6da3f09f44d652eb7156a793b5f50e8e08
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Mar 1 12:52:09 2010 +0300
CPT: Add ioctl CPT_HARDLNK_ON for rst
vzctl have to call ioctl CPT_HARDLNK_ON to enable open hardlinked
files by kernel during restore.
This protection is needed to prevent mix new kernel + old vzctl (which
doesn't do cleaning). In other words, prevent creating/open files
which will not be removed, and therefore this issue can lead to
security problem.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 72dfa44429c57c924ec4ac4d25d9ef6a343ddade
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Mar 1 12:51:39 2010 +0300
CPT: Add CPT_DENTRY_HARDLINKED flag to cpt_file_image
This flag tells that file was hardlinked.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 80d2ce353aa41820eca28c15abd6c1421d537736
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Mar 1 12:49:48 2010 +0300
CPT: Create hard links to "deleted but referenced" during checkpoint
For "deleted but referenced" files, kernel creates hard link in
directory (that was set via CPT_LINKDIR_ADD) in format:
.cpt_hardlink.xxxxxxxx
x - digit, from 0 to 9
Note - this policy is used only when no other ways of dumping unlined
file helped.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c24ab545f53ae07a2bfb3a6df100b56d49b57281
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Mar 1 12:47:30 2010 +0300
CPT: Add ioctl CPT_LINKDIR_ADD for cpt
vzctl have to call ioctl CPT_LINKDIR_ADD to tell kernel where
create hardlinked files during checkpoint. Without this ioctl
kernel assumes that creating hardlinked files is off.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d4ef97ff64464126b459ef8d9a0adbb95fb9dc09
Author: Konstantin Khorenko <khorenko@openvz.org>
Date: Sat Feb 27 16:58:11 2010 +0300
CPT: stop the migration if shm restoration failed
Bug #268163
Signed-off-by: Konstantin Khorenko <khorenko@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 089c01a6503ec6fc1ce66841d049bb65aa3c212c
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Sat Feb 27 16:58:11 2010 +0300
CPT: restart local_kernel_thread in case of -ERESTARTNOINTR
This is essential in case of migration to SLM node.
We can bump into situation when SLM refuses to fork during the
undumping process because it thinks that subgroup's resources
are to be redistributed. When this happens fork is delayed with
the -ERESTARTNOINTR error and the undumping process fails.
As Den (den@) noticed userspace is not intented to see the
-ERESTARTNOINTR error so we should handle this situation in the
kernel. According to the logic in the do_signal() function the
interrupted system call is immediately restarted in case of the
-ERESTARTNOINTR error.
We borrow this policy and apply it to the local_kernel_thread()
cpt helper function.
[ xemul: this is quite a rare case, so simple cond_resched()
is OK here all the more so the redistribution should
happen in a timer ]
Bug #116787
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8551a850a459df659d7b14a66dfc8cf6da5065d6
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:11 2010 +0300
CPT: save/restore only classic task flags
Task flags were restored as they were saved in image. That is not correct as
flags are differs in 2.6.9, 2.6.16 and 2.6.18 kernels.
Actually we just need to save/restore only classic flags (PF_EXITING, PF_DEAD,
PF_FORKNOEXEC, PF_SUPERPRIV, PF_DUMPCORE and PF_SIGNALED).
The problems can occure because during migration from 2.6.9 to 2.6.18 kernel
flag PF_USED_MATH was not restored on tsk->flags correctly.
In 2.6.9 kernel there was field tsk->used_math for this purpose, in 2.6.18
kernel it is transformed into one of the tsk->flags.
And it was a bug, that after restore of fpu state and PF_USED_MATH flag, it
was cleared by "tsk->flags = ti->cpt_flags & ~PF_FROZEN", as old cpt_flags do
not contain PF_USED_MATH flag.
Bugs #115977 #115980 #115982
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 75f2abfa9f92fc7ac512a8ed9a34c2df0edd133d
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:11 2010 +0300
CPT: udp sockets restore fix
Some applications (like ntpd) set on udp sockets sk_reuse to 1. So any other
applications can bind to the same port. During restore we must skip this
check and restore and bind all sockets. On IPv6 we must also force DAD
(Duplicate Address Detection) procedure to be sure that IFA_F_TENTATIVE flag
will be cleared on IPv6 address and socket can be binded to it.
http://bugzilla.openvz.org/show_bug.cgi?id=784
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ba94d3fa2bb8636a7dceaa01fbf6fecdb8edacd5
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:11 2010 +0300
CPT: screw up udev bindmounts knot
Ubuntu's udev on boot does:
if ! mountpoint -q /dev; then
# initramfs didn't mount /dev, so we'll need to do that
mount -n --bind /dev /etc/udev
mount -n -t tmpfs -o mode=0755 udev /dev
mkdir -m 0700 -p /dev/.static/dev
mount -n --move /etc/udev /dev/.static/dev
fi
So, workaround is dumping "/dev" as bindmount's source.
Bug #120852
http://bugzilla.openvz.org/show_bug.cgi?id=1198
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit faa9a6dd94c072b38c8f963ce314fc1d6ff69ddf
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:10 2010 +0300
CPT: restore dead tasks proc files
If some process opened /proc/<pid><somefile> and process with <pid> will die
after some time then checkpoint fails with error:
Can not dump VE: Invalid argument
Error: d_path cannot be looked up /proc/125/cmdline
The fix is to catch this situation at the dump time, mark the image respectively
and restore a fake file on restore.
http://bugzilla.openvz.org/show_bug.cgi?id=1047
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 977418edceabb4705f5012e562d4e5e04a19f138
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:10 2010 +0300
CPT: adjust vfsmounts restore order
Idea is: Dump parent before dump his children
This order is needed during checkpoint/restore:
mount /A /B -o bind
mount none /C -t tmpfs
mkdir /C/D
mount /B /C/D --move
After this, checkpoint (w/o this patch) will dump vfsmounts in order:
- vfsmount, bind to /A, mounted to /C/D
- vfsmount, mounted to /C (tmpfs)
and will restore in the same order, that causes error.
Bug #132951
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c42b985195cc8e7c2bbeb644e92d98a066aacc18
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:10 2010 +0300
CPT: dont cpt requiresdev fs
Don't allow chkpnt VE with mounted ext2/ext3, etc filesystems.
Allow checkpoint only for mounted nodev and "external" filesystem.
This check protects from error on restore:
CPT ERR: ffff810007113000,102 :-2 mounting /root/some_dir ext3 40000000
as do_one_mount() doesn't pass mntdev to mount().
[xemul: actually, the reason we don't support filesystems other than
virtual and tmpfs is because we simply can't (easily) get the
mount options for them to cpt and restore ]
Bug #131737
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a1d028ce2f1e87b5d64fb9fb7ed46740c1d73ed2
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:10 2010 +0300
CPT: Restore information about tcp listening sockets
Not all options are important. Only missed ipv6only can cause
error if other application want to listen the same port for IPv4 any address.
tp->XXX are inherited by children (noticed by Alexey Kuznetsov), so we need also
to restore these options.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Comment from Alexey:
It [everything before] was not OK. The feature which are broken are important,
but not actually critical except for ipv6only.
F.e. DEFER_ACCEPT is broken -> but nobody will notice, it just will not
be deferred.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6364b5498e48bcb600472bb2fafb865206f35068
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:10 2010 +0300
CPT: put 'expect' after insert to the 'conntrack'
During restore conntrack, we need to put expect after allocating
ip_conntrack_expect and do something with one. Expect will be
freed or immediate (if nobody has this expect) or during cleanup/timer
hooks. Otherwise expect never will be freed.
Note: Approaches for kernels 2.6.18 and 2.6.9 are different. For example
see help() in "net/ipv4/netfilter/ip_conntrack_netbios_ns.c"
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b3d4348ca6322edad5a0a0d56b15d1eb8db718bd
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:09 2010 +0300
CPT: Fix ip_conntrack_ftp usage counter leak
Function ip_conntrack_helper_find_get() gets module counter. So put a
conntrack after putting in the hash and handling the conntrack's expect
list.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 74e373eeb5e71b1c8253c04bee92250e5f6640cf
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:58:08 2010 +0300
CPT: dump and restore global snmp statistics
Per device exists for ipv6 only and is probably not used now, but
anyway - I'll do it later.
This patch adds new section CPT_SECT_SNMP_STATS that is populated
with CPT_OBJ_BITS set of objects - one for each type of statistics.
Objects have variable length. Stats are stored as a plain array of
__u32 numbers and thus the order in which stats types are stored is
implicitly hard-coded.
In case we do not have an IPV6 turned on all ipv6 stats are dumped
as CPT_OBJ_BITS/CPT_CONTENT_VOID and are skipped on restore.
When we restore from an image with more stats in any type, the not
supported ones are dropped with a warning.
Stats add 28K to image file.
Bug #113930
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3b0f4b2e0503c157d596d7426ffcba01e30e930f
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:08 2010 +0300
CPT: Fix memory corruption if cpt_family is wrong.
During restore, if parent socket is AF_INET but cpt_family is
wrong (non initialized, see bug ##95113), then consider request as
related to AF_INET6 is not right and leads to memory corruption.
As there are a lot of buggy images, so we can't check only on values
AF_INET and AF_INET6.
Desicion:
- Check request on AF_INET6 first, and consider
request as AF_INET by default.
- Additionally checkup for AF_INET6 request (protect from
random value cpt_family == AF_INET6)
Bug #118912
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Acked-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4a7ddd3db9a8030d514d120341bffd904ef57315
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:58:07 2010 +0300
CPT: fix restoring of /dev/null opened early by init
The problem is the following:
* init from fc9 starts and opens /dev/null for its stdin, stdout
and stderr
* udev starts and overmounts /dev with tmpfs
After this cpt cannot dump this ve, since one process holds a file,
that is inaccessible from ve root.
The proposed solution is the following:
1. allow for /dev/null to be over-mounted
2. restore init's file in two stages:
stage1: *before* we restored mounts restore init's 0, 1 and
2 file descriptors, since most likely (in fc9 case - definitely)
init opened them before any other manipulations with fs;
stage2: restore the rest files later, at usual time to make
sore that e.g. sockets etc are restored properly.
Comment from Alexey:
ACK.
Though this is really ugly, it really produces 100% correct result
for this particular situation.
Bug #116261
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 937a5462e54d42a70ca0a66c7d3147d02ff40767
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:58:07 2010 +0300
CPT: lock sock before restoring its synwait queue
This new socket already has all the necessary TCP timers armed,
so tcp_keepalive_timer can fire during the rst_restore_synwait_queue
and (for the latter being lockless) can spoil the queue.
Bug #118912
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c5d30bd0194b026df7684e08f1b6e8e77d06305c
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:58:07 2010 +0300
CPT: sysctl randomize_va_space
implement checkpointing for virtualized sysctl kernel.randomize_va_space.
reuse existing unused pad1 field in cpt_veinfo_image.
0 -> image without rnd_va_space virtualization (default value is used)
1 -> rnd = 0
2 -> rnd = 1
etc...
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bbdcbaadf794e4a6c579cdac4c92ecc278d7606c
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:07 2010 +0300
CPT: add check for presence of module slm_dmprst if SLM is enabled
Add a check in "checks" for presence of module slm_dmprst if SLM is enabled.
Check will be performed for both source and destination nodes. Changes in
vzmigrate are not needed.
Bug #114312
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 04c139f6c20e5c80a19db1439f8cd2f7e2715b4e
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:07 2010 +0300
CPT: add diagnostics in case of iptables-restore fail
It is not clear right now what is wrong if iptables-restore fails.
Add some diagnostics in case of error.
Bug #95952
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f06677625bf53b6aad0a3742b5f01d1376715e1d
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: Check that VE is not running on restore.
Bug #99679
Signed-off-by: Denis V. Lunev <den@parallels.com>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit dcda94043007a5d005e92c2df31ba63eeb1b8a70
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: fix check in decode_tuple()
Tuple structure can be used as a mask and protonum can be 0xffff in 2.6.9
kernel. In 2.6.18 kernel all masks for protonum are 0xff and 0xffff will
be shrunken to 0xff.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5a889e32263292bec6e2d4c2710ee41985f35716
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: fix restore of conntrack expect timer
One more fix of restore conntrack procedure.
Following code:
if (ct->helper->timeout && !del_timer(&exp->timeout)) {
...
}
can lead to oops, as exp->timeout is not initialized at this point.
Actually this optimization is not needed at all.
If expectation is dying, then we will let it die by its own death.
Also in ip_conntrack_expect_insert() there is an initialization of
exp->timeout. And we can't just do add_timer() after that (as in add_timer()
we have BUG_ON(timer_pending(timer))), we must do mod_timer() instead.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 19dce010faff8960e80b1778afa9f4ad07dd365f
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: restore mark value on conntracks
Restore mark value in conntracks as it is needed for connmark module.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7ec63fdedf332db285f71d857cf395da8cf674d5
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: convert conntrack tuple from 2.6.9 kernel image
Add conversion for conntrack tuple from 2.6.9 kernel image.
Check for correct value is added in decode_tuple().
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c34d6367f6cc5ee7f60fdee828c41de7b633a779
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:06 2010 +0300
CPT: convert conntrack image from 2.6.9 to 2.6.18
CPT structure in image file for conntracks is different in 2.6.9 and 2.6.18
kernels (array cpt_help_data was enlarged in the middle of the structure), so
conntracks from 2.6.9 kernel are restored incorrectly on 2.6.18 kernel and
lead to kernel oops.
A simple conversion from 2.6.9 to 2.6.18 is introduced to restore conntracks
correctly on 2.6.18 kernel.
Bug #113290
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 21644501b4651df2c7f271cae528f1996fc23a8d
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:05 2010 +0300
CPT: create kernel threads in VE0 context
In current implementation master process which performs checkpointing has
owner_env set to VE0 and exec_env set to VE. All auxiliary kernel threads
are created with exec_env set to VE and owner_env set to VE0, so after the
do_fork_pid() we have the follwing:
* new thread has owner_env == ve0, exec env == ve
* its pid belongs to ve (pid->veid != 0)
That is why if ve_enter() in thread fails, then we hit BUG_ON in
release_task -> detach_pid -> free_pid
sequence, since task owner env != pid's veid.
When enter succeeds the task's owner env becomes ve and this BUG_ON
is not triggered.
To solve this problem exec_env is switched to VE before kernel thread
creation and switched back after. Veid is passed to kernel via args. All
kernel threads are created with CLONE_VFORK to be sure that parent
process will not exit before doing exec() in thread.
Bug #97124
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 686bb3916a1247b46893078f8d87b8df6b1e305a
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:05 2010 +0300
CPT: restore rlimits correctly during 32bit-64bit migration
During 32bit to 64bit migration rlimits were restored incorrectly due to
different size of long on 32bit and 64bit archs. Now simple conversion is
introduced in case of 32bit-64bit migration. Infinity values are restored as
infinity values. Error is returned if value greater than RLIM_INFINITY32 is
found in dump during restore on 32bit arch.
Bug #111965
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c3e4a29b420b871a6543955728b1f8a5de75e955
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:05 2010 +0300
CPT: restore packet control block from kernels with and without IPv6
More generic mechanism for restoring packet control blocks. Unfortunately we
do not save length of control block in dump and we can only try to calculate
it during restore. This method is based on knowledge that the flags value in
TCP control block is not zero for all packets in queue.
Since this image version TCP control block will be saved in IPv6 form
regardless to IPv6 config option.
Restore of control block is splitted in 4 ways for any IPv6 and non-IPv6
kernel combinations.
Check is added to be sure that all control block were restored in the same
way. If it will be found that some control blocks were restored incorrectly,
then undump process will be terminated.
Bug #111370.
Merged 4 patches sent earlier:
1. Increase image version.
2. Save TCP control block regardless to IPv6 config option.
3. Restore of control block is splitted in 4 ways...
4. Add appropriate comment on TCP control block restore procedure.
[xemul:
Added do { } while (0) around macro body
Mention Alexey in comment about skb_cb->flags being non-zero
]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1f218bb8d606af3b95cd089b68b44800f91ac7d1
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:05 2010 +0300
CPT: add binfmt_misc fs in supported list
Just add binfmt_misc in list of supported file systems. With this small
quick fix migration will be allowed, but all binfmt_misc entries will
be dropped during migration.
This fix is only for the first time. Later will be implemented generic
mechanism for checkpointing/restore of external modules. And this quick
fix will be replaced with full support for binfmt_misc in CPT.
Bugs #100709, #101061
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 85da0ddab187bb9e6000ba6c98b7454095055799
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:05 2010 +0300
CPT: relax check for several bind mounts on the same mount point
Relax check for special bind mounts which mounted several times on the same
mount point. We need to check only dentry, mount check can be skipped in this
case.
We can't remove completely mount check as there are exist cases when we need
to check mnt too. E.g. /dev is mounted with NODEV over /dev and some file is
opened from underlying mount. If mount check is removed, then we will be able
to checkpoint such state, but we will not be able to restore it.
Correct sollution will be to dump/restore whole mount tree with overmounts.
But we can't implement this right now for number of reasons.
Bug #84310
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bc4769bb4acc7547f4e537b23a093019e78652d7
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:04 2010 +0300
CPT: fix reopen dentries procedure
Dentries were not reopened correctly during checkpointing and restore.
Two bugs fixed:
1. In case of huge files (more then 2Gb) dentry_open() returns -EFBIG if
O_LARGEFILE flag is not set. This flag should be used for temporary files
used during checkpointing and restore process.
Bug #99544
https://bugzilla.sw.ru/show_bug.cgi?id=99544
2. In dump_content_regular() we have following code:
file = dentry_open(dget(file->f_dentry),
mntget(file->f_vfsmnt), O_RDONLY);
if (IS_ERR(file)) {
cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file));
return PTR_ERR(file);
}
Which results in kernel oops if dentry_open() returns error
(e.g. -EFBIG because of bug #99544)
Bug #99542
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 08b8f8ba476ec8e67b2eac74028fa5f4a3586c2f
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:04 2010 +0300
CPT: fix save/restore of open requests
Open requests were saved and restored sometimes incorrectly:
1. Family of open request was not saved (commented out)
2. Restore was broken, would crash because rsk_ops was cleared by memset.
3. And finally, all the coded restoring open requests was skipped.
Tested with http_load.
Bug #95113
http://bugzilla.openvz.org/show_bug.cgi?id=784
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0a6789976c6ff602e11a4f00123ae70b62738f21
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:04 2010 +0300
cpt: add lost dcache_lock protection around __d_path()
Protect __d_path() call with dcache_lock spinlock.
Protect other checks with env->op_sem semaphore.
Bug #98833
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 22c792c3605e5d0f916308678319e25eb18cf4a6
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:58:04 2010 +0300
cpt: fix restore of inotify on symlink
Inside VE file /etc/mtab is a symlink to /proc/mounts.
FreeNX server with KDE creates inotify on /etc/mtab file.
To restore such inotify we need to obtain dentry with path_lookup() and
restore inotify on it.
Bug #96464
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 66a6c3e51c35096b204b8866ee50afe0b1d13d59
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:58:04 2010 +0300
quota: compat layer for compat quota
This patch implements compatibility quotactls for old quota tools.
replace:
diff-fs-quotcompat-ia32emul-fix-20050921
diff-fs-quotcompat-comp-fix-20080710
diff-fs-quotcompat-xencomp-fix-20080806
diff-fs-quota-compat-proper-split-20081027
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1b04f79cb59f8cd8fb1ca26e19a6a4e8295a088f
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:58:03 2010 +0300
ve: Don't check for CAP_SETVEID - use more ... imagination
This patch:
The proposed check correctly detects the root in ve0.
However, we lose the ability to create containers with
some fancy tool, that has the CAP_SETVEID capability
*only*, but we don't have such.
The cap itself is declared to be obsoleted, but there's
no need in rewriting vzctl in a rush - things will still
work. If we'll want to manipulate audit caps from the
vzctl we'll make it via features.
Overall history:
Don't ban CAP_AUDIT_XXX capabilities in container to make the
dbus-daemon work.
After two (maybe tree) days of brain storm me and Den finally
gave birth to this solution. So...
First of all AUDIT will be banned in container. Since dbus refused
not to set audit caps we don't want it to mess with it in any case.
Next step is to note, that CAP_AUDIT_CONTROL coincides with the
CAP_VE_ADMIN, which is not that bad (besides, dbus doesn't try to
set this one up) and we leave one alone.
And finally - the CAP_AUDIT_WRITE, which coincides with the most
delicate one - CAP_SETVEID. The latter one is explicitly dropped
on container start and there's no way to set one (dbus tries this
and fails) back. Simple "don't clear it" solution is too dangerous.
TO handle *this* case we
1. replace all checks to capable(CAP_SETVEID) to more complicated,
but still matching ve0's root only;
2. don't ban the CAP_SETVEID (== CAP_AUDIT_WRITE == the_one_dbus_needs);
3. remember, that this capability is present on ve startup and thus
we automatically have the CAP_AUDIT_WRITE required by dbus;
4. carefully handle the case, when we enter container in do_env_create
and try to call fairsched system calls.
That's it. No fraud, just manual dexterity ;)
Bug #117448
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 153eca7d4bf56bd34e7c5957b1ff8ec331713a0b
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:58:03 2010 +0300
fairsched: Sanitize fairsched manipulations on ve startup
First of all we won't be able to call them after we fix
capability checks. Second of it is that taking the fairsched
mutex 4 times on startup is an overkill.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e2fb9c79fd348a0603c4b881c4e1f179945b55b5
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:58:03 2010 +0300
ms: lutime lchmod syscalls
Add possibility to change owner/permissions on symbolic links
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0b7042d24abe59baba84a78e37b95a88624f9308
Author: Konstantin Khorenko <khorenko@openvz.org>
Date: Sat Feb 27 16:58:02 2010 +0300
ve-net: permit changing of netdev's tx_queue_len from inside a CT
In particular it makes OpenVPN happy.
Bug #457318
Signed-off-by: Konstantin Khorenko <khorenko@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit eb3139203f525babc452556dd5071c73382050dd
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:58:02 2010 +0300
venet: Core support for external ip filtering
Allow VE emit packets with configured source IP address.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 98ec6de33c046e4f053c6b21152d3e07bead7804
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Sat Feb 27 16:58:01 2010 +0300
vzethdev: stat tx dropped acount
Veth get_stats() should return the number of tx_dropped packets
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 57a5848f98e677abefa203f9ad5f1b4bf3d28ace
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:58:01 2010 +0300
venet: add TSO support in venet and vzethdev
venet and veth support checksumming and scatter-gather features, but TSO
feature still wasn't added.
TSO increases bandwidth up to 50% or appreciably decreases CPU usage.
Approach is the same as for checksumming:
1. TSO is off by default
2. For veth: tso can be enabled/disabled in VE or VE0 for
pair {veth in VE, veth in VE0}
3. For venet: tso can be enabled/disabled only in VE0 (for
all venet devices at once)
To use this feature just enable:
1. Tx checksumming: ethtool -K DEVNAME tx on
2. Scatter-gather: ethtool -K DEVNAME sg on
3. TSO: ethtool -K DEVNAME ts on
Some performance info (tested via netperf):
1. Traffic VE->VE0 (via venet), TCP STREAM test, message size 32K, socket size 256K:
TSO off 2300 10^6 bits/s
TSO on 5600 10^6 bits/s
Notes:
Admins need to set TSO on {venet,veth} only if physical ethernet device supports TSO.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f0fe2ba7ff9d91a2bfef1ec95fddbeada5be14d3
Author: Vasily Averin <vvs@openvz.org>
Date: Sat Feb 27 16:58:01 2010 +0300
ve: Kill not-yet-closed TCP sockets on VE stop herder
Idea proposed by Alexey Kuznetsov <alexey@openvz.org>
tcp_v4_kill_ve_sockets() can hangs in loop because NFS can hold some sockets in
host node rpciod/nfsdiod queues.
This patch resets such sockets if it's possible or delays its cleanup.
changes in 20090429: fixed wrong locking and another xemul@ notices
Bug #429296
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5ad4c74a16b2f9812a1d79287bba724243454ecc
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:58:00 2010 +0300
bc: compat system calls for bc and fairsched
correct UB_MAXVALUE convertion and wire compat syscalls
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 20fd4dd54736b40a815ad07d34c4339d5c627f7e
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:58:00 2010 +0300
ub-dcache: sleep in dput
ub: dentry->dentry_bc.d_ub is unreliable after the sleep
d_kill can sleep inside. In this case dentry->dentry_bc.d_ub saved before
is unreliable as we can have dcache accounting on event during sleep. In this
case we'll have saved ub == NULL and OOPS/leak inside dcache_uncharge.
Another problem here is that we should decrement inuse count on the
dentry appropriately.
Bug #116095
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 76038f85b0523d4d2a48b20b5443a81dee3531e4
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat Feb 27 16:58:00 2010 +0300
ve-fs: implement "ve-xattr-policy" sysctl entry
"ve-xattr-policy" sysctl entry allows to control how to react on xattr
change from inside of a container.
There are three options allowed:
0 - accept any xattr modifications (VE0 always and VE by default)
1 - ignore
2 - reject
Note that any other value assigned to "ve-xattr-policy"
leads to "accept" policy being applied without any warning.
The sysctl is placed at /proc/sys/fs/ve-xattr-policy on HW node.
http://bugzilla.openvz.org/show_bug.cgi?id=1050
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5cab8bf42b5da73a02d5288951aeeec8fd8b4716
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Sat Feb 27 16:57:59 2010 +0300
ve-kmsg: printk va copy add
Copy args variable in ve_printk() function
x64 can corrupt va_list after return from the called function.
Bug #440939
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b55fc66f70948758037a4639e8a63663792ec1f5
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:57:59 2010 +0300
ve-kmsg: printk lockdep fixup
printk: fix lockdep warnings if kernel compiled with CONFIG_LOCKDEP
vprintk() to VE causes:
=====================================
[ BUG: lock held at task exit time! ]
-------------------------------------
iptables/8203 is exiting with locks still held!
1 lock held by iptables/8203:
#0: (sk_lock-AF_INET){--..}, at: [<ffffffff81213341>] ip_setsockopt+0x61/0xa0
stack backtrace:
Call Trace:
[<ffffffff8100b78a>] show_trace+0xca/0x3b0
[<ffffffff8100ba85>] dump_stack+0x15/0x20
[<ffffffff8105e469>] debug_check_no_locks_held+0x89/0xa0
[<ffffffff8103aa7e>] do_exit+0xe2e/0xe80
[<ffffffff8103aba0>] sys_exit_group+0x0/0x20
[<0000000000000001>]
Note: to reproduce this you can type in VE:
iptables -A INPUT -m tcp --dport 22 -j DROP
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 84ac295d2315ecf649e3910735d81e8d217396c3
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:58 2010 +0300
ve-proc: mangle mounts devname harder
mounts: show /dev/xxx devices near ve root mounts, rather than just xxx
Required for fixing autofs in rhel5 container:
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 454ad87b41380655cb31a85f682ddb8289e8e1f9
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:58 2010 +0300
ve-sysctl: randomize_va_space
virtualize sysctl kernel.randomize_va_space
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a44c3498bcf70065a85236b7daa77fe0320313f2
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:58 2010 +0300
ve-sysctl: add proc_dointvec_ve helper
add generic method for proc access to per ve int values.
extra1 field of ctl_table contains data field offset from ve_struct begin.
without CONFIG_VE use address from .data field.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 34e6684b531637ad4fd34502d32f6e3c74e2dac6
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:57 2010 +0300
ve: drop oom immunity at enter
At CT enter switch to default OOM adjustment level if task is OOM-immune.
This is a very bad idea to have OOM-unkillable tasks inside container,
because all forked tasks inherit this setting.
Proc interface for changing OOM adjustment (/proc/<pid>/oom_adj)
allready restricted in CT by diff-ve-oom-adjust-20070604.
On some systems sshd got OOM protection at start and not drop it after fork.
(example: ssh root@HN -> vzctl enter -> restart apache -- apache now OOM immune)
(example from xemul@: ssh root@HN vzctl start - VE is now OOM immune)
http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=480020
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c7cf5c388378abf4d6e8e2e18c6c815eccab4fd7
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:57:57 2010 +0300
ms: ext4 use get host
Force ext4 page fault handlers use ->get_host callbacks
This is required not to use vzfs file in ->page_mkwrite callback.
Bug #454968
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a7de88181858ae8f9ec51cee11ae7f955e76430d
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:57 2010 +0300
nfs: disable nfs-v2
nfs: disable NFSv2 as it is broken
According to Alexey: "who is going to turn v2 on, having
a v3, which works better, nearby?"
Bug #114720
Signed-off-by: Denis V. Lunev <den@parallels.com>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7805f36534f20e530fb84e83a360993ec78f3bb6
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:56 2010 +0300
ve: vfs sillyrename
i_nlink count on private inodes after silly rename is 1. So, virtual inodes
gain i_nlink == 1 and remains in unused_list instead of to be cleaned.
Bug #114672 #112999
Signed-off-by: Denis V. Lunev <den@parallels.com>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d252a93b32d6d251fcc73863b75b91edaa801b95
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:57:56 2010 +0300
mm mmap zero length kludge
Return -EINVAL in case of zero length file to all applications except
rpm. For (legacy) rpm address will be returned.
Such hack is introduced just not to break compatibility with old
tools, sorry :(
Bug #74964
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 437d113149802cb91254246f29134e3ade55e411
Author: Alexey Kuznetsov <alexey@openvz.org>
Date: Sat Feb 27 16:57:56 2010 +0300
nfs: use file private macro
Minor fix to nfs, which allows to use vzfs over nfs mounts.
It survives fsstress test. I think normal vzfs tests can be started
asap to catch the pointes of possile misbehaviour.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3c07eb700d9bbe7fd6b7dcf52103faf58ef4a035
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:55 2010 +0300
vzdq: cleanup fake qmblk destroy
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8d622018ad2a3d025576578c0838c18ebfd3fdab
Author: Konstantin Ozerkov <kozerkov@openvz.org>
Date: Sat Feb 27 16:57:55 2010 +0300
vzdq: qmblk dq_sem to mutex
vzquota: replace quota master block semaphore with mutex
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 769b3bbe8d7859d168b42daa35720f12372e10db
Author: Konstantin Ozerkov <kozerkov@openvz.org>
Date: Sat Feb 27 16:57:54 2010 +0300
vzdq: vz_quota sem to mutex
vzquota: replace master lock semaphore with mutex
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 085883fb2366ae47c84fb18aa50f832e93ab56aa
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:54 2010 +0300
vzdq: vzaquota proc nlink
Produce correct nlink count for /proc/vz/vzaquota
Use count mounpoints accessible from VE as upper estimate for
count subdirectories inside /proc/vz/vzaquot.
Concept stolen from vzdq_aquotd_readdir.
Disable enumation in VE0 for performance reason (like in _readdir and _lookup)
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b9a8ce596cba9f5161769ca0408c71f8e6a059c7
Author: Alexey Kuznetsov <alexey@openvz.org>
Date: Sat Feb 27 16:57:54 2010 +0300
vzdq: swap noquota
swap_inode did not do anything for inodes not covered by vzquota,
which was wrong. F.e. mkdir, which creates inode with i_blocks!=0,
triggered message "detached inode not in creation".
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 20d11fba2ae882456b343ae78f466e27cc19d000
Author: Alexey Kuznetsov <alexey@openvz.org>
Date: Sat Feb 27 16:57:54 2010 +0300
vzdq: nfs support
It works differently and requires different interface.
Block accounting and quota check are separate now, we account
without checks and check for space in places, where an operation
could allocate more space.
Chunk-by-chunk:
1. Added new operation - swap_inode. Normally, virtual inode
is created/accounted/checked simultaneously. It is impossible for NFS.
So, each operation creating a new inode starts from allocating
space in quota using a dummy inode. If the operation succeeds and real
inode is created, we swap quota accounting information.
TODO: optimize out dummy inode. All that we need is qlnk.
2. DQUOT_CHECK_SPACE() to check that quota is not full.
3. DQUOT_SYNC_BLOCKS() to resync i_blocks obtained from NFS server
with our accounting.
4. is_nfs_root(). NFS does not have root inode. Instead each mount
has pointer to a disconnected inode. vzquota has to undestand this.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fd4f6b28860495f939f10abfaec8f255797a4fe8
Author: Alexey Kuznetsov <alexey@openvz.org>
Date: Sat Feb 27 16:57:53 2010 +0300
vzdq: fix oops is inode_drop_call
I suppose this happens when vzcache moves to template a file,
which was not under vzquota.
Bug #97782
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 71208971e69657168517194564e045781b054526
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:53 2010 +0300
simfs: statfs on root
Do not use s_root dentry of underlying for statfs
The real problem is that s_root on the NFS super block is a crap.
Unfortunately, the original dentry (which is asked to be statfs-ed)
is not available at this point. The only visible solution for this
is to use the dentry to which simfs is point to.
Signed-off-by: Denis V. Lunev <den@parallels.com>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 11d902b2933c3292b8e1305e38e37c6419cb9cf2
Author: Konstant Khorenko <khorenko@openvz.org>
Date: Sat Feb 27 16:57:52 2010 +0300
virtinfo hook in daemonize
#427726
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 95a5273372efb164d0b3a4ab6eefca8b671d13e4
Author: Andrey Mirkin <major@openvz.org>
Date: Sat Feb 27 16:57:52 2010 +0300
virtinfo add cpttest
Add VIRTINFO_SCP_TEST event to virtinfo calls
This will be responsible for checking CPT features
during checkpoint/restore process.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e2e5984d43c91b3aa674123af73849e9643bffb3
Author: Konstantin Khorenko <khorenko@openvz.org>
Date: Sat Feb 27 16:57:52 2010 +0300
ve-proc: fake sysrq trigger
Add dummy /proc/sysrq-trigger file inside a Container
Oracle 11g Release 1 RAC tries to open one and refuses to start on fail.
Writing to the file inside a CT leads to nothing, first 10 writes are logged.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fc17c7e942ccbcf6909ef9fdb7c4f170acaf1d72
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Sat Feb 27 16:57:51 2010 +0300
ve-proc: add devices
Proc: add empty /proc/devices to CT
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3cfd7ac2a553a88af0053a59ac9870f1ce82760f
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:51 2010 +0300
ve: decrease ve_struct size in case of huge nr_cpus
kstat_lat_pcpu_struct contains array of NR_CPUS elements.
Replace it with alloc_percpu data which helps to keep ve_struct
relatively small and prevents allocation fails of huge order.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 010370ec6b62618648c8b8882d3887e5e4073fc8
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Apr 26 17:22:10 2010 +0400
percpu: Return ve0/ub0 percpu-s back
With the DEFINE_PER_CPU and init-s made in proper place we can
use them as alloc_percpu-ed ones.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 541c4b4da4f9c522593f3fd622e5d20fa6a6b294
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:51 2010 +0300
ve: fix fs umount at ct stop
Don't umount some mount multiple times on ct stop
umount_tree kill argument must be empty list,
otherwise it can detach each vfsmount multiple times and
produce negative d_mounted count on mountpoint dentry.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 543578c2947332cda5aea3b195c4d6a80a3d317b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:50 2010 +0300
ve: ptys idr mem leak
Plug minor memory leak in idr_layer_cache slab on ve start-stop
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 965adae71aaa774796aeac8087806b77bbb0709f
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:50 2010 +0300
ve: tmpfs virtualize default size
set default size to half of physpages from meminfo
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 79c0a2ab51af39b665f7e8162c26c5573eca1872
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:50 2010 +0300
ve: meminfo dont use subub
Get parent UB instead of sub-group one to calculate usage
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 223f044cc32146df3a5f6dc61aab2bd053277de8
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:50 2010 +0300
ve: move veinfo to vzmon
Since some people wish to run openvz w/o venet device, but
vzlist tool relies on /proc/vz/veinfo file presence, vzmon
module is a better place for this file.
http://bugzilla.openvz.org/show_bug.cgi?id=394
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f267ef18a62f50bd5293a876e43b89467c8253f4
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:57:49 2010 +0300
ve: virtualize binfmt-misc
Nothing special. SUN jdk complains since can't use binfmt.
Not serious and java surely works fine w/o it, but just to
make it and its users happy let's virtualize binfmt_misc.
Signed-off-by: Pavel Emelianov <xemul@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1ff4faada1dabfdc4592e2824ce53a357373c83e
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:49 2010 +0300
bc: pb hash cookie
add random hash cookie to ub to use in pb_hash instead of non-random ub_uid
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 31f588463c8294df47ff6357829b286abd580782
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Sat Feb 27 16:57:49 2010 +0300
bc: uncharge files harder
There is a chance when we do not start uncharging because
ub_barrier_farnr() is not hit for UB_NUMFILE and ub_barrier_farsz()
is not hit for UB_KMEMSIZE (SLM for example set ubc barrier to a
huge value).
This fact can lead us to the situation when two tasks are able
to consume all of UB_NUMFILE and UB_KMEMSIZE despite they close
opened files.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 76cd7c1686940c2eeef94926e978b8893f9bb9e2
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:48 2010 +0300
ve: show proc swaps in ct
Fill the size/used values with the ones from the meminfo virtinfo notifier.
Show one fake swap partition (/dev/null) with the same size/used as in
/proc/meminfo. If --meminfo == none show overall swap statisctics from HN.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bf8c54dbd1c7b09abdab952da58e1f2c8f439ea4
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:48 2010 +0300
ve: mangle swapinfo
Fill swap size/usage with data from UB_SWAPPAGES in meminfo notifier.
Don't show swap if the limit is unlimited (default state).
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1c2b5b4b1cbaafa707cb56da94dd5099dbdcc73d
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:48 2010 +0300
cpt: bc resources array
restore only bc resources really presented in cpt image.
store UB_RESOURCES in cpt_beancounter_image while checkpointing.
(leave all new added resources with default limits filled at bc alloc)
change cpt_content of cpt_beancounter_image to CPT_CONTENT_ARRAY to detect
structure version without bumping cpt image version, because in old images
__cpt_pad field (reused for cpt_ub_resources) uninitilized.
add missed error handling inside rst_undump_ubc -- toss errors
from restore_one_bc to higher level.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7b8bbb51527e58abadcd0eeb3e7103ba4048a57f
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:47 2010 +0300
bc-swap: add swappages bc resource
The limit value will be used as configured CT swap size to show
in /proc/swaps and /proc/meminfo. Default is UB_MAXVALUE
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e7416bee163fb262076d9b7dfa93c0dbf304891d
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:57:47 2010 +0300
bc-rss: show how much page beancounters each bc has
Essentially, this is the per-UB rss value calculated
(unline physpages and privvmpages) w/o taking sharing
into account.
With this statistics (shown via /proc/bc/XXX/vmaux:rss)
we can evaluate the portion of pages, that are shared
accross beancounters (i.e. CTs) like this:
(\sum (bc.rss + bc.tmpfs_respages) - \sum (bc.physpages)) /
(\sum (bc.rss + bc.tmpfs_respages))
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b03577fcbea66508aca033f9c9c78bc060c02c24
Author: Denis Lunev <den@openvz.org>
Date: Sat Feb 27 16:57:47 2010 +0300
bc-ioacct: define page_io_mark in right place
fix compilation without CONFIG_BC_IO_ACCOUNTING
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 35fe6d0b31e36227f572550dff53154491760fb1
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Sat Feb 27 16:57:47 2010 +0300
bc-ioprio: sys_ioprio_set lost unlock
sys_ioprio_set() may exit without releasing tasklist_lock. Fix it.
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2cba7730c015206352563731d9f25cd027bd88f5
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:45 2010 +0300
ve-proc: fix root entry nlink
* Add entries from local tree, similar as in proc_getattr;
* Use per-ve process count for VE's root, rather than the
total number of processes in the system.
All of the above is an upper estimation, that is perfectly
fine with 'find' utlity.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a2a22de6b8939570239c99973d3be7fb2eb4e70a
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:45 2010 +0300
ve-proc: fix nlink in getattr
Fix nlink correction in proc_getattr
and change it right in the stat buffer insted of inode nlink
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f665309226859e081bcae5c0c7fd3a3bdd9ecfbc
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:45 2010 +0300
bc-proc: bc nlink count
Override getattr callback on /proc/bc and ubc entries to get correct nlink.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 85051b1c71ad37949ef448ff8ddb342b75d706b0
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:45 2010 +0300
bc-proc: add bc and sub-bc counters
Add counter of ubc, protected with ub_hash_lock.
Needed for correct proc n_link calculation.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d5ee7014d3f4995249cdadf3d00d1be778a3b10a
Author: Pavel Emelianov <xemul@openvz.org>
Date: Sat Feb 27 16:57:44 2010 +0300
bc-proc: fix sub-bc inode number
fix subbeancounter inode number calculations in /proc/bc
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b93ef081a586e08e226273599bcf7800907c731b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:44 2010 +0300
simfs: compilation without quota
fix simfs compilation if CONFIG_QUOTA=n
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4fa1e482478bcde0552e9a97db1ddca620ebbe05
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Sat Feb 27 16:57:43 2010 +0300
sysrq: smp nmi show regs v2
Rework nmi show regs, make it clean and tollerable to nmi ipi losts.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit cab0d970b18692b61e62e2095392e63c5097bf29
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Apr 26 15:09:43 2010 +0400
sysrq: revert nmi ipi callback
next patch will implement this in less intrusive manner,
and without deadlocks at nmi ipi loss
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6b5607eeec54fcef60c25fa7a72bc30f69446933
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 16 12:34:01 2010 +0400
timers: Don't take task from the pid field of timer
http://bugzilla.openvz.org/show_bug.cgi?id=1461
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8893da4d5000630819ce4f5edf5bb71c8f65c01c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Apr 15 17:25:09 2010 +0400
netfilter: Add headers missed by the previous commit
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f360aa43e0782543a6e8fece20b71ec25ef36568
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Apr 12 23:41:46 2010 +0400
netfilter: Restore match/target support for revision 0
Revision 0 is still used in our VE templates (iptables-1.3.5)
so we just can't drop this kind of support. Bring them back.
What is restored:
- target "CONNMARK"
- target "TOS"
- target "MARK"
- match "connmark"
- match "tos"
- match "iprange"
- match "mark"
- match "owner"
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 81959d6fac075a62150d9e726f27e09254b3cf0e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Apr 8 21:28:03 2010 +0400
OpenVZ kernel 2.6.32-atkov released
Named after Oleg Yur'yevich At'kov - a Russian cosmonaut.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b97a0293d29009398eb7abd2e9887ab64741b98d
Merge: 44e953b 1b6e168
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Apr 8 21:24:36 2010 +0400
Merged 2.6.32.11
Conflicts:
Makefile
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 44e953b5f69888d88ceaa236ddb41086404a9dbd
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Apr 8 18:59:26 2010 +0400
netfilter: xtables: Return xt_conntrack v0 back
In commit 9e05ec4b1804a1ba51f61fe169aef9b86edcd3f7
the revision 0 of xt_conntrack was dropped which made
iptables-1.3.5 not funtional in VE.
Return compatibility back.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 451c01314a07c01e55e79a8134e0adc55a584465
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Mar 29 01:50:51 2010 +0400
iptables: Tables should be checked for permission via mask only
The sequence of module loading is not controllable by the kernel anymore
(due to KSYM removal). As result we may fail testing if dependant module is
already loaded(allowed for usage via config) at moment of granting the module
permissions to run in particular VE.
Instead the _MOD bits are used as flags pointing out that netns is borrowing
some resources and we need to release them at exit (we can't just fail in
netns init/fini routines otherwise VE would not start at all).
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ea5540ba9694531c6a74092c29f5e61e393b9ac8
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Mar 29 01:15:52 2010 +0400
netfiler: Introduce per-net stubs for l3proto_ipv4
Prepare ground for future l3proto_ipv4 virtualization.
This snippets only setup pernet ops without any serious actions.
We still need to make sysctls netns compatible.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2cf066b2d96b58e1594965d102d48cd3eb6d6168
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun Mar 28 03:24:36 2010 +0400
netfilter: Do not create NAT rules if not allowed
In case if NAT is not allowed in particular VE we just drop the creation
of tuples for such VE (this way dropping this functionality).
Note that there is no need for setting up VE_NF_CONNTRACK_MOD in
nf_conntrack_net_init. We are going to get rid of module dependency checking
by completely switch to net-namespace functionality.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8d9ba8b69ff82a2c335f2cf860265b5240460228
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Apr 6 20:09:50 2010 +0400
meminfo: Initialize mi.si before passing it to virtinfo
Otherwise the listener (vecalls.c) will see grbage and may produce
bullshit in ve's /proc/meminfo
http://bugzilla.openvz.org/show_bug.cgi?id=1487
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3e16fd3fd9101ef59d38d60b989b62cb9a11df2b
Author: Sven-Haegar Koch <mail-openvz@sdinet.de>
Date: Tue Apr 6 15:39:02 2010 +0400
Fix compile error in mm/slab.c
Commit f385db6d4 (MM: Kmemsize accounting bits) introduced a syntax
error into mm/slab.c:
CC mm/slab.o
mm/slab.c: In function 'alloc_slabmgmt':
mm/slab.c:2687: error: too few arguments to function 'kmem_cache_alloc_node
mm/slab.c:2687: warning: left-hand operand of comma expression has no effec
mm/slab.c:2687: warning: statement with no effect
mm/slab.c:2687: error: expected ';' before ')' token
mm/slab.c:2687: error: expected statement before ')' token
make[1]: *** [mm/slab.o] Error 1
make: *** [mm] Error 2
http://bugzilla.openvz.org/show_bug.cgi?id=1486
Signed-off-by: Sven-Haegar Koch <mail-openvz@sdinet.de>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2a40c7f120c9239ffa4cf7b151974351c2f54796
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun Mar 28 01:44:02 2010 +0300
iptables, nat: Add missing flag that namespace is used
Otherwise we may leak the namespace resource.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5186a0a6d011e4b8c2bbd2ca4cf1aa40507e2078
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Apr 5 14:28:45 2010 +0400
cpt: Save and restore task_user_gs().
Without it 32bit container stops right after the restore
due to #GP.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 14a9729fab679c9c9f15e2ff44070806247b62c5
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 2 23:00:10 2010 +0400
inotify: Return lost mntputs for inotify mnts
Otherwise we leak the mountpoint and sometimes much more.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0b1c1e340b9008360e0a0d4083702ec8546ec1bd
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 2 22:59:12 2010 +0400
cpt: Fix 32bit version of cpt kernel thread creation
http://bugzilla.openvz.org/show_bug.cgi?id=1482
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 760a36fb468201cb4964c7a542172bda941fe755
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 2 16:21:41 2010 +0400
devcg: Check for device permissions for DEV_ALL rules
Thanks to cd500819 commit :\
http://bugzilla.openvz.org/show_bug.cgi?id=1478
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e6ad27ee6e87614512d44074d841df822453dda7
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Mar 25 19:47:32 2010 +0300
iptables: Restore NAT functionality for node
During migration to new iptables management code
VE0 lost NAT table access. Restore it. Moreover
VE0 gets highest priviledge -- ie all iptables
functionality is not filtering.
http://bugzilla.openvz.org/show_bug.cgi?id=1473
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 796e80e5b2edff524f40a608be511143b4c8c828
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 19 11:17:45 2010 +0300
OpenVZ kernel 2.6.32-afanasyev released
Named after Viktor Mikhailovich Afanasyev - a Russian cosmonaut.
commit 97ae2f923c11e4cc83436878cdf5262781ae17d1
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 22 17:14:57 2010 +0300
net: Virtualize tcp orphan count back
This oopsed before the previous fix.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5ae2ad90d7bbe29ea299955fad28491fedf10468
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 22 17:14:23 2010 +0300
bc: Don't re-initialize BC after orphan count allocaiton
And do a small code rework.
http://bugzilla.openvz.org/show_bug.cgi?id=1471
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1a8072e88d55070bd78fe960c6db36e5af61ea85
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 22 17:13:48 2010 +0300
mm: Don't charge failed allocaiton
I.e. don't oops :\
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f27cf7344f66277fc44462d2880d8a43c17f59ac
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 19 12:06:09 2010 +0300
Compilation fix for ftrace
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 026c0e96af23e154b741645fe2d61a0278451268
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 18 19:30:31 2010 +0300
cpt: Add support for sit, ipip and ipgre tunnels.
These patches were dropped during port on 2.6.26 (!) and now
it's time to resurrect them.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit db266821b6c7f7ee39b97fe58fd336b04d7d689f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 18 18:35:01 2010 +0300
ve: Get rid of ksyms completely
The last user of it (do_env_free) is easy due to module refcounting
kinda working properly ;)
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 69309430a023aa75a73c8ebd5b1952e733cd1c6a
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Mar 18 18:05:01 2010 +0300
iptables: rework module management in VE v3
Most of xt_ tables are per-net compatible so there is no need
to put an access restriction on them from a VE.
Typically we need to restrict access to certan netfilter tables:
"nat", "forward", "mangle" and "conntrack".
Note the "conntrack" is not covered by this patch yet, ie it's
allowed in VE regardless the VE configuration.
Technical details of the patch
1) nfcalls.h is almost completely not needed anymore
but we still have one user of it ve.c + vecalls.c
2) VE_IPT_CMP macro is wrapped with inline helper mask_ipt_allow
which allow to check types passed in at compile time
3) net_ipt_module_set code beautification
4) ve_xt_table_forbidden inline helper introduced. It wraps open
coded NULL pointer check in a sake of "grep"ability (pointing
out that the check is not a mainline code)
5) vziptable_defs.h has been re-made in a sake of easier reading.
Backward compatibility preserved.
6) VE_IP_ALL introduced to be used instead of opencoded "all allowed"
mask.
7) no_module global variable is no longer needed, we don't use legacy
symbol resolution technique. Net namespaces do all work for us by
own.
8) Because of vzmon being a module we need do_env_free_hook (side effect
of removing KSYMs) and vzmon_mod pointer as well.
9) We no longer need init_ve_iptables, do_ve_iptables and fini_ve_iptables.
The reason the same as in (7)
10) And finally "nat", "forward", "mangle" tables are controlled via
VE configuration.
TODO
1) Contol "conntrack" table via VE configuration supplied.
2) Get rid of KSYM completely
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d95919fda88c144bac75019f692c3f2f8e2b9d2e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Mar 17 15:56:18 2010 +0300
config: Disable DEVTMPFS
http://bugzilla.openvz.org/show_bug.cgi?id=1469
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f492a5013944b559cd809565250df2027dbb2c51
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 16 14:34:19 2010 +0300
printk: Don't forget to reset printk_cpu
http://bugzilla.openvz.org/show_bug.cgi?id=1460
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e3b97ad1f30de13c1a752522b4b75966ef515bab
Merge: b3e334e dd49f62
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 16 13:51:38 2010 +0300
Merge 2.6.32.10 from git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.32.y
Conflicts:
Makefile
fs/namei.c
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b3e334e592eeb6880b31ee327eab438328753949
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 15 16:09:54 2010 +0300
cpt: Don't check for rtnl reply len on error
This kernel reports cumulative len (error + original) when returning
an error and thus this check is wrong.
OTOH, checking the message type is enough, this check is also unneeded :)
http://bugzilla.openvz.org/show_bug.cgi?id=1456
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 727b3d18ef931d2f84c3e4588c6d6fa6a0ac7e51
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 15 15:28:50 2010 +0300
sched: Fix loadaverage statistics
The nr_unint count only increased which resulted in broken
loadavg statistics for container.
http://bugzilla.openvz.org/show_bug.cgi?id=1459
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f6e9bb6efeb9b6d9e527dac472111ba313a0f2c5
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Mar 15 14:44:26 2010 +0300
ioclt: Print compat errors to ve's log
http://bugzilla.openvz.org/show_bug.cgi?id=1455
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 83b44fc83ece6763cfc53c05e7a37cabc69a085b
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Mar 15 11:58:08 2010 +0300
xt_recent: Fix missing member build bug
proc_old_dir was occasionally deleted which leads
to build failure. Restore it.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fc9b8cc0d03647d97224af6eaa587aebc9f2c07a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 12 17:14:08 2010 +0300
sysfs: Fix error in sysfs deprecation
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0af39b9f83a556aae4a78744416a9d4af16109ba
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 12 17:02:26 2010 +0300
vzdq: Moved vzdq to fs/quota/vzdquota dir
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f40134386ccead91baafb8c48690463bf6dd7845
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 12 16:17:07 2010 +0300
sysfs: Make dynamic sysfs deprecation
sysfs has two layouts controlled by CONFIG_SYSFS_DEPRECATED_V2
build option, but this is not an option for openvz - it should
work on both old and new distros.
The old_sysfs boot option allows to select the sysfs layout to
be used on host system.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 36550b06fe97dc87b4ea91c27139016adca6f1c2
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 12 13:48:10 2010 +0300
BC: Fix lockedpages discrpancy
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7f92d5fa22a7564acd73a5b97b69a420ff1373ee
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Mar 12 13:21:28 2010 +0300
conf: Ban freezer and cpuacct cgroups.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 32ee7c243edd57598550dbb02ca96b212810cd9e
Author: Kir Kolyshkin <kir@openvz.org>
Date: Fri Mar 12 13:10:22 2010 +0300
mm/swap_state.c: fix a compiler warning
Fix the following warning introduced by commit 5198e6ea:
mm/swap_state.c: In function "add_to_swap":
mm/swap_state.c:161: warning: ISO C90 forbids mixed declarations and code
Signed-off-by: Kir Kolyshkin <kir@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6d267858ed5a189626b4c85fdb30642c01f8aa49
Author: Maximilian Attems <max@stro.at>
Date: Thu Mar 11 20:44:18 2010 +0100
memory failure: fix up openvz compilation
due to 801183f17814b0f99fa907ca73c7908d623e3beb
s/for_each_process/for_each_process_all/
Signed-off-by: Maximilian Attems <max@stro.at>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7794fd2a2669b1f311df5b34908c71048181aa34
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 21:12:47 2010 +0300
inotify: Remove debugging printk-s from previous patch
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5fdc2442ff4cdc5c414d8444bad3901ebb57ff9d
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 21:10:37 2010 +0300
CPT: Reimplement inotify support
The approach stays the same - inotify watch object is tagged
with the path to the respective inode. No new tricks actually.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2b3eae114ee37e6a106802856ffc86be13fff6cc
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 21:09:26 2010 +0300
BC: Create debugfs dentries in ub0 context
They are created for (e.g.) block devices internal objects. Thus,
if some container opens one the corresponding dentry will pin the
respective beancounter.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a8e93f6687f7b3ee6e72c994750c52866322ef89
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 21:07:56 2010 +0300
BC: Mark the fasync helpers cache as accountable
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 01dbf0ef9b8bb3c85c41a1227d89d9bc6a750446
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 15:31:50 2010 +0300
inotify: Removed lost debugging that broke compilation
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 58dffccd8590ddc83fe857c0f34b29bcf96846e3
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Mar 11 13:57:55 2010 +0300
cpt: dump_one_process -- get rid of _TIF_ABI_PENDING
It was removed by 05d43ed8a89c159ff641d472f970e3f1baa66318
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 002fdeea3c2cff663452bb92035074bb8bbb84ac
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 13:47:52 2010 +0300
inotify: Don't leak user struct on inotify release
The inotify_new_group receives a get_uid-ed user_struct and saves the
reference on group->inotify_data.user. The problem is that the free_uid
is never called on it.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fed090200cfbc5d409eb9d29d0dc40e1c249a046
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Mar 11 12:37:54 2010 +0300
BC: Uncharge locked memory at task exit
Now the locked vma-s are unlocked before unmap-at-exit.
Take this into account in order not to leak lockedpages.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 96524bb0f3f624341dd8b70404ae209277c701ae
Merge: 3beb2c5 7f5e918
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Mar 10 14:52:57 2010 +0300
Merge 2.6.32.9 from git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.32.y
Conflicts:
Makefile
fs/exec.c
include/linux/quota.h
include/net/netfilter/ipv6/nf_conntrack_ipv6.h
init/calibrate.c
kernel/futex.c
mm/mremap.c
net/ipv4/netfilter/nf_nat_core.c
net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
net/ipv6/netfilter/nf_conntrack_reasm.c
net/netfilter/nf_conntrack_core.c
commit 3beb2c56e32523a7954ab3f5ce246112354302a5
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 20:55:56 2010 +0300
BC: Don't uncharge orphaned SKBs
They are about to get their skparents soon.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fee24e3bcb3ef22212e29e9ea66dbea5adc4934f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:32:56 2010 +0300
devpts: Virtualize devpts
Don't use the -o newinstance since it's idiotic approach - no
distributions are ready for ptmx place in devpts.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 33edc3590e81cc7266e6dc350bef93d0bd097c27
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:31:46 2010 +0300
nsproxy: Clone task's nsproxy pid namespace
During ve creation time we need to clone net namespace
later than others (proc issue), but by that time we don't
have task's pids moved to new pid space.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ef3d6f4fd66d324a46dde365bdeb0d82ae6299dd
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:30:12 2010 +0300
UB: Fix per-UB orphan count
First - initialize ub0 one
Second - temporarily ban per-UB since it oopses for yet unknown reason
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d1f6b41859d20465ff2bc8fd8e2baf64dfb3a7b2
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:29:34 2010 +0300
autofs: Switch dev ioctl to handle new pid reference
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 51417889da6dff28c32ffac459b2fdcbce609615
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:29:13 2010 +0300
ve: Fix oops in ve percpu stats
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0cfcfacd9aa0f0f25a834e74cb34479e568f209d
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:28:41 2010 +0300
user: Minor fixes for per-ve user_namespace
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ace19e2fda3b73ab37bb8bed75c4281866dc1b9e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Mar 9 00:27:04 2010 +0300
tcp: Switch tcp_v4_kill_ve_sockets to new nulls hash
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b95472e0485a55c1dd13b00c40f8b6f9726b5348
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:34:57 2010 +0300
CPT: New netdev cpt/rst scheme
This is not only a "new cool design" but also a compilation fix.
The new scheme is done in rhel5 kernel for all the devices we
have. This one doesn't affect tunnels, but we'll do them later.
Only compilation is checked, since I don't have a good box atm.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 983d7dc16bb875915dac21ca3066e785ebdf0cb6
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:34:01 2010 +0300
CPT: Misc compilation bits
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 804d171fcbec07112bf438ce2d4f9366fae69cbb
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:32:35 2010 +0300
CPT: Trivial networking cpt/rst compilation
Just switch to new variable names, new locking, etc.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit dadcd97c96e33796462a31e00c3a84c74a02425a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:27:17 2010 +0300
CPT: New rules of task_struct and Co manipulation
This includes:
- tsk->cred object
- signal->timers changes
- task's pgrp/session restoration (signal struct doesn't have
these IDs any longer)
The last item deserves a note: according to Alexey we restored
task gid and sid in two steps - firt restore_one_signal_struct
fixed up orphan pgrgs and sessions and later rst_process_linkage
did the rest. This is shitty. The rst_process_linkage can do all
the work by its own. Hopefully this implementation will work.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3d64a26d56c453967de03a7d59bebfca53baace4
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:26:04 2010 +0300
CPT: New rules for VFS object manipulations
- file cred
- dentry_open args
- fs_struct refcount
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 263541a0557e7c193da15c511dbe5f493fd5fd85
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:24:46 2010 +0300
CPT: Mm AIO list has changed
Rework CPT accordingly.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c07bbe3184398421818e45e8b15338414f9b6407
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:24:10 2010 +0300
CPT: Turn off inotifies support
The inotifies themselves has changed.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0bc314b75d7d7ed7506e2a8a190d66587b398b76
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:18:26 2010 +0300
Export symbols for CPT
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 38a9b2d73336ddd8eb3a95f71bd14e83efb2933a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:16:29 2010 +0300
vzdq: Minor compilation fix
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6775ea38231c3adefb37c8804dc1f92d157dbbf7
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sun Feb 28 20:15:11 2010 +0300
vecalls: User namespace is now created separately
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f2e0ec87c21362673bb087dbd35a2cb80657a995
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sat Feb 27 19:54:51 2010 +0300
Temporarily ban percpus for ub0 and ve0
The percpu allocator doesn't work that early.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8ae771d6a8110dca4edb4a0b1fba91fcee5d67bb
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Sat Feb 27 19:54:14 2010 +0300
Fix class_kset access for ve0
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 696a26c8a79a54d7f7a1adf37aabe974c8ec5ce7
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 20:09:49 2010 +0300
Compilation: One more header for x86_64
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 44709fd26a8187ec6ea098ef89bd09b2ffdb5d44
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:39:59 2010 +0300
Compilation: Headers fixes
Missed, changed or misplaced.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7e3e2676d338551af81c4c5e1820c40898df4341
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:39:27 2010 +0300
Compilation: Cover the 2.6.32 API changes
New functions, new constants, etc.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 770ec7ef472f909386cf622e14fe77a50e407c5c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:34:52 2010 +0300
Compilation: Trivial mistakes performed while porting patches
Misprints, missed prototypes, missed function arguments, etc.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 39f8db35fe7b88cded819a5cf87dba5a45a02400
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:27:58 2010 +0300
VE: Remove ipv6mibs kludges
We have it netnsizated already.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 84b7cad3268230c36539998e2293f71ca21622b5
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:27:08 2010 +0300
VE: Remove devpts kludges
Will need to rework them.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ac2d5d623a6ff183139643e5baa85438ffca3c5c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:26:19 2010 +0300
misc: Remove unused code fetched with port
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bad80e078f40bb43d6c7161f8cd4a7042562f9da
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:24:25 2010 +0300
VE: Switch ve's init caps according to new creds api
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5dd5ebdd7e1e1b0176e7c681697a81c219b24835
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:22:36 2010 +0300
VZDQ: New sb->flags usage semantics
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ed915c647d05d3bdddb4c96c42565f64b9af9dca
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:21:10 2010 +0300
venet: Don't reset skb dst in xmit callback
This is done automatically now.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 60e08953c7505aec749b62d36f42d293be014884
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:19:56 2010 +0300
IPC: Use existing shmem_lock fn
And fix the one to charge pages properly.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 944ca7eb47c57b230877eeda36fb569de9390d8e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:16:54 2010 +0300
venet: Switch venet and vzethdev to using net_device_ops
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d680fa424510524993ba1016aa4dd27b05817884
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:15:13 2010 +0300
UB: Turn ub_orphan_count into percpu_counter
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 456684681e42f257ca1614be9871a4c23f17b2f8
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:13:35 2010 +0300
Get rid of our static percpu kludges
We neither love them nor can implement with new percpu engine.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bfda2fd34a32b692d130a68abf474c4965eb7898
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:11:37 2010 +0300
BC: Get rid of ioprio finally
We don't have it working :(
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1c32aca49a66448c44c2c525b91322e796d955ef
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:10:20 2010 +0300
VZ: Turn vzmond thread into kthread API
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fcaea413672b6069ea397a708b2133c903282eff
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 26 18:07:33 2010 +0300
tun: Don't use sock_alloc_send_pskb
We don't have pages accounting fixed properly for it.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6c0f99507e5dd78ac699c4e6266c85a6cbd2faa6
Author: Vitaly Gusev <vgusev@openvz.org>
Date: Tue Feb 9 20:22:01 2010 +0300
cpt: add signalfd support
http://bugzilla.openvz.org/show_bug.cgi?id=1424
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit abddb6ad5d23542ae2880d2a0b211ae77657224f
Author: Marat Stanichenko <mstanichenko@openvz.org>
Date: Wed Feb 3 20:22:40 2010 +0300
printk_cpu have to be "cleared" in __vprintk (v2)
http://bugzilla.openvz.org/show_bug.cgi?id=1284
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3aad60a7d9451a78d4b44c6539cae85d6ed799a5
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed Jan 20 21:18:47 2010 +0300
net,bridge: add support of VE_FEATURE_BRIDGE
Bridge should be available only on host node
by default and VEs with a proper feature set.
Note: we reserve VE_FEATURE_IPGRE for now.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7b3da8cc50f4abe02e8bd672e52015e5b186de50
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed Jan 20 21:04:11 2010 +0300
net,bridge: Allow bridged packets to cross VEX-VE0 bound
In case of bridged packets we should allow packets
with spoofed MACs to cross VE0-VEX bound and back.
WARNING: As only vethX become a bridge port it's
not secure anymore against sending\receiving packets with
spoofed MAC address to\from HW node (ie VE0). So please
be very careful in providing a VE with "bridge" feature
turned on.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5d26df93201509a12f890fb83559736196b98c9a
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun Jul 19 00:31:44 2009 +0400
net,vzethdev - reset bridge mark
If packet is emitted by a bridge we should
clear BR_ALREADY_SEEN mark so it could be
managed by a recipient.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ef8abb270578b9ec9362e6c77c055b16380ecedd
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun Jul 19 00:29:13 2009 +0400
net,vzethdev - don't assign skb->dev twice
skb->dev is assigned via eth_type_trans anyway
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 41f7aa140793e8382ee2b4eeb9f2279bc2e0e973
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun Jul 19 00:26:31 2009 +0400
net,veth - reset skb->brmark in veth_xmit
If skb packet is marked as BR_ALREADY_SEEN and is going through
VE (transit packet), then in the VE out this packet still be marked
as BR_ALREADY_SEEN.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 56329ced6087f1611246fda8aa56366f1e126188
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Jan 22 19:09:10 2010 +0300
CPT: new tap filter support
Tap filtering was reworked in mainstream commit f271b2cc.
This patch adds support for it and a converter from the old image.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f841999e14c44a19f73ede27b083fa0e9439e53b
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:14:18 2008 +0400
cpt: tun device has advanced filtering logic now
The old PROMISC-vs-addr-and-mac filter is now rewritten.
Right now it's OK not to deal with it (hardly someone uses
it so early).
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f6cc5e3de3217e2a580a41edac90f5a67aad33d7
Author: Vitaly Gusev <vgusev@openvz.org>
Date: Fri Jan 22 18:55:08 2010 +0300
Fix bad throughput of TCP connection after live migration
After migration sk->sk_gso_type is set to 0. Due to this, sk_can_gso()
returns 1, and tcp layer thinks that socket supports GSO.
As result, some outgoing packets are TSO packets. If physical ethernet device
doesn't support TSO, then big ethernet packet will be dropped.
Notes:
A. Big ethernet packet is dropped on the non-TSO real physical ethernet
device, because:
1. Packet will not be slitted on several small packets, as GSO layer
doesn't handle properly this packet (skb_gso_ok(skb) returns 1).
2. Total packet's length > mtu (for instanse: mtu 1500, length 2962)
B. UDP sockets don't use sk->sk_gso_type.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 728cfff28426ab443602fc3166886a9cf8ec6226
Author: Denis Lunev <den@openvz.org>
Date: Fri Jan 22 18:48:26 2010 +0300
ub: incorrect skb is charged in tcp_send_synack
New one should be charged rather than old.
http://bugzilla.openvz.org/show_bug.cgi?id=987
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 77b6f2c55d0b122e9b538002a4579f956cfebcde
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Jan 11 18:44:43 2010 +0300
sysrq.c -- trivial cleanup
Use tabs instead of spaces.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 178b81eb4c92bd9de1e16b6957f79ba8697e3e46
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon Jan 11 18:42:24 2010 +0300
SysRq debugger -- trivial misprint fixup
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fbdbae997bde9dabd6d1eab2592bc9c929108618
Author: Vitaily Gusev <vgusev@openvz.org>
Date: Mon Oct 12 18:53:30 2009 +0400
cpt: dump inode content for shm_file_operations
If file->f_op == shm_file_operations then cpt code doesn't
dump inode content, and dump only for f_op == shmem_file_operations.
Bug http://bugzilla.openvz.org/show_bug.cgi?id=1342
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4ebe9f29dab354b80d5f0ba26365543cdf060031
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Jul 16 19:22:21 2009 +0400
slab: kmem_cache - fixup members and use helpers
In a sake of SLAB memory manager usability we need
to add beancounter members and use proper helpers.
[ bug http://bugzilla.openvz.org/show_bug.cgi?id=1232 ]
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 096e829b9ac73cec435e23fe0ab703d836ff4239
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Jul 16 21:14:29 2009 +0400
net,tcp - CONFIG_USER_RESOURCE doesn't exist anymore
CONFIG_USER_RESOURCE is deprecated and we should use
CONFIG_BEANCOUNTERS instead.
http://bugzilla.openvz.org/show_bug.cgi?id=1275
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ca151e1d2ada3cd195a1e5b582360b8e4355cc33
Author: Kir Kolyshkin <kir@openvz.org>
Date: Mon Mar 23 19:53:59 2009 +0300
cpt: enable only for supported arches
Since checkpointing is only supported on X86 (both 32 and 64) and IA64,
it makes no sense to make it enabled by default on all arches. This
also makes 'make defconfig' uncompilable.
Fix is easy: make VZ_CHECKPOINT depend upon the supported arches.
Signed-off-by: Kir Kolyshkin <kir@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 39ee2ebdcd179700a04d878ab68a44192896401b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed Jun 3 16:34:08 2009 +0400
pidns: pi-futex pid check fixup
fix WARN_ON condition
port rh5 patch from Stanichenko Marat <mstanichenko@openvz.org>
http://bugzilla.openvz.org/show_bug.cgi?id=1262
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5f5143937566b52a8ce3d040c7d885ef0a417c38
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue Apr 7 21:59:25 2009 +0400
net: bridge - set_via_phys_dev_state should return int value
While playing with bridge code I found the following
compiler warning
| CC net/bridge/br_sysfs_br.o
|net/bridge/br_sysfs_br.c: In function store_via_phys_dev_state:
|net/bridge/br_sysfs_br.c:199: warning: passing argument 4 of
|store_bridge_parm from incompatible pointer type
This is due to store_bridge_parm uses int returning callback.
Fix it.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9a3ddc21651d5f77616f56f0bbec2303c1265250
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue Apr 21 16:00:07 2009 +0400
net: bridge - process skbs has been already substituted due to via_phys_dev
When via_phys_dev is enabled we substitute skb->dev with
master_dev and pass it back to bridge code. Instead of
dropping such skb we should pass it up to network stack
to process.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 13d8afaafce0229e06244fb36ad93464435af77b
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 10 15:22:09 2009 +0400
namespaces: ban netns creation even for ve0's root
netnamespace do not work with sysfs yet, so creating one
can be fatal
http://bugzilla.openvz.org/show_bug.cgi?id=1234
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 797b5ff880ec074424d5d7e2758553d21a00030f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 3 17:59:56 2009 +0400
ve: show task's vpid and veid even inside a container
Getting task real virtual :) pid is tricky in 2.6.26 and above...
http://bugzilla.openvz.org/show_bug.cgi?id=1223
http://bugzilla.openvz.org/show_bug.cgi?id=1224
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c4acecd46e5ec5cac30070469a2d65b05922f468
Author: Denis V. Lunev <den@openvz.org>
Date: Mon Jul 14 11:04:29 2008 +0400
ubc: uncharging too much for TCPSNDBUF
It is not allowed to go to the label wait_for_memory with chargesize != 0
when this space is already placed to the skb.
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0940e60191e93a0948bf451ef0d87f9a7ac59fbb
Author: Denis V. Lunev <den@openvz.org>
Date: Mon Jun 30 11:05:14 2008 +0400
Endless loop in __sk_stream_wait_memory.
[UBC]: Endless loop in __sk_stream_wait_memory.
The loop in __sk_stream_wait_memory when tcp_sendmsg asks to wait for
TCPSNDBUF space is endless when the timeout is not specified. The only way
out is to queue a signal for that process.
Lets return a status flag from ub_sock_snd_queue_add that UB space is
available. This is enough to make a correct decision to leave the cycle.
Signed-off-by: Denis V. Lunev <den@parallels.com>
Signed-off-by: Pavel Emelyanov <xemul@sw.ru>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5591a7de607e089e1ade3c6fc6b1fef9f521824a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Apr 3 17:13:14 2009 +0400
ptrace: ban ptracing of a container init from inside the container
Current ptrace engine suffers from strange problems, one of which
is described in bug #1222 - init results in T state after incorrect
tracer detach.
Fixing it is not that easy, but since ptracing init was never alowed
before it's OK to ban this (for a while?).
http://bugzilla.openvz.org/show_bug.cgi?id=1222
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f1ff7b5a9b5600e8584fa22ef842898303b35b00
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Mar 27 15:03:57 2009 +0300
VE: fix idle time accounting
Make both account ways simmetic: idle time accounted as idle or iowait,
depending on number tasks in iowait state.
http://bugzilla.openvz.org/show_bug.cgi?id=1217
(#114633)
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3d1ae2f4477c0a5f048f4ee6ce1aab8b5fdd703b
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Mar 10 15:55:35 2009 +0300
NETLINK: disable netns broadcast filtering
There only one uevent_sock in init_net for all VE.
Broadcasts allready filtered by exec_env compare, drop netns check.
http://bugzilla.openvz.org/show_bug.cgi?id=1195
http://git.openvz.org/?p=linux-2.6.24-openvz;a=commit;h=0474535acfde6a
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit bd327cf59d64d5ba4ca33707a90c5cb048d80aee
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Mar 10 14:31:18 2009 +0300
pidns: zap ve process only when killing ve's init pid-ns
This prevents task genocide when zapping nested pid-ns in same ve,
and affects ve0 only.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 50b28ee671404e4de0dc65e51a31b434e98365f2
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Mar 10 14:27:17 2009 +0300
pidns: lost task debug print uses wrong prototype
Print most interesting fields manually.
http://bugzilla.openvz.org/show_bug.cgi?id=1181
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 235eec98d609a1c55c23ca159a5316d4de85b1d2
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed Mar 18 18:48:28 2009 +0300
bc: fix permissions on /proc/bc
The reading of /proc/bc/* is permitted for those only who
has CAP_DAC_OVERRIDE and CAP_DAC_READ_SEARCH attributes
set. We should not point files as "group" or "other"
readable/executable since they are not.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 76d5b985f75ce7053091d4795969777fd8a9e2d4
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Feb 24 16:57:05 2009 +0300
ve: fix sysfs warnings in case CONFIG_SYSFS_DEPRECATED_V2=n
http://bugzilla.openvz.org/show_bug.cgi?id=1179
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8ce64ebab8668adefd6454c44769432a19a03e13
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Feb 24 16:47:23 2009 +0300
pidns: update leader_pid at pidns attach
after commit fea9d17 it_real_fn send SIGALRM to task->signal->leader_pid
(used for sys_alarm(...) and sys_setitimer(ITIMER_REAL,...))
Thus, __pid_ns_attach_task hack-n-dirty cross pid-ns task movement must
update this pid too
http://bugzilla.openvz.org/show_bug.cgi?id=1160
127384
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3550d9ce088e15e0d1e65640e833d1b95702dc5a
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Feb 3 13:57:32 2009 +0300
simfs: don't work with buggy input
Some (buggy) filesystems (aufs for example) pass NULL as mnt to getatts
and hope for the better...
Let's not confuse the user with the oops at least.
http://bugzilla.openvz.org/show_bug.cgi?id=1054
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5f162271db7a2bef678127fd164c7b3645730695
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Jan 26 15:48:02 2009 +0300
Free skb->nf_bridge in veth_xmit() and venet_xmit()
We free skb->nfct in veth_xmit, but also have to free skb->nf_bridge.
Note: Why it works in 2.6.24-ovz but doesn't work in 2.6.26-ovz ?
1. It issue is only if BRIDGE_NETFILTER=y
2. nf_hook_register() has effect to all VEs in 2.6.26-ovz
(in 2.6.24-ovz doesn't).
Thus bridge hook ip_sabotage_in is not called for 2.6.24-ovz, but
is called for 2.6.26-ovz.
http://bugzilla.openvz.org/show_bug.cgi?id=1146
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c600236484edea22668c027c9f0e42d78f29ead6
Author: Konstantin Ozerkov <kozerkov@openvz.org>
Date: Fri Jan 23 17:43:33 2009 +0300
Fix broken permissions for Unix98 pty.
This bug is not very critical because modern software can
automatically choose between legacy pty or Unix98 one.
Signed-off-by: Konstantin Ozerkov <kozerkov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8207275f91f0b1849edc8cfdcfab4565997ce758
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu Feb 19 23:14:39 2009 +0300
net: PPPoE, PPPoL2TP - dont create socket without
VE_FEATURE_PPP
If environment has no VE_FEATURE_PPP feature set
we should prohibit the creation of PPPoE, PPPoL2TP
sockets.
http://bugzilla.openvz.org/show_bug.cgi?id=1187
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9319ef007e558a552b46e7336214495ac7025d20
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Feb 24 17:15:18 2009 +0300
cpt: fix sysvipc lock init
commit cbb502 broke sysvipc restore sequence: kern_ipc_perm left unitialized.
This patch:
* fix kern_ipc_perm initialization and lock
* fix rcu_read_lock balance
* remove from idr inexact id
* proper toss EAGAIN from idr_get_new_above to higher level
lockdep output:
CT: 101: stopped
CT: 101: started
=====================================
[ BUG: bad unlock balance detected! ]
-------------------------------------
init/6760 is trying to release lock (<NULL>) at:
[<ffffffff8110740f>] newary+0x11c/0x148
but there are no more locks to release!
other info that might help us debug this:
1 lock held by init/6760:
#0: (&ids->rw_mutex){--..}, at: [<ffffffff81105744>] ipcget+0x5a/0x1db
stack backtrace:
Pid: 6760, comm: init Not tainted 2.6.27.10 #5
Call Trace:
[<ffffffff8105e620>] print_unlock_inbalance_bug+0x100/0x110
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff81128601>] ? get_from_free_list+0x45/0x50
[<ffffffff811287db>] ? idr_get_empty_slot+0x169/0x24d
[<ffffffff8105e6d3>] lock_release_non_nested+0xa3/0x20e
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff8110740f>] ? newary+0x11c/0x148
[<ffffffff8105e9b1>] lock_release+0x173/0x1a2
[<ffffffff812bc90e>] _spin_unlock+0x1e/0x2a
[<ffffffff8110740f>] newary+0x11c/0x148
[<ffffffff81105751>] ipcget+0x67/0x1db
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff81107281>] sysvipc_setup_sem+0x57/0x59
[<ffffffffa023a59e>] ? file_pread+0x54/0x88 [vzrst]
[<ffffffff811072f3>] ? newary+0x0/0x148
[<ffffffff81107110>] ? sem_security+0x0/0x8
[<ffffffff81107118>] ? sem_more_checks+0x0/0x16
[<ffffffffa0248180>] rst_sysv_ipc+0x11e/0x4bc [vzrst]
[<ffffffffa024408c>] ? fixup_unix_address+0x2f/0x92 [vzrst]
[<ffffffffa0244c05>] ? open_socket+0x482/0x6e9 [vzrst]
[<ffffffffa0245b35>] ? rst_sockets+0x773/0x83a [vzrst]
[<ffffffffa0238bc8>] hook+0x8e3/0x1c4e [vzrst]
[<ffffffff81059e5f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffff8107cbf7>] ? mempool_free_slab+0x12/0x14
[<ffffffff81059e5f>] ? trace_hardirqs_off+0xd/0xf
[<ffffffff81011dc8>] ? native_sched_clock+0x76/0x88
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff8105bfae>] ? trace_hardirqs_on_caller+0xf9/0x124
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffff812bc8a4>] ? _spin_unlock_irq+0x2b/0x30
[<ffffffff8105bfae>] ? trace_hardirqs_on_caller+0xf9/0x124
[<ffffffffa02375ae>] child_rip+0xa/0x14 [vzrst]
[<ffffffff8100c73f>] ? restore_args+0x0/0x30
[<ffffffff8105bfe6>] ? trace_hardirqs_on+0xd/0xf
[<ffffffffa02382e5>] ? hook+0x0/0x1c4e [vzrst]
[<ffffffffa02375a4>] ? child_rip+0x0/0x14 [vzrst]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5f58d37a08228a4fd20c7bf76ac7f25a7cb10d03
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Feb 24 17:05:35 2009 +0300
cpt: fix sysvsem undo list init
NULL-terminated list had replaced with struct list_head by the commit
4daa28f since 2.6.27.
sync initialization with native get_undo_list(...) constructor
http://bugzilla.openvz.org/show_bug.cgi?id=1172
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-bt: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit be0408a1c55dc22cbcb9b1b8e4e54d884bce8ee6
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed Feb 4 19:53:56 2009 +0300
net: vz_security_family_check - allow PF_PPPOX
Since we support PPPoX now we shouldn't prohibit
the creation of a such kind of sockets.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 175327660b42f51185f692fbe951d6e84beff882
Author: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed Feb 4 19:53:51 2009 +0300
ppp: introduce new VE_FEATURE_PPP feature
We will need this feature to manage PPP virtualization.
ve0 is capable for this feature as well.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 080d6d5b976356b4da767a5881809603f0e11a3b
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Feb 4 14:16:42 2009 +0300
Ban resource_counters and fix compilation when they are on
The resource counters are somewhat opposed to the beancounters we
have, so ban them for a while. Besides, if we turn them on and start
using memory controller we'll have a broken compilation in kernel/exit.c
http://bugzilla.openvz.org/show_bug.cgi?id=1163
http://bugzilla.openvz.org/show_bug.cgi?id=1154
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 44d71b98657d7f2f15ff21fe5d15fc594861ea78
Author: Vitaly Gusev <vgusev@openvz.org>
Date: Wed Feb 4 14:12:52 2009 +0300
Fixed compilation error with CONFIG_KGDB=y
http://bugzilla.openvz.org/show_bug.cgi?id=1163
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4cadfac75a76fb25c97611ac4acff301e3a0121d
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 2 19:24:00 2009 +0300
ub: fix compilation for SLAB allocator
The kmem_cache struct is private to slab.c file, so any
dereferences failed. Fixed by using in-slab.c helpers.
http://bugzilla.openvz.org/show_bug.cgi?id=1144
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2d596de10608185454b1c5f4e673c90e6c2e11fd
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Wed Jan 28 15:26:40 2009 +0300
Pass vpid to sys_fairsched_mvpr()
sys_fairsched_mvpr expects vpid, but after cpt_dump() current->pid is the
task's global pidy. Thus do_env_enter() returns -ESRCH.
Not relevant for earlier kernels, since they used find_task_by_pid,
which in turn is now deprecated.
http://bugzilla.openvz.org/show_bug.cgi?id=1155
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit b8c70094ecfbae2f5830c7478ae2a5906d5bfcff
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Thu Jan 29 11:59:26 2009 +0300
fix autofs4 ia32 compat
autofs4_notify_daemon is called in the context of task accessing autofs,
Thus checking the current bitness is wrong in mixed environment.
Store the autofs daemon compat mode in superblock at mount time.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9a6dd49f6123c2c222e3b3bd08887820e9330a35
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Thu Sep 25 13:06:13 2008 +0400
conntrack: fix oops in nf_ct_frag6_gather
skb->dev == NULL in NF_LOCAL_OUT hook level. So dev_inet(skb->dev)
in nf_ct_frag6_gather() function causes OOPS.
Pass directly net namespace to nf_ct_frag6_gather() as parameter
to avoid this issue.
(#122210)
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 823f379de7703522e4463320075d4a8349fbe364
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Jan 14 18:23:02 2009 +0300
bridge: don't leak master device on brctl addif
If we add a second ethernet device to bridge the former one leaks.
http://bugzilla.openvz.org/show_bug.cgi?id=1145
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 369a0ca3295a3af4c69a6cb4590d16ee62fb25f9
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Jan 14 18:22:14 2009 +0300
tun: mark tun/tap devices with NETIF_F_VIRTUAL flag
This flag is not only a "don't register me in CTs" sign, but
also a "can be a bridge master device" one.
Need it back.
http://bugzilla.openvz.org/show_bug.cgi?id=1145
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0aae51aaacc075717f6db70fa5ce3cb2563a9499
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Jan 13 18:23:56 2009 +0300
nfs: use kthread_run_ve to start lockd
Lockd is virtualized, so must be created in VE context.
The reason it worked before (in 2.6.18 kernel for example) is that lockd is
rewritten to use new kthread API, which was not capable for creating threads
in containers.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 13a6d4e0ffbf6bdcab92eb2a81a503c39578180c
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Jan 13 18:23:55 2009 +0300
Don't dereference NULL tsk->mm in ve_move_task
Kthreads are mmless...
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 2be6d439943fa3d785c27a1d977c6ce20861909e
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Jan 13 19:49:02 2009 +0300
Add kthread_create_ve() and kthread_run_ve() functions #2
These functions are like kthread_create() and kthread_run()
but create threads in VE context.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit dc10ea6d7454731f98e3fa407d606dde4c0ac6fa
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Jan 13 18:23:53 2009 +0300
Add do_ve_enter_hook
We will call this hook to enter to VE.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c29037a0cae0c3f5d88dceee098273d320247694
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Jan 12 17:29:54 2009 +0300
nfs: Fix nfs_match_client()
nfs_match_client() can return nfs_client from other VE.
#266951
Original-patch-by: Denis Lunev <den@openvz.org>
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit eff7bfc5877e6b68a9439c4a1da79d53d92b6b4f
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Jan 9 12:18:20 2009 +0300
ve: sanitize capability checks for namespaces creation
The existing hard checking for namespaces mask is too bad. The
intention was to ban namespaces creation for containers, but
there aready exists a proper security mechanism to govern this
question.
Switch to existing capability-driven policy, thus allowing for
namespaces creation from the HN.
http://bugzilla.openvz.org/show_bug.cgi?id=1113
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7c73b2873754b60dd18edd1201236806174dc93d
Author: Denis Lunev <den@openvz.org>
Date: Mon Dec 29 20:34:32 2008 +0300
NFS: NFS super blocks in different VEs should be different
NFS: NFS super blocks in different VEs should be different
Teach nfs_compare_super to this
Bug #265926
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit aed2b93a00e5bfaf08922ee0b3055115ae3e2ab2
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Wed Dec 24 20:32:43 2008 +0300
nfs: Fix access to freed memory
rpc_shutdown_client() frees xprt, so we can't use this xprt.
So move put_ve() to xprt::destroy level.
Bug https://bugzilla.sw.ru/show_bug.cgi?id=265628
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit fa8ea58824db1e7631dfb1bdb653296fdc57fba1
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Dec 29 19:37:47 2008 +0300
cpt: Make the proper check for sigmask
invalid check of TS_RESTORE_SIGMASK (always false!)
original code ..rhel5..2.6.24 code from diff-cpt-sigsuspend-lockup-20070131
if (!signal_pending(current) &&
!test_thread_flag(TIF_RESTORE_SIGMASK)) {
TIF_RESTORE_SIGMASK replaced with TS_RESTORE_SIGMASK and
after commit 7648d96 setting TS_RESTORE_SIGMASK always set TIF_SIGPENDING.
so, second check is not needed.
http://bugzilla.openvz.org/show_bug.cgi?id=1122
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit ceb9c594c44e800a78f921dc34dffa297258ace3
Author: Denis V. Lunev <den@openvz.org>
Date: Wed Sep 24 14:35:09 2008 +0400
netns: enable cross-ve Unix sockets
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f6e060b4bf5c19dda7a7e8f59736f550de158592
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Fri Nov 14 19:19:40 2008 +0300
Correct per-process capabilities bounding set in CT
Otherwise tasks in container may have unlimited capabilities...
(#127136)
Singed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit e78d1e4a7ff9aa4936c62a0fe644cc2bea27b314
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Fri Oct 31 16:48:47 2008 +0300
net: set ve context when init/exit method is called
Both pernet init and exit methods are called:
- from VE context when VE is created;
- from VE0 context if module registers pernet operations
This difference in approches leads to many nasty things, since the
init callback can be actually called with wrong exec_env.
Unify both approaches.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4aaf2f38170ca8da93bee50104c146beea7b79fb
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Mon Oct 20 15:38:43 2008 +0400
iptables: setup init iptables mask before net initialization
Net initialization uses iptables init mask and checks
VE_IP_IPTABLES6, VE_IP_FILTER6, VE_IP_MANGLE6.
Thus without setup before net init, VE's ipv6 iptables
will not be initialized.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 11a2b1bd817dc151a798173f5ba41247ad4c1acf
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Oct 14 19:20:33 2008 +0400
Adjust VE before call netdev_unregister_kobject/netdev_register_kobject
These function use visible_net_class.
http://bugzilla.openvz.org/show_bug.cgi?id=1044
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 714af324cb0e9adcce769888b0ac64e38f6b4990
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Oct 14 19:18:57 2008 +0400
Simplify call __dev_change_net_namespace() by remove parameters.
1. Source VE and destination VE doesn't need to pass to the
__dev_change_net_namespace() as src VE can be obtained from
dev->owner_env and dst VE from net->owner_ve.
2. Destination VE that is passed to __dev_change_net_namespace()
was wrong, so this patch also fixes it.
Related to the bug http://bugzilla.openvz.org/show_bug.cgi?id=1044
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 86760c475cc42fab9bbdd34345e09cade546d214
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Oct 14 19:10:44 2008 +0400
CPT: revert check on sk_reuse>1
Revert commit ac6f78192054784f02dd47f8e6d7d1c8d75ab173
("[INET]: sk_reuse is valbool", Author: Gerrit Renker <gerrit@erg.abdn.ac.uk>)
Check on sk_reuse>1 is needed as during restore "bind" fails
for sockets that was bound to the same port for same sk_family and also
fails for sockets that are bound to the same port for other sk_family
(Example: sshd that listen to port 22 IPv4 and IPv6 any address).
For additional information see open_listening_sockets() and open_socket().
Related to the bug#1034
http://bugzilla.openvz.org/show_bug.cgi?id=1034
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 444a2b211f9a11e34339022d4dc50fc69656febc
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue Oct 14 19:06:53 2008 +0400
Don't do ip6t_do_table() if VE doesn't have xtable
If VE is not granded to have xtable then ip6t_do_table()
shouldn't do anything.
Why it is needed for 2.6.27 and is not needed for 2.6.26:
Kernel 2.6.26:
ip6t_local_out_hook() uses init_net.ipv6.ip6table_filter
Kernel 2.6.27:
ip6t_local_out_hook() uses nf_local_out_net(in, out)->ipv6.ip6table_filter);
Thus in kernel 2.6.26 to ip6t_do_table is passed VE0's xtable but in
kernel 2.6.27 - VE's xtable.
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit c4e7fea1fdf0522a5ea2cbb4f1d2fb721454e85e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Oct 1 11:52:31 2008 +0400
cpt: replace BUG_ON-s checking for sizeof-s with BUILD_BUG_ON
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 142dbbc31dbf0d5c85e5ac311035a6f77c9d335a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 30 19:03:04 2008 +0400
Add "VE features" for sit and ipip devices.
Currently these devices are created unconditionally in *each*
ve after ipip or sit module load. This is bad for many reasons.
I add two features, just like it's done for 2.6.18-rh5 kernel.
The migration support will become available a bit later - I
don't want to push the existing implementation till we settle
with the new netdeives migration scheme (in rh5 of course).
http://bugzilla.openvz.org/show_bug.cgi?id=874
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5467a3a4c9a77b925fcc4d0cfb24ef2bca260c25
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Sep 22 13:21:20 2008 +0400
autofs: fix default pgrp vnr
Default pgrp should be virtual-nr,
because autofs lookup pid struct via find_get_pid.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a79201c08bb37c87ca5f393450ef33522884ed4f
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Mon Sep 22 13:20:00 2008 +0400
autofs4: pidns friendly oz_mode
Fix oz_mode detect to prevent autofs daemon hang inside CT.
Switch from pid_t to struct pid of oz mode process group.
The same changes as in mainstream commit fa0334f1 for autofs.
http://bugzilla.openvz.org/show_bug.cgi?id=959
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7aa1dff6d85912e7951a2cb058a33e2929a80d91
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Sep 10 13:55:58 2008 +0400
cpt: Make it module by default
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7df2ed99202a0cb803f7ecd47479fb47e2171743
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:25:00 2008 +0400
cpt: bump image version to VERSION_27
Images from older kernels will most likely not get restored...
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 84a726bf66ed4838e6def4434c901f5aa36cc5f5
Author: Denis Lunev <den@openvz.org>
Date: Tue Sep 9 17:55:51 2008 +0400
Double free for UDP socket
The socket resided in UB space waiting queue could be released. In this
case ub_snd_wakeup running on the another CPU could hold/release that
socket effectively hitting 0 refcounter second time.
Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 029a59e0492fd3f927e6dd13e37560ce51a7ef8a
Author: Vitaliy Gusev <vgusev@openvz.org>
Date: Wed Oct 8 16:42:33 2008 +0400
Remove code that doesn't allow net != init_net calls in netfilter
Taken from 2.6.26
Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 35fde54348e27bfc15dbbddb7ba77f8004ef31f8
Author: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue Oct 7 12:57:48 2008 +0400
fix wrong size of ub0_percpu.
after commit b3242151 struct percpu_data dynamically allocated
and have array only for 1 cpu, so static usage of it does not work.
Plus rework macros for static percpu variables declaration and initialization.
http://bugzilla.openvz.org/show_bug.cgi?id=1039
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 634d44cb7e025026b8267a553868935b95b79d2e
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Feb 25 00:10:33 2010 +0300
net: NETIF_F_VIRTUAL/_VENET intersects with NETIF_F_LRO
Fortunately, this is not a part of user/kernel interface
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit d1c9c6b158135f3695304431610caa7e052ab86f
Author: Denis Lunev <den@openvz.org>
Date: Fri Sep 12 14:05:02 2008 +0400
Compilation fixes after rebase
Signed-off-by: Denis Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5fcee65dc5574c7b23814a641bdc6c2e177dac6a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Sep 11 13:46:49 2008 +0400
sysctl: Fix sysctls visibility in ves
New sysctl infrastructure suffers from may problems :\
In order not to rework the whole kernel (including our patches in net)
switch to simple and stupid scheme (to be fixed if required later):
* all sysctls are visible in ve read-only
* required to be writable are registered explicitly, e.g.
- some net/xxx
- utsname
- ipc
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9d9f1a017892ee241d11d444853ffb47b9aa0b9b
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Sep 10 13:03:03 2008 +0400
cpt: fix task state/exit_state checking
Dead tasks now have TASK_DEAD in their state, not EXIT_DEAD,
which is sitting in the exit_state only.
Singed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit cfbfbd2e9518e0e998b956b0d46d4de55d69ed56
Author: Denis Lunev <den@openvz.org>
Date: Wed Sep 10 12:29:20 2008 +0400
nfs: add support for nfs lient in container
Signed-off-by: Denis Lunev <den@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 04bbc5c8bd6e547755059d675a98088da6089f7f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:19:02 2008 +0400
cpt: ptrace relations logic is rewritten
But from CPT point of view it seems, that
* lists' names changed
* ptraced kids are no longer in the parents' children list
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a93f756496a69ddc6f2dc3bd636d3aff66da2806
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:17:42 2008 +0400
cpt: sock_map_fs now has a flags argument
Ulrich's CLOEXEC disease :(
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0e4ec3f1e054f1cfbb2a840a2cfc49e4a2bcdeea
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:16:12 2008 +0400
cpt: semundo lists are rewritten unsing list_head-s
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6da0a4f1360f74d70c6f76b12b5c12bebd0bd325
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:15:19 2008 +0400
cpt: find_task_by_pid is removed
The find_task_by_pid_ns with explicit use of init_pid_ns is
the replacement.
Singed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 0a474fb35bd550e45ea44471e2049d22686b08ff
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:12:32 2008 +0400
cpt: altroot has finally gone
So no need to dump and restore it...
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 91a9ea726054774595f2729c4a48e7c5b05f2d2f
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Sep 9 19:11:13 2008 +0400
cpt: wait_task_inactive has one more argument
... to match for the state to wait for. Say 0 not to
use this filtering.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 3266f648c424fc120fe259dc8e30f7c402f9529a
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Feb 24 20:25:04 2010 +0300
NF: Conntracks virtualisationbits
Mostly conntracks are virtualized already. Need to check
the iptables_mask bits (dropped from this patch) and sysclts.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 48ee59b1171bf93efe87c04016d04d64aa9d376c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 19 15:04:27 2010 +0300
NF: Netfilter virtualization (without conntracks)
Finally it's fairly simple :)
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 801183f17814b0f99fa907ca73c7908d623e3beb
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 19 12:37:23 2010 +0300
DBG: OpenVZ debugging bits
SysRQ debugger, additional printks on oops, smp_nmi_call_function, etc
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit be8c8cf8f30df3e6b2cc88d8a0137a8719de1ef9
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Fri Feb 19 12:10:06 2010 +0300
TTY: Virtualization bits
The devpts requires rework though.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4650f334588b5dceeec5d1aa4982c1998193a3fc
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Feb 18 20:15:57 2010 +0300
SECURITY: Virtualization bits
Mostly devcgroup.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 26974fa580312d462effa9e4c6d1db29348c177c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Thu Feb 18 19:32:03 2010 +0300
VE: Bunch of virtualization bits
This mostly finishes up the core. Some minor stuff left though.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 7dfca0dae87c07b2b7dcf02174462a57d8c55bec
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Wed Feb 17 15:52:44 2010 +0300
NET: Virtualization and accounting bits
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 72e5c027e75e7dd3fbaa312061728b10a7bb11b4
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Feb 16 22:19:43 2010 +0300
SWAP: Proper R/O kludge for CPT
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit f385db6d4c5e735b8d7ca9d9e845494498ff71a2
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Feb 16 21:40:06 2010 +0300
MM: Kmemsize accounting bits
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 84e7a7fc66c78bc8170d34a065f81d68cbada866
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Feb 16 20:47:03 2010 +0300
OOM: Make it beancounters aware
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5198e6ea6a7c9c1a7d890f4c639007fce9290b05
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Feb 16 20:00:48 2010 +0300
MM: Memory manipulation bits
Includes BC accounting (mostly), CPT struts, kstats and others.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5ae684b555876388aec0eca2733ae99dd067cc01
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Tue Feb 16 16:20:52 2010 +0300
VE: Network devices virtualization bits
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8c2a50edaa93d430c1ecddf416a52d1962ba47bc
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 20:47:40 2010 +0300
SYSFS: Virtualize basic stuff
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 6a2f6ce2ce3c99348caf7296a147a82e4e1b75f8
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 20:22:55 2010 +0300
VE: Proc meminfo and stat files virtualization
Made it simpler than it was.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 4d331aee9e1c0601baffd60664c536de01d9f7af
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 19:09:15 2010 +0300
VE: Proc tree virtualization (and some files)
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8f48174c7581fbc1ebe4f1ee536f4cea3b810423
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 16:41:14 2010 +0300
VFS: Virtualization of FUSE and AutoFS
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 1987be4229e24528af07d36611bba22cbaa24619
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 16:09:25 2010 +0300
CPT: Various hacks over the code
Includes many things required by CPT, like structures
publication, new arguments to function, exports, etc.
Doesn't include inotify though.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit a46ffa4f39f0865a05dc3e550f26f626ef978f4c
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:22:58 2010 +0300
VE: Networking core
Venet and Vzethdev drivers and networking-related headers from 2.6.27
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 94d76ab02f7faa0a6725a09eccceeedcb143c899
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:22:03 2010 +0300
SimFS: The container root filesystem
Copied from 2.6.27 branch
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 283d0394211e4d18257a3c7904dc27a3e3a5e2ff
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:21:21 2010 +0300
Fairsched: Compatibility layer
The vzctl to cfs-sched compatibility layer taken from 2.6.27
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9eebf121ade0c0571b5a117ff294b792ecd56ca5
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:20:28 2010 +0300
VZDQ: The core
Core VZ disk quota functionality copied from 2.6.27 branch
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 8f6326d12d2250c044aad0c151fb134170d55d48
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:19:41 2010 +0300
VE: The core
Core virtualization modules copied from 2.6.27 branch.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 5317bec3d3f0fcdf5cbbc0f67fc00889d6756780
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:18:33 2010 +0300
CPT: The core
Core checkpointing functionality copied from 2.6.27 branch.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
commit 9845178382de1ea7ddde707006f66dd5b79066e4
Author: Pavel Emelyanov <xemul@openvz.org>
Date: Mon Feb 15 15:17:35 2010 +0300
UBC: The core
Core beancounters functionality copied from 2.6.27.
Neither compiles, nor works.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
diff --git a/COPYING.Parallels b/COPYING.Parallels
new file mode 100644
index 0000000..9856a2b
--- /dev/null
+++ b/COPYING.Parallels
@@ -0,0 +1,350 @@
+
+Nothing in this license should be construed as a grant by Parallels of any rights
+beyond the rights specified in the GNU General Public License, and nothing in
+this license should be construed as a waiver by Parallels of its patent, copyright
+and/or trademark rights, beyond the waiver required by the GNU General Public
+License. This license is expressly inapplicable to any product that is not
+within the scope of the GNU General Public License
+
+----------------------------------------
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
index 36fead3..674349f 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@ VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 32
EXTRAVERSION =
+VZVERSION = budarin
NAME = Man-Eating Seals of Antiquity
# *DOCUMENTATION*
@@ -352,7 +353,7 @@ KBUILD_AFLAGS := -D__ASSEMBLY__
KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
+export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION
export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
export CPP AR NM STRIP OBJCOPY OBJDUMP
export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
@@ -1033,7 +1034,8 @@ define filechk_utsrelease.h
echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \
exit 1; \
fi; \
- (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+ (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \
+ echo \#define VZVERSION \"$(VZVERSION)\";)
endef
define filechk_version.h
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbc161d..e6cc64c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2074,6 +2074,8 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
+source "kernel/Kconfig.openvz"
+
source "net/Kconfig"
source "drivers/Kconfig"
@@ -2091,3 +2093,5 @@ source "crypto/Kconfig"
source "arch/x86/kvm/Kconfig"
source "lib/Kconfig"
+
+source "kernel/bc/Kconfig"
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5294d84..a920d42 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -617,7 +617,7 @@ ia32_sys_call_table:
.quad stub32_iopl /* 110 */
.quad sys_vhangup
.quad quiet_ni_syscall /* old "idle" system call */
- .quad sys32_vm86_warning /* vm86old */
+ .quad quiet_ni_syscall /* vm86old */
.quad compat_sys_wait4
.quad sys_swapoff /* 115 */
.quad compat_sys_sysinfo
@@ -670,7 +670,7 @@ ia32_sys_call_table:
.quad sys_mremap
.quad sys_setresuid16
.quad sys_getresuid16 /* 165 */
- .quad sys32_vm86_warning /* vm86 */
+ .quad quiet_ni_syscall /* vm86 */
.quad quiet_ni_syscall /* query_module */
.quad sys_poll
.quad compat_sys_nfsservctl
@@ -841,4 +841,25 @@ ia32_sys_call_table:
.quad compat_sys_pwritev
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
+ .rept 500-(.-ia32_sys_call_table)/8
+ .quad sys_ni_syscall
+ .endr
+ .quad sys_fairsched_mknod /* 500 */
+ .quad sys_fairsched_rmnod
+ .quad sys_fairsched_chwt
+ .quad sys_fairsched_mvpr
+ .quad sys_fairsched_rate
+ .quad sys_fairsched_vcpus /* 505 */
+ .quad sys_ni_syscall
+ .quad sys_ni_syscall
+ .quad sys_ni_syscall
+ .quad sys_ni_syscall
+ .quad sys_getluid /* 510 */
+ .quad sys_setluid
+ .quad compat_sys_setublimit
+ .quad compat_sys_ubstat
+ .quad sys_ni_syscall
+ .quad sys_ni_syscall /* 515 */
+ .quad sys_lchmod
+ .quad compat_sys_lutime
ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 016218c..f368a9a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -623,20 +623,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
advice);
}
-long sys32_vm86_warning(void)
-{
- struct task_struct *me = current;
- static char lastcomm[sizeof(me->comm)];
-
- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- compat_printk(KERN_INFO
- "%s: vm86 mode not supported on 64 bit kernel\n",
- me->comm);
- strncpy(lastcomm, me->comm, sizeof(lastcomm));
- }
- return -ENOSYS;
-}
-
long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
char __user *buf, size_t len)
{
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 8ac9d9a..6f2fd90 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -285,7 +285,7 @@ struct task_struct;
#define ARCH_DLINFO_IA32(vdso_enabled) \
do { \
- if (vdso_enabled) { \
+ if (vdso_enabled && sysctl_at_vsyscall) { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
@@ -332,9 +332,11 @@ struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
+ int uses_interp,
+ unsigned long map_address);
-extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
+extern int syscall32_setup_pages(struct linux_binprm *, int exstack,
+ unsigned long map_address);
#define compat_arch_setup_additional_pages syscall32_setup_pages
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94..e255a04 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -80,7 +80,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#if PAGETABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT);
}
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
@@ -116,7 +116,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT);
}
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 13b1885..224e817 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -974,8 +974,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
- 0xc0000000 : 0xFFFFe000)
+#define IA32_PAGE_OFFSET 0xc0000000
#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 19c3ce4..4598413 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
+#define TIF_RESUME 29
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -117,6 +118,7 @@ struct thread_info {
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_RESUME (1<<TIF_RESUME)
/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY \
@@ -161,7 +163,8 @@ struct thread_info {
#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
#define alloc_thread_info(tsk) \
- ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
+ ((struct thread_info *)__get_free_pages(THREAD_FLAGS | __GFP_UBC,\
+ THREAD_ORDER))
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index c042729..6e7f232 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -24,7 +24,7 @@ static inline cycles_t get_cycles(void)
unsigned long long ret = 0;
#ifndef CONFIG_X86_TSC
- if (!cpu_has_tsc)
+ if (WARN_ON_ONCE(!cpu_has_tsc))
return 0;
#endif
rdtscll(ret);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6fb3c20..c870519 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,10 +342,22 @@
#define __NR_pwritev 334
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
+#define __NR_fairsched_mknod 500 /* FairScheduler syscalls */
+#define __NR_fairsched_rmnod 501
+#define __NR_fairsched_chwt 502
+#define __NR_fairsched_mvpr 503
+#define __NR_fairsched_rate 504
+#define __NR_fairsched_vcpus 505
+#define __NR_getluid 510
+#define __NR_setluid 511
+#define __NR_setublimit 512
+#define __NR_ubstat 513
+#define __NR_lchmod 516
+#define __NR_lutime 517
#ifdef __KERNEL__
-#define NR_syscalls 337
+#define NR_syscalls 514
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 8d3ad0a..15bc00e 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -661,6 +661,30 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
#define __NR_perf_event_open 298
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_fairsched_vcpus 499
+__SYSCALL(__NR_fairsched_vcpus, sys_fairsched_vcpus)
+#define __NR_getluid 500
+__SYSCALL(__NR_getluid, sys_getluid)
+#define __NR_setluid 501
+__SYSCALL(__NR_setluid, sys_setluid)
+#define __NR_setublimit 502
+__SYSCALL(__NR_setublimit, sys_setublimit)
+#define __NR_ubstat 503
+__SYSCALL(__NR_ubstat, sys_ubstat)
+#define __NR_fairsched_mknod 504 /* FairScheduler syscalls */
+__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod)
+#define __NR_fairsched_rmnod 505
+__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod)
+#define __NR_fairsched_chwt 506
+__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt)
+#define __NR_fairsched_mvpr 507
+__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr)
+#define __NR_fairsched_rate 508
+__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate)
+#define __NR_lchmod 509
+__SYSCALL(__NR_lchmod, sys_lchmod)
+#define __NR_lutime 510
+__SYSCALL(__NR_lutime, sys_lutime)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
@@ -685,6 +709,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __ARCH_WANT_SYS_RT_SIGSUSPEND
#define __ARCH_WANT_SYS_TIME
#define __ARCH_WANT_COMPAT_SYS_TIME
+#define __ARCH_WANT_SYS_RT_SIGSUSPEND
#endif /* __NO_STUBS */
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052..2cf267b 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -18,6 +18,7 @@ extern const char VDSO64_PRELINK[];
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const char VDSO32_PRELINK[];
+extern const char VDSO32_SYSENTER_RETURN[];
/*
* Given a pointer to the vDSO image, find the pointer to VDSO32_name
* as that symbol is defined in the vDSO sources or linker script.
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6..ee58297 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -435,10 +435,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
+ * wait a few IRQs (30 seconds) before doing the oops ...
*/
local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz)
+ if (local_read(&__get_cpu_var(alert_counter)) == 30 * nmi_hz)
/*
* die_nmi will return ONLY if NOTIFY_STOP happens..
*/
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e..ce8a3f5 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/init.h>
+#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371..0d1ce00 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -320,6 +320,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic)
printk(" on CPU%d, ip %08lx, registers:\n",
smp_processor_id(), regs->ip);
show_registers(regs);
+ nmi_show_regs(regs, 1);
oops_end(flags, regs, 0);
if (do_panic || panic_on_oops)
panic("Non maskable interrupt");
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7..24c02de 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -105,8 +105,9 @@ void show_registers(struct pt_regs *regs)
print_modules();
__show_regs(regs, 0);
- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
+ printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)\n",
TASK_COMM_LEN, current->comm, task_pid_nr(current),
+ VEID(current->ve_task_info.owner_env),
current_thread_info(), current, task_thread_info(current));
/*
* When in-kernel, we also print out the stack and code at the
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6b..24ec167 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -254,8 +254,10 @@ void show_registers(struct pt_regs *regs)
sp = regs->sp;
printk("CPU %d ", cpu);
__show_regs(regs, 1);
- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
- cur->comm, cur->pid, task_thread_info(cur), cur);
+ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n",
+ cur->comm, cur->pid,
+ VEID(VE_TASK_INFO(current)->owner_env),
+ task_thread_info(cur), cur);
/*
* When in-kernel, we also print out the stack and code at the
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d..582db41 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -325,6 +325,7 @@ ENTRY(ret_from_fork)
GET_THREAD_INFO(%ebp)
popl %eax
CFI_ADJUST_CFA_OFFSET -4
+ret_from_fork_tail:
pushl $0x0202 # Reset kernel eflags
CFI_ADJUST_CFA_OFFSET 4
popfl
@@ -333,6 +334,25 @@ ENTRY(ret_from_fork)
CFI_ENDPROC
END(ret_from_fork)
+ENTRY(i386_ret_from_resume)
+ CFI_STARTPROC
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ movl (%esp),%eax
+ testl %eax,%eax
+ jz 1f
+ pushl %esp
+ call *%eax
+ addl $4,%esp
+1:
+ addl $256,%esp
+ jmp ret_from_fork_tail
+ CFI_ENDPROC
+
/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f..36f56e3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -405,8 +405,12 @@ ENTRY(ret_from_fork)
call schedule_tail # rdi: 'prev' task parameter
+ret_from_fork_tail:
GET_THREAD_INFO(%rcx)
+ btr $TIF_RESUME,TI_flags(%rcx)
+ jc x86_64_ret_from_resume
+ret_from_fork_check:
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
@@ -418,6 +422,18 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
+x86_64_ret_from_resume:
+ movq (%rsp),%rax
+ testq %rax,%rax
+ jz 1f
+ movq %rsp,%rdi
+ call *%rax
+1:
+ addq $256,%rsp
+ cmpq $0,ORIG_RAX(%rsp)
+ jge ret_from_fork_tail
+ RESTORE_REST
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
@@ -1182,7 +1198,7 @@ ENTRY(kernel_thread)
xorl %r9d,%r9d
# clone now
- call do_fork
+ call do_fork_kthread
movq %rax,RAX(%rsp)
xorl %edi,%edi
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540..84afa2f 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -163,6 +163,7 @@ int init_fpu(struct task_struct *tsk)
set_stopped_child_used_math(tsk);
return 0;
}
+EXPORT_SYMBOL(init_fpu);
int fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60..b8a4a3c 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -13,6 +13,8 @@
#include <linux/smp.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <bc/kmem.h>
#include <asm/system.h>
#include <asm/ldt.h>
@@ -39,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+ newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE);
else
- newldt = (void *)__get_free_page(GFP_KERNEL);
+ newldt = (void *)__get_free_page(GFP_KERNEL_UBC);
if (!newldt)
return -ENOMEM;
@@ -117,6 +119,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
}
return retval;
}
+EXPORT_SYMBOL_GPL(init_new_context);
/*
* No need to lock the MM as we are the last user
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf7956..69478e4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -40,6 +40,8 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/sysctl.h>
+#include <linux/utsrelease.h>
#include <asm/pgtable.h>
#include <asm/system.h>
@@ -60,6 +62,9 @@
#include <asm/ds.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+EXPORT_SYMBOL(ret_from_fork);
+asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume");
+EXPORT_SYMBOL_GPL(i386_ret_from_resume);
/*
* Return saved PC of a blocked thread.
@@ -144,16 +149,17 @@ void __show_regs(struct pt_regs *regs, int all)
board = dmi_get_system_info(DMI_PRODUCT_NAME);
if (!board)
board = "";
- printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
+ printk("Pid: %d, comm: %s %s (%s %.*s) %s %s)\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
+ init_utsname()->version, VZVERSION, board);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
+ if (decode_call_traces)
+ print_symbol("EIP is at %s\n", regs->ip);
printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -189,6 +195,8 @@ void show_regs(struct pt_regs *regs)
{
__show_regs(regs, 1);
show_trace(NULL, regs, &regs->sp, regs->bp);
+ if (!decode_call_traces)
+ printk(" EIP: [<%08lx>]\n", regs->ip);
}
/*
@@ -197,6 +205,7 @@ void show_regs(struct pt_regs *regs)
* the "args".
*/
extern void kernel_thread_helper(void);
+EXPORT_SYMBOL(kernel_thread_helper);
/*
* Create a kernel thread
@@ -205,6 +214,13 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct pt_regs regs;
+ /* Don't allow kernel_thread() inside VE */
+ if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) {
+ printk("kernel_thread call inside container\n");
+ dump_stack();
+ return -EPERM;
+ }
+
memset(&regs, 0, sizeof(regs));
regs.bx = (unsigned long) fn;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 868fdb4..0cc650d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -25,8 +25,10 @@
#include <linux/smp.h>
#include <linux/slab.h>
#include <linux/user.h>
+#include <linux/sysctl.h>
#include <linux/interrupt.h>
#include <linux/utsname.h>
+#include <linux/utsrelease.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
@@ -53,8 +55,6 @@
#include <asm/syscalls.h>
#include <asm/ds.h>
-asmlinkage extern void ret_from_fork(void);
-
DEFINE_PER_CPU(unsigned long, old_rsp);
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -169,13 +169,14 @@ void __show_regs(struct pt_regs *regs, int all)
board = dmi_get_system_info(DMI_PRODUCT_NAME);
if (!board)
board = "";
- printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s %s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version, board);
+ init_utsname()->version, VZVERSION, board);
printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip, 1);
+ if (decode_call_traces)
+ printk_address(regs->ip, 1);
printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
regs->sp, regs->flags);
printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
@@ -228,7 +229,9 @@ void show_regs(struct pt_regs *regs)
{
printk(KERN_INFO "CPU %d:", smp_processor_id());
__show_regs(regs, 1);
- show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
+ show_trace(NULL, regs, &regs->sp, regs->bp);
+ if (!decode_call_traces)
+ printk(" EIP: [<%08lx>]\n", regs->ip);
}
void release_thread(struct task_struct *dead_task)
@@ -680,3 +683,20 @@ unsigned long KSTK_ESP(struct task_struct *task)
return (test_tsk_thread_flag(task, TIF_IA32)) ?
(task_pt_regs(task)->sp) : ((task)->thread.usersp);
}
+
+long do_fork_kthread(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ if (ve_allow_kthreads || ve_is_super(get_exec_env()))
+ return do_fork(clone_flags, stack_start, regs, stack_size,
+ parent_tidptr, child_tidptr);
+
+ /* Don't allow kernel_thread() inside VE */
+ printk("kernel_thread call inside container\n");
+ dump_stack();
+ return -EPERM;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76..6ecea3a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/uaccess.h>
+#include <linux/freezer.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -792,6 +793,9 @@ static void do_signal(struct pt_regs *regs)
if (!user_mode(regs))
return;
+ if (try_to_freeze() && !signal_pending(current))
+ goto no_signal;
+
if (current_thread_info()->status & TS_RESTORE_SIGMASK)
oldset = &current->saved_sigmask;
else
@@ -821,6 +825,7 @@ static void do_signal(struct pt_regs *regs)
return;
}
+no_signal:
/* Did we come from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97..29df6fd 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -221,6 +221,11 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
irq_exit();
}
+void send_nmi_ipi_allbutself(void)
+{
+ apic->send_IPI_allbutself(NMI_VECTOR);
+}
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 28e963d..54a0ecf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -733,6 +733,12 @@ do_rest:
initial_code = (unsigned long)start_secondary;
stack_start.sp = (void *) c_idle.idle->thread.sp;
+#ifdef CONFIG_VE
+ /* Cosmetic: sleep_time won't be changed afterwards for the idle
+ * thread; keep it 0 rather than -cycles. */
+ VE_TASK_INFO(c_idle.idle)->sleep_time = 0;
+#endif
+
/* start_ip had better be page-aligned! */
start_ip = setup_trampoline();
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 76d70a4..477e261 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -336,3 +336,24 @@ ENTRY(sys_call_table)
.long sys_pwritev
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
+ .rept 500-(.-sys_call_table)/4
+ .long sys_ni_syscall
+ .endr
+ .long sys_fairsched_mknod /* 500 */
+ .long sys_fairsched_rmnod
+ .long sys_fairsched_chwt
+ .long sys_fairsched_mvpr
+ .long sys_fairsched_rate
+ .long sys_fairsched_vcpus /* 505 */
+ .long sys_ni_syscall
+ .long sys_ni_syscall
+ .long sys_ni_syscall
+ .long sys_ni_syscall
+ .long sys_getluid /* 510 */
+ .long sys_setluid
+ .long sys_setublimit
+ .long sys_ubstat
+ .long sys_ni_syscall
+ .long sys_ni_syscall /* 515 */
+ .long sys_lchmod
+ .long sys_lutime
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dce..d1fd061 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -405,7 +405,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog_tick(regs, reason))
+ if (nmi_watchdog_tick(regs, reason) +
+ do_nmi_show_regs(regs, cpu))
return;
if (!do_nmi_callback(regs, cpu))
unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f379309..6c44e77 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -150,6 +150,10 @@ void __cpuinit check_tsc_sync_source(int cpu)
printk(" passed.\n");
}
+#ifdef CONFIG_VE
+ /* TSC reset. kill whatever might rely on old values */
+ VE_TASK_INFO(current)->wakeup_stamp = 0;
+#endif
/*
* Reset it - just in case we boot another CPU later:
*/
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3b..bbfa7af 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
#include <net/checksum.h>
@@ -17,6 +18,7 @@
EXPORT_SYMBOL(mcount);
#endif
+EXPORT_SYMBOL(kernel_execve);
EXPORT_SYMBOL(kernel_thread);
EXPORT_SYMBOL(__get_user_1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee90..3e549cd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -689,7 +689,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
- printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
@@ -909,7 +909,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
return ret;
}
-int show_unhandled_signals = 1;
+int show_unhandled_signals = 0;
static inline int
access_error(unsigned long error_code, int write, struct vm_area_struct *vma)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340..6b7330c 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/sysctl.h>
+#include <linux/module.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -230,6 +231,7 @@ int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
+EXPORT_SYMBOL(pmd_huge);
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index c9ba9de..589a93b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,7 +4,8 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO | __GFP_UBC
+#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -16,7 +17,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(PGALLOC_GFP);
+ return (pte_t *)__get_free_page(PGALLOC_KERN_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08e..42445e5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -256,6 +256,8 @@ void flush_tlb_mm(struct mm_struct *mm)
preempt_enable();
}
+EXPORT_SYMBOL(flush_tlb_mm);
+
void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f..b7028c5 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -17,6 +17,8 @@
#include <linux/err.h>
#include <linux/module.h>
+#include <bc/vmpages.h>
+
#include <asm/cpufeature.h>
#include <asm/msr.h>
#include <asm/pgtable.h>
@@ -37,6 +39,8 @@ enum {
#else
#define VDSO_DEFAULT VDSO_ENABLED
#endif
+#undef VDSO_DEFAULT
+#define VDSO_DEFAULT VDSO_DISABLED
#ifdef CONFIG_X86_64
#define vdso_enabled sysctl_vsyscall32
@@ -193,7 +197,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
}
}
-static struct page *vdso32_pages[1];
+struct page *vdso32_pages[1];
+EXPORT_SYMBOL_GPL(vdso32_pages);
#ifdef CONFIG_X86_64
@@ -309,16 +314,30 @@ int __init sysenter_setup(void)
return 0;
}
+EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN);
+EXPORT_SYMBOL_GPL(VDSO32_PRELINK);
+
/* Setup a VMA at program startup for the vsyscall page */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
+ unsigned long map_address)
{
struct mm_struct *mm = current->mm;
- unsigned long addr;
+ unsigned long addr = map_address;
int ret = 0;
bool compat;
+ unsigned long flags;
- if (vdso_enabled == VDSO_DISABLED)
+ if (vdso_enabled == VDSO_DISABLED && map_address == 0) {
+ current->mm->context.vdso = NULL;
return 0;
+ }
+
+ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
+ mm->def_flags;
+
+ ret = -ENOMEM;
+ if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT))
+ goto err_charge;
down_write(&mm->mmap_sem);
@@ -328,19 +347,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
map_compat_vdso(compat);
- if (compat)
- addr = VDSO_HIGH_BASE;
- else {
- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+ if (!compat || map_address) {
+ addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- }
+ } else
+ addr = VDSO_HIGH_BASE;
current->mm->context.vdso = (void *)addr;
- if (compat_uses_vma || !compat) {
+ if (compat_uses_vma || !compat || map_address) {
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*
@@ -368,9 +386,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
current->mm->context.vdso = NULL;
up_write(&mm->mmap_sem);
+ if (ret < 0)
+ ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL);
+err_charge:
return ret;
}
+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
#ifdef CONFIG_X86_64
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 21e1aeb..507ba17 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -4,6 +4,7 @@
* Subject to the GPL, v.2
*/
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/init.h>
@@ -99,17 +100,23 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
/* Setup a VMA at program startup for the vsyscall page.
Not called for compat tasks */
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp,
+ unsigned long map_address)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
int ret;
- if (!vdso_enabled)
+ if (!vdso_enabled && map_address == 0) {
+ current->mm->context.vdso = NULL;
return 0;
+ }
down_write(&mm->mmap_sem);
- addr = vdso_addr(mm->start_stack, vdso_size);
+ if (map_address)
+ addr = map_address;
+ else
+ addr = vdso_addr(mm->start_stack, vdso_size);
addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
@@ -132,6 +139,7 @@ up_fail:
up_write(&mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL_GPL(arch_setup_additional_pages);
static __init int vdso_setup(char *s)
{
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56..e20fbde 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,28 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
+config BLK_CGROUP
+ bool
+ depends on CGROUPS
+ default n
+ ---help---
+ Generic block IO controller cgroup interface. This is the common
+ cgroup interface which should be used by various IO controlling
+ policies.
+
+ Currently, CFQ IO scheduler uses it to recognize task groups and
+ control disk bandwidth allocation (proportional time slice allocation)
+ to such task groups.
+
+config DEBUG_BLK_CGROUP
+ bool
+ depends on BLK_CGROUP
+ default n
+ ---help---
+ Enable some debugging help. Currently it stores the cgroup path
+ in the blk group which can be used by cfq for tracing various
+ group related activity.
+
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 7e803fc..9c5f0b5 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,23 @@ config IOSCHED_CFQ
working environment, suitable for desktop systems.
This is the default I/O scheduler.
+config CFQ_GROUP_IOSCHED
+ bool "CFQ Group Scheduling support"
+ depends on IOSCHED_CFQ && CGROUPS
+ select BLK_CGROUP
+ default n
+ ---help---
+ Enable group IO scheduling in CFQ.
+
+config DEBUG_CFQ_IOSCHED
+ bool "Debug CFQ Scheduling"
+ depends on CFQ_GROUP_IOSCHED
+ select DEBUG_BLK_CGROUP
+ default n
+ ---help---
+ Enable CFQ IO scheduling debugging in CFQ. Currently it makes
+ blktrace output more verbose.
+
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
diff --git a/block/Makefile b/block/Makefile
index ba74ca6..16334c9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
+obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_AS) += as-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
new file mode 100644
index 0000000..444f20b
--- /dev/null
+++ b/block/blk-cgroup.c
@@ -0,0 +1,366 @@
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+#include <linux/ioprio.h>
+#include <linux/seq_file.h>
+#include <linux/kdev_t.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include "blk-cgroup.h"
+
+static DEFINE_SPINLOCK(blkio_list_lock);
+static LIST_HEAD(blkio_list);
+
+struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
+EXPORT_SYMBOL_GPL(blkio_root_cgroup);
+
+bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
+{
+ if (!css_tryget(&blkcg->css))
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_tryget);
+
+void blkiocg_css_put(struct blkio_cgroup *blkcg)
+{
+ css_put(&blkcg->css);
+}
+EXPORT_SYMBOL_GPL(blkiocg_css_put);
+
+struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
+{
+ return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
+ struct blkio_cgroup, css);
+}
+EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
+
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+ unsigned long time, unsigned long sectors)
+{
+ blkg->time += time;
+ blkg->sectors += sectors;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+
+void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+ struct blkio_group *blkg, void *key, dev_t dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkcg->lock, flags);
+ rcu_assign_pointer(blkg->key, key);
+ blkg->blkcg_id = css_id(&blkcg->css);
+ hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* Need to take css reference ? */
+ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+#endif
+ blkg->dev = dev;
+}
+EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
+
+static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+ hlist_del_init_rcu(&blkg->blkcg_node);
+ blkg->blkcg_id = 0;
+}
+
+/*
+ * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
+ * indicating that blk_group was unhashed by the time we got to it.
+ */
+int blkiocg_del_blkio_group(struct blkio_group *blkg)
+{
+ struct blkio_cgroup *blkcg;
+ unsigned long flags;
+ struct cgroup_subsys_state *css;
+ int ret = 1;
+
+ rcu_read_lock();
+ css = css_lookup(&blkio_subsys, blkg->blkcg_id);
+ if (!css)
+ goto out;
+
+ blkcg = container_of(css, struct blkio_cgroup, css);
+ spin_lock_irqsave(&blkcg->lock, flags);
+ if (!hlist_unhashed(&blkg->blkcg_node)) {
+ __blkiocg_del_blkio_group(blkg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
+
+/* called under rcu_read_lock(). */
+struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
+{
+ struct blkio_group *blkg;
+ struct hlist_node *n;
+ void *__key;
+
+ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ __key = blkg->key;
+ if (__key == key)
+ return blkg;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
+
+#define SHOW_FUNCTION(__VAR) \
+static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
+ struct cftype *cftype) \
+{ \
+ struct blkio_cgroup *blkcg; \
+ \
+ blkcg = cgroup_to_blkio_cgroup(cgroup); \
+ return (u64)blkcg->__VAR; \
+}
+
+SHOW_FUNCTION(weight);
+#undef SHOW_FUNCTION
+
+static int
+blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ spin_lock(&blkio_list_lock);
+ spin_lock_irq(&blkcg->lock);
+ blkcg->weight = (unsigned int)val;
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_update_group_weight_fn(blkg,
+ blkcg->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
+ return 0;
+}
+
+int blkiocg_set_weight(struct cgroup *cgroup, u64 val)
+{
+ return blkiocg_weight_write(cgroup, NULL, val);
+}
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR) \
+static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
+ struct cftype *cftype, struct seq_file *m) \
+{ \
+ struct blkio_cgroup *blkcg; \
+ struct blkio_group *blkg; \
+ struct hlist_node *n; \
+ \
+ if (!cgroup_lock_live_group(cgroup)) \
+ return -ENODEV; \
+ \
+ blkcg = cgroup_to_blkio_cgroup(cgroup); \
+ rcu_read_lock(); \
+ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
+ if (blkg->dev) \
+ seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
+ MINOR(blkg->dev), blkg->__VAR); \
+ } \
+ rcu_read_unlock(); \
+ cgroup_unlock(); \
+ return 0; \
+}
+
+SHOW_FUNCTION_PER_GROUP(time);
+SHOW_FUNCTION_PER_GROUP(sectors);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+SHOW_FUNCTION_PER_GROUP(dequeue);
+#endif
+#undef SHOW_FUNCTION_PER_GROUP
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue)
+{
+ blkg->dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
+#endif
+
+struct cftype blkio_files[] = {
+ {
+ .name = "weight",
+ .read_u64 = blkiocg_weight_read,
+ .write_u64 = blkiocg_weight_write,
+ },
+ {
+ .name = "time",
+ .read_seq_string = blkiocg_time_read,
+ },
+ {
+ .name = "sectors",
+ .read_seq_string = blkiocg_sectors_read,
+ },
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ {
+ .name = "dequeue",
+ .read_seq_string = blkiocg_dequeue_read,
+ },
+#endif
+};
+
+static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ return cgroup_add_files(cgroup, subsys, blkio_files,
+ ARRAY_SIZE(blkio_files));
+}
+
+static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ unsigned long flags;
+ struct blkio_group *blkg;
+ void *key;
+ struct blkio_policy_type *blkiop;
+
+ rcu_read_lock();
+remove_entry:
+ spin_lock_irqsave(&blkcg->lock, flags);
+
+ if (hlist_empty(&blkcg->blkg_list)) {
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+ goto done;
+ }
+
+ blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+ blkcg_node);
+ key = rcu_dereference(blkg->key);
+ __blkiocg_del_blkio_group(blkg);
+
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+
+ /*
+ * This blkio_group is being unlinked as associated cgroup is going
+ * away. Let all the IO controlling policies know about this event.
+ *
+ * Currently this is static call to one io controlling policy. Once
+ * we have more policies in place, we need some dynamic registration
+ * of callback function.
+ */
+ spin_lock(&blkio_list_lock);
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_unlink_group_fn(key, blkg);
+ spin_unlock(&blkio_list_lock);
+ goto remove_entry;
+done:
+ free_css_id(&blkio_subsys, &blkcg->css);
+ rcu_read_unlock();
+ kfree(blkcg);
+}
+
+static struct cgroup_subsys_state *
+blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ struct blkio_cgroup *blkcg, *parent_blkcg;
+
+ if (!cgroup->parent) {
+ blkcg = &blkio_root_cgroup;
+ goto done;
+ }
+
+ /* Currently we do not support hierarchy deeper than two level (0,1) */
+ parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent);
+ if (css_depth(&parent_blkcg->css) > 0)
+ return ERR_PTR(-EINVAL);
+
+ blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+ if (!blkcg)
+ return ERR_PTR(-ENOMEM);
+
+ blkcg->weight = BLKIO_WEIGHT_DEFAULT;
+done:
+ spin_lock_init(&blkcg->lock);
+ INIT_HLIST_HEAD(&blkcg->blkg_list);
+
+ return &blkcg->css;
+}
+
+/*
+ * We cannot support shared io contexts, as we have no mean to support
+ * two tasks with the same ioc in two different groups without major rework
+ * of the main cic data structures. For now we allow a task to change
+ * its cgroup only if it's the only owner of its ioc.
+ */
+static int blkiocg_can_attach(struct cgroup_subsys *subsys,
+ struct cgroup *cgroup, struct task_struct *tsk,
+ bool threadgroup)
+{
+ struct io_context *ioc;
+ int ret = 0;
+
+ /* task_lock() is needed to avoid races with exit_io_context() */
+ task_lock(tsk);
+ ioc = tsk->io_context;
+ if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+ ret = -EINVAL;
+ task_unlock(tsk);
+
+ return ret;
+}
+
+static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
+ struct cgroup *prev, struct task_struct *tsk,
+ bool threadgroup)
+{
+ struct io_context *ioc;
+
+ task_lock(tsk);
+ ioc = tsk->io_context;
+ if (ioc)
+ ioc->cgroup_changed = 1;
+ task_unlock(tsk);
+}
+
+struct cgroup_subsys blkio_subsys = {
+ .name = "blkio",
+ .create = blkiocg_create,
+ .can_attach = blkiocg_can_attach,
+ .attach = blkiocg_attach,
+ .destroy = blkiocg_destroy,
+ .populate = blkiocg_populate,
+ .subsys_id = blkio_subsys_id,
+ .use_id = 1,
+};
+
+void blkio_policy_register(struct blkio_policy_type *blkiop)
+{
+ spin_lock(&blkio_list_lock);
+ list_add_tail(&blkiop->list, &blkio_list);
+ spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_register);
+
+void blkio_policy_unregister(struct blkio_policy_type *blkiop)
+{
+ spin_lock(&blkio_list_lock);
+ list_del_init(&blkiop->list);
+ spin_unlock(&blkio_list_lock);
+}
+EXPORT_SYMBOL_GPL(blkio_policy_unregister);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 0000000..4d316df
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,127 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+
+#ifdef CONFIG_BLK_CGROUP
+
+struct blkio_cgroup {
+ struct cgroup_subsys_state css;
+ unsigned int weight;
+ spinlock_t lock;
+ struct hlist_head blkg_list;
+};
+
+struct blkio_group {
+ /* An rcu protected unique identifier for the group */
+ void *key;
+ struct hlist_node blkcg_node;
+ unsigned short blkcg_id;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* Store cgroup path */
+ char path[128];
+ /* How many times this group has been removed from service tree */
+ unsigned long dequeue;
+#endif
+ /* The device MKDEV(major, minor), this group has been created for */
+ dev_t dev;
+
+ /* total disk time and nr sectors dispatched by this group */
+ unsigned long time;
+ unsigned long sectors;
+};
+
+extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
+extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
+
+typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
+typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
+ unsigned int weight);
+
+struct blkio_policy_ops {
+ blkio_unlink_group_fn *blkio_unlink_group_fn;
+ blkio_update_group_weight_fn *blkio_update_group_weight_fn;
+};
+
+struct blkio_policy_type {
+ struct list_head list;
+ struct blkio_policy_ops ops;
+};
+
+/* Blkio controller policy registration */
+extern void blkio_policy_register(struct blkio_policy_type *);
+extern void blkio_policy_unregister(struct blkio_policy_type *);
+
+#else
+
+struct blkio_group {
+};
+
+struct blkio_policy_type {
+};
+
+static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
+static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+
+#endif
+
+#define BLKIO_WEIGHT_MIN 100
+#define BLKIO_WEIGHT_MAX 1000
+#define BLKIO_WEIGHT_DEFAULT 500
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+ return blkg->path;
+}
+void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue);
+#else
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+static inline void blkiocg_update_blkio_group_dequeue_stats(
+ struct blkio_group *blkg, unsigned long dequeue) {}
+#endif
+
+#ifdef CONFIG_BLK_CGROUP
+extern struct blkio_cgroup blkio_root_cgroup;
+extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
+extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+ struct blkio_group *blkg, void *key, dev_t dev);
+extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
+extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
+ void *key);
+void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+ unsigned long time, unsigned long sectors);
+#else
+struct cgroup;
+static inline struct blkio_cgroup *
+cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
+
+static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
+ struct blkio_group *blkg, void *key, dev_t dev)
+{
+}
+
+static inline int
+blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
+
+static inline struct blkio_group *
+blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
+static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
+ unsigned long time, unsigned long sectors)
+{
+}
+#endif
+#endif /* _BLK_CGROUP_H */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9651c0a..06c6694 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -9,6 +9,7 @@
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/gcd.h>
#include <linux/lcm.h>
+#include <linux/jiffies.h>
#include "blk.h"
@@ -142,7 +143,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
q->nr_batching = BLK_BATCH_REQ;
q->unplug_thresh = 4; /* hmm */
- q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
+ q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */
if (q->unplug_delay == 0)
q->unplug_delay = 1;
diff --git a/block/bsg.c b/block/bsg.c
index 0676301..a9fd2d8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,6 +15,7 @@
#include <linux/blkdev.h>
#include <linux/poll.h>
#include <linux/cdev.h>
+#include <linux/jiffies.h>
#include <linux/percpu.h>
#include <linux/uio.h>
#include <linux/idr.h>
@@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
rq->cmd_len = hdr->request_len;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->timeout = (hdr->timeout * HZ) / 1000;
+ rq->timeout = msecs_to_jiffies(hdr->timeout);
if (!rq->timeout)
rq->timeout = q->sg_timeout;
if (!rq->timeout)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aa1e953..023f4e6 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -9,9 +9,11 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
+#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
/*
* tunables
@@ -27,6 +29,8 @@ static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
+static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
+static const int cfq_hist_divisor = 4;
/*
* offset from end of service tree
@@ -40,6 +44,10 @@ static int cfq_slice_idle = HZ / 125;
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
+#define CFQ_SERVICE_SHIFT 12
+
+#define CFQQ_SEEK_THR 8 * 1024
+#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR)
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
@@ -57,6 +65,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
+#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
/*
* Most of our rbtree usage is for sorting with min extraction, so
@@ -67,8 +76,12 @@ static DEFINE_SPINLOCK(ioc_gone_lock);
struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
+ unsigned count;
+ u64 min_vdisktime;
+ struct rb_node *active;
+ unsigned total_weight;
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, }
+#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
/*
* Per process-grouping structure
@@ -99,6 +112,11 @@ struct cfq_queue {
/* fifo list of requests in sort_list */
struct list_head fifo;
+ /* time when queue got scheduled in to dispatch first request. */
+ unsigned long dispatch_start;
+ unsigned int allocated_slice;
+ /* time when first request from queue completed and slice started. */
+ unsigned long slice_start;
unsigned long slice_end;
long slice_resid;
unsigned int slice_dispatch;
@@ -112,7 +130,70 @@ struct cfq_queue {
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
+ unsigned int seek_samples;
+ u64 seek_total;
+ sector_t seek_mean;
+ sector_t last_request_pos;
+
pid_t pid;
+
+ struct cfq_rb_root *service_tree;
+ struct cfq_queue *new_cfqq;
+ struct cfq_group *cfqg;
+ struct cfq_group *orig_cfqg;
+ /* Sectors dispatched in current dispatch round */
+ unsigned long nr_sectors;
+};
+
+/*
+ * First index in the service_trees.
+ * IDLE is handled separately, so it has negative index
+ */
+enum wl_prio_t {
+ BE_WORKLOAD = 0,
+ RT_WORKLOAD = 1,
+ IDLE_WORKLOAD = 2,
+};
+
+/*
+ * Second index in the service_trees.
+ */
+enum wl_type_t {
+ ASYNC_WORKLOAD = 0,
+ SYNC_NOIDLE_WORKLOAD = 1,
+ SYNC_WORKLOAD = 2
+};
+
+/* This is per cgroup per device grouping structure */
+struct cfq_group {
+ /* group service_tree member */
+ struct rb_node rb_node;
+
+ /* group service_tree key */
+ u64 vdisktime;
+ unsigned int weight;
+ bool on_st;
+
+ /* number of cfqq currently on this group */
+ int nr_cfqq;
+
+ /* Per group busy queus average. Useful for workload slice calc. */
+ unsigned int busy_queues_avg[2];
+ /*
+ * rr lists of queues with requests, onle rr for each priority class.
+ * Counts are embedded in the cfq_rb_root
+ */
+ struct cfq_rb_root service_trees[2][3];
+ struct cfq_rb_root service_tree_idle;
+
+ unsigned long saved_workload_slice;
+ enum wl_type_t saved_workload;
+ enum wl_prio_t saved_serving_prio;
+ struct blkio_group blkg;
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ struct hlist_node cfqd_node;
+ atomic_t ref;
+#endif
};
/*
@@ -120,11 +201,18 @@ struct cfq_queue {
*/
struct cfq_data {
struct request_queue *queue;
+ /* Root service tree for cfq_groups */
+ struct cfq_rb_root grp_service_tree;
+ struct cfq_group root_group;
/*
- * rr list of queues with requests and the count of them
+ * The priority currently being served
*/
- struct cfq_rb_root service_tree;
+ enum wl_prio_t serving_prio;
+ enum wl_type_t serving_type;
+ unsigned long workload_expires;
+ struct cfq_group *serving_group;
+ bool noidle_tree_requires_idle;
/*
* Each priority tree is sorted by next_request position. These
@@ -143,8 +231,14 @@ struct cfq_data {
*/
int rq_queued;
int hw_tag;
- int hw_tag_samples;
- int rq_in_driver_peak;
+ /*
+ * hw_tag can be
+ * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
+ * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
+ * 0 => no NCQ
+ */
+ int hw_tag_est_depth;
+ unsigned int hw_tag_samples;
/*
* idle window management
@@ -174,6 +268,7 @@ struct cfq_data {
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
unsigned int cfq_latency;
+ unsigned int cfq_group_isolation;
struct list_head cic_list;
@@ -182,9 +277,28 @@ struct cfq_data {
*/
struct cfq_queue oom_cfqq;
- unsigned long last_end_sync_rq;
+ unsigned long last_delayed_sync;
+
+ /* List of cfq groups being managed on this device*/
+ struct hlist_head cfqg_list;
+ struct rcu_head rcu;
};
+static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
+
+static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
+ enum wl_prio_t prio,
+ enum wl_type_t type)
+{
+ if (!cfqg)
+ return NULL;
+
+ if (prio == IDLE_WORKLOAD)
+ return &cfqg->service_tree_idle;
+
+ return &cfqg->service_trees[prio][type];
+}
+
enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
@@ -195,8 +309,10 @@ enum cfqq_state_flags {
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
- CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */
- CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */
+ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
+ CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
+ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
+ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
};
#define CFQ_CFQQ_FNS(name) \
@@ -223,14 +339,78 @@ CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
-CFQ_CFQQ_FNS(coop_preempt);
+CFQ_CFQQ_FNS(split_coop);
+CFQ_CFQQ_FNS(deep);
+CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
+#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
+ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
+ blkg_path(&(cfqq)->cfqg->blkg), ##args);
+
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
+ blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
+ blkg_path(&(cfqg)->blkg), ##args); \
+
+#else
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
+#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
+#endif
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
+/* Traverses through cfq group service trees */
+#define for_each_cfqg_st(cfqg, i, j, st) \
+ for (i = 0; i <= IDLE_WORKLOAD; i++) \
+ for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
+ : &cfqg->service_tree_idle; \
+ (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
+ (i == IDLE_WORKLOAD && j == 0); \
+ j++, st = i < IDLE_WORKLOAD ? \
+ &cfqg->service_trees[i][j]: NULL) \
+
+
+static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
+{
+ if (cfq_class_idle(cfqq))
+ return IDLE_WORKLOAD;
+ if (cfq_class_rt(cfqq))
+ return RT_WORKLOAD;
+ return BE_WORKLOAD;
+}
+
+
+static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
+{
+ if (!cfq_cfqq_sync(cfqq))
+ return ASYNC_WORKLOAD;
+ if (!cfq_cfqq_idle_window(cfqq))
+ return SYNC_NOIDLE_WORKLOAD;
+ return SYNC_WORKLOAD;
+}
+
+static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
+ struct cfq_data *cfqd,
+ struct cfq_group *cfqg)
+{
+ if (wl == IDLE_WORKLOAD)
+ return cfqg->service_tree_idle.count;
+
+ return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
+ + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
+ + cfqg->service_trees[wl][SYNC_WORKLOAD].count;
+}
+
+static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
+ struct cfq_group *cfqg)
+{
+ return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
+ + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
+}
+
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
@@ -279,7 +459,7 @@ static int cfq_queue_empty(struct request_queue *q)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- return !cfqd->busy_queues;
+ return !cfqd->rq_queued;
}
/*
@@ -303,10 +483,110 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
+static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
+{
+ u64 d = delta << CFQ_SERVICE_SHIFT;
+
+ d = d * BLKIO_WEIGHT_DEFAULT;
+ do_div(d, cfqg->weight);
+ return d;
+}
+
+static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+ s64 delta = (s64)(vdisktime - min_vdisktime);
+ if (delta > 0)
+ min_vdisktime = vdisktime;
+
+ return min_vdisktime;
+}
+
+static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
+{
+ s64 delta = (s64)(vdisktime - min_vdisktime);
+ if (delta < 0)
+ min_vdisktime = vdisktime;
+
+ return min_vdisktime;
+}
+
+static void update_min_vdisktime(struct cfq_rb_root *st)
+{
+ u64 vdisktime = st->min_vdisktime;
+ struct cfq_group *cfqg;
+
+ if (st->active) {
+ cfqg = rb_entry_cfqg(st->active);
+ vdisktime = cfqg->vdisktime;
+ }
+
+ if (st->left) {
+ cfqg = rb_entry_cfqg(st->left);
+ vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
+ }
+
+ st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
+}
+
+/*
+ * get averaged number of queues of RT/BE priority.
+ * average is updated, with a formula that gives more weight to higher numbers,
+ * to quickly follows sudden increases and decrease slowly
+ */
+
+static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
+ struct cfq_group *cfqg, bool rt)
+{
+ unsigned min_q, max_q;
+ unsigned mult = cfq_hist_divisor - 1;
+ unsigned round = cfq_hist_divisor / 2;
+ unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
+
+ min_q = min(cfqg->busy_queues_avg[rt], busy);
+ max_q = max(cfqg->busy_queues_avg[rt], busy);
+ cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
+ cfq_hist_divisor;
+ return cfqg->busy_queues_avg[rt];
+}
+
+static inline unsigned
+cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+
+ return cfq_target_latency * cfqg->weight / st->total_weight;
+}
+
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+ unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
+ if (cfqd->cfq_latency) {
+ /*
+ * interested queues (we consider only the ones with the same
+ * priority class in the cfq group)
+ */
+ unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
+ cfq_class_rt(cfqq));
+ unsigned sync_slice = cfqd->cfq_slice[1];
+ unsigned expect_latency = sync_slice * iq;
+ unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
+
+ if (expect_latency > group_slice) {
+ unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
+ /* scale low_slice according to IO priority
+ * and sync vs async */
+ unsigned low_slice =
+ min(slice, base_low_slice * slice / sync_slice);
+ /* the adapted slice value is scaled to fit all iqs
+ * into the target latency */
+ slice = max(slice * group_slice / expect_latency,
+ low_slice);
+ }
+ }
+ cfqq->slice_start = jiffies;
+ cfqq->slice_end = jiffies + slice;
+ cfqq->allocated_slice = slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
}
@@ -331,9 +611,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq)
* behind the head is penalized and only allowed to a certain extent.
*/
static struct request *
-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
+cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
{
- sector_t last, s1, s2, d1 = 0, d2 = 0;
+ sector_t s1, s2, d1 = 0, d2 = 0;
unsigned long back_max;
#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */
#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */
@@ -356,8 +636,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
- last = cfqd->last_position;
-
/*
* by definition, 1KiB is 2 sectors
*/
@@ -425,6 +703,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
*/
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
{
+ /* Service tree is empty */
+ if (!root->count)
+ return NULL;
+
if (!root->left)
root->left = rb_first(&root->rb);
@@ -434,6 +716,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
return NULL;
}
+static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
+{
+ if (!root->left)
+ root->left = rb_first(&root->rb);
+
+ if (root->left)
+ return rb_entry_cfqg(root->left);
+
+ return NULL;
+}
+
static void rb_erase_init(struct rb_node *n, struct rb_root *root)
{
rb_erase(n, root);
@@ -445,6 +738,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
if (root->left == n)
root->left = NULL;
rb_erase_init(n, &root->rb);
+ --root->count;
}
/*
@@ -471,7 +765,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
next = rb_entry_rq(rbnext);
}
- return cfq_choose_req(cfqd, next, prev);
+ return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
}
static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
@@ -480,12 +774,334 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
/*
* just an approximation, should be ok.
*/
- return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) -
+ return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
}
+static inline s64
+cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ return cfqg->vdisktime - st->min_vdisktime;
+}
+
+static void
+__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
+{
+ struct rb_node **node = &st->rb.rb_node;
+ struct rb_node *parent = NULL;
+ struct cfq_group *__cfqg;
+ s64 key = cfqg_key(st, cfqg);
+ int left = 1;
+
+ while (*node != NULL) {
+ parent = *node;
+ __cfqg = rb_entry_cfqg(parent);
+
+ if (key < cfqg_key(st, __cfqg))
+ node = &parent->rb_left;
+ else {
+ node = &parent->rb_right;
+ left = 0;
+ }
+ }
+
+ if (left)
+ st->left = &cfqg->rb_node;
+
+ rb_link_node(&cfqg->rb_node, parent, node);
+ rb_insert_color(&cfqg->rb_node, &st->rb);
+}
+
+static void
+cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_group *__cfqg;
+ struct rb_node *n;
+
+ cfqg->nr_cfqq++;
+ if (cfqg->on_st)
+ return;
+
+ /*
+ * Currently put the group at the end. Later implement something
+ * so that groups get lesser vtime based on their weights, so that
+ * if group does not loose all if it was not continously backlogged.
+ */
+ n = rb_last(&st->rb);
+ if (n) {
+ __cfqg = rb_entry_cfqg(n);
+ cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+ } else
+ cfqg->vdisktime = st->min_vdisktime;
+
+ __cfq_group_service_tree_add(st, cfqg);
+ cfqg->on_st = true;
+ st->total_weight += cfqg->weight;
+}
+
+static void
+cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+
+ if (st->active == &cfqg->rb_node)
+ st->active = NULL;
+
+ BUG_ON(cfqg->nr_cfqq < 1);
+ cfqg->nr_cfqq--;
+
+ /* If there are other cfq queues under this group, don't delete it */
+ if (cfqg->nr_cfqq)
+ return;
+
+ cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
+ cfqg->on_st = false;
+ st->total_weight -= cfqg->weight;
+ if (!RB_EMPTY_NODE(&cfqg->rb_node))
+ cfq_rb_erase(&cfqg->rb_node, st);
+ cfqg->saved_workload_slice = 0;
+ blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+}
+
+static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
+{
+ unsigned int slice_used;
+
+ /*
+ * Queue got expired before even a single request completed or
+ * got expired immediately after first request completion.
+ */
+ if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
+ /*
+ * Also charge the seek time incurred to the group, otherwise
+ * if there are mutiple queues in the group, each can dispatch
+ * a single request on seeky media and cause lots of seek time
+ * and group will never know it.
+ */
+ slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
+ 1);
+ } else {
+ slice_used = jiffies - cfqq->slice_start;
+ if (slice_used > cfqq->allocated_slice)
+ slice_used = cfqq->allocated_slice;
+ }
+
+ cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
+ cfqq->nr_sectors);
+ return slice_used;
+}
+
+static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
+ struct cfq_queue *cfqq)
+{
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ unsigned int used_sl, charge_sl;
+ int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
+ - cfqg->service_tree_idle.count;
+
+ BUG_ON(nr_sync < 0);
+ used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
+
+ if (!cfq_cfqq_sync(cfqq) && !nr_sync)
+ charge_sl = cfqq->allocated_slice;
+
+ /* Can't update vdisktime while group is on service tree */
+ cfq_rb_erase(&cfqg->rb_node, st);
+ cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
+ __cfq_group_service_tree_add(st, cfqg);
+
+ /* This group is being expired. Save the context */
+ if (time_after(cfqd->workload_expires, jiffies)) {
+ cfqg->saved_workload_slice = cfqd->workload_expires
+ - jiffies;
+ cfqg->saved_workload = cfqd->serving_type;
+ cfqg->saved_serving_prio = cfqd->serving_prio;
+ } else
+ cfqg->saved_workload_slice = 0;
+
+ cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
+ st->min_vdisktime);
+ blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
+ cfqq->nr_sectors);
+}
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
+{
+ if (blkg)
+ return container_of(blkg, struct cfq_group, blkg);
+ return NULL;
+}
+
+void
+cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
+{
+ cfqg_of_blkg(blkg)->weight = weight;
+}
+
+static struct cfq_group *
+cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
+{
+ struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
+ struct cfq_group *cfqg = NULL;
+ void *key = cfqd;
+ int i, j;
+ struct cfq_rb_root *st;
+ struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
+ unsigned int major, minor;
+
+ /* Do we need to take this reference */
+ if (!blkiocg_css_tryget(blkcg))
+ return NULL;;
+
+ cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg || !create)
+ goto done;
+
+ cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
+ if (!cfqg)
+ goto done;
+
+ cfqg->weight = blkcg->weight;
+ for_each_cfqg_st(cfqg, i, j, st)
+ *st = CFQ_RB_ROOT;
+ RB_CLEAR_NODE(&cfqg->rb_node);
+
+ /*
+ * Take the initial reference that will be released on destroy
+ * This can be thought of a joint reference by cgroup and
+ * elevator which will be dropped by either elevator exit
+ * or cgroup deletion path depending on who is exiting first.
+ */
+ atomic_set(&cfqg->ref, 1);
+
+ /* Add group onto cgroup list */
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
+ MKDEV(major, minor));
+
+ /* Add group on cfqd list */
+ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
+
+done:
+ blkiocg_css_put(blkcg);
+ return cfqg;
+}
+
+/*
+ * Search for the cfq group current task belongs to. If create = 1, then also
+ * create the cfq group if it does not exist. request_queue lock must be held.
+ */
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+{
+ struct cgroup *cgroup;
+ struct cfq_group *cfqg = NULL;
+
+ rcu_read_lock();
+ cgroup = task_cgroup(current, blkio_subsys_id);
+ cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
+ if (!cfqg && create)
+ cfqg = &cfqd->root_group;
+ rcu_read_unlock();
+ return cfqg;
+}
+
+static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
+{
+ /* Currently, all async queues are mapped to root group */
+ if (!cfq_cfqq_sync(cfqq))
+ cfqg = &cfqq->cfqd->root_group;
+
+ cfqq->cfqg = cfqg;
+ /* cfqq reference on cfqg */
+ atomic_inc(&cfqq->cfqg->ref);
+}
+
+static void cfq_put_cfqg(struct cfq_group *cfqg)
+{
+ struct cfq_rb_root *st;
+ int i, j;
+
+ BUG_ON(atomic_read(&cfqg->ref) <= 0);
+ if (!atomic_dec_and_test(&cfqg->ref))
+ return;
+ for_each_cfqg_st(cfqg, i, j, st)
+ BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+ kfree(cfqg);
+}
+
+static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ /* Something wrong if we are trying to remove same group twice */
+ BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
+
+ hlist_del_init(&cfqg->cfqd_node);
+
+ /*
+ * Put the reference taken at the time of creation so that when all
+ * queues are gone, group can be destroyed.
+ */
+ cfq_put_cfqg(cfqg);
+}
+
+static void cfq_release_cfq_groups(struct cfq_data *cfqd)
+{
+ struct hlist_node *pos, *n;
+ struct cfq_group *cfqg;
+
+ hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
+ /*
+ * If cgroup removal path got to blk_group first and removed
+ * it from cgroup list, then it will take care of destroying
+ * cfqg also.
+ */
+ if (!blkiocg_del_blkio_group(&cfqg->blkg))
+ cfq_destroy_cfqg(cfqd, cfqg);
+ }
+}
+
+/*
+ * Blk cgroup controller notification saying that blkio_group object is being
+ * delinked as associated cgroup object is going away. That also means that
+ * no new IO will come in this group. So get rid of this group as soon as
+ * any pending IO in the group is finished.
+ *
+ * This function is called under rcu_read_lock(). key is the rcu protected
+ * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
+ * read lock.
+ *
+ * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
+ * it should not be NULL as even if elevator was exiting, cgroup deltion
+ * path got to it first.
+ */
+void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct cfq_data *cfqd = key;
+
+ spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+ cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
+ spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+#else /* GROUP_IOSCHED */
+static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
+{
+ return &cfqd->root_group;
+}
+static inline void
+cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
+ cfqq->cfqg = cfqg;
+}
+
+static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
+static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
+
+#endif /* GROUP_IOSCHED */
+
/*
- * The cfqd->service_tree holds all pending cfq_queue's that have
+ * The cfqd->service_trees holds all pending cfq_queue's that have
* requests waiting to be processed. It is sorted in the order that
* we will service the queues.
*/
@@ -495,11 +1111,42 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct rb_node **p, *parent;
struct cfq_queue *__cfqq;
unsigned long rb_key;
+ struct cfq_rb_root *service_tree;
int left;
+ int new_cfqq = 1;
+ int group_changed = 0;
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (!cfqd->cfq_group_isolation
+ && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
+ && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
+ /* Move this cfq to root group */
+ cfq_log_cfqq(cfqd, cfqq, "moving to root group");
+ if (!RB_EMPTY_NODE(&cfqq->rb_node))
+ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ cfqq->orig_cfqg = cfqq->cfqg;
+ cfqq->cfqg = &cfqd->root_group;
+ atomic_inc(&cfqd->root_group.ref);
+ group_changed = 1;
+ } else if (!cfqd->cfq_group_isolation
+ && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
+ /* cfqq is sequential now needs to go to its original group */
+ BUG_ON(cfqq->cfqg != &cfqd->root_group);
+ if (!RB_EMPTY_NODE(&cfqq->rb_node))
+ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
+ cfq_put_cfqg(cfqq->cfqg);
+ cfqq->cfqg = cfqq->orig_cfqg;
+ cfqq->orig_cfqg = NULL;
+ group_changed = 1;
+ cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
+ }
+#endif
+ service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
+ cfqq_type(cfqq));
if (cfq_class_idle(cfqq)) {
rb_key = CFQ_IDLE_DELAY;
- parent = rb_last(&cfqd->service_tree.rb);
+ parent = rb_last(&service_tree->rb);
if (parent && parent != &cfqq->rb_node) {
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
rb_key += __cfqq->rb_key;
@@ -517,23 +1164,27 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->slice_resid = 0;
} else {
rb_key = -HZ;
- __cfqq = cfq_rb_first(&cfqd->service_tree);
+ __cfqq = cfq_rb_first(service_tree);
rb_key += __cfqq ? __cfqq->rb_key : jiffies;
}
if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+ new_cfqq = 0;
/*
* same position, nothing more to do
*/
- if (rb_key == cfqq->rb_key)
+ if (rb_key == cfqq->rb_key &&
+ cfqq->service_tree == service_tree)
return;
- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+ cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+ cfqq->service_tree = NULL;
}
left = 1;
parent = NULL;
- p = &cfqd->service_tree.rb.rb_node;
+ cfqq->service_tree = service_tree;
+ p = &service_tree->rb.rb_node;
while (*p) {
struct rb_node **n;
@@ -541,35 +1192,28 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
/*
- * sort RT queues first, we always want to give
- * preference to them. IDLE queues goes to the back.
- * after that, sort on the next service time.
+ * sort by key, that represents service time.
*/
- if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq))
+ if (time_before(rb_key, __cfqq->rb_key))
n = &(*p)->rb_left;
- else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq))
- n = &(*p)->rb_right;
- else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq))
- n = &(*p)->rb_left;
- else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq))
- n = &(*p)->rb_right;
- else if (time_before(rb_key, __cfqq->rb_key))
- n = &(*p)->rb_left;
- else
+ else {
n = &(*p)->rb_right;
-
- if (n == &(*p)->rb_right)
left = 0;
+ }
p = n;
}
if (left)
- cfqd->service_tree.left = &cfqq->rb_node;
+ service_tree->left = &cfqq->rb_node;
cfqq->rb_key = rb_key;
rb_link_node(&cfqq->rb_node, parent, p);
- rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb);
+ rb_insert_color(&cfqq->rb_node, &service_tree->rb);
+ service_tree->count++;
+ if ((add_front || !new_cfqq) && !group_changed)
+ return;
+ cfq_group_service_tree_add(cfqd, cfqq->cfqg);
}
static struct cfq_queue *
@@ -671,13 +1315,16 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(!cfq_cfqq_on_rr(cfqq));
cfq_clear_cfqq_on_rr(cfqq);
- if (!RB_EMPTY_NODE(&cfqq->rb_node))
- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree);
+ if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
+ cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
+ cfqq->service_tree = NULL;
+ }
if (cfqq->p_root) {
rb_erase(&cfqq->p_node, cfqq->p_root);
cfqq->p_root = NULL;
}
+ cfq_group_service_tree_del(cfqd, cfqq->cfqg);
BUG_ON(!cfqd->busy_queues);
cfqd->busy_queues--;
}
@@ -688,7 +1335,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_del_rq_rb(struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
- struct cfq_data *cfqd = cfqq->cfqd;
const int sync = rq_is_sync(rq);
BUG_ON(!cfqq->queued[sync]);
@@ -696,8 +1342,17 @@ static void cfq_del_rq_rb(struct request *rq)
elv_rb_del(&cfqq->sort_list, rq);
- if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
- cfq_del_cfqq_rr(cfqd, cfqq);
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
+ /*
+ * Queue will be deleted from service tree when we actually
+ * expire it later. Right now just remove it from prio tree
+ * as it is empty.
+ */
+ if (cfqq->p_root) {
+ rb_erase(&cfqq->p_node, cfqq->p_root);
+ cfqq->p_root = NULL;
+ }
+ }
}
static void cfq_add_rq_rb(struct request *rq)
@@ -722,7 +1377,7 @@ static void cfq_add_rq_rb(struct request *rq)
* check if this request is a better next-serve candidate
*/
prev = cfqq->next_rq;
- cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq);
+ cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
/*
* adjust priority tree position, if ->next_rq changes
@@ -829,6 +1484,7 @@ static void
cfq_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
+ struct cfq_queue *cfqq = RQ_CFQQ(rq);
/*
* reposition in fifo if next is older than rq
*/
@@ -838,6 +1494,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
rq_set_fifo_time(rq, rq_fifo_time(next));
}
+ if (cfqq->next_rq == next)
+ cfqq->next_rq = rq;
cfq_remove_request(next);
}
@@ -871,8 +1529,12 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd,
{
if (cfqq) {
cfq_log_cfqq(cfqd, cfqq, "set_active");
+ cfqq->slice_start = 0;
+ cfqq->dispatch_start = jiffies;
+ cfqq->allocated_slice = 0;
cfqq->slice_end = 0;
cfqq->slice_dispatch = 0;
+ cfqq->nr_sectors = 0;
cfq_clear_cfqq_wait_request(cfqq);
cfq_clear_cfqq_must_dispatch(cfqq);
@@ -899,6 +1561,16 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
del_timer(&cfqd->idle_slice_timer);
cfq_clear_cfqq_wait_request(cfqq);
+ cfq_clear_cfqq_wait_busy(cfqq);
+
+ /*
+ * If this cfqq is shared between multiple processes, check to
+ * make sure that those processes are still issuing I/Os within
+ * the mean seek distance. If not, it may be time to break the
+ * queues apart again.
+ */
+ if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
+ cfq_mark_cfqq_split_coop(cfqq);
/*
* store what was left of this slice, if the queue idled/timed out
@@ -908,11 +1580,19 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
}
+ cfq_group_served(cfqd, cfqq->cfqg, cfqq);
+
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
+ cfq_del_cfqq_rr(cfqd, cfqq);
+
cfq_resort_rr_list(cfqd, cfqq);
if (cfqq == cfqd->active_queue)
cfqd->active_queue = NULL;
+ if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
+ cfqd->grp_service_tree.active = NULL;
+
if (cfqd->active_cic) {
put_io_context(cfqd->active_cic->ioc);
cfqd->active_cic = NULL;
@@ -933,10 +1613,39 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
*/
static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
{
- if (RB_EMPTY_ROOT(&cfqd->service_tree.rb))
+ struct cfq_rb_root *service_tree =
+ service_tree_for(cfqd->serving_group, cfqd->serving_prio,
+ cfqd->serving_type);
+
+ if (!cfqd->rq_queued)
return NULL;
- return cfq_rb_first(&cfqd->service_tree);
+ /* There is nothing to dispatch */
+ if (!service_tree)
+ return NULL;
+ if (RB_EMPTY_ROOT(&service_tree->rb))
+ return NULL;
+ return cfq_rb_first(service_tree);
+}
+
+static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
+{
+ struct cfq_group *cfqg;
+ struct cfq_queue *cfqq;
+ int i, j;
+ struct cfq_rb_root *st;
+
+ if (!cfqd->rq_queued)
+ return NULL;
+
+ cfqg = cfq_get_next_cfqg(cfqd);
+ if (!cfqg)
+ return NULL;
+
+ for_each_cfqg_st(cfqg, i, j, st)
+ if ((cfqq = cfq_rb_first(st)) != NULL)
+ return cfqq;
+ return NULL;
}
/*
@@ -945,14 +1654,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
- if (!cfqq) {
+ if (!cfqq)
cfqq = cfq_get_next_queue(cfqd);
- if (cfqq && !cfq_cfqq_coop_preempt(cfqq))
- cfq_clear_cfqq_coop(cfqq);
- }
-
- if (cfqq)
- cfq_clear_cfqq_coop_preempt(cfqq);
__cfq_set_active_queue(cfqd, cfqq);
return cfqq;
@@ -967,16 +1670,17 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
return cfqd->last_position - blk_rq_pos(rq);
}
-#define CIC_SEEK_THR 8 * 1024
-#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR)
-
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq)
+static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ struct request *rq, bool for_preempt)
{
- struct cfq_io_context *cic = cfqd->active_cic;
- sector_t sdist = cic->seek_mean;
+ sector_t sdist = cfqq->seek_mean;
+
+ if (!sample_valid(cfqq->seek_samples))
+ sdist = CFQQ_SEEK_THR;
- if (!sample_valid(cic->seek_samples))
- sdist = CIC_SEEK_THR;
+ /* if seek_mean is big, using it as close criteria is meaningless */
+ if (sdist > CFQQ_SEEK_THR && !for_preempt)
+ sdist = CFQQ_SEEK_THR;
return cfq_dist_from_last(cfqd, rq) <= sdist;
}
@@ -1005,7 +1709,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
* will contain the closest sector.
*/
__cfqq = rb_entry(parent, struct cfq_queue, p_node);
- if (cfq_rq_close(cfqd, __cfqq->next_rq))
+ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
return __cfqq;
if (blk_rq_pos(__cfqq->next_rq) < sector)
@@ -1016,7 +1720,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
return NULL;
__cfqq = rb_entry(node, struct cfq_queue, p_node);
- if (cfq_rq_close(cfqd, __cfqq->next_rq))
+ if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false))
return __cfqq;
return NULL;
@@ -1033,16 +1737,19 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
* assumption.
*/
static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
- struct cfq_queue *cur_cfqq,
- bool probe)
+ struct cfq_queue *cur_cfqq)
{
struct cfq_queue *cfqq;
+ if (!cfq_cfqq_sync(cur_cfqq))
+ return NULL;
+ if (CFQQ_SEEKY(cur_cfqq))
+ return NULL;
+
/*
- * A valid cfq_io_context is necessary to compare requests against
- * the seek_mean of the current cfqq.
+ * Don't search priority tree if it's the only queue in the group.
*/
- if (!cfqd->active_cic)
+ if (cur_cfqq->cfqg->nr_cfqq == 1)
return NULL;
/*
@@ -1054,14 +1761,55 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
if (!cfqq)
return NULL;
- if (cfq_cfqq_coop(cfqq))
+ /* If new queue belongs to different cfq_group, don't choose it */
+ if (cur_cfqq->cfqg != cfqq->cfqg)
+ return NULL;
+
+ /*
+ * It only makes sense to merge sync queues.
+ */
+ if (!cfq_cfqq_sync(cfqq))
+ return NULL;
+ if (CFQQ_SEEKY(cfqq))
+ return NULL;
+
+ /*
+ * Do not merge queues of different priority classes
+ */
+ if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
return NULL;
- if (!probe)
- cfq_mark_cfqq_coop(cfqq);
return cfqq;
}
+/*
+ * Determine whether we should enforce idle window for this queue.
+ */
+
+static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ enum wl_prio_t prio = cfqq_prio(cfqq);
+ struct cfq_rb_root *service_tree = cfqq->service_tree;
+
+ BUG_ON(!service_tree);
+ BUG_ON(!service_tree->count);
+
+ /* We never do for idle class queues. */
+ if (prio == IDLE_WORKLOAD)
+ return false;
+
+ /* We do for queues that were marked with idle window flag. */
+ if (cfq_cfqq_idle_window(cfqq) &&
+ !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
+ return true;
+
+ /*
+ * Otherwise, we do only if they are the last ones
+ * in their service tree.
+ */
+ return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
+}
+
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
@@ -1082,13 +1830,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
/*
* idle is disabled, either manually or by past process history
*/
- if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq))
+ if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
return;
/*
- * still requests with the driver, don't idle
+ * still active requests from this queue, don't idle
*/
- if (rq_in_driver(cfqd))
+ if (cfqq->dispatched)
return;
/*
@@ -1109,14 +1857,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
cfq_mark_cfqq_wait_request(cfqq);
- /*
- * we don't want to idle for seeks, but we do want to allow
- * fair distribution of slice time for a process doing back-to-back
- * seeks. so allow a little bit of time for him to submit a new rq
- */
sl = cfqd->cfq_slice_idle;
- if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
- sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT));
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
@@ -1139,6 +1880,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
if (cfq_cfqq_sync(cfqq))
cfqd->sync_flight++;
+ cfqq->nr_sectors += blk_rq_sectors(rq);
}
/*
@@ -1175,6 +1917,186 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
}
/*
+ * Must be called with the queue_lock held.
+ */
+static int cfqq_process_refs(struct cfq_queue *cfqq)
+{
+ int process_refs, io_refs;
+
+ io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
+ process_refs = atomic_read(&cfqq->ref) - io_refs;
+ BUG_ON(process_refs < 0);
+ return process_refs;
+}
+
+static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
+{
+ int process_refs, new_process_refs;
+ struct cfq_queue *__cfqq;
+
+ /* Avoid a circular list and skip interim queue merges */
+ while ((__cfqq = new_cfqq->new_cfqq)) {
+ if (__cfqq == cfqq)
+ return;
+ new_cfqq = __cfqq;
+ }
+
+ process_refs = cfqq_process_refs(cfqq);
+ /*
+ * If the process for the cfqq has gone away, there is no
+ * sense in merging the queues.
+ */
+ if (process_refs == 0)
+ return;
+
+ /*
+ * Merge in the direction of the lesser amount of work.
+ */
+ new_process_refs = cfqq_process_refs(new_cfqq);
+ if (new_process_refs >= process_refs) {
+ cfqq->new_cfqq = new_cfqq;
+ atomic_add(process_refs, &new_cfqq->ref);
+ } else {
+ new_cfqq->new_cfqq = cfqq;
+ atomic_add(new_process_refs, &cfqq->ref);
+ }
+}
+
+static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
+ struct cfq_group *cfqg, enum wl_prio_t prio)
+{
+ struct cfq_queue *queue;
+ int i;
+ bool key_valid = false;
+ unsigned long lowest_key = 0;
+ enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
+
+ for (i = 0; i <= SYNC_WORKLOAD; ++i) {
+ /* select the one with lowest rb_key */
+ queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
+ if (queue &&
+ (!key_valid || time_before(queue->rb_key, lowest_key))) {
+ lowest_key = queue->rb_key;
+ cur_best = i;
+ key_valid = true;
+ }
+ }
+
+ return cur_best;
+}
+
+static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
+{
+ unsigned slice;
+ unsigned count;
+ struct cfq_rb_root *st;
+ unsigned group_slice;
+
+ if (!cfqg) {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /* Choose next priority. RT > BE > IDLE */
+ if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = RT_WORKLOAD;
+ else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
+ cfqd->serving_prio = BE_WORKLOAD;
+ else {
+ cfqd->serving_prio = IDLE_WORKLOAD;
+ cfqd->workload_expires = jiffies + 1;
+ return;
+ }
+
+ /*
+ * For RT and BE, we have to choose also the type
+ * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
+ * expiration time
+ */
+ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ count = st->count;
+
+ /*
+ * check workload expiration, and that we still have other queues ready
+ */
+ if (count && !time_after(jiffies, cfqd->workload_expires))
+ return;
+
+ /* otherwise select new workload type */
+ cfqd->serving_type =
+ cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
+ st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
+ count = st->count;
+
+ /*
+ * the workload slice is computed as a fraction of target latency
+ * proportional to the number of queues in that workload, over
+ * all the queues in the same priority class
+ */
+ group_slice = cfq_group_slice(cfqd, cfqg);
+
+ slice = group_slice * count /
+ max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
+ cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
+
+ if (cfqd->serving_type == ASYNC_WORKLOAD) {
+ unsigned int tmp;
+
+ /*
+ * Async queues are currently system wide. Just taking
+ * proportion of queues with-in same group will lead to higher
+ * async ratio system wide as generally root group is going
+ * to have higher weight. A more accurate thing would be to
+ * calculate system wide asnc/sync ratio.
+ */
+ tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
+ tmp = tmp/cfqd->busy_queues;
+ slice = min_t(unsigned, slice, tmp);
+
+ /* async workload slice is scaled down according to
+ * the sync/async slice ratio. */
+ slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
+ } else
+ /* sync workload slice is at least 2 * cfq_slice_idle */
+ slice = max(slice, 2 * cfqd->cfq_slice_idle);
+
+ slice = max_t(unsigned, slice, CFQ_MIN_TT);
+ cfqd->workload_expires = jiffies + slice;
+ cfqd->noidle_tree_requires_idle = false;
+}
+
+static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
+{
+ struct cfq_rb_root *st = &cfqd->grp_service_tree;
+ struct cfq_group *cfqg;
+
+ if (RB_EMPTY_ROOT(&st->rb))
+ return NULL;
+ cfqg = cfq_rb_first_group(st);
+ st->active = &cfqg->rb_node;
+ update_min_vdisktime(st);
+ return cfqg;
+}
+
+static void cfq_choose_cfqg(struct cfq_data *cfqd)
+{
+ struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
+
+ cfqd->serving_group = cfqg;
+
+ /* Restore the workload type data */
+ if (cfqg->saved_workload_slice) {
+ cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
+ cfqd->serving_type = cfqg->saved_workload;
+ cfqd->serving_prio = cfqg->saved_serving_prio;
+ } else
+ cfqd->workload_expires = jiffies - 1;
+
+ choose_service_tree(cfqd, cfqg);
+}
+
+/*
* Select a queue for service. If we have a current active queue,
* check whether to continue servicing it, or retrieve and set a new one.
*/
@@ -1186,13 +2108,37 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
if (!cfqq)
goto new_queue;
+ if (!cfqd->rq_queued)
+ return NULL;
+
/*
- * The active queue has run out of time, expire it and select new.
+ * We were waiting for group to get backlogged. Expire the queue
*/
- if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
+ if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
goto expire;
/*
+ * The active queue has run out of time, expire it and select new.
+ */
+ if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
+ /*
+ * If slice had not expired at the completion of last request
+ * we might not have turned on wait_busy flag. Don't expire
+ * the queue yet. Allow the group to get backlogged.
+ *
+ * The very fact that we have used the slice, that means we
+ * have been idling all along on this queue and it should be
+ * ok to wait for this request to complete.
+ */
+ if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
+ && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
+ cfqq = NULL;
+ goto keep_queue;
+ } else
+ goto expire;
+ }
+
+ /*
* The active queue has requests and isn't expired, allow it to
* dispatch.
*/
@@ -1203,11 +2149,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
* If another queue has a request waiting within our mean seek
* distance, let it run. The expire code will check for close
* cooperators and put the close queue at the front of the service
- * tree.
+ * tree. If possible, merge the expiring queue with the new cfqq.
*/
- new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0);
- if (new_cfqq)
+ new_cfqq = cfq_close_cooperator(cfqd, cfqq);
+ if (new_cfqq) {
+ if (!cfqq->new_cfqq)
+ cfq_setup_merge(cfqq, new_cfqq);
goto expire;
+ }
/*
* No requests pending. If the active queue still has requests in
@@ -1215,7 +2164,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
* conditions to happen (or time out) before selecting a new queue.
*/
if (timer_pending(&cfqd->idle_slice_timer) ||
- (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) {
+ (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
cfqq = NULL;
goto keep_queue;
}
@@ -1223,6 +2172,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
expire:
cfq_slice_expired(cfqd, 0);
new_queue:
+ /*
+ * Current queue expired. Check if we have to switch to a new
+ * service tree
+ */
+ if (!new_cfqq)
+ cfq_choose_cfqg(cfqd);
+
cfqq = cfq_set_active_queue(cfqd, new_cfqq);
keep_queue:
return cfqq;
@@ -1238,6 +2194,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
}
BUG_ON(!list_empty(&cfqq->fifo));
+
+ /* By default cfqq is not expired if it is empty. Do it explicitly */
+ __cfq_slice_expired(cfqq->cfqd, cfqq, 0);
return dispatched;
}
@@ -1250,11 +2209,10 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
struct cfq_queue *cfqq;
int dispatched = 0;
- while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL)
+ while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
dispatched += __cfq_forced_dispatch_cfqq(cfqq);
cfq_slice_expired(cfqd, 0);
-
BUG_ON(cfqd->busy_queues);
cfq_log(cfqd, "forced_dispatch=%d", dispatched);
@@ -1268,7 +2226,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
/*
* Drain async requests before we start sync IO
*/
- if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
return false;
/*
@@ -1298,9 +2256,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return false;
/*
- * Sole queue user, allow bigger slice
+ * Sole queue user, no limit
*/
- max_dispatch *= 4;
+ max_dispatch = -1;
}
/*
@@ -1309,7 +2267,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* based on the last sync IO we serviced
*/
if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
- unsigned long last_sync = jiffies - cfqd->last_end_sync_rq;
+ unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
unsigned int depth;
depth = last_sync / cfqd->cfq_slice[1];
@@ -1407,11 +2365,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
* task holds one reference to the queue, dropped when task exits. each rq
* in-flight on this queue also holds a reference, dropped when rq is freed.
*
+ * Each cfq queue took a reference on the parent group. Drop it now.
* queue lock must be held here.
*/
static void cfq_put_queue(struct cfq_queue *cfqq)
{
struct cfq_data *cfqd = cfqq->cfqd;
+ struct cfq_group *cfqg, *orig_cfqg;
BUG_ON(atomic_read(&cfqq->ref) <= 0);
@@ -1421,14 +2381,19 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
cfq_log_cfqq(cfqd, cfqq, "put_queue");
BUG_ON(rb_first(&cfqq->sort_list));
BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
- BUG_ON(cfq_cfqq_on_rr(cfqq));
+ cfqg = cfqq->cfqg;
+ orig_cfqg = cfqq->orig_cfqg;
if (unlikely(cfqd->active_queue == cfqq)) {
__cfq_slice_expired(cfqd, cfqq, 0);
cfq_schedule_dispatch(cfqd);
}
+ BUG_ON(cfq_cfqq_on_rr(cfqq));
kmem_cache_free(cfq_pool, cfqq);
+ cfq_put_cfqg(cfqg);
+ if (orig_cfqg)
+ cfq_put_cfqg(orig_cfqg);
}
/*
@@ -1518,11 +2483,29 @@ static void cfq_free_io_context(struct io_context *ioc)
static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
+ struct cfq_queue *__cfqq, *next;
+
if (unlikely(cfqq == cfqd->active_queue)) {
__cfq_slice_expired(cfqd, cfqq, 0);
cfq_schedule_dispatch(cfqd);
}
+ /*
+ * If this queue was scheduled to merge with another queue, be
+ * sure to drop the reference taken on that queue (and others in
+ * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.
+ */
+ __cfqq = cfqq->new_cfqq;
+ while (__cfqq) {
+ if (__cfqq == cfqq) {
+ WARN(1, "cfqq->new_cfqq loop detected\n");
+ break;
+ }
+ next = __cfqq->new_cfqq;
+ cfq_put_queue(__cfqq);
+ __cfqq = next;
+ }
+
cfq_put_queue(cfqq);
}
@@ -1703,14 +2686,51 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->pid = pid;
}
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+{
+ struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
+ struct cfq_data *cfqd = cic->key;
+ unsigned long flags;
+ struct request_queue *q;
+
+ if (unlikely(!cfqd))
+ return;
+
+ q = cfqd->queue;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+
+ if (sync_cfqq) {
+ /*
+ * Drop reference to sync queue. A new sync queue will be
+ * assigned in new group upon arrival of a fresh request.
+ */
+ cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
+ cic_set_cfqq(cic, NULL, 1);
+ cfq_put_queue(sync_cfqq);
+ }
+
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void cfq_ioc_set_cgroup(struct io_context *ioc)
+{
+ call_for_each_cic(ioc, changed_cgroup);
+ ioc->cgroup_changed = 0;
+}
+#endif /* CONFIG_CFQ_GROUP_IOSCHED */
+
static struct cfq_queue *
cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
struct io_context *ioc, gfp_t gfp_mask)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
struct cfq_io_context *cic;
+ struct cfq_group *cfqg;
retry:
+ cfqg = cfq_get_cfqg(cfqd, 1);
cic = cfq_cic_lookup(cfqd, ioc);
/* cic always exists here */
cfqq = cic_to_cfqq(cic, is_sync);
@@ -1741,6 +2761,7 @@ retry:
if (cfqq) {
cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
cfq_init_prio_data(cfqq, ioc);
+ cfq_link_cfqq_cfqg(cfqq, cfqg);
cfq_log_cfqq(cfqd, cfqq, "alloced");
} else
cfqq = &cfqd->oom_cfqq;
@@ -1932,6 +2953,10 @@ out:
if (unlikely(ioc->ioprio_changed))
cfq_ioc_set_ioprio(ioc);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (unlikely(ioc->cgroup_changed))
+ cfq_ioc_set_cgroup(ioc);
+#endif
return cic;
err_free:
cfq_cic_free(cic);
@@ -1952,33 +2977,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
}
static void
-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *rq)
{
sector_t sdist;
u64 total;
- if (!cic->last_request_pos)
+ if (!cfqq->last_request_pos)
sdist = 0;
- else if (cic->last_request_pos < blk_rq_pos(rq))
- sdist = blk_rq_pos(rq) - cic->last_request_pos;
+ else if (cfqq->last_request_pos < blk_rq_pos(rq))
+ sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
else
- sdist = cic->last_request_pos - blk_rq_pos(rq);
+ sdist = cfqq->last_request_pos - blk_rq_pos(rq);
/*
* Don't allow the seek distance to get too large from the
* odd fragment, pagein, etc
*/
- if (cic->seek_samples <= 60) /* second&third seek */
- sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024);
+ if (cfqq->seek_samples <= 60) /* second&third seek */
+ sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024);
else
- sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64);
+ sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64);
- cic->seek_samples = (7*cic->seek_samples + 256) / 8;
- cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8;
- total = cic->seek_total + (cic->seek_samples/2);
- do_div(total, cic->seek_samples);
- cic->seek_mean = (sector_t)total;
+ cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8;
+ cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8;
+ total = cfqq->seek_total + (cfqq->seek_samples/2);
+ do_div(total, cfqq->seek_samples);
+ cfqq->seek_mean = (sector_t)total;
}
/*
@@ -1999,14 +3024,15 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
+ if (cfqq->queued[0] + cfqq->queued[1] >= 4)
+ cfq_mark_cfqq_deep(cfqq);
+
if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
- (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic)))
+ (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples)
+ && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
else if (sample_valid(cic->ttime_samples)) {
- unsigned int slice_idle = cfqd->cfq_slice_idle;
- if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic))
- slice_idle = msecs_to_jiffies(CFQ_MIN_TT);
- if (cic->ttime_mean > slice_idle)
+ if (cic->ttime_mean > cfqd->cfq_slice_idle)
enable_idle = 0;
else
enable_idle = 1;
@@ -2035,9 +3061,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if (!cfqq)
return false;
- if (cfq_slice_used(cfqq))
- return true;
-
if (cfq_class_idle(new_cfqq))
return false;
@@ -2045,12 +3068,31 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
return true;
/*
+ * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
+ */
+ if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
+ return false;
+
+ /*
* if the new request is sync, but the currently running queue is
* not, let the sync request have priority.
*/
if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
return true;
+ if (new_cfqq->cfqg != cfqq->cfqg)
+ return false;
+
+ if (cfq_slice_used(cfqq))
+ return true;
+
+ /* Allow preemption only if we are idling on sync-noidle tree */
+ if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
+ cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
+ new_cfqq->service_tree->count == 2 &&
+ RB_EMPTY_ROOT(&cfqq->sort_list))
+ return true;
+
/*
* So both queues are sync. Let the new request get disk time if
* it's a metadata request and the current queue is doing regular IO.
@@ -2071,16 +3113,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
* if this request is as-good as one we would expect from the
* current cfqq, let it preempt
*/
- if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) ||
- cfqd->busy_queues == 1)) {
- /*
- * Mark new queue coop_preempt, so its coop flag will not be
- * cleared when new queue gets scheduled at the very first time
- */
- cfq_mark_cfqq_coop_preempt(new_cfqq);
- cfq_mark_cfqq_coop(new_cfqq);
+ if (cfq_rq_close(cfqd, cfqq, rq, true))
return true;
- }
return false;
}
@@ -2121,10 +3155,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqq->meta_pending++;
cfq_update_io_thinktime(cfqd, cic);
- cfq_update_io_seektime(cfqd, cic, rq);
+ cfq_update_io_seektime(cfqd, cfqq, rq);
cfq_update_idle_window(cfqd, cfqq, cic);
- cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (cfqq == cfqd->active_queue) {
/*
@@ -2141,9 +3175,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
cfqd->busy_queues > 1) {
del_timer(&cfqd->idle_slice_timer);
- __blk_run_queue(cfqd->queue);
- }
- cfq_mark_cfqq_must_dispatch(cfqq);
+ cfq_clear_cfqq_wait_request(cfqq);
+ __blk_run_queue(cfqd->queue);
+ } else
+ cfq_mark_cfqq_must_dispatch(cfqq);
}
} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
/*
@@ -2165,10 +3200,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
cfq_log_cfqq(cfqd, cfqq, "insert_request");
cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
- cfq_add_rq_rb(rq);
-
rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
list_add_tail(&rq->queuelist, &cfqq->fifo);
+ cfq_add_rq_rb(rq);
cfq_rq_enqueued(cfqd, cfqq, rq);
}
@@ -2179,23 +3213,64 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
*/
static void cfq_update_hw_tag(struct cfq_data *cfqd)
{
- if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
- cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
+ struct cfq_queue *cfqq = cfqd->active_queue;
+
+ if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth)
+ cfqd->hw_tag_est_depth = rq_in_driver(cfqd);
+
+ if (cfqd->hw_tag == 1)
+ return;
if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
return;
+ /*
+ * If active queue hasn't enough requests and can idle, cfq might not
+ * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+ * case
+ */
+ if (cfqq && cfq_cfqq_idle_window(cfqq) &&
+ cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
+ CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN)
+ return;
+
if (cfqd->hw_tag_samples++ < 50)
return;
- if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+ if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
cfqd->hw_tag = 1;
else
cfqd->hw_tag = 0;
+}
+
+static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ struct cfq_io_context *cic = cfqd->active_cic;
+
+ /* If there are other queues in the group, don't wait */
+ if (cfqq->cfqg->nr_cfqq > 1)
+ return false;
+
+ if (cfq_slice_used(cfqq))
+ return true;
+
+ /* if slice left is less than think time, wait busy */
+ if (cic && sample_valid(cic->ttime_samples)
+ && (cfqq->slice_end - jiffies < cic->ttime_mean))
+ return true;
+
+ /*
+ * If think times is less than a jiffy than ttime_mean=0 and above
+ * will not be true. It might happen that slice has not expired yet
+ * but will expire soon (4-5 ns) during select_queue(). To cover the
+ * case where think time is less than a jiffy, mark the queue wait
+ * busy if only 1 jiffy is left in the slice.
+ */
+ if (cfqq->slice_end - jiffies == 1)
+ return true;
- cfqd->hw_tag_samples = 0;
- cfqd->rq_in_driver_peak = 0;
+ return false;
}
static void cfq_completed_request(struct request_queue *q, struct request *rq)
@@ -2206,7 +3281,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
unsigned long now;
now = jiffies;
- cfq_log_cfqq(cfqd, cfqq, "complete");
+ cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
cfq_update_hw_tag(cfqd);
@@ -2220,7 +3295,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
if (sync) {
RQ_CIC(rq)->last_end_request = now;
- cfqd->last_end_sync_rq = now;
+ if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
+ cfqd->last_delayed_sync = now;
}
/*
@@ -2234,18 +3310,39 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
cfq_set_prio_slice(cfqd, cfqq);
cfq_clear_cfqq_slice_new(cfqq);
}
+
+ /*
+ * Should we wait for next request to come in before we expire
+ * the queue.
+ */
+ if (cfq_should_wait_busy(cfqd, cfqq)) {
+ cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
+ cfq_mark_cfqq_wait_busy(cfqq);
+ }
+
/*
- * If there are no requests waiting in this queue, and
- * there are other queues ready to issue requests, AND
- * those other queues are issuing requests within our
- * mean seek distance, give them a chance to run instead
- * of idling.
+ * Idling is not enabled on:
+ * - expired queues
+ * - idle-priority queues
+ * - async queues
+ * - queues with still some requests queued
+ * - when there is a close cooperator
*/
if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
cfq_slice_expired(cfqd, 1);
- else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) &&
- sync && !rq_noidle(rq))
- cfq_arm_slice_timer(cfqd);
+ else if (sync && cfqq_empty &&
+ !cfq_close_cooperator(cfqd, cfqq)) {
+ cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
+ /*
+ * Idling is enabled for SYNC_WORKLOAD.
+ * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
+ * only if we processed at least one !rq_noidle request
+ */
+ if (cfqd->serving_type == SYNC_WORKLOAD
+ || cfqd->noidle_tree_requires_idle
+ || cfqq->cfqg->nr_cfqq == 1)
+ cfq_arm_slice_timer(cfqd);
+ }
}
if (!rq_in_driver(cfqd))
@@ -2269,12 +3366,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
cfqq->ioprio = IOPRIO_NORM;
} else {
/*
- * check if we need to unboost the queue
+ * unboost the queue (if needed)
*/
- if (cfqq->ioprio_class != cfqq->org_ioprio_class)
- cfqq->ioprio_class = cfqq->org_ioprio_class;
- if (cfqq->ioprio != cfqq->org_ioprio)
- cfqq->ioprio = cfqq->org_ioprio;
+ cfqq->ioprio_class = cfqq->org_ioprio_class;
+ cfqq->ioprio = cfqq->org_ioprio;
}
}
@@ -2338,6 +3433,35 @@ static void cfq_put_request(struct request *rq)
}
}
+static struct cfq_queue *
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+ struct cfq_queue *cfqq)
+{
+ cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
+ cic_set_cfqq(cic, cfqq->new_cfqq, 1);
+ cfq_mark_cfqq_coop(cfqq->new_cfqq);
+ cfq_put_queue(cfqq);
+ return cic_to_cfqq(cic, 1);
+}
+
+/*
+ * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
+ * was the last process referring to said cfqq.
+ */
+static struct cfq_queue *
+split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+{
+ if (cfqq_process_refs(cfqq) == 1) {
+ cfqq->pid = current->pid;
+ cfq_clear_cfqq_coop(cfqq);
+ cfq_clear_cfqq_split_coop(cfqq);
+ return cfqq;
+ }
+
+ cic_set_cfqq(cic, NULL, 1);
+ cfq_put_queue(cfqq);
+ return NULL;
+}
/*
* Allocate cfq data structures associated with this request.
*/
@@ -2360,10 +3484,30 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
if (!cic)
goto queue_fail;
+new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
cic_set_cfqq(cic, cfqq, is_sync);
+ } else {
+ /*
+ * If the queue was seeky for too long, break it apart.
+ */
+ if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
+ cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
+ cfqq = split_cfqq(cic, cfqq);
+ if (!cfqq)
+ goto new_queue;
+ }
+
+ /*
+ * Check to see if this queue is scheduled to merge with
+ * another, closely cooperating queue. The merging of
+ * queues happens here as it must be done in process context.
+ * The reference on new_cfqq was taken in merge_cfqqs.
+ */
+ if (cfqq->new_cfqq)
+ cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
}
cfqq->allocated[rw]++;
@@ -2438,6 +3582,11 @@ static void cfq_idle_slice_timer(unsigned long data)
*/
if (!RB_EMPTY_ROOT(&cfqq->sort_list))
goto out_kick;
+
+ /*
+ * Queue depth flag is reset only when the idle didn't succeed
+ */
+ cfq_clear_cfqq_deep(cfqq);
}
expire:
cfq_slice_expired(cfqd, timed_out);
@@ -2468,6 +3617,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
cfq_put_queue(cfqd->async_idle_cfqq);
}
+static void cfq_cfqd_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct cfq_data, rcu));
+}
+
static void cfq_exit_queue(struct elevator_queue *e)
{
struct cfq_data *cfqd = e->elevator_data;
@@ -2489,25 +3643,49 @@ static void cfq_exit_queue(struct elevator_queue *e)
}
cfq_put_async_queues(cfqd);
+ cfq_release_cfq_groups(cfqd);
+ blkiocg_del_blkio_group(&cfqd->root_group.blkg);
spin_unlock_irq(q->queue_lock);
cfq_shutdown_timer_wq(cfqd);
- kfree(cfqd);
+ /* Wait for cfqg->blkg->key accessors to exit their grace periods. */
+ call_rcu(&cfqd->rcu, cfq_cfqd_free);
}
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
- int i;
+ int i, j;
+ struct cfq_group *cfqg;
+ struct cfq_rb_root *st;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
return NULL;
- cfqd->service_tree = CFQ_RB_ROOT;
+ /* Init root service tree */
+ cfqd->grp_service_tree = CFQ_RB_ROOT;
+
+ /* Init root group */
+ cfqg = &cfqd->root_group;
+ for_each_cfqg_st(cfqg, i, j, st)
+ *st = CFQ_RB_ROOT;
+ RB_CLEAR_NODE(&cfqg->rb_node);
+ /* Give preference to root group over other groups */
+ cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
+
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ /*
+ * Take a reference to root group which we never drop. This is just
+ * to make sure that cfq_put_cfqg() does not try to kfree root group
+ */
+ atomic_set(&cfqg->ref, 1);
+ blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
+ 0);
+#endif
/*
* Not strictly needed (since RB_ROOT just clears the node and we
* zeroed cfqd on alloc), but better be safe in case someone decides
@@ -2523,6 +3701,7 @@ static void *cfq_init_queue(struct request_queue *q)
*/
cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
atomic_inc(&cfqd->oom_cfqq.ref);
+ cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
INIT_LIST_HEAD(&cfqd->cic_list);
@@ -2544,8 +3723,14 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_latency = 1;
- cfqd->hw_tag = 1;
- cfqd->last_end_sync_rq = jiffies;
+ cfqd->cfq_group_isolation = 0;
+ cfqd->hw_tag = -1;
+ /*
+ * we optimistically start assuming sync ops weren't delayed in last
+ * second, in order to have larger depth for async operations.
+ */
+ cfqd->last_delayed_sync = jiffies - HZ;
+ INIT_RCU_HEAD(&cfqd->rcu);
return cfqd;
}
@@ -2614,6 +3799,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
@@ -2646,6 +3832,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
#undef STORE_FUNCTION
#define CFQ_ATTR(name) \
@@ -2662,6 +3849,7 @@ static struct elv_fs_entry cfq_attrs[] = {
CFQ_ATTR(slice_async_rq),
CFQ_ATTR(slice_idle),
CFQ_ATTR(low_latency),
+ CFQ_ATTR(group_isolation),
__ATTR_NULL
};
@@ -2691,6 +3879,17 @@ static struct elevator_type iosched_cfq = {
.elevator_owner = THIS_MODULE,
};
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+static struct blkio_policy_type blkio_policy_cfq = {
+ .ops = {
+ .blkio_unlink_group_fn = cfq_unlink_blkio_group,
+ .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
+ },
+};
+#else
+static struct blkio_policy_type blkio_policy_cfq;
+#endif
+
static int __init cfq_init(void)
{
/*
@@ -2705,6 +3904,7 @@ static int __init cfq_init(void)
return -ENOMEM;
elv_register(&iosched_cfq);
+ blkio_policy_register(&blkio_policy_cfq);
return 0;
}
@@ -2712,6 +3912,7 @@ static int __init cfq_init(void)
static void __exit cfq_exit(void)
{
DECLARE_COMPLETION_ONSTACK(all_gone);
+ blkio_policy_unregister(&blkio_policy_cfq);
elv_unregister(&iosched_cfq);
ioc_gone = &all_gone;
/* ioc_gone's update must be visible before reading ioc_count */
diff --git a/block/elevator.c b/block/elevator.c
index a847046..7e0fe67 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -959,12 +959,12 @@ void elv_unregister(struct elevator_type *e)
*/
if (e->ops.trim) {
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
task_lock(p);
if (p->io_context)
e->ops.trim(p->io_context);
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock(&tasklist_lock);
}
diff --git a/block/genhd.c b/block/genhd.c
index 517e433..c97c1d1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -22,9 +22,7 @@
#include "blk.h"
static DEFINE_MUTEX(block_class_lock);
-#ifndef CONFIG_SYSFS_DEPRECATED
struct kobject *block_depr;
-#endif
/* for extended dynamic devt allocation, currently only one major is used */
#define MAX_EXT_DEVT (1 << MINORBITS)
@@ -793,7 +791,7 @@ static int __init genhd_device_init(void)
{
int error;
- block_class.dev_kobj = sysfs_dev_block_kobj;
+ block_class.dev_kobj = ve_sysfs_dev_block_kobj;
error = class_register(&block_class);
if (unlikely(error))
return error;
@@ -802,10 +800,10 @@ static int __init genhd_device_init(void)
register_blkdev(BLOCK_EXT_MAJOR, "blkext");
-#ifndef CONFIG_SYSFS_DEPRECATED
- /* create top-level block dir */
- block_depr = kobject_create_and_add("block", NULL);
-#endif
+ if (!sysfs_deprecated)
+ /* create top-level block dir */
+ block_depr = kobject_create_and_add("block", NULL);
+
return 0;
}
@@ -997,6 +995,7 @@ static void disk_release(struct device *dev)
struct class block_class = {
.name = "block",
};
+EXPORT_SYMBOL(block_class);
static char *block_devnode(struct device *dev, mode_t *mode)
{
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index ee37727..0361045 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -10,7 +10,7 @@ config UEVENT_HELPER_PATH
config DEVTMPFS
bool "Create a kernel maintained /dev tmpfs (EXPERIMENTAL)"
- depends on HOTPLUG && SHMEM && TMPFS
+ depends on HOTPLUG && SHMEM && TMPFS && !VE
help
This creates a tmpfs filesystem, and mounts it at bootup
and mounts it at /dev. The kernel driver core creates device
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 2ca7f5b..88d57ee 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -129,7 +129,12 @@ extern char *make_class_name(const char *name, struct kobject *kobj);
extern int devres_release_all(struct device *dev);
+#ifndef CONFIG_VE
extern struct kset *devices_kset;
+#define ve_devices_kset devices_kset
+#else
+#define ve_devices_kset (get_exec_env()->devices_kset)
+#endif
#if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
extern void module_add_driver(struct module *mod, struct device_driver *drv);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 63c143e..5038fe8 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -439,21 +439,20 @@ static void device_remove_attrs(struct bus_type *bus, struct device *dev)
}
}
-#ifdef CONFIG_SYSFS_DEPRECATED
static int make_deprecated_bus_links(struct device *dev)
{
- return sysfs_create_link(&dev->kobj,
- &dev->bus->p->subsys.kobj, "bus");
+ if (sysfs_deprecated)
+ return sysfs_create_link(&dev->kobj,
+ &dev->bus->p->subsys.kobj, "bus");
+ else
+ return 0;
}
static void remove_deprecated_bus_links(struct device *dev)
{
- sysfs_remove_link(&dev->kobj, "bus");
+ if (sysfs_deprecated)
+ sysfs_remove_link(&dev->kobj, "bus");
}
-#else
-static inline int make_deprecated_bus_links(struct device *dev) { return 0; }
-static inline void remove_deprecated_bus_links(struct device *dev) { }
-#endif
/**
* bus_add_device - add device to bus
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 6e2c3b0..ac27503 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -19,6 +19,8 @@
#include <linux/slab.h>
#include <linux/genhd.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
#include "base.h"
#define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
@@ -74,8 +76,14 @@ static struct kobj_type class_ktype = {
};
/* Hotplug events for classes go to the class class_subsys */
-static struct kset *class_kset;
+#ifndef CONFIG_VE
+struct kset *class_kset;
+EXPORT_SYMBOL_GPL(class_kset);
+#define visible_class_kset class_kset
+#else
+#define visible_class_kset (get_exec_env()->class_kset)
+#endif
int class_create_file(struct class *cls, const struct class_attribute *attr)
{
@@ -173,14 +181,14 @@ int __class_register(struct class *cls, struct lock_class_key *key)
/* set the default /sys/dev directory for devices of this class */
if (!cls->dev_kobj)
- cls->dev_kobj = sysfs_dev_char_kobj;
+ cls->dev_kobj = ve_sysfs_dev_char_kobj;
-#if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK)
+#if defined(CONFIG_BLOCK)
/* let the block class directory show up in the root of sysfs */
- if (cls != &block_class)
- cp->class_subsys.kobj.kset = class_kset;
+ if (!sysfs_deprecated || cls != &block_class)
+ cp->class_subsys.kobj.kset = visible_class_kset;
#else
- cp->class_subsys.kobj.kset = class_kset;
+ cp->class_subsys.kobj.kset = visible_class_kset;
#endif
cp->class_subsys.kobj.ktype = &class_ktype;
cp->class = cls;
@@ -265,7 +273,6 @@ void class_destroy(struct class *cls)
class_unregister(cls);
}
-#ifdef CONFIG_SYSFS_DEPRECATED
char *make_class_name(const char *name, struct kobject *kobj)
{
char *class_name;
@@ -282,7 +289,6 @@ char *make_class_name(const char *name, struct kobject *kobj)
strcat(class_name, kobject_name(kobj));
return class_name;
}
-#endif
/**
* class_dev_iter_init - initialize class device iterator
@@ -508,7 +514,7 @@ struct class_compat *class_compat_register(const char *name)
cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL);
if (!cls)
return NULL;
- cls->kobj = kobject_create_and_add(name, &class_kset->kobj);
+ cls->kobj = kobject_create_and_add(name, &visible_class_kset->kobj);
if (!cls->kobj) {
kfree(cls);
return NULL;
@@ -577,13 +583,20 @@ void class_compat_remove_link(struct class_compat *cls, struct device *dev,
}
EXPORT_SYMBOL_GPL(class_compat_remove_link);
-int __init classes_init(void)
+int classes_init(void)
{
- class_kset = kset_create_and_add("class", NULL, NULL);
- if (!class_kset)
+ visible_class_kset = kset_create_and_add("class", NULL, NULL);
+ if (!visible_class_kset)
return -ENOMEM;
return 0;
}
+EXPORT_SYMBOL_GPL(classes_init);
+
+void classes_fini(void)
+{
+ kset_unregister(visible_class_kset);
+}
+EXPORT_SYMBOL_GPL(classes_fini);
EXPORT_SYMBOL_GPL(class_create_file);
EXPORT_SYMBOL_GPL(class_remove_file);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index f33d768..49ec490 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -23,15 +23,22 @@
#include <linux/semaphore.h>
#include <linux/mutex.h>
#include <linux/async.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
#include "base.h"
#include "power/power.h"
int (*platform_notify)(struct device *dev) = NULL;
int (*platform_notify_remove)(struct device *dev) = NULL;
+#ifndef CONFIG_VE
static struct kobject *dev_kobj;
+#define ve_dev_kobj dev_kobj
struct kobject *sysfs_dev_char_kobj;
struct kobject *sysfs_dev_block_kobj;
+#else
+#define ve_dev_kobj (get_exec_env()->dev_kobj)
+#endif
#ifdef CONFIG_BLOCK
static inline int device_is_not_partition(struct device *dev)
@@ -192,7 +199,9 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
if (dev->driver)
add_uevent_var(env, "DRIVER=%s", dev->driver->name);
-#ifdef CONFIG_SYSFS_DEPRECATED
+ if (!sysfs_deprecated)
+ goto skip;
+
if (dev->class) {
struct device *parent = dev->parent;
@@ -221,7 +230,7 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "PHYSDEVDRIVER=%s",
dev->driver->name);
}
-#endif
+skip:
/* have the bus specific function add its stuff */
if (dev->bus && dev->bus->uevent) {
@@ -438,8 +447,9 @@ static ssize_t show_dev(struct device *dev, struct device_attribute *attr,
static struct device_attribute devt_attr =
__ATTR(dev, S_IRUGO, show_dev, NULL);
-/* kset to create /sys/devices/ */
+#ifndef CONFIG_VE
struct kset *devices_kset;
+#endif
/**
* device_create_file - create sysfs attribute file for device.
@@ -557,7 +567,7 @@ static void klist_children_put(struct klist_node *n)
*/
void device_initialize(struct device *dev)
{
- dev->kobj.kset = devices_kset;
+ dev->kobj.kset = ve_devices_kset;
kobject_init(&dev->kobj, &device_ktype);
INIT_LIST_HEAD(&dev->dma_pools);
init_MUTEX(&dev->sem);
@@ -568,8 +578,7 @@ void device_initialize(struct device *dev)
set_dev_node(dev, -1);
}
-#ifdef CONFIG_SYSFS_DEPRECATED
-static struct kobject *get_device_parent(struct device *dev,
+static struct kobject *get_device_parent_dep(struct device *dev,
struct device *parent)
{
/* class devices without a parent live in /sys/class/<classname>/ */
@@ -582,22 +591,25 @@ static struct kobject *get_device_parent(struct device *dev,
return NULL;
}
-static inline void cleanup_device_parent(struct device *dev) {}
-static inline void cleanup_glue_dir(struct device *dev,
+static inline void cleanup_device_parent_dep(struct device *dev) {}
+static inline void cleanup_glue_dir_dep(struct device *dev,
struct kobject *glue_dir) {}
+#ifndef CONFIG_VE
+static struct kobject *virtual_dir = NULL;
#else
+# define virtual_dir (get_exec_env()->_virtual_dir)
+#endif
+
static struct kobject *virtual_device_parent(struct device *dev)
{
- static struct kobject *virtual_dir = NULL;
-
if (!virtual_dir)
virtual_dir = kobject_create_and_add("virtual",
- &devices_kset->kobj);
+ &ve_devices_kset->kobj);
return virtual_dir;
}
-static struct kobject *get_device_parent(struct device *dev,
+static struct kobject *get_device_parent_nodep(struct device *dev,
struct device *parent)
{
int retval;
@@ -658,7 +670,7 @@ static struct kobject *get_device_parent(struct device *dev,
return NULL;
}
-static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
+static void cleanup_glue_dir_nodep(struct device *dev, struct kobject *glue_dir)
{
/* see if we live in a "glue" directory */
if (!glue_dir || !dev->class ||
@@ -668,11 +680,36 @@ static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
kobject_put(glue_dir);
}
+static void cleanup_device_parent_nodep(struct device *dev)
+{
+ cleanup_glue_dir_nodep(dev, dev->kobj.parent);
+}
+
+static struct kobject *get_device_parent(struct device *dev,
+ struct device *parent)
+{
+ if (sysfs_deprecated)
+ return get_device_parent_dep(dev, parent);
+ else
+ return get_device_parent_nodep(dev, parent);
+}
+
+static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
+{
+ if (sysfs_deprecated)
+ cleanup_glue_dir_dep(dev, glue_dir);
+ else
+ cleanup_glue_dir_nodep(dev, glue_dir);
+}
+
static void cleanup_device_parent(struct device *dev)
{
- cleanup_glue_dir(dev, dev->kobj.parent);
+ if (sysfs_deprecated)
+ cleanup_device_parent_dep(dev);
+ else
+ cleanup_device_parent_nodep(dev);
}
-#endif
+
static void setup_parent(struct device *dev, struct device *parent)
{
@@ -695,7 +732,9 @@ static int device_add_class_symlinks(struct device *dev)
if (error)
goto out;
-#ifdef CONFIG_SYSFS_DEPRECATED
+ if (!sysfs_deprecated)
+ goto nodep;
+
/* stacked class devices need a symlink in the class directory */
if (dev->kobj.parent != &dev->class->p->class_subsys.kobj &&
device_is_not_partition(dev)) {
@@ -720,7 +759,7 @@ static int device_add_class_symlinks(struct device *dev)
&parent->kobj,
"device");
if (error)
- goto out_busid;
+ goto out_busid_dep;
class_name = make_class_name(dev->class->name,
&dev->kobj);
@@ -736,12 +775,14 @@ static int device_add_class_symlinks(struct device *dev)
out_device:
if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
-out_busid:
+out_busid_dep:
if (dev->kobj.parent != &dev->class->p->class_subsys.kobj &&
device_is_not_partition(dev))
sysfs_remove_link(&dev->class->p->class_subsys.kobj,
dev_name(dev));
-#else
+ goto out_subsys;
+
+nodep:
/* link in the class directory pointing to the device */
error = sysfs_create_link(&dev->class->p->class_subsys.kobj,
&dev->kobj, dev_name(dev));
@@ -752,14 +793,12 @@ out_busid:
error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
"device");
if (error)
- goto out_busid;
+ goto out_busid_nodep;
}
return 0;
-out_busid:
+out_busid_nodep:
sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));
-#endif
-
out_subsys:
sysfs_remove_link(&dev->kobj, "subsystem");
out:
@@ -771,7 +810,9 @@ static void device_remove_class_symlinks(struct device *dev)
if (!dev->class)
return;
-#ifdef CONFIG_SYSFS_DEPRECATED
+ if (!sysfs_deprecated)
+ goto nodep;
+
if (dev->parent && device_is_not_partition(dev)) {
char *class_name;
@@ -787,13 +828,14 @@ static void device_remove_class_symlinks(struct device *dev)
device_is_not_partition(dev))
sysfs_remove_link(&dev->class->p->class_subsys.kobj,
dev_name(dev));
-#else
+ goto done;
+
+nodep:
if (dev->parent && device_is_not_partition(dev))
sysfs_remove_link(&dev->kobj, "device");
sysfs_remove_link(&dev->class->p->class_subsys.kobj, dev_name(dev));
-#endif
-
+done:
sysfs_remove_link(&dev->kobj, "subsystem");
}
@@ -832,7 +874,7 @@ static struct kobject *device_to_dev_kobj(struct device *dev)
if (dev->class)
kobj = dev->class->dev_kobj;
else
- kobj = sysfs_dev_char_kobj;
+ kobj = ve_sysfs_dev_char_kobj;
return kobj;
}
@@ -1270,31 +1312,43 @@ struct device *device_find_child(struct device *parent, void *data,
return child;
}
-int __init devices_init(void)
+int devices_init(void)
{
- devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
- if (!devices_kset)
- return -ENOMEM;
- dev_kobj = kobject_create_and_add("dev", NULL);
- if (!dev_kobj)
+ ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
+ if (!ve_devices_kset)
+ goto dev_kset_err;
+ ve_dev_kobj = kobject_create_and_add("dev", NULL);
+ if (!ve_dev_kobj)
goto dev_kobj_err;
- sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
- if (!sysfs_dev_block_kobj)
+ ve_sysfs_dev_block_kobj = kobject_create_and_add("block", ve_dev_kobj);
+ if (!ve_sysfs_dev_block_kobj)
goto block_kobj_err;
- sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
- if (!sysfs_dev_char_kobj)
+ ve_sysfs_dev_char_kobj = kobject_create_and_add("char", ve_dev_kobj);
+ if (!ve_sysfs_dev_char_kobj)
goto char_kobj_err;
return 0;
char_kobj_err:
- kobject_put(sysfs_dev_block_kobj);
+ kobject_put(ve_sysfs_dev_block_kobj);
block_kobj_err:
- kobject_put(dev_kobj);
+ kobject_put(ve_dev_kobj);
dev_kobj_err:
- kset_unregister(devices_kset);
+ kset_unregister(ve_devices_kset);
+dev_kset_err:
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(devices_init);
+
+void devices_fini(void)
+{
+ kobject_put(ve_sysfs_dev_char_kobj);
+ kobject_put(ve_sysfs_dev_block_kobj);
+ kobject_put(ve_dev_kobj);
+ kset_unregister(ve_devices_kset);
+}
+EXPORT_SYMBOL_GPL(devices_fini);
+
EXPORT_SYMBOL_GPL(device_for_each_child);
EXPORT_SYMBOL_GPL(device_find_child);
@@ -1556,10 +1610,8 @@ int device_rename(struct device *dev, char *new_name)
pr_debug("device: '%s': %s: renaming to '%s'\n", dev_name(dev),
__func__, new_name);
-#ifdef CONFIG_SYSFS_DEPRECATED
- if ((dev->class) && (dev->parent))
+ if (sysfs_deprecated && (dev->class) && (dev->parent))
old_class_name = make_class_name(dev->class->name, &dev->kobj);
-#endif
old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
if (!old_device_name) {
@@ -1571,8 +1623,7 @@ int device_rename(struct device *dev, char *new_name)
if (error)
goto out;
-#ifdef CONFIG_SYSFS_DEPRECATED
- if (old_class_name) {
+ if (sysfs_deprecated && old_class_name) {
new_class_name = make_class_name(dev->class->name, &dev->kobj);
if (new_class_name) {
error = sysfs_create_link_nowarn(&dev->parent->kobj,
@@ -1583,8 +1634,7 @@ int device_rename(struct device *dev, char *new_name)
sysfs_remove_link(&dev->parent->kobj, old_class_name);
}
}
-#else
- if (dev->class) {
+ if (!sysfs_deprecated && dev->class) {
error = sysfs_create_link_nowarn(&dev->class->p->class_subsys.kobj,
&dev->kobj, dev_name(dev));
if (error)
@@ -1592,7 +1642,6 @@ int device_rename(struct device *dev, char *new_name)
sysfs_remove_link(&dev->class->p->class_subsys.kobj,
old_device_name);
}
-#endif
out:
put_device(dev);
@@ -1610,9 +1659,11 @@ static int device_move_class_links(struct device *dev,
struct device *new_parent)
{
int error = 0;
-#ifdef CONFIG_SYSFS_DEPRECATED
char *class_name;
+ if (!sysfs_deprecated)
+ goto nodep;
+
class_name = make_class_name(dev->class->name, &dev->kobj);
if (!class_name) {
error = -ENOMEM;
@@ -1636,14 +1687,14 @@ static int device_move_class_links(struct device *dev,
out:
kfree(class_name);
return error;
-#else
+
+nodep:
if (old_parent)
sysfs_remove_link(&dev->kobj, "device");
if (new_parent)
error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
"device");
return error;
-#endif
}
/**
@@ -1734,7 +1785,12 @@ void device_shutdown(void)
{
struct device *dev, *devn;
- list_for_each_entry_safe_reverse(dev, devn, &devices_kset->list,
+ if (!ve_is_super(get_exec_env())) {
+ printk("BUG: device_shutdown call from inside VE\n");
+ return;
+ }
+
+ list_for_each_entry_safe_reverse(dev, devn, &ve_devices_kset->list,
kobj.entry) {
if (dev->bus && dev->bus->shutdown) {
dev_dbg(dev, "shutdown\n");
@@ -1744,8 +1800,9 @@ void device_shutdown(void)
dev->driver->shutdown(dev);
}
}
- kobject_put(sysfs_dev_char_kobj);
- kobject_put(sysfs_dev_block_kobj);
- kobject_put(dev_kobj);
+
+ kobject_put(ve_sysfs_dev_char_kobj);
+ kobject_put(ve_sysfs_dev_block_kobj);
+ kobject_put(ve_dev_kobj);
async_synchronize_full();
}
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 0d90390..d3bfe47 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -20,6 +20,8 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/pm.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
#include <linux/device.h>
#include <linux/mutex.h>
#include <linux/interrupt.h>
@@ -494,7 +496,7 @@ EXPORT_SYMBOL_GPL(sysdev_resume);
int __init system_bus_init(void)
{
- system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
+ system_kset = kset_create_and_add("system", NULL, &ve_devices_kset->kobj);
if (!system_kset)
return -ENOMEM;
return 0;
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 6aad99e..e1b1536 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -458,7 +458,7 @@ config UNIX98_PTYS
config DEVPTS_MULTIPLE_INSTANCES
bool "Support multiple instances of devpts"
- depends on UNIX98_PTYS
+ depends on UNIX98_PTYS && !VE
default n
---help---
Enable support for multiple instances of devpts filesystem.
diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c
index 950837c..b25fa54 100644
--- a/drivers/char/keyboard.c
+++ b/drivers/char/keyboard.c
@@ -162,6 +162,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 1] =
static int sysrq_down;
static int sysrq_alt_use;
#endif
+int sysrq_key_scancode = KEY_SYSRQ;
static int sysrq_alt;
/*
@@ -1067,6 +1068,9 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
{
int code;
+ if (keycode == sysrq_key_scancode && sysrq_alt)
+ goto sysrq;
+
switch (keycode) {
case KEY_PAUSE:
put_queue(vc, 0xe1);
@@ -1085,6 +1089,7 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode,
break;
case KEY_SYSRQ:
+sysrq:
/*
* Real AT keyboards (that's what we're trying
* to emulate here emit 0xe0 0x2a 0xe0 0x37 when
@@ -1179,7 +1184,8 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode);
#ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */
- if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) {
+ if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) &&
+ (sysrq_down || (down == 1 && sysrq_alt))) {
if (!sysrq_down) {
sysrq_down = down;
sysrq_alt_use = sysrq_alt;
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 62f282e..2dd6714 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -30,16 +30,22 @@
#include <linux/bitops.h>
#include <linux/devpts_fs.h>
+#include <bc/misc.h>
+
#include <asm/system.h>
#ifdef CONFIG_UNIX98_PTYS
-static struct tty_driver *ptm_driver;
-static struct tty_driver *pts_driver;
+struct tty_driver *ptm_driver;
+struct tty_driver *pts_driver;
+EXPORT_SYMBOL(ptm_driver);
+EXPORT_SYMBOL(pts_driver);
#endif
static void pty_close(struct tty_struct *tty, struct file *filp)
{
BUG_ON(!tty);
+
+ ub_pty_uncharge(tty);
if (tty->driver->subtype == PTY_TYPE_MASTER)
WARN_ON(tty->count > 1);
else {
@@ -58,8 +64,12 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
if (tty->driver->subtype == PTY_TYPE_MASTER) {
set_bit(TTY_OTHER_CLOSED, &tty->flags);
#ifdef CONFIG_UNIX98_PTYS
- if (tty->driver == ptm_driver)
+ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ struct ve_struct *old_env;
+ old_env = set_exec_env(tty->owner_env);
devpts_pty_kill(tty->link);
+ (void)set_exec_env(old_env);
+ }
#endif
tty_vhangup(tty->link);
}
@@ -201,6 +211,10 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
if (tty->link->count != 1)
goto out;
+ retval = -ENOMEM;
+ if (ub_pty_charge(tty))
+ goto out;
+
clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
set_bit(TTY_THROTTLED, &tty->flags);
retval = 0;
@@ -358,9 +372,12 @@ static const struct tty_operations slave_pty_ops_bsd = {
.resize = pty_resize
};
+struct tty_driver *pty_driver, *pty_slave_driver;
+EXPORT_SYMBOL(pty_driver);
+EXPORT_SYMBOL(pty_slave_driver);
+
static void __init legacy_pty_init(void)
{
- struct tty_driver *pty_driver, *pty_slave_driver;
if (legacy_count <= 0)
return;
@@ -645,7 +662,7 @@ static int __ptmx_open(struct inode *inode, struct file *filp)
return index;
mutex_lock(&tty_mutex);
- tty = tty_init_dev(ptm_driver, index, 1);
+ tty = tty_init_dev(get_exec_env()->ptm_driver, index, NULL, 1);
mutex_unlock(&tty_mutex);
if (IS_ERR(tty)) {
@@ -661,7 +678,7 @@ static int __ptmx_open(struct inode *inode, struct file *filp)
if (retval)
goto out1;
- retval = ptm_driver->ops->open(tty, filp);
+ retval = get_exec_env()->ptm_driver->ops->open(tty, filp);
if (!retval)
return 0;
out1:
@@ -744,6 +761,9 @@ static void __init unix98_pty_init(void)
register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0)
panic("Couldn't register /dev/ptmx driver\n");
device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
+#ifdef CONFIG_VE
+ get_ve0()->ptm_driver = ptm_driver;
+#endif
}
#else
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 44203ff..4288c77 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -37,7 +37,10 @@
#include <linux/vt_kern.h>
#include <linux/workqueue.h>
#include <linux/hrtimer.h>
+#include <linux/kallsyms.h>
+#include <linux/slab.h>
#include <linux/oom.h>
+#include <linux/nmi.h>
#include <asm/ptrace.h>
#include <asm/irq_regs.h>
@@ -250,8 +253,8 @@ static struct sysrq_key_op sysrq_showallcpus_op = {
static void sysrq_handle_showregs(int key, struct tty_struct *tty)
{
struct pt_regs *regs = get_irq_regs();
- if (regs)
- show_regs(regs);
+
+ nmi_show_regs(regs, 0);
perf_event_print_debug();
}
static struct sysrq_key_op sysrq_showregs_op = {
@@ -303,6 +306,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = {
static void sysrq_handle_showmem(int key, struct tty_struct *tty)
{
show_mem();
+ show_slab_info();
}
static struct sysrq_key_op sysrq_showmem_op = {
.handler = sysrq_handle_showmem,
@@ -318,7 +322,7 @@ static void send_sig_all(int sig)
{
struct task_struct *p;
- for_each_process(p) {
+ for_each_process_all(p) {
if (p->mm && !is_global_init(p))
/* Not swapper, init nor kernel thread */
force_sig(sig, p);
@@ -394,7 +398,267 @@ static struct sysrq_key_op sysrq_unrt_op = {
/* Key Operations table and lock */
static DEFINE_SPINLOCK(sysrq_key_table_lock);
-static struct sysrq_key_op *sysrq_key_table[36] = {
+#define SYSRQ_KEY_TABLE_LENGTH 37
+static struct sysrq_key_op **sysrq_key_table;
+static struct sysrq_key_op *sysrq_default_key_table[];
+
+#ifdef CONFIG_SYSRQ_DEBUG
+#define SYSRQ_NAMELEN_MAX 64
+#define SYSRQ_DUMP_LINES 32
+
+static struct sysrq_key_op *sysrq_debug_key_table[];
+static struct sysrq_key_op *sysrq_input_key_table[];
+static unsigned long *dump_address;
+static int orig_console_loglevel;
+static void (*sysrq_input_return)(char *) = NULL;
+
+static void dump_mem(void)
+{
+ unsigned long value[4];
+ mm_segment_t old_fs;
+ int line, err;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = 0;
+
+ for (line = 0; line < SYSRQ_DUMP_LINES; line++) {
+ err |= __get_user(value[0], dump_address++);
+ err |= __get_user(value[1], dump_address++);
+ err |= __get_user(value[2], dump_address++);
+ err |= __get_user(value[3], dump_address++);
+ if (err) {
+ printk("Invalid address %p\n", dump_address - 4);
+ break;
+ }
+#if BITS_PER_LONG == 32
+ printk("0x%p: %08lx %08lx %08lx %08lx\n",
+ dump_address - 4,
+ value[0], value[1], value[2], value[3]);
+#else
+ printk("0x%p: %016lx %016lx %016lx %016lx\n",
+ dump_address - 4,
+ value[0], value[1], value[2], value[3]);
+#endif
+ }
+ set_fs(old_fs);
+}
+
+static void write_mem(unsigned long val)
+{
+ mm_segment_t old_fs;
+ unsigned long old_val;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ if (__get_user(old_val, dump_address)) {
+ printk("Invalid address %p\n", dump_address);
+ goto out;
+ }
+
+#if BITS_PER_LONG == 32
+ printk("Changing [%p] from %08lx to %08lx\n",
+ dump_address, old_val, val);
+#else
+ printk("Changing [%p] from %016lx to %016lx\n",
+ dump_address, old_val, val);
+#endif
+ __put_user(val, dump_address);
+out:
+ set_fs(old_fs);
+}
+
+static void handle_read(int key, struct tty_struct *tty)
+{
+ static int pos;
+ static int upper_case;
+ static char str[SYSRQ_NAMELEN_MAX];
+
+ if (key == 0) {
+ /* actually 0 is not shift only... */
+ upper_case = 1;
+ return;
+ }
+
+ if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) {
+ /* enter */
+ sysrq_key_table = sysrq_debug_key_table;
+ str[pos] = '\0';
+ pos = upper_case = 0;
+ printk("\n");
+ if (sysrq_input_return == NULL)
+ printk("No return handler!!!\n");
+ else
+ sysrq_input_return(str);
+ return;
+ };
+
+ /* check for alowed symbols */
+ if (key == '-') {
+ if (upper_case)
+ key = '_';
+ goto correct;
+ };
+ if (key >= 'a' && key <= 'z') {
+ if (upper_case)
+ key = key - 'a' + 'A';
+ goto correct;
+ };
+ if (key >= '0' && key <= '9')
+ goto correct;
+
+ upper_case = 0;
+ return;
+
+correct:
+ str[pos] = key;
+ printk("%c", (char)key);
+ pos++;
+ upper_case = 0;
+}
+
+static struct sysrq_key_op input_read = {
+ .handler = handle_read,
+ .help_msg = "",
+ .action_msg = NULL,
+};
+
+static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
+ [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read,
+};
+
+static void return_dump_mem(char *str)
+{
+ unsigned long address;
+ char *end;
+
+ address = simple_strtoul(str, &end, 0);
+ if (*end != '\0') {
+ printk("Bad address [%s]\n", str);
+ return;
+ }
+
+ dump_address = (unsigned long *)address;
+ dump_mem();
+}
+
+static void handle_dump_mem(int key, struct tty_struct *tty)
+{
+ sysrq_input_return = return_dump_mem;
+ sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_dump_mem = {
+ .handler = handle_dump_mem,
+ .help_msg = "Dump",
+ .action_msg = "Enter address:",
+};
+
+static void return_resolve(char *str)
+{
+ unsigned long address;
+
+ address = kallsyms_lookup_name(str);
+ printk("%s : %lx\n", str, address);
+ if (address) {
+ dump_address = (unsigned long *)address;
+ printk("Now you can dump it via X\n");
+ }
+}
+
+static void handle_resolve(int key, struct tty_struct *tty)
+{
+ sysrq_input_return = return_resolve;
+ sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_resolve = {
+ .handler = handle_resolve,
+ .help_msg = "Resolve",
+ .action_msg = "Enter symbol name:",
+};
+
+static void return_write_mem(char *str)
+{
+ unsigned long address;
+ unsigned long value;
+ char *end;
+
+ address = simple_strtoul(str, &end, 0);
+ if (*end != '-') {
+ printk("Bad address in %s\n", str);
+ return;
+ }
+ value = simple_strtoul(end + 1, &end, 0);
+ if (*end != '\0') {
+ printk("Bad value in %s\n", str);
+ return;
+ }
+
+ dump_address = (unsigned long *)address;
+ write_mem(value);
+}
+
+static void handle_write_mem(int key, struct tty_struct *tty)
+{
+ sysrq_input_return = return_write_mem;
+ sysrq_key_table = sysrq_input_key_table;
+}
+
+static struct sysrq_key_op debug_write_mem = {
+ .handler = handle_write_mem,
+ .help_msg = "Writemem",
+ .action_msg = "Enter address-value:",
+};
+
+static void handle_next(int key, struct tty_struct *tty)
+{
+ dump_mem();
+}
+
+static struct sysrq_key_op debug_next = {
+ .handler = handle_next,
+ .help_msg = "neXt",
+ .action_msg = "continuing",
+};
+
+static void handle_quit(int key, struct tty_struct *tty)
+{
+ sysrq_key_table = sysrq_default_key_table;
+ console_loglevel = orig_console_loglevel;
+}
+
+static struct sysrq_key_op debug_quit = {
+ .handler = handle_quit,
+ .help_msg = "Quit",
+ .action_msg = "Thank you for using debugger",
+};
+
+static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
+ [13] = &debug_dump_mem, /* d */
+ [26] = &debug_quit, /* q */
+ [27] = &debug_resolve, /* r */
+ [32] = &debug_write_mem, /* w */
+ [33] = &debug_next, /* x */
+};
+
+static void sysrq_handle_debug(int key, struct tty_struct *tty)
+{
+ orig_console_loglevel = console_loglevel;
+ console_loglevel = 8;
+ sysrq_key_table = sysrq_debug_key_table;
+ printk("Welcome sysrq debugging mode\n"
+ "Press H for help\n");
+}
+
+static struct sysrq_key_op sysrq_debug_op = {
+ .handler = sysrq_handle_debug,
+ .help_msg = "debuG",
+ .action_msg = "Select desired action",
+};
+#endif
+
+static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = {
&sysrq_loglevel_op, /* 0 */
&sysrq_loglevel_op, /* 1 */
&sysrq_loglevel_op, /* 2 */
@@ -417,7 +681,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
&sysrq_term_op, /* e */
&sysrq_moom_op, /* f */
/* g: May be registered for the kernel debugger */
+#ifdef CONFIG_SYSRQ_DEBUG
+ &sysrq_debug_op, /* g */
+#else
NULL, /* g */
+#endif
NULL, /* h - reserved for help */
&sysrq_kill_op, /* i */
#ifdef CONFIG_BLOCK
@@ -449,8 +717,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
/* y: May be registered on sparc64 for global register dump */
NULL, /* y */
&sysrq_ftrace_dump_op, /* z */
+ NULL, /* for debugger */
};
+static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table;
+
/* key2index calculation, -1 on invalid index */
static int sysrq_key_table_key2index(int key)
{
@@ -460,6 +731,10 @@ static int sysrq_key_table_key2index(int key)
retval = key - '0';
else if ((key >= 'a') && (key <= 'z'))
retval = key + 10 - 'a';
+#ifdef CONFIG_SYSRQ_DEBUG
+ else if (key == 0 || key == 0x0d || key == '-')
+ retval = SYSRQ_KEY_TABLE_LENGTH - 1;
+#endif
else
retval = -1;
return retval;
@@ -470,21 +745,21 @@ static int sysrq_key_table_key2index(int key)
*/
struct sysrq_key_op *__sysrq_get_key_op(int key)
{
- struct sysrq_key_op *op_p = NULL;
- int i;
+ struct sysrq_key_op *op_p = NULL;
+ int i;
i = sysrq_key_table_key2index(key);
if (i != -1)
- op_p = sysrq_key_table[i];
- return op_p;
+ op_p = sysrq_key_table[i];
+ return op_p;
}
static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p)
{
- int i = sysrq_key_table_key2index(key);
+ int i = sysrq_key_table_key2index(key);
- if (i != -1)
- sysrq_key_table[i] = op_p;
+ if (i != -1)
+ sysrq_key_table[i] = op_p;
}
/*
@@ -507,25 +782,25 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask)
*/
orig_log_level = console_loglevel;
console_loglevel = 7;
- printk(KERN_INFO "SysRq : ");
- op_p = __sysrq_get_key_op(key);
- if (op_p) {
+ op_p = __sysrq_get_key_op(key);
+ if (op_p) {
/*
* Should we check for enabled operations (/proc/sysrq-trigger
* should not) and is the invoked operation enabled?
*/
if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
- printk("%s\n", op_p->action_msg);
+ if (op_p->action_msg)
+ printk("%s\n", op_p->action_msg);
console_loglevel = orig_log_level;
op_p->handler(key, tty);
} else {
printk("This sysrq operation is disabled.\n");
}
} else {
- printk("HELP : ");
+ printk("SysRq HELP : ");
/* Only print the help msg once per handler */
- for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) {
+ for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) {
if (sysrq_key_table[i]) {
int j;
@@ -555,7 +830,7 @@ void handle_sysrq(int key, struct tty_struct *tty)
EXPORT_SYMBOL(handle_sysrq);
static int __sysrq_swap_key_ops(int key, struct sysrq_key_op *insert_op_p,
- struct sysrq_key_op *remove_op_p)
+ struct sysrq_key_op *remove_op_p)
{
int retval;
@@ -591,12 +866,29 @@ EXPORT_SYMBOL(unregister_sysrq_key);
static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
+ struct ve_struct *cur = get_exec_env();
+ static int pnum = 10;
+
if (count) {
- char c;
+ int i, cnt;
+ char c[32];
- if (get_user(c, buf))
+ cnt = min(count, sizeof(c));
+ if (copy_from_user(c, buf, cnt))
return -EFAULT;
- __handle_sysrq(c, NULL, 0);
+
+
+ for (i = 0; i < cnt && c[i] != '\n'; i++) {
+ if (!ve_is_super(cur)) {
+ if (!pnum)
+ continue;
+ printk("SysRq: CT#%u sent '%c' magic key.\n",
+ cur->veid, c[i]);
+ pnum--;
+ continue;
+ }
+ __handle_sysrq(c[i], NULL, 0);
+ }
}
return count;
}
@@ -607,7 +899,7 @@ static const struct file_operations proc_sysrq_trigger_operations = {
static int __init sysrq_init(void)
{
- proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations);
+ proc_create("sysrq-trigger", S_IWUSR, &glob_proc_root, &proc_sysrq_trigger_operations);
return 0;
}
module_init(sysrq_init);
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 53ffcfc..2571f59 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -96,6 +96,8 @@
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
+#include <linux/ve.h>
#include <linux/uaccess.h>
#include <asm/system.h>
@@ -106,6 +108,7 @@
#include <linux/kmod.h>
#include <linux/nsproxy.h>
+#include <bc/kmem.h>
#undef TTY_DEBUG_HANGUP
@@ -130,6 +133,7 @@ EXPORT_SYMBOL(tty_std_termios);
into this file */
LIST_HEAD(tty_drivers); /* linked list of tty drivers */
+EXPORT_SYMBOL(tty_drivers);
/* Mutex to protect creating and releasing a tty. This is shared with
vt.c for deeply disgusting hack reasons */
@@ -166,7 +170,7 @@ static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty);
struct tty_struct *alloc_tty_struct(void)
{
- return kzalloc(sizeof(struct tty_struct), GFP_KERNEL);
+ return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC);
}
/**
@@ -274,9 +278,29 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index)
if (device < base || device >= base + p->num)
continue;
*index = device - base;
- return tty_driver_kref_get(p);
+#ifdef CONFIG_VE
+ if (in_interrupt())
+ goto found;
+ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR
+#ifdef CONFIG_UNIX98_PTYS
+ && (p->major<UNIX98_PTY_MASTER_MAJOR ||
+ p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) &&
+ (p->major<UNIX98_PTY_SLAVE_MAJOR ||
+ p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1)
+#endif
+ )
+ goto found;
+ if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env()))
+ goto found;
+ if (!ve_accessible_strict(p->owner_env, get_exec_env()))
+ continue;
+#endif
+ goto found;
}
return NULL;
+
+found:
+ return tty_driver_kref_get(p);
}
#ifdef CONFIG_CONSOLE_POLL
@@ -1169,7 +1193,7 @@ int tty_init_termios(struct tty_struct *tty)
tp = tty->driver->termios[idx];
if (tp == NULL) {
- tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL);
+ tp = kzalloc(sizeof(struct ktermios[2]), GFP_KERNEL_UBC);
if (tp == NULL)
return -ENOMEM;
memcpy(tp, &tty->driver->init_termios,
@@ -1297,7 +1321,7 @@ static int tty_reopen(struct tty_struct *tty)
*/
struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
- int first_ok)
+ struct tty_struct *i_tty, int first_ok)
{
struct tty_struct *tty;
int retval;
@@ -1707,7 +1731,7 @@ void tty_release_dev(struct file *filp)
static int __tty_open(struct inode *inode, struct file *filp)
{
- struct tty_struct *tty = NULL;
+ struct tty_struct *tty = NULL, *c_tty = NULL;
int noctty, retval;
struct tty_driver *driver;
int index;
@@ -1731,6 +1755,7 @@ retry_open:
}
driver = tty_driver_kref_get(tty->driver);
index = tty->index;
+ c_tty = tty;
filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
/* noctty = 1; */
/* FIXME: Should we take a driver reference ? */
@@ -1740,6 +1765,12 @@ retry_open:
#ifdef CONFIG_VT
if (device == MKDEV(TTY_MAJOR, 0)) {
extern struct tty_driver *console_driver;
+#ifdef CONFIG_VE
+ if (!ve_is_super(get_exec_env())) {
+ mutex_unlock(&tty_mutex);
+ return -ENODEV;
+ }
+#endif
driver = tty_driver_kref_get(console_driver);
index = fg_console;
noctty = 1;
@@ -1748,6 +1779,12 @@ retry_open:
#endif
if (device == MKDEV(TTYAUX_MAJOR, 1)) {
struct tty_driver *console_driver = console_device(&index);
+#ifdef CONFIG_VE
+ if (!ve_is_super(get_exec_env())) {
+ mutex_unlock(&tty_mutex);
+ return -ENODEV;
+ }
+#endif
if (console_driver) {
driver = tty_driver_kref_get(console_driver);
if (driver) {
@@ -1782,7 +1819,7 @@ got_driver:
if (retval)
tty = ERR_PTR(retval);
} else
- tty = tty_init_dev(driver, index, 0);
+ tty = tty_init_dev(driver, index, c_tty, 0);
mutex_unlock(&tty_mutex);
tty_driver_kref_put(driver);
@@ -2078,6 +2115,8 @@ static int tioccons(struct file *file)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!ve_is_super(get_exec_env()))
+ return -EACCES;
if (file->f_op->write == redirected_tty_write) {
struct file *f;
spin_lock(&redirect_lock);
@@ -2658,7 +2697,7 @@ void __do_SAK(struct tty_struct *tty)
/* Now kill any processes that happen to have the
* tty open.
*/
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (p->signal->tty == tty) {
printk(KERN_NOTICE "SAK: killed process %d"
" (%s): task_session(p)==tty->session\n",
@@ -2690,7 +2729,7 @@ void __do_SAK(struct tty_struct *tty)
spin_unlock(&p->files->file_lock);
}
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock(&tasklist_lock);
#endif
}
@@ -2757,6 +2796,7 @@ void initialize_tty_struct(struct tty_struct *tty,
tty->ops = driver->ops;
tty->index = idx;
tty_line_name(driver, idx, tty->name);
+ tty->owner_env = driver->owner_env;
}
/**
@@ -2849,6 +2889,7 @@ struct tty_driver *alloc_tty_driver(int lines)
driver->magic = TTY_DRIVER_MAGIC;
driver->num = lines;
/* later we'll move allocation of tables here */
+ driver->owner_env = get_ve(get_exec_env());
}
return driver;
}
@@ -2883,6 +2924,7 @@ static void destruct_tty_driver(struct kref *kref)
kfree(p);
cdev_del(&driver->cdev);
}
+ put_ve(driver->owner_env);
kfree(driver);
}
@@ -2957,6 +2999,7 @@ int tty_register_driver(struct tty_driver *driver)
}
mutex_lock(&tty_mutex);
+ driver->owner_env = get_exec_env();
list_add(&driver->tty_drivers, &tty_drivers);
mutex_unlock(&tty_mutex);
@@ -3130,3 +3173,43 @@ static int __init tty_init(void)
return 0;
}
module_init(tty_init);
+
+#ifdef CONFIG_UNIX98_PTYS
+int init_ve_tty_class(void)
+{
+ struct class * ve_tty_class;
+ struct device * ve_ptmx_dev_class;
+
+ ve_tty_class = class_create(THIS_MODULE, "tty");
+ if (IS_ERR(ve_tty_class))
+ return -ENOMEM;
+
+ ve_ptmx_dev_class = device_create(ve_tty_class, NULL,
+ MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx");
+ if (IS_ERR(ve_ptmx_dev_class)) {
+ class_destroy(ve_tty_class);
+ return PTR_ERR(ve_ptmx_dev_class);
+ }
+
+ get_exec_env()->tty_class = ve_tty_class;
+ return 0;
+}
+
+void fini_ve_tty_class(void)
+{
+ struct class *ve_tty_class = get_exec_env()->tty_class;
+
+ device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2));
+ class_destroy(ve_tty_class);
+}
+#else
+int init_ve_tty_class(void)
+{
+ return 0;
+}
+void fini_ve_tty_class(void)
+{
+}
+#endif
+EXPORT_SYMBOL(init_ve_tty_class);
+EXPORT_SYMBOL(fini_ve_tty_class);
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 246323d..3e91ef9 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -41,6 +41,10 @@ ucc_geth_driver-objs := ucc_geth.o ucc_geth_ethtool.o
obj-$(CONFIG_FSL_PQ_MDIO) += fsl_pq_mdio.o
+obj-$(CONFIG_VE_NETDEV) += vznetdev.o
+vznetdev-objs := open_vznet.o venet_core.o
+obj-$(CONFIG_VE_ETHDEV) += vzethdev.o
+
#
# link order important here
#
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 1bc654a..3ecee33 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -75,6 +75,12 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
struct pcpu_lstats *pcpu_lstats, *lb_stats;
int len;
+#ifdef CONFIG_VE
+ if (unlikely(get_exec_env()->disable_net)) {
+ kfree_skb(skb);
+ return 0;
+ }
+#endif
skb_orphan(skb);
skb->protocol = eth_type_trans(skb, dev);
@@ -153,10 +159,16 @@ static void loopback_dev_free(struct net_device *dev)
free_netdev(dev);
}
+static void loopback_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+}
+
static const struct net_device_ops loopback_ops = {
.ndo_init = loopback_dev_init,
.ndo_start_xmit= loopback_xmit,
.ndo_get_stats = loopback_get_stats,
+ .ndo_cpt = loopback_cpt,
};
/*
@@ -177,7 +189,8 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NO_CSUM
| NETIF_F_HIGHDMA
| NETIF_F_LLTX
- | NETIF_F_NETNS_LOCAL;
+ | NETIF_F_NETNS_LOCAL
+ | NETIF_F_VIRTUAL;
dev->ethtool_ops = &loopback_ethtool_ops;
dev->header_ops = &eth_header_ops;
dev->netdev_ops = &loopback_ops;
diff --git a/drivers/net/open_vznet.c b/drivers/net/open_vznet.c
new file mode 100644
index 0000000..79bf640
--- /dev/null
+++ b/drivers/net/open_vznet.c
@@ -0,0 +1,244 @@
+/*
+ * open_vznet.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Virtual Networking device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/inet.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <linux/venet.h>
+
+void veip_stop(struct ve_struct *ve)
+{
+ struct list_head *p, *tmp;
+
+ write_lock_irq(&veip_hash_lock);
+ if (ve->veip == NULL)
+ goto unlock;
+ list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
+ struct ip_entry_struct *ptr;
+ ptr = list_entry(p, struct ip_entry_struct, ve_list);
+ ptr->active_env = NULL;
+ list_del(&ptr->ve_list);
+ list_del(&ptr->ip_hash);
+ kfree(ptr);
+ }
+ veip_put(ve->veip);
+ ve->veip = NULL;
+ if (!ve_is_super(ve))
+ module_put(THIS_MODULE);
+unlock:
+ write_unlock_irq(&veip_hash_lock);
+}
+
+int veip_start(struct ve_struct *ve)
+{
+ int err, get;
+
+ err = 0;
+ write_lock_irq(&veip_hash_lock);
+ get = ve->veip == NULL;
+ ve->veip = veip_findcreate(ve->veid);
+ if (ve->veip == NULL)
+ err = -ENOMEM;
+ write_unlock_irq(&veip_hash_lock);
+ if (err == 0 && get && !ve_is_super(ve))
+ __module_get(THIS_MODULE);
+ return err;
+}
+
+int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+ struct ip_entry_struct *entry, *found;
+ int err;
+
+ entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (ve->veip == NULL) {
+ /* This can happen if we load venet AFTER ve was started */
+ err = veip_start(ve);
+ if (err < 0)
+ goto out;
+ }
+
+ write_lock_irq(&veip_hash_lock);
+ err = -EADDRINUSE;
+ found = venet_entry_lookup(addr);
+ if (found != NULL)
+ goto out_unlock;
+
+ entry->active_env = ve;
+ entry->addr = *addr;
+ ip_entry_hash(entry, ve->veip);
+
+ err = 0;
+ entry = NULL;
+out_unlock:
+ write_unlock_irq(&veip_hash_lock);
+out:
+ if (entry != NULL)
+ kfree(entry);
+ return err;
+}
+
+int veip_entry_del(envid_t veid, struct ve_addr_struct *addr)
+{
+ struct ip_entry_struct *found;
+ int err;
+
+ err = -EADDRNOTAVAIL;
+ write_lock_irq(&veip_hash_lock);
+ found = venet_entry_lookup(addr);
+ if (found == NULL)
+ goto out;
+ if (found->active_env->veid != veid)
+ goto out;
+
+ err = 0;
+ found->active_env = NULL;
+
+ list_del(&found->ip_hash);
+ list_del(&found->ve_list);
+ kfree(found);
+out:
+ write_unlock_irq(&veip_hash_lock);
+ return err;
+}
+
+static int skb_extract_addr(struct sk_buff *skb,
+ struct ve_addr_struct *addr, int dir)
+{
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_IP):
+ addr->family = AF_INET;
+ addr->key[0] = 0;
+ addr->key[1] = 0;
+ addr->key[2] = 0;
+ addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr);
+ return 0;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case __constant_htons(ETH_P_IPV6):
+ addr->family = AF_INET6;
+ memcpy(&addr->key, dir ?
+ ipv6_hdr(skb)->daddr.s6_addr32 :
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ sizeof(addr->key));
+ return 0;
+#endif
+ }
+
+ return -EAFNOSUPPORT;
+}
+
+static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir)
+{
+ struct ip_entry_struct *entry;
+ struct ve_addr_struct addr;
+
+ if (skb_extract_addr(skb, &addr, dir) < 0)
+ return NULL;
+
+ entry = venet_entry_lookup(&addr);
+ if (entry == NULL)
+ return NULL;
+
+ return entry->active_env;
+}
+
+int venet_change_skb_owner(struct sk_buff *skb)
+{
+ struct ve_struct *ve, *ve_old;
+
+ ve_old = skb->owner_env;
+
+ read_lock(&veip_hash_lock);
+ if (!ve_is_super(ve_old)) {
+ /* from VE to host */
+ ve = venet_find_ve(skb, 0);
+ if (ve == NULL)
+ goto out_drop;
+ if (!ve_accessible_strict(ve, ve_old))
+ goto out_source;
+ skb->owner_env = get_ve0();
+ } else {
+ /* from host to VE */
+ ve = venet_find_ve(skb, 1);
+ if (ve == NULL)
+ goto out_drop;
+ skb->owner_env = ve;
+ }
+ read_unlock(&veip_hash_lock);
+
+ return 0;
+
+out_drop:
+ read_unlock(&veip_hash_lock);
+ return -ESRCH;
+
+out_source:
+ read_unlock(&veip_hash_lock);
+ if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
+ printk(KERN_WARNING "Dropped packet, source wrong "
+ "veid=%u src-IP=%u.%u.%u.%u "
+ "dst-IP=%u.%u.%u.%u\n",
+ skb->owner_env->veid,
+ NIPQUAD(ip_hdr(skb)->saddr),
+ NIPQUAD(ip_hdr(skb)->daddr));
+ }
+ return -EACCES;
+}
+
+#ifdef CONFIG_PROC_FS
+int veip_seq_show(struct seq_file *m, void *v)
+{
+ struct list_head *p;
+ struct ip_entry_struct *entry;
+ char s[40];
+
+ p = (struct list_head *)v;
+ if (p == ip_entry_hash_table) {
+ seq_puts(m, "Version: 2.5\n");
+ return 0;
+ }
+ entry = list_entry(p, struct ip_entry_struct, ip_hash);
+ veaddr_print(s, sizeof(s), &entry->addr);
+ seq_printf(m, "%39s %10u\n", s, 0);
+ return 0;
+}
+#endif
+
+__exit void veip_cleanup(void)
+{
+ int i;
+
+ write_lock_irq(&veip_hash_lock);
+ for (i = 0; i < VEIP_HASH_SZ; i++)
+ while (!list_empty(ip_entry_hash_table + i)) {
+ struct ip_entry_struct *entry;
+
+ entry = list_first_entry(ip_entry_hash_table + i,
+ struct ip_entry_struct, ip_hash);
+ list_del(&entry->ip_hash);
+ kfree(entry);
+ }
+ write_unlock_irq(&veip_hash_lock);
+}
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index 2559991..19d17f0 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -77,6 +77,7 @@
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/vzcalluser.h>
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
@@ -547,6 +548,9 @@ static int pppoe_create(struct net *net, struct socket *sock)
{
struct sock *sk;
+ if (!(get_exec_env()->features & VE_FEATURE_PPP))
+ return -EACCES;
+
sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
if (!sk)
return -ENOMEM;
diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c
index 5910df6..0b64d3d 100644
--- a/drivers/net/pppol2tp.c
+++ b/drivers/net/pppol2tp.c
@@ -97,6 +97,7 @@
#include <net/ip.h>
#include <net/udp.h>
#include <net/xfrm.h>
+#include <linux/vzcalluser.h>
#include <asm/byteorder.h>
#include <asm/atomic.h>
@@ -1588,6 +1589,9 @@ static int pppol2tp_create(struct net *net, struct socket *sock)
int error = -ENOMEM;
struct sock *sk;
+ if (!(get_exec_env()->features & VE_FEATURE_PPP))
+ return -EACCES;
+
sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
if (!sk)
goto out;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 4fdfa2a..a052759 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -61,6 +61,7 @@
#include <linux/crc32.h>
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
+#include <linux/file.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
@@ -69,6 +70,9 @@
#include <asm/system.h>
#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */
@@ -93,6 +97,7 @@ struct tun_file {
atomic_t count;
struct tun_struct *tun;
struct net *net;
+ struct file *file;
};
struct tun_sock;
@@ -124,6 +129,15 @@ static inline struct tun_sock *tun_sk(struct sock *sk)
return container_of(sk, struct tun_sock, sk);
}
+static void __tun_attach(struct tun_struct *tun, struct tun_file *tfile)
+{
+ tfile->tun = tun;
+ tun->tfile = tfile;
+ dev_hold(tun->dev);
+ sock_hold(tun->socket.sk);
+ atomic_inc(&tfile->count);
+}
+
static int tun_attach(struct tun_struct *tun, struct file *file)
{
struct tun_file *tfile = file->private_data;
@@ -142,12 +156,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file)
goto out;
err = 0;
- tfile->tun = tun;
- tun->tfile = tfile;
- dev_hold(tun->dev);
- sock_hold(tun->socket.sk);
- atomic_inc(&tfile->count);
-
+ __tun_attach(tun, tfile);
out:
netif_tx_unlock_bh(tun->dev);
return err;
@@ -418,12 +427,16 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+static void tun_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context * ctx);
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
.ndo_stop = tun_net_close,
.ndo_start_xmit = tun_net_xmit,
.ndo_change_mtu = tun_net_change_mtu,
+ .ndo_cpt = tun_cpt,
};
static const struct net_device_ops tap_netdev_ops = {
@@ -435,6 +448,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_set_multicast_list = tun_net_mclist,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
+ .ndo_cpt = tun_cpt,
};
/* Initialize net device. */
@@ -513,12 +527,8 @@ static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
struct sk_buff *skb;
int err;
- /* Under a page? Don't bother with paged skb. */
- if (prepad + len < PAGE_SIZE || !linear)
- linear = len;
-
- skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
- &err);
+ linear = len;
+ skb = sock_alloc_send_skb(sk, prepad + linear, noblock, &err);
if (!skb)
return ERR_PTR(err);
@@ -819,6 +829,7 @@ static void tun_setup(struct net_device *dev)
dev->ethtool_ops = &tun_ethtool_ops;
dev->destructor = tun_free_netdev;
+ dev->features |= NETIF_F_VIRTUAL;
}
/* Trivial set of netlink ops to allow deleting tun or tap
@@ -864,6 +875,29 @@ static struct proto tun_proto = {
.obj_size = sizeof(struct tun_sock),
};
+static int tun_sk_alloc_init(struct net *net, struct tun_struct *tun,
+ struct sock **psk)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
+ if (!sk)
+ return -ENOMEM;
+
+ init_waitqueue_head(&tun->socket.wait);
+ sock_init_data(&tun->socket, sk);
+ sk->sk_write_space = tun_sock_write_space;
+ sk->sk_sndbuf = INT_MAX;
+
+ container_of(sk, struct tun_sock, sk)->tun = tun;
+
+ security_tun_dev_post_create(sk);
+
+ *psk = sk;
+ return 0;
+
+}
+
static int tun_flags(struct tun_struct *tun)
{
int flags = 0;
@@ -932,7 +966,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
if (((tun->owner != -1 && cred->euid != tun->owner) ||
(tun->group != -1 && !in_egroup_p(tun->group))) &&
- !capable(CAP_NET_ADMIN))
+ !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
err = security_tun_dev_attach(tun->socket.sk);
if (err < 0)
@@ -946,7 +980,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
char *name;
unsigned long flags = 0;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
err = security_tun_dev_create();
if (err < 0)
@@ -980,20 +1014,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->flags = flags;
tun->txflt.count = 0;
- err = -ENOMEM;
- sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
- if (!sk)
+ err = tun_sk_alloc_init(net, tun, &sk);
+ if (err)
goto err_free_dev;
- init_waitqueue_head(&tun->socket.wait);
- sock_init_data(&tun->socket, sk);
- sk->sk_write_space = tun_sock_write_space;
- sk->sk_sndbuf = INT_MAX;
-
- container_of(sk, struct tun_sock, sk)->tun = tun;
-
- security_tun_dev_post_create(sk);
-
tun_net_init(dev);
if (strchr(dev->name, '%')) {
@@ -1006,9 +1030,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
if (err < 0)
goto err_free_sk;
- if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
- device_create_file(&tun->dev->dev, &dev_attr_owner) ||
- device_create_file(&tun->dev->dev, &dev_attr_group))
+ if ((dev_net(tun->dev) == &init_net) &&
+ (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
+ device_create_file(&tun->dev->dev, &dev_attr_owner) ||
+ device_create_file(&tun->dev->dev, &dev_attr_group)))
printk(KERN_ERR "Failed to create tun sysfs files\n");
sk->sk_destruct = tun_sock_destruct;
@@ -1316,6 +1341,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->tun = NULL;
tfile->net = get_net(current->nsproxy->net_ns);
file->private_data = tfile;
+ tfile->file = file;
return 0;
}
@@ -1457,6 +1483,226 @@ static const struct ethtool_ops tun_ethtool_ops = {
.set_rx_csum = tun_set_rx_csum
};
+static void cpt_dump_tap_filter(struct tap_filter *flt,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+ struct cpt_tap_filter_image v;
+ loff_t saved_obj;
+
+ ops->push_object(&saved_obj, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_TAP_FILTER;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_count = flt->count;
+
+ BUILD_BUG_ON(sizeof(flt->mask) != sizeof(v.cpt_mask));
+ memcpy(v.cpt_mask, flt->mask, sizeof(v.cpt_mask));
+
+ BUILD_BUG_ON(sizeof(flt->addr) != sizeof(v.cpt_addr));
+ memcpy(v.cpt_addr, flt->addr, sizeof(v.cpt_addr));
+
+ ops->write(&v, sizeof(v), ctx);
+
+ ops->pop_object(&saved_obj, ctx);
+}
+
+static void tun_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context * ctx)
+{
+ struct cpt_tuntap_image v;
+ struct tun_struct *tun;
+
+ tun = netdev_priv(dev);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_TUNTAP;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_owner = tun->owner;
+ v.cpt_flags = tun->flags;
+
+ if (tun->tfile->file)
+ v.cpt_bindfile = ops->lookup_object(CPT_OBJ_FILE, tun->tfile->file, ctx);
+
+ v.cpt_if_flags = 0;
+ memset(v.cpt_dev_addr, 0, sizeof(v.cpt_dev_addr));
+ memset(v.cpt_chr_filter, 0, sizeof(v.cpt_chr_filter));
+ memset(v.cpt_net_filter, 0, sizeof(v.cpt_net_filter));
+
+ ops->write(&v, sizeof(v), ctx);
+
+ cpt_dump_tap_filter(&tun->txflt, ops, ctx);
+}
+
+static int rst_restore_tap_filter(loff_t start, struct cpt_tuntap_image *ti,
+ struct tap_filter *flt, struct rst_ops *ops,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_tap_filter_image fi;
+ loff_t pos;
+
+ /* disable filtering */
+ flt->count = 0;
+
+ pos = start + ti->cpt_hdrlen;
+
+ /* no tap filter image? */
+ if (pos >= start + ti->cpt_next)
+ goto convert;
+
+ err = ops->get_object(CPT_OBJ_NET_TAP_FILTER, pos,
+ &fi, sizeof(fi), ctx);
+ if (err)
+ return err;
+
+ BUILD_BUG_ON(sizeof(flt->mask) != sizeof(fi.cpt_mask));
+ memcpy(flt->mask, fi.cpt_mask, sizeof(fi.cpt_mask));
+
+ BUILD_BUG_ON(sizeof(flt->addr) != sizeof(fi.cpt_addr));
+ memcpy(flt->addr, fi.cpt_addr, sizeof(fi.cpt_addr));
+
+ flt->count = fi.cpt_count;
+
+ return 0;
+
+convert:
+ /** From OLD filtering code:
+ * Decide whether to accept this packet. This code is designed to
+ * behave identically to an Ethernet interface. Accept the packet if
+ * - we are promiscuous.
+ * - the packet is addressed to us.
+ * - the packet is broadcast.
+ * - the packet is multicast and
+ * - we are multicast promiscous.
+ * - we belong to the multicast group.
+ */
+
+ /* accept all, this is default if filter is untouched */
+ if (ti->cpt_if_flags & IFF_PROMISC)
+ return 0;
+
+ /* accept packets addressed to character device's hardware address */
+ BUILD_BUG_ON(sizeof(flt->addr[0]) != sizeof(ti->cpt_dev_addr));
+ memcpy(flt->addr[0], ti->cpt_dev_addr, sizeof(ti->cpt_dev_addr));
+
+ /* accept broadcast */
+ memset(flt->addr[1], ~0, sizeof(flt->addr[1]));
+
+ /* accept hashed multicast: hash function the same as in old code */
+ BUILD_BUG_ON(sizeof(flt->mask) != sizeof(ti->cpt_chr_filter));
+ memcpy(flt->mask, ti->cpt_chr_filter, sizeof(ti->cpt_chr_filter));
+
+ /* accept all multicast */
+ if (ti->cpt_if_flags & IFF_ALLMULTI)
+ memset(flt->mask, ~0, sizeof(flt->mask));
+
+ /* two exact filters: hw addr and broadcast */
+ flt->count = 2;
+
+ return 0;
+}
+
+static int tun_rst(loff_t start, struct cpt_netdev_image *di,
+ struct rst_ops *ops, struct cpt_context *ctx)
+{
+ int err = -ENODEV;
+ struct cpt_tuntap_image ti;
+ struct net_device *dev;
+ struct file *bind_file = NULL;
+ struct tun_struct *tun;
+ struct tun_file *tfile;
+ struct sock *sk;
+ loff_t pos;
+
+ pos = start + di->cpt_hdrlen;
+ err = ops->get_object(CPT_OBJ_NET_TUNTAP, pos,
+ &ti, sizeof(ti), ctx);
+ if (err)
+ return err;
+
+ if (ti.cpt_bindfile) {
+ bind_file = ops->rst_file(ti.cpt_bindfile, -1, ctx);
+ if (IS_ERR(bind_file))
+ return PTR_ERR(bind_file);
+ }
+
+ tfile = kmalloc(sizeof(*tfile), GFP_KERNEL);
+ if (!tfile)
+ goto out;
+
+ atomic_set(&tfile->count, 0);
+ tfile->tun = NULL;
+ tfile->net = get_net(current->nsproxy->net_ns);
+ tfile->file = bind_file;
+
+ err = -ENOMEM;
+ dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup);
+ if (!dev)
+ goto out_tf;
+
+ tun = netdev_priv(dev);
+
+ tun->dev = dev;
+ tun->owner = ti.cpt_owner;
+ tun->flags = ti.cpt_flags;
+ tun_net_init(dev);
+
+ err = tun_sk_alloc_init(current->nsproxy->net_ns, tun, &sk);
+ if (err)
+ goto out_netdev;
+
+ err = rst_restore_tap_filter(pos, &ti, &tun->txflt, ops, ctx);
+ if (err < 0)
+ goto out_sk;
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out_sk;
+
+ pos += ti.cpt_next;
+ if (pos < start + di->cpt_next) {
+ struct cpt_hwaddr_image hw;
+ /* Restore hardware address */
+ err = ops->get_object(CPT_OBJ_NET_HWADDR, pos,
+ &hw, sizeof(hw), ctx);
+ if (err)
+ goto out_unreg;
+
+ memcpy(dev->dev_addr, hw.cpt_dev_addr,
+ sizeof(hw.cpt_dev_addr));
+ }
+
+ sk->sk_destruct = tun_sock_destruct;
+ bind_file->private_data = tfile;
+ __tun_attach(tun, tfile);
+
+ fput(bind_file);
+ return 0;
+
+out_unreg:
+ unregister_netdevice(dev);
+out_sk:
+ sock_put(sk);
+out_netdev:
+ free_netdev(dev);
+out_tf:
+ put_net(tfile->net);
+ kfree(tfile);
+out:
+ fput(bind_file);
+ return err;
+}
+
+static struct netdev_rst tun_netdev_rst = {
+ .cpt_object = CPT_OBJ_NET_TUNTAP,
+ .ndo_rst = tun_rst,
+};
+
static int __init tun_init(void)
{
@@ -1476,6 +1722,8 @@ static int __init tun_init(void)
printk(KERN_ERR "tun: Can't register misc device %d\n", TUN_MINOR);
goto err_misc;
}
+
+ register_netdev_rst(&tun_netdev_rst);
return 0;
err_misc:
rtnl_link_unregister(&tun_link_ops);
@@ -1485,6 +1733,7 @@ err_linkops:
static void tun_cleanup(void)
{
+ unregister_netdev_rst(&tun_netdev_rst);
misc_deregister(&tun_miscdev);
rtnl_link_unregister(&tun_link_ops);
}
diff --git a/drivers/net/venet_core.c b/drivers/net/venet_core.c
new file mode 100644
index 0000000..317fbb0
--- /dev/null
+++ b/drivers/net/venet_core.c
@@ -0,0 +1,864 @@
+/*
+ * venet_core.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Common part for Virtuozzo virtual network devices
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/addrconf.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h> /* For the statistics structure. */
+#include <linux/if_arp.h> /* For ARPHRD_ETHER */
+#include <linux/ethtool.h>
+#include <linux/venet.h>
+#include <linux/ve_proto.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_venet.h>
+
+struct list_head ip_entry_hash_table[VEIP_HASH_SZ];
+rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED;
+LIST_HEAD(veip_lh);
+
+#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1))
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
+{
+ list_add(&entry->ip_hash,
+ ip_entry_hash_table +
+ ip_entry_hash_function(entry->addr.key[3]));
+ list_add(&entry->ve_list, &veip->ip_lh);
+}
+
+void veip_put(struct veip_struct *veip)
+{
+ if (!list_empty(&veip->ip_lh))
+ return;
+ if (!list_empty(&veip->src_lh))
+ return;
+ if (!list_empty(&veip->dst_lh))
+ return;
+
+ list_del(&veip->list);
+ kfree(veip);
+}
+
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr)
+{
+ struct ip_entry_struct *entry;
+
+ list_for_each_entry (entry, ip_entry_hash_table +
+ ip_entry_hash_function(addr->key[3]), ip_hash)
+ if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+ return entry;
+ return NULL;
+}
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+ struct ve_addr_struct *addr)
+{
+ struct ext_entry_struct *entry;
+
+ if (ve->veip == NULL)
+ return NULL;
+
+ list_for_each_entry (entry, &ve->veip->ext_lh, list)
+ if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+ return entry;
+ return NULL;
+}
+
+int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+ struct ext_entry_struct *entry, *found;
+ int err;
+
+ if (ve->veip == NULL)
+ return -ENONET;
+
+ entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ write_lock_irq(&veip_hash_lock);
+ err = -EADDRINUSE;
+ found = venet_ext_lookup(ve, addr);
+ if (found != NULL)
+ goto out_unlock;
+
+ entry->addr = *addr;
+ list_add(&entry->list, &ve->veip->ext_lh);
+ err = 0;
+ entry = NULL;
+out_unlock:
+ write_unlock_irq(&veip_hash_lock);
+ if (entry != NULL)
+ kfree(entry);
+ return err;
+}
+
+int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+ struct ext_entry_struct *found;
+ int err;
+
+ if (ve->veip == NULL)
+ return -ENONET;
+
+ err = -EADDRNOTAVAIL;
+ write_lock_irq(&veip_hash_lock);
+ found = venet_ext_lookup(ve, addr);
+ if (found == NULL)
+ goto out;
+
+ list_del(&found->list);
+ kfree(found);
+ err = 0;
+out:
+ write_unlock_irq(&veip_hash_lock);
+ return err;
+}
+
+void venet_ext_clean(struct ve_struct *ve)
+{
+ struct ext_entry_struct *entry, *tmp;
+
+ if (ve->veip == NULL)
+ return;
+
+ write_lock_irq(&veip_hash_lock);
+ list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ write_unlock_irq(&veip_hash_lock);
+}
+
+struct veip_struct *veip_find(envid_t veid)
+{
+ struct veip_struct *ptr;
+
+ list_for_each_entry(ptr, &veip_lh, list) {
+ if (ptr->veid != veid)
+ continue;
+ return ptr;
+ }
+ return NULL;
+}
+
+struct veip_struct *veip_findcreate(envid_t veid)
+{
+ struct veip_struct *ptr;
+
+ ptr = veip_find(veid);
+ if (ptr != NULL)
+ return ptr;
+
+ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
+ if (ptr == NULL)
+ return NULL;
+ memset(ptr, 0, sizeof(struct veip_struct));
+ INIT_LIST_HEAD(&ptr->ip_lh);
+ INIT_LIST_HEAD(&ptr->src_lh);
+ INIT_LIST_HEAD(&ptr->dst_lh);
+ INIT_LIST_HEAD(&ptr->ext_lh);
+ ptr->veid = veid;
+ list_add(&ptr->list, &veip_lh);
+ return ptr;
+}
+
+static int convert_sockaddr(struct sockaddr *addr, int addrlen,
+ struct ve_addr_struct *veaddr)
+{
+ int err;
+
+ switch (addr->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin;
+
+ err = -EINVAL;
+ if (addrlen != sizeof(struct sockaddr_in))
+ break;
+
+ err = 0;
+ sin = (struct sockaddr_in *)addr;
+ veaddr->family = AF_INET;
+ veaddr->key[0] = 0;
+ veaddr->key[1] = 0;
+ veaddr->key[2] = 0;
+ veaddr->key[3] = sin->sin_addr.s_addr;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin;
+
+ err = -EINVAL;
+ if (addrlen != sizeof(struct sockaddr_in6))
+ break;
+
+ err = 0;
+ sin = (struct sockaddr_in6 *)addr;
+ veaddr->family = AF_INET6;
+ memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key));
+ break;
+ }
+ default:
+ err = -EAFNOSUPPORT;
+ }
+ return err;
+}
+
+int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+ struct ve_addr_struct *veaddr)
+{
+ int err;
+ char addr[MAX_SOCK_ADDR];
+
+ err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr *)&addr);
+ if (err < 0)
+ goto out;
+
+ err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr);
+out:
+ return err;
+}
+
+void veaddr_print(char *str, int len, struct ve_addr_struct *a)
+{
+ if (a->family == AF_INET)
+ snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3]));
+ else
+ snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x",
+ ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF,
+ ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF,
+ ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF,
+ ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF
+ );
+}
+
+/*
+ * Device functions
+ */
+
+static int venet_open(struct net_device *dev)
+{
+ if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE))
+ return -EBUSY;
+ return 0;
+}
+
+static int venet_close(struct net_device *master)
+{
+ if (!ve_is_super(get_exec_env()))
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static void venet_destructor(struct net_device *dev)
+{
+ struct venet_stats *stats = (struct venet_stats *)dev->ml_priv;
+ if (stats == NULL)
+ return;
+ free_percpu(stats->real_stats);
+ kfree(stats);
+ dev->ml_priv = NULL;
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device_stats *stats;
+ struct net_device *rcv = NULL;
+ int length;
+
+ stats = venet_stats(dev, smp_processor_id());
+ if (unlikely(get_exec_env()->disable_net))
+ goto outf;
+
+ if (skb->protocol == __constant_htons(ETH_P_IP)) {
+ struct iphdr *iph;
+ iph = ip_hdr(skb);
+ if (ipv4_is_multicast(iph->daddr))
+ goto outf;
+ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h;
+ ip6h = ipv6_hdr(skb);
+ if (ipv6_addr_is_multicast(&ip6h->daddr))
+ goto outf;
+ skb_orphan(skb);
+ } else {
+ goto outf;
+ }
+
+ if (venet_change_skb_owner(skb) < 0)
+ goto outf;
+
+ if (unlikely(skb->owner_env->disable_net))
+ goto outf;
+
+ rcv = skb->owner_env->_venet_dev;
+ if (!rcv)
+ /* VE going down */
+ goto outf;
+
+ dev_hold(rcv);
+
+ if (!(rcv->flags & IFF_UP)) {
+ /* Target VE does not want to receive packets */
+ dev_put(rcv);
+ goto outf;
+ }
+
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = rcv;
+
+ skb_reset_mac_header(skb);
+ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
+
+ nf_reset(skb);
+ length = skb->len;
+
+ netif_rx(skb);
+
+ stats->tx_bytes += length;
+ stats->tx_packets++;
+ if (rcv) {
+ struct net_device_stats *rcv_stats;
+
+ rcv_stats = venet_stats(rcv, smp_processor_id());
+ rcv_stats->rx_bytes += length;
+ rcv_stats->rx_packets++;
+ dev_put(rcv);
+ }
+
+ return 0;
+
+outf:
+ kfree_skb(skb);
+ ++stats->tx_dropped;
+ return 0;
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+ int i;
+ struct venet_stats *stats;
+
+ stats = (struct venet_stats *)dev->ml_priv;
+ memset(&stats->stats, 0, sizeof(struct net_device_stats));
+ for (i=0; i < NR_CPUS; i++) {
+ struct net_device_stats *dev_stats;
+
+ if (!cpu_possible(i))
+ continue;
+ dev_stats = venet_stats(dev, i);
+ stats->stats.rx_bytes += dev_stats->rx_bytes;
+ stats->stats.tx_bytes += dev_stats->tx_bytes;
+ stats->stats.rx_packets += dev_stats->rx_packets;
+ stats->stats.tx_packets += dev_stats->tx_packets;
+ }
+
+ return &stats->stats;
+}
+
+/* Initialize the rest of the LOOPBACK device. */
+int venet_init_dev(struct net_device *dev)
+{
+ struct venet_stats *stats;
+
+ stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL);
+ if (stats == NULL)
+ goto fail;
+ stats->real_stats = alloc_percpu(struct net_device_stats);
+ if (stats->real_stats == NULL)
+ goto fail_free;
+ dev->ml_priv = stats;
+
+ /*
+ * Fill in the generic fields of the device structure.
+ */
+ dev->type = ARPHRD_VOID;
+ dev->hard_header_len = ETH_HLEN;
+ dev->mtu = 1500; /* eth_mtu */
+ dev->tx_queue_len = 0;
+
+ memset(dev->broadcast, 0xFF, ETH_ALEN);
+
+ /* New-style flags. */
+ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
+ return 0;
+
+fail_free:
+ kfree(stats);
+fail:
+ return -ENOMEM;
+}
+
+static const struct net_device_ops venet_netdev_ops;
+
+static int
+venet_set_op(struct net_device *dev, u32 data,
+ int (*fop)(struct net_device *, u32))
+{
+
+ struct ve_struct *ve;
+ int ret = 0;
+
+ read_lock(&ve_list_lock);
+ for_each_ve(ve) {
+ struct ve_struct *ve_old;
+
+ ve_old = set_exec_env(ve);
+ read_lock(&dev_base_lock);
+ for_each_netdev(ve->ve_netns, dev) {
+ if (dev->netdev_ops == &venet_netdev_ops)
+ ret = fop(dev, data);
+ }
+ read_unlock(&dev_base_lock);
+ set_exec_env(ve_old);
+
+ if (ret < 0)
+ break;
+ }
+ read_unlock(&ve_list_lock);
+ return ret;
+}
+
+static unsigned long common_features;
+
+static int venet_op_set_sg(struct net_device *dev, u32 data)
+{
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
+ if (data)
+ common_features |= NETIF_F_SG;
+ else
+ common_features &= ~NETIF_F_SG;
+
+ return venet_set_op(dev, data, ethtool_op_set_sg);
+}
+
+static int venet_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
+ if (data)
+ common_features |= NETIF_F_IP_CSUM;
+ else
+ common_features &= ~NETIF_F_IP_CSUM;
+
+ return venet_set_op(dev, data, ethtool_op_set_tx_csum);
+}
+
+static int
+venet_op_set_tso(struct net_device *dev, u32 data)
+{
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
+ if (data)
+ common_features |= NETIF_F_TSO;
+ else
+ common_features &= ~NETIF_F_TSO;
+
+ return venet_set_op(dev, data, ethtool_op_set_tso);
+}
+
+#define venet_op_set_rx_csum venet_op_set_tx_csum
+
+static struct ethtool_ops venet_ethtool_ops = {
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = venet_op_set_sg,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = venet_op_set_tx_csum,
+ .get_rx_csum = ethtool_op_get_tx_csum,
+ .set_rx_csum = venet_op_set_rx_csum,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = venet_op_set_tso,
+};
+
+static void venet_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+}
+
+static const struct net_device_ops venet_netdev_ops = {
+ .ndo_start_xmit = venet_xmit,
+ .ndo_get_stats = get_stats,
+ .ndo_open = venet_open,
+ .ndo_stop = venet_close,
+ .ndo_init = venet_init_dev,
+ .ndo_cpt = venet_cpt,
+};
+
+static void venet_setup(struct net_device *dev)
+{
+ /*
+ * No other features, as they are:
+ * - checksumming is required, and nobody else will done our job
+ */
+ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
+ NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED;
+
+ dev->netdev_ops = &venet_netdev_ops;
+ dev->destructor = venet_destructor;
+
+ dev->features |= common_features;
+
+ SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
+}
+
+#ifdef CONFIG_PROC_FS
+static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve)
+{
+ struct ip_entry_struct *entry;
+
+ read_lock(&veip_hash_lock);
+ if (ve->veip == NULL)
+ goto unlock;
+ list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) {
+ char addr[40];
+
+ if (entry->active_env == NULL)
+ continue;
+
+ veaddr_print(addr, sizeof(addr), &entry->addr);
+ if (entry->addr.family == AF_INET)
+ seq_printf(m, " %15s", addr);
+ else
+ seq_printf(m, " %39s", addr);
+ }
+unlock:
+ read_unlock(&veip_hash_lock);
+}
+
+static void *veip_seq_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t l;
+ struct list_head *p;
+ int i;
+
+ l = *pos;
+ write_lock_irq(&veip_hash_lock);
+ if (l == 0)
+ return ip_entry_hash_table;
+ for (i = 0; i < VEIP_HASH_SZ; i++) {
+ list_for_each(p, ip_entry_hash_table + i) {
+ if (--l == 0)
+ return p;
+ }
+ }
+ return NULL;
+}
+
+static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct list_head *p;
+
+ p = (struct list_head *)v;
+ while (1) {
+ p = p->next;
+ if (p < ip_entry_hash_table ||
+ p >= ip_entry_hash_table + VEIP_HASH_SZ) {
+ (*pos)++;
+ return p;
+ }
+ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ)
+ return NULL;
+ }
+ return NULL;
+}
+
+static void veip_seq_stop(struct seq_file *m, void *v)
+{
+ write_unlock_irq(&veip_hash_lock);
+}
+
+static struct seq_operations veip_seq_op = {
+ .start = veip_seq_start,
+ .next = veip_seq_next,
+ .stop = veip_seq_stop,
+ .show = veip_seq_show,
+};
+
+static int veip_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &veip_seq_op);
+}
+
+static struct file_operations proc_veip_operations = {
+ .open = veip_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
+
+static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr,
+ int addrlen)
+{
+ int err;
+ struct ve_struct *ve;
+ struct ve_addr_struct addr;
+
+ err = -EPERM;
+ if (!capable_setveid())
+ goto out;
+
+ err = sockaddr_to_veaddr(uaddr, addrlen, &addr);
+ if (err < 0)
+ goto out;
+
+ switch (op)
+ {
+ case VE_IP_ADD:
+ ve = get_ve_by_id(veid);
+ err = -ESRCH;
+ if (!ve)
+ goto out;
+
+ down_read(&ve->op_sem);
+ if (ve->is_running)
+ err = veip_entry_add(ve, &addr);
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ break;
+
+ case VE_IP_DEL:
+ err = veip_entry_del(veid, &addr);
+ break;
+ case VE_IP_EXT_ADD:
+ ve = get_ve_by_id(veid);
+ err = -ESRCH;
+ if (!ve)
+ goto out;
+
+ down_read(&ve->op_sem);
+ err = venet_ext_add(ve, &addr);
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ break;
+ case VE_IP_EXT_DEL:
+ ve = get_ve_by_id(veid);
+ err = -ESRCH;
+ if (!ve)
+ goto out;
+
+ down_read(&ve->op_sem);
+ err = venet_ext_del(ve, &addr);
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ err = -ENOTTY;
+ switch(cmd) {
+ case VENETCTL_VE_IP_MAP: {
+ struct vzctl_ve_ip_map s;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
+ break;
+ }
+ }
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ switch(cmd) {
+ case VENETCTL_COMPAT_VE_IP_MAP: {
+ struct compat_vzctl_ve_ip_map cs;
+
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+
+ err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr),
+ cs.addrlen);
+ break;
+ }
+ default:
+ err = venet_ioctl(file, cmd, arg);
+ break;
+ }
+ return err;
+}
+#endif
+
+static struct vzioctlinfo venetcalls = {
+ .type = VENETCTLTYPE,
+ .ioctl = venet_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_venet_ioctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+int venet_dev_start(struct ve_struct *ve)
+{
+ struct net_device *dev_venet;
+ int err;
+
+ dev_venet = alloc_netdev(0, "venet%d", venet_setup);
+ if (!dev_venet)
+ return -ENOMEM;
+ dev_net_set(dev_venet, ve->ve_netns);
+ err = dev_alloc_name(dev_venet, dev_venet->name);
+ if (err<0)
+ goto err;
+ if ((err = register_netdev(dev_venet)) != 0)
+ goto err;
+ ve->_venet_dev = dev_venet;
+ return 0;
+err:
+ free_netdev(dev_venet);
+ printk(KERN_ERR "VENET initialization error err=%d\n", err);
+ return err;
+}
+
+static int venet_start(void *data)
+{
+ struct ve_struct *env;
+ int err;
+
+ env = (struct ve_struct *)data;
+ if (env->veip)
+ return -EEXIST;
+
+ err = veip_start(env);
+ if (err != 0)
+ return err;
+
+ err = venet_dev_start(env);
+ if (err)
+ goto err_free;
+ return 0;
+
+err_free:
+ veip_stop(env);
+ return err;
+}
+
+static void venet_stop(void *data)
+{
+ struct ve_struct *env;
+ struct net_device *dev;
+
+ env = (struct ve_struct *)data;
+ venet_ext_clean(env);
+ veip_stop(env);
+
+ dev = env->_venet_dev;
+ if (dev == NULL)
+ return;
+
+ unregister_netdev(dev);
+ env->_venet_dev = NULL;
+ free_netdev(dev);
+}
+
+static struct ve_hook venet_ve_hook = {
+ .init = venet_start,
+ .fini = venet_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_NET,
+};
+
+__init int venet_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *de;
+#endif
+ int i, err;
+
+ if (get_ve0()->_venet_dev != NULL)
+ return -EEXIST;
+
+ for (i = 0; i < VEIP_HASH_SZ; i++)
+ INIT_LIST_HEAD(ip_entry_hash_table + i);
+
+ err = venet_start(get_ve0());
+ if (err)
+ return err;
+
+#ifdef CONFIG_PROC_FS
+ de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir,
+ &proc_veip_operations);
+ if (de == NULL)
+ printk(KERN_WARNING "venet: can't make veip proc entry\n");
+#endif
+
+ ve_hook_register(VE_SS_CHAIN, &venet_ve_hook);
+ vzioctl_register(&venetcalls);
+ vzmon_register_veaddr_print_cb(veaddr_seq_print);
+ return 0;
+}
+
+__exit void venet_exit(void)
+{
+ vzmon_unregister_veaddr_print_cb(veaddr_seq_print);
+ vzioctl_unregister(&venetcalls);
+ ve_hook_unregister(&venet_ve_hook);
+
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("veip", proc_vz_dir);
+#endif
+ venet_stop(get_ve0());
+ veip_cleanup();
+}
+
+module_init(venet_init);
+module_exit(venet_exit);
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 52af501..68b47b9 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -180,6 +180,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
skb->mark = 0;
secpath_reset(skb);
nf_reset(skb);
+ skb_init_brmark(skb);
length = skb->len;
diff --git a/drivers/net/vzethdev.c b/drivers/net/vzethdev.c
new file mode 100644
index 0000000..ed8ed97
--- /dev/null
+++ b/drivers/net/vzethdev.c
@@ -0,0 +1,749 @@
+/*
+ * veth.c
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * Virtual ethernet device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h> /* For the statistics structure. */
+#include <linux/if_arp.h> /* For ARPHRD_ETHER */
+#include <linux/if_bridge.h>
+#include <linux/ethtool.h>
+#include <linux/ve_proto.h>
+#include <linux/veth.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_veth.h>
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/vzcalluser.h>
+
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+static LIST_HEAD(veth_hwaddr_list);
+static DEFINE_RWLOCK(ve_hwaddr_lock);
+static DECLARE_MUTEX(hwaddr_sem);
+
+struct net_device * veth_dev_start(char *dev_addr, char *name);
+
+struct veth_struct *hwaddr_entry_lookup(char *name)
+{
+ struct veth_struct *entry;
+
+ list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) {
+ BUG_ON(entry->pair == NULL);
+ if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0)
+ return entry;
+ }
+ return NULL;
+}
+
+int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name,
+ char *dev_addr_ve, char *name_ve)
+{
+ struct net_device *dev_ve;
+ struct net_device *dev_ve0;
+ struct ve_struct *old_env;
+ char dev_name[IFNAMSIZ];
+ int err;
+
+ down(&hwaddr_sem);
+
+ if (name[0] == '\0')
+ snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid);
+ else {
+ memcpy(dev_name, name, IFNAMSIZ - 1);
+ dev_name[IFNAMSIZ - 1] = '\0';
+ }
+ dev_ve0 = veth_dev_start(dev_addr, dev_name);
+ if (IS_ERR(dev_ve0)) {
+ err = PTR_ERR(dev_ve0);
+ goto err;
+ }
+
+ old_env = set_exec_env(ve);
+ if (name_ve[0] == '\0')
+ sprintf(dev_name, "eth%%d");
+ else {
+ memcpy(dev_name, name_ve, IFNAMSIZ - 1);
+ dev_name[IFNAMSIZ - 1] = '\0';
+ }
+ dev_ve = veth_dev_start(dev_addr_ve, dev_name);
+ if (IS_ERR(dev_ve)) {
+ err = PTR_ERR(dev_ve);
+ goto err_ve;
+ }
+ set_exec_env(old_env);
+ veth_from_netdev(dev_ve)->pair = dev_ve0;
+ veth_from_netdev(dev_ve)->me = dev_ve;
+ veth_from_netdev(dev_ve0)->pair = dev_ve;
+ veth_from_netdev(dev_ve0)->me = dev_ve0;
+
+ write_lock(&ve_hwaddr_lock);
+ list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list);
+ write_unlock(&ve_hwaddr_lock);
+
+ up(&hwaddr_sem);
+ return 0;
+
+err_ve:
+ set_exec_env(old_env);
+ unregister_netdev(dev_ve0);
+err:
+ up(&hwaddr_sem);
+ return err;
+}
+
+void veth_pair_del(struct ve_struct *env, struct veth_struct *entry)
+{
+ struct net_device *dev;
+ struct ve_struct *old_env;
+
+ write_lock(&ve_hwaddr_lock);
+ list_del(&entry->hwaddr_list);
+ write_unlock(&ve_hwaddr_lock);
+
+ dev = entry->pair;
+ BUG_ON(entry->pair == NULL);
+
+ veth_from_netdev(dev)->pair = NULL;
+ entry->pair = NULL;
+ rtnl_lock();
+ old_env = set_exec_env(dev->owner_env);
+ dev_close(dev);
+
+ /*
+ * Now device from VE0 does not send or receive anything,
+ * i.e. dev->hard_start_xmit won't be called.
+ */
+ set_exec_env(env);
+ unregister_netdevice(veth_to_netdev(entry));
+ set_exec_env(dev->owner_env);
+ unregister_netdevice(dev);
+ set_exec_env(old_env);
+ rtnl_unlock();
+}
+
+int veth_entry_del(struct ve_struct *ve, char *name)
+{
+ struct veth_struct *found;
+ int err;
+
+ err = -ENODEV;
+ down(&hwaddr_sem);
+ found = hwaddr_entry_lookup(name);
+ if (found == NULL)
+ goto out;
+ if (veth_to_netdev(found)->owner_env != ve)
+ goto out;
+
+ err = 0;
+ veth_pair_del(ve, found);
+
+out:
+ up(&hwaddr_sem);
+ return err;
+}
+
+int veth_allow_change_mac(envid_t veid, char *name, int allow)
+{
+ struct ve_struct *ve;
+ struct veth_struct *found;
+ int err;
+
+ err = -ESRCH;
+ ve = get_ve_by_id(veid);
+ if (!ve)
+ return err;
+
+ down_read(&ve->op_sem);
+ if (!ve->is_running)
+ goto out_ve;
+ err = -ENODEV;
+ down(&hwaddr_sem);
+ found = hwaddr_entry_lookup(name);
+ if (found == NULL)
+ goto out_sem;
+ if (veth_to_netdev(found)->owner_env != ve)
+ goto out_sem;
+
+ err = 0;
+ found->allow_mac_change = allow;
+
+out_sem:
+ up(&hwaddr_sem);
+out_ve:
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ return err;
+}
+
+/*
+ * Device functions
+ */
+
+static int veth_open(struct net_device *dev)
+{
+ return 0;
+}
+
+static int veth_close(struct net_device *master)
+{
+ return 0;
+}
+
+static void veth_destructor(struct net_device *dev)
+{
+ free_percpu(veth_from_netdev(dev)->real_stats);
+ free_netdev(dev);
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+ int i;
+ struct net_device_stats *stats;
+
+ stats = &veth_from_netdev(dev)->stats;
+ memset(stats, 0, sizeof(struct net_device_stats));
+ for (i = 0; i < NR_CPUS; i++) {
+ struct net_device_stats *dev_stats;
+
+ if (!cpu_possible(i))
+ continue;
+ dev_stats = veth_stats(dev, i);
+ stats->rx_bytes += dev_stats->rx_bytes;
+ stats->tx_bytes += dev_stats->tx_bytes;
+ stats->rx_packets += dev_stats->rx_packets;
+ stats->tx_packets += dev_stats->tx_packets;
+ stats->tx_dropped += dev_stats->tx_dropped;
+ }
+
+ return stats;
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device_stats *stats;
+ struct net_device *rcv = NULL;
+ struct veth_struct *entry;
+ int length;
+
+ stats = veth_stats(dev, smp_processor_id());
+ if (unlikely(get_exec_env()->disable_net))
+ goto outf;
+
+ entry = veth_from_netdev(dev);
+ rcv = entry->pair;
+ if (!rcv)
+ /* VE going down */
+ goto outf;
+
+ if (!(rcv->flags & IFF_UP)) {
+ /* Target VE does not want to receive packets */
+ goto outf;
+ }
+
+ if (unlikely(rcv->owner_env->disable_net))
+ goto outf;
+ /* Filtering */
+ if (ve_is_super(dev->owner_env) &&
+ !veth_from_netdev(rcv)->allow_mac_change) {
+ /* from VE0 to VEX */
+ if (ve_is_super(rcv->owner_env))
+ goto out;
+ if (is_multicast_ether_addr(
+ ((struct ethhdr *)skb->data)->h_dest))
+ goto out;
+ if (!rcv->br_port &&
+ compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, rcv->dev_addr))
+ goto outf;
+ } else if (!ve_is_super(dev->owner_env) &&
+ !entry->allow_mac_change) {
+ /* from VEX to VE0 */
+ if (!skb->dev->br_port &&
+ compare_ether_addr(((struct ethhdr *)skb->data)->h_source, dev->dev_addr))
+ goto outf;
+ }
+
+out:
+ skb->owner_env = rcv->owner_env;
+
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, rcv);
+
+ if (skb->protocol != __constant_htons(ETH_P_IP))
+ skb_orphan(skb);
+
+ nf_reset(skb);
+ length = skb->len;
+ skb_init_brmark(skb);
+
+ netif_rx(skb);
+
+ stats->tx_bytes += length;
+ stats->tx_packets++;
+ if (rcv) {
+ struct net_device_stats *rcv_stats;
+ rcv_stats = veth_stats(rcv, smp_processor_id());
+ rcv_stats->rx_bytes += length;
+ rcv_stats->rx_packets++;
+ }
+
+ return 0;
+
+outf:
+ kfree_skb(skb);
+ stats->tx_dropped++;
+ return 0;
+}
+
+static int veth_set_mac(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ if (!ve_is_super(dev->owner_env) &&
+ !veth_from_netdev(dev)->allow_mac_change)
+ return -EPERM;
+ if (netif_running(dev))
+ return -EBUSY;
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+ return 0;
+}
+
+int veth_init_dev(struct net_device *dev)
+{
+ veth_from_netdev(dev)->real_stats =
+ alloc_percpu(struct net_device_stats);
+ if (veth_from_netdev(dev)->real_stats == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int
+veth_set_op(struct net_device *dev, u32 data,
+ int (*fop)(struct net_device *, u32))
+{
+ struct net_device *pair;
+ int ret = 0;
+
+ ret = fop(dev, data);
+ if (ret < 0)
+ goto out;
+
+ pair = veth_from_netdev(dev)->pair;
+ if (pair)
+ ret = fop(pair, data);
+out:
+ return ret;
+}
+
+static int veth_op_set_sg(struct net_device *dev, u32 data)
+{
+ return veth_set_op(dev, data, ethtool_op_set_sg);
+}
+
+static int veth_op_set_tx_csum(struct net_device *dev, u32 data)
+{
+ return veth_set_op(dev, data, ethtool_op_set_tx_csum);
+}
+
+static int
+veth_op_set_tso(struct net_device *dev, u32 data)
+{
+ return veth_set_op(dev, data, ethtool_op_set_tso);
+}
+
+#define veth_op_set_rx_csum veth_op_set_tx_csum
+
+static struct ethtool_ops veth_ethtool_ops = {
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = veth_op_set_sg,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .set_tx_csum = veth_op_set_tx_csum,
+ .get_rx_csum = ethtool_op_get_tx_csum,
+ .set_rx_csum = veth_op_set_rx_csum,
+ .get_tso = ethtool_op_get_tso,
+ .set_tso = veth_op_set_tso,
+};
+
+static void veth_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+ struct cpt_veth_image v;
+ struct veth_struct *veth;
+
+ veth = veth_from_netdev(dev);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_VETH;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_allow_mac_change = veth->allow_mac_change;
+
+ ops->write(&v, sizeof(v), ctx);
+}
+
+static int veth_rst(loff_t pos, struct cpt_netdev_image *di,
+ struct rst_ops *ops,
+ struct cpt_context *ctx)
+
+{
+ int err;
+ struct cpt_veth_image vi;
+ struct veth_struct *veth;
+ struct net_device *dev;
+
+ pos = pos + di->cpt_hdrlen;
+ err = ops->get_object(CPT_OBJ_NET_VETH, pos,
+ &vi, sizeof(vi), ctx);
+ if (err)
+ return err;
+
+ dev = __dev_get_by_name(get_exec_env()->ve_ns->net_ns, di->cpt_name);
+ if (dev == NULL)
+ return -ENODEV;
+
+ veth = veth_from_netdev(dev);
+ veth->allow_mac_change = vi.cpt_allow_mac_change;
+
+ return 0;
+}
+
+static struct netdev_rst veth_netdev_rst = {
+ .cpt_object = CPT_OBJ_NET_VETH,
+ .ndo_rst = veth_rst,
+};
+
+static const struct net_device_ops veth_ops = {
+ .ndo_init = veth_init_dev,
+ .ndo_start_xmit = veth_xmit,
+ .ndo_get_stats = get_stats,
+ .ndo_open = veth_open,
+ .ndo_stop = veth_close,
+ .ndo_set_mac_address = veth_set_mac,
+ .ndo_cpt = veth_cpt,
+};
+
+static void veth_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &veth_ops;
+ dev->destructor = veth_destructor;
+ dev->tx_queue_len = 0;
+
+ /*
+ * No other features, as they are:
+ * - checksumming is required, and nobody else will done our job
+ */
+ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
+ NETIF_F_HIGHDMA;
+
+ SET_ETHTOOL_OPS(dev, &veth_ethtool_ops);
+}
+
+#ifdef CONFIG_PROC_FS
+#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
+#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5]
+static int vehwaddr_seq_show(struct seq_file *m, void *v)
+{
+ struct list_head *p;
+ struct veth_struct *entry;
+
+ p = (struct list_head *)v;
+ if (p == &veth_hwaddr_list) {
+ seq_puts(m, "Version: 1.0\n");
+ return 0;
+ }
+ entry = list_entry(p, struct veth_struct, hwaddr_list);
+ seq_printf(m, ADDR_FMT " %16s ",
+ ADDR_ARG(entry->pair->dev_addr), entry->pair->name);
+ seq_printf(m, ADDR_FMT " %16s %10u %5s\n",
+ ADDR_ARG(veth_to_netdev(entry)->dev_addr),
+ veth_to_netdev(entry)->name,
+ VEID(veth_to_netdev(entry)->owner_env),
+ entry->allow_mac_change ? "allow" : "deny");
+ return 0;
+}
+
+static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos)
+{
+ read_lock(&ve_hwaddr_lock);
+ return seq_list_start_head(&veth_hwaddr_list, *pos);
+}
+
+static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &veth_hwaddr_list, pos);
+}
+
+static void vehwaddr_seq_stop(struct seq_file *m, void *v)
+{
+ read_unlock(&ve_hwaddr_lock);
+}
+
+static struct seq_operations vehwaddr_seq_op = {
+ .start = vehwaddr_seq_start,
+ .next = vehwaddr_seq_next,
+ .stop = vehwaddr_seq_stop,
+ .show = vehwaddr_seq_show,
+};
+
+static int vehwaddr_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &vehwaddr_seq_op);
+}
+
+static struct file_operations proc_vehwaddr_operations = {
+ .open = vehwaddr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+#endif
+
+int real_ve_hwaddr(envid_t veid, int op,
+ unsigned char *dev_addr, int addrlen, char *name,
+ unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve)
+{
+ int err;
+ struct ve_struct *ve;
+ char ve_addr[ETH_ALEN];
+
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+
+ err = -EINVAL;
+ switch (op) {
+ case VE_ETH_ADD:
+ if (addrlen != ETH_ALEN)
+ goto out;
+ if (addrlen_ve != ETH_ALEN && addrlen_ve != 0)
+ goto out;
+ /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */
+ if (addrlen_ve == 0 && (dev_addr[3] & 0x80))
+ goto out;
+ if (addrlen_ve == 0) {
+ memcpy(ve_addr, dev_addr, ETH_ALEN);
+ ve_addr[3] |= 0x80;
+ } else {
+ memcpy(ve_addr, dev_addr_ve, ETH_ALEN);
+ }
+
+ ve = get_ve_by_id(veid);
+ err = -ESRCH;
+ if (!ve)
+ goto out;
+
+ down_read(&ve->op_sem);
+ if (ve->is_running)
+ err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve);
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ break;
+
+ case VE_ETH_DEL:
+ if (name[0] == '\0')
+ goto out;
+ ve = get_ve_by_id(veid);
+ err = -ESRCH;
+ if (!ve)
+ goto out;
+
+ down_read(&ve->op_sem);
+ if (ve->is_running)
+ err = veth_entry_del(ve, name);
+ up_read(&ve->op_sem);
+ put_ve(ve);
+ break;
+ case VE_ETH_ALLOW_MAC_CHANGE:
+ case VE_ETH_DENY_MAC_CHANGE:
+ err = veth_allow_change_mac(veid, name,
+ op == VE_ETH_ALLOW_MAC_CHANGE);
+ break;
+ }
+
+out:
+ return err;
+}
+
+int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ err = -ENOTTY;
+ switch(cmd) {
+ case VETHCTL_VE_HWADDR: {
+ struct vzctl_ve_hwaddr s;
+
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen,
+ s.dev_name, s.dev_addr_ve, s.addrlen_ve,
+ s.dev_name_ve);
+ }
+ break;
+ }
+ return err;
+}
+
+static struct vzioctlinfo vethcalls = {
+ .type = VETHCTLTYPE,
+ .ioctl = veth_ioctl,
+ .compat_ioctl = veth_ioctl,
+ .owner = THIS_MODULE,
+};
+
+struct net_device * veth_dev_start(char *dev_addr, char *name)
+{
+ struct net_device *dev;
+ int err;
+
+ if (!is_valid_ether_addr(dev_addr))
+ return ERR_PTR(-EADDRNOTAVAIL);
+
+ dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+ dev->nd_net = get_exec_env()->ve_netns;
+ if (strchr(dev->name, '%')) {
+ err = dev_alloc_name(dev, dev->name);
+ if (err < 0)
+ goto err;
+ }
+ if ((err = register_netdev(dev)) != 0)
+ goto err;
+
+ memcpy(dev->dev_addr, dev_addr, ETH_ALEN);
+ dev->addr_len = ETH_ALEN;
+
+ return dev;
+err:
+ free_netdev(dev);
+ printk(KERN_ERR "%s initialization error err=%d\n", name, err);
+ return ERR_PTR(err);
+}
+
+static int veth_start(void *data)
+{
+ return 0;
+}
+
+static void veth_stop(void *data)
+{
+ struct ve_struct *env;
+ struct veth_struct *entry, *tmp;
+
+ env = (struct ve_struct *)data;
+ down(&hwaddr_sem);
+ list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list)
+ if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env))
+ veth_pair_del(env, entry);
+ up(&hwaddr_sem);
+}
+
+static struct ve_hook veth_ve_hook = {
+ .init = veth_start,
+ .fini = veth_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_NET,
+};
+
+__init int veth_init(void)
+{
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *de;
+
+ de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir,
+ &proc_vehwaddr_operations);
+ if (de == NULL)
+ printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n");
+#endif
+
+ register_netdev_rst(&veth_netdev_rst);
+ ve_hook_register(VE_SS_CHAIN, &veth_ve_hook);
+ vzioctl_register(&vethcalls);
+ return 0;
+}
+
+__exit void veth_exit(void)
+{
+ struct veth_struct *entry;
+ struct list_head *tmp, *n;
+ struct ve_struct *ve;
+
+ vzioctl_unregister(&vethcalls);
+ ve_hook_unregister(&veth_ve_hook);
+ unregister_netdev_rst(&veth_netdev_rst);
+
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("veth", proc_vz_dir);
+#endif
+
+ down(&hwaddr_sem);
+ list_for_each_safe(tmp, n, &veth_hwaddr_list) {
+ entry = list_entry(tmp, struct veth_struct, hwaddr_list);
+ ve = get_ve(veth_to_netdev(entry)->owner_env);
+
+ veth_pair_del(ve, entry);
+
+ put_ve(ve);
+ }
+ up(&hwaddr_sem);
+}
+
+module_init(veth_init);
+module_exit(veth_exit);
+
+MODULE_AUTHOR("Andrey Mirkin <amirkin@sw.ru>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device");
+MODULE_LICENSE("GPL v2");
+
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 554626e..41ca449 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -401,9 +401,8 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
device_initialize(&shost->shost_gendev);
dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
-#ifndef CONFIG_SYSFS_DEPRECATED
- shost->shost_gendev.bus = &scsi_bus_type;
-#endif
+ if (!sysfs_deprecated)
+ shost->shost_gendev.bus = &scsi_bus_type;
shost->shost_gendev.type = &scsi_host_type;
device_initialize(&shost->shost_dev);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 47291bc..142a991 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -415,9 +415,8 @@ static struct scsi_target *scsi_alloc_target(struct device *parent,
starget->reap_ref = 1;
dev->parent = get_device(parent);
dev_set_name(dev, "target%d:%d:%d", shost->host_no, channel, id);
-#ifndef CONFIG_SYSFS_DEPRECATED
- dev->bus = &scsi_bus_type;
-#endif
+ if (!sysfs_deprecated)
+ dev->bus = &scsi_bus_type;
dev->type = &scsi_target_type;
starget->id = id;
starget->channel = channel;
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44ef..f48e240 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -63,6 +63,14 @@ source "fs/autofs/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
+config SIM_FS
+ tristate "VPS filesystem"
+ default m
+ help
+ This file system is a part of Virtuozzo. It intoduces a fake
+ superblock and blockdev to VE to hide real device and show
+ statfs results taken from quota.
+
config CUSE
tristate "Character device in Userpace support"
depends on FUSE_FS
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..d45ecab 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,6 +53,8 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-y += quota/
+obj-$(CONFIG_SIM_FS) += simfs.o
+
obj-$(CONFIG_PROC_FS) += proc/
obj-y += partitions/
obj-$(CONFIG_SYSFS) += sysfs/
diff --git a/fs/aio.c b/fs/aio.c
index 02a2c93..1f18b09 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -43,13 +43,16 @@
#endif
/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
+DEFINE_SPINLOCK(aio_nr_lock);
+EXPORT_SYMBOL_GPL(aio_nr_lock);
unsigned long aio_nr; /* current system wide number of aio requests */
+EXPORT_SYMBOL_GPL(aio_nr);
unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
/*----end sysctl variables---*/
static struct kmem_cache *kiocb_cachep;
-static struct kmem_cache *kioctx_cachep;
+struct kmem_cache *kioctx_cachep;
+EXPORT_SYMBOL_GPL(kioctx_cachep);
static struct workqueue_struct *aio_wq;
@@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
static DEFINE_SPINLOCK(fput_lock);
static LIST_HEAD(fput_head);
-static void aio_kick_handler(struct work_struct *);
+void aio_kick_handler(struct work_struct *);
static void aio_queue_work(struct kioctx *);
/* aio_setup
@@ -343,7 +346,7 @@ static void aio_cancel_all(struct kioctx *ctx)
spin_unlock_irq(&ctx->ctx_lock);
}
-static void wait_for_all_aios(struct kioctx *ctx)
+void wait_for_all_aios(struct kioctx *ctx)
{
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
@@ -366,6 +369,7 @@ static void wait_for_all_aios(struct kioctx *ctx)
out:
spin_unlock_irq(&ctx->ctx_lock);
}
+EXPORT_SYMBOL_GPL(wait_for_all_aios);
/* wait_on_sync_kiocb:
* Waits on the given sync kiocb to complete.
@@ -818,7 +822,7 @@ static inline void aio_run_all_iocbs(struct kioctx *ctx)
* space.
* Run on aiod's context.
*/
-static void aio_kick_handler(struct work_struct *work)
+void aio_kick_handler(struct work_struct *work)
{
struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
mm_segment_t oldfs = get_fs();
@@ -839,7 +843,7 @@ static void aio_kick_handler(struct work_struct *work)
if (requeue)
queue_delayed_work(aio_wq, &ctx->wq, 0);
}
-
+EXPORT_SYMBOL_GPL(aio_kick_handler);
/*
* Called by kick_iocb to queue the kiocb for retry
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index cea5219..1217caf 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = {
.name = "autofs",
.get_sb = autofs_get_sb,
.kill_sb = autofs_kill_sb,
+ .fs_flags = FS_VIRTUALIZED,
};
static int __init init_autofs_fs(void)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index e1734f2..ccf87cb 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -78,7 +78,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
*uid = current_uid();
*gid = current_gid();
- *pgrp = task_pgrp_nr(current);
+ *pgrp = task_pgrp_vnr(current);
*minproto = *maxproto = AUTOFS_PROTO_VERSION;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401c..7144001 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -362,7 +362,7 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry)
/* This allows root to remove symlinks */
lock_kernel();
- if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) {
+ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) {
unlock_kernel();
return -EACCES;
}
@@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp,
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
- if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
return -EPERM;
switch(cmd) {
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f7cdde..41b42bd 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -119,7 +119,7 @@ struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
- pid_t oz_pgrp;
+ struct pid *oz_pgrp;
int catatonic;
int version;
int sub_version;
@@ -136,6 +136,7 @@ struct autofs_sb_info {
spinlock_t lookup_lock;
struct list_head active_list;
struct list_head expiring_list;
+ unsigned is32bit:1;
};
static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -153,7 +154,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
filesystem without "magic".) */
static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
- return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
/* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 00bf8fc..5ce47da 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -385,7 +385,8 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
fput(pipe);
goto out;
}
- sbi->oz_pgrp = task_pgrp_nr(current);
+ put_pid(sbi->oz_pgrp);
+ sbi->oz_pgrp = get_pid(task_pgrp(current));
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 9722e4b..afd983f 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = {
.name = "autofs",
.get_sb = autofs_get_sb,
.kill_sb = autofs4_kill_sb,
+ .fs_flags = FS_VIRTUALIZED,
};
static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 69c8142..47a39bc 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -171,6 +171,8 @@ void autofs4_kill_sb(struct super_block *sb)
/* Clean up and release dangling references */
autofs4_force_release(sbi);
+ put_pid(sbi->oz_pgrp);
+
sb->s_fs_info = NULL;
kfree(sbi);
@@ -192,7 +194,7 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, ",uid=%u", root_inode->i_uid);
if (root_inode->i_gid != 0)
seq_printf(m, ",gid=%u", root_inode->i_gid);
- seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+ seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
seq_printf(m, ",minproto=%d", sbi->min_proto);
seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -237,7 +239,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
*uid = current_uid();
*gid = current_gid();
- *pgrp = task_pgrp_nr(current);
+ *pgrp = task_pgrp_vnr(current);
*minproto = AUTOFS_MIN_PROTO_VERSION;
*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -322,6 +324,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+ pid_t pgrp;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -334,13 +337,16 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->pipe = NULL;
sbi->catatonic = 1;
sbi->exp_timeout = 0;
- sbi->oz_pgrp = task_pgrp_nr(current);
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
set_autofs_type_indirect(&sbi->type);
sbi->min_proto = 0;
sbi->max_proto = 0;
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ sbi->is32bit = 1;
+#endif
mutex_init(&sbi->wq_mutex);
spin_lock_init(&sbi->fs_lock);
sbi->queues = NULL;
@@ -373,7 +379,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Can this call block? */
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
+ &pgrp, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
printk("autofs: called with bogus options\n");
goto fail_dput;
@@ -401,12 +407,20 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+ DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pgrp);
+
+ sbi->oz_pgrp = find_get_pid(pgrp);
+
+ if (!sbi->oz_pgrp) {
+ printk("autofs: could not find process group %d\n", pgrp);
+ goto fail_dput;
+ }
+
pipe = fget(pipefd);
if (!pipe) {
printk("autofs: could not open pipe file descriptor\n");
- goto fail_dput;
+ goto fail_put_pid;
}
if (!pipe->f_op || !pipe->f_op->write)
goto fail_fput;
@@ -427,6 +441,8 @@ fail_fput:
printk("autofs: pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
+fail_put_pid:
+ put_pid(sbi->oz_pgrp);
fail_dput:
dput(root);
goto fail_free;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b96a3c5..263c27a 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -689,7 +689,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_info *p_ino;
/* This allows root to remove symlinks */
- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
return -EACCES;
if (atomic_dec_and_test(&ino->count)) {
@@ -883,7 +883,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN))
return -EPERM;
switch(cmd) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 2341375..f1408f1 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -148,6 +148,16 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
pktsz = sizeof(*packet);
+#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION
+ /*
+ * On x86_64 autofs_v5_packet struct padded with 4 bytes
+ * it broke autofs daemon worked in ia32 emulation mode
+ *
+ * reduce size if work in 32-bit mode to satisfy userspace hope
+ */
+ if (sbi->is32bit)
+ pktsz -= 4;
+#endif
packet->wait_queue_token = wq->wait_queue_token;
packet->len = wq->name.len;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0133b5a..1379e26 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -311,12 +311,12 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
(N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
{
- printk(KERN_NOTICE "executable not page aligned\n");
+ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
}
if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
{
- printk(KERN_WARNING
+ ve_printk(VE_LOG, KERN_WARNING
"fd_offset is not page aligned. Please convert program: %s\n",
bprm->file->f_path.dentry->d_name.name);
}
@@ -425,7 +425,7 @@ static int load_aout_library(struct file *file)
if (printk_ratelimit())
{
- printk(KERN_WARNING
+ ve_printk(VE_LOG, KERN_WARNING
"N_TXTOFF is not page aligned. Please convert library: %s\n",
file->f_path.dentry->d_name.name);
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1ed37ba..e8ef26b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -437,7 +437,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
eppnt = elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
if (eppnt->p_type == PT_LOAD) {
- int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+ int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO;
int elf_prot = 0;
unsigned long vaddr = 0;
unsigned long k, map_addr;
@@ -789,7 +789,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (elf_ppnt->p_flags & PF_X)
elf_prot |= PROT_EXEC;
- elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
+ elf_flags = MAP_PRIVATE | MAP_DENYWRITE |
+ MAP_EXECUTABLE | MAP_EXECPRIO;
vaddr = elf_ppnt->p_vaddr;
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
@@ -922,7 +923,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
set_binfmt(&elf_format);
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
- retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+ retval = arch_setup_additional_pages(bprm, !!elf_interpreter, 0);
if (retval < 0) {
send_sig(SIGKILL, current, 0);
goto out;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c4e8353..8180165 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -28,6 +28,7 @@
#include <linux/mount.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
+#include <linux/ve_proto.h>
#include <asm/uaccess.h>
@@ -35,8 +36,15 @@ enum {
VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
};
+#ifdef CONFIG_VE
+#define bm_entries(ve) ((ve)->bm_entries)
+#define bm_enabled(ve) ((ve)->bm_enabled)
+#else
static LIST_HEAD(entries);
static int enabled = 1;
+#define bm_entries(ve) (entries)
+#define bm_enabled(ve) (enabled)
+#endif
enum {Enabled, Magic};
#define MISC_FMT_PRESERVE_ARGV0 (1<<31)
@@ -56,21 +64,30 @@ typedef struct {
} Node;
static DEFINE_RWLOCK(entries_lock);
+#ifdef CONFIG_VE
+#define bm_fs_type(ve) (*(ve)->bm_fs_type)
+#define bm_mnt(ve) ((ve)->bm_mnt)
+#define bm_entry_count(ve) ((ve)->bm_entry_count)
+#else
static struct file_system_type bm_fs_type;
static struct vfsmount *bm_mnt;
static int entry_count;
+#define bm_fs_type(ve) (bm_fs_type)
+#define bm_mnt(ve) (bm_mnt)
+#define bm_entry_count(ve) (bm_entry_count)
+#endif
/*
* Check if we support the binfmt
* if we do, return the node, else NULL
* locking is done in load_misc_binary
*/
-static Node *check_file(struct linux_binprm *bprm)
+static Node *check_file(struct ve_struct *ve, struct linux_binprm *bprm)
{
char *p = strrchr(bprm->interp, '.');
struct list_head *l;
- list_for_each(l, &entries) {
+ list_for_each(l, &bm_entries(ve)) {
Node *e = list_entry(l, Node, list);
char *s;
int j;
@@ -111,9 +128,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
char *iname_addr = iname;
int retval;
int fd_binary = -1;
+ struct ve_struct *ve = get_exec_env();
retval = -ENOEXEC;
- if (!enabled)
+ if (!bm_enabled(ve))
goto _ret;
retval = -ENOEXEC;
@@ -122,7 +140,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
/* to keep locking time low, we copy the interpreter string */
read_lock(&entries_lock);
- fmt = check_file(bprm);
+ fmt = check_file(ve, bprm);
if (fmt)
strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
read_unlock(&entries_lock);
@@ -507,7 +525,7 @@ static void bm_clear_inode(struct inode *inode)
kfree(inode->i_private);
}
-static void kill_node(Node *e)
+static void kill_node(struct ve_struct *ve, Node *e)
{
struct dentry *dentry;
@@ -523,7 +541,7 @@ static void kill_node(Node *e)
dentry->d_inode->i_nlink--;
d_drop(dentry);
dput(dentry);
- simple_release_fs(&bm_mnt, &entry_count);
+ simple_release_fs(&bm_mnt(ve), &bm_entry_count(ve));
}
}
@@ -562,7 +580,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
mutex_lock(&root->d_inode->i_mutex);
- kill_node(e);
+ kill_node(get_exec_env(), e);
mutex_unlock(&root->d_inode->i_mutex);
dput(root);
@@ -587,6 +605,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct dentry *root, *dentry;
struct super_block *sb = file->f_path.mnt->mnt_sb;
int err = 0;
+ struct ve_struct *ve = get_exec_env();
e = create_entry(buffer, count);
@@ -610,7 +629,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
if (!inode)
goto out2;
- err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+ err = simple_pin_fs(&bm_fs_type(ve), &bm_mnt(ve), &bm_entry_count(ve));
if (err) {
iput(inode);
inode = NULL;
@@ -623,7 +642,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
d_instantiate(dentry, inode);
write_lock(&entries_lock);
- list_add(&e->list, &entries);
+ list_add(&e->list, &bm_entries(ve));
write_unlock(&entries_lock);
err = 0;
@@ -649,26 +668,31 @@ static const struct file_operations bm_register_operations = {
static ssize_t
bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
- char *s = enabled ? "enabled\n" : "disabled\n";
+ struct ve_struct *ve = get_exec_env();
+ char *s = bm_enabled(ve) ? "enabled\n" : "disabled\n";
return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
+static void dm_genocide(struct ve_struct *ve)
+{
+ while (!list_empty(&bm_entries(ve)))
+ kill_node(ve, list_entry(bm_entries(ve).next, Node, list));
+}
+
static ssize_t bm_status_write(struct file * file, const char __user * buffer,
size_t count, loff_t *ppos)
{
+ struct ve_struct *ve = get_exec_env();
int res = parse_command(buffer, count);
struct dentry *root;
switch (res) {
- case 1: enabled = 0; break;
- case 2: enabled = 1; break;
+ case 1: bm_enabled(ve) = 0; break;
+ case 2: bm_enabled(ve) = 1; break;
case 3: root = dget(file->f_path.mnt->mnt_sb->s_root);
mutex_lock(&root->d_inode->i_mutex);
-
- while (!list_empty(&entries))
- kill_node(list_entry(entries.next, Node, list));
-
+ dm_genocide(ve);
mutex_unlock(&root->d_inode->i_mutex);
dput(root);
default: return res;
@@ -719,6 +743,53 @@ static struct file_system_type bm_fs_type = {
.kill_sb = kill_litter_super,
};
+#ifdef CONFIG_VE
+static void __ve_binfmt_init(struct ve_struct *ve, struct file_system_type *fs)
+{
+ ve->bm_fs_type = fs;
+ INIT_LIST_HEAD(&ve->bm_entries);
+ ve->bm_enabled = 1;
+ ve->bm_mnt = NULL;
+ ve->bm_entry_count = 0;
+}
+
+static int ve_binfmt_init(void *x)
+{
+ struct ve_struct *ve = x;
+ struct file_system_type *fs_type;
+ int err;
+
+ err = register_ve_fs_type(ve, &bm_fs_type, &fs_type, NULL);
+ if (err == 0)
+ __ve_binfmt_init(ve, fs_type);
+
+ return err;
+}
+
+static void ve_binfmt_fini(void *x)
+{
+ struct ve_struct *ve = x;
+
+ /*
+ * no locks since exec_ve is dead and noone will
+ * mess with bm_xxx fields any longer
+ */
+ if (!ve->bm_fs_type)
+ return;
+ dm_genocide(ve);
+ unregister_ve_fs_type(ve->bm_fs_type, NULL);
+ kfree(ve->bm_fs_type);
+ ve->bm_fs_type = NULL;
+}
+
+static struct ve_hook ve_binfmt_hook = {
+ .init = ve_binfmt_init,
+ .fini = ve_binfmt_fini,
+ .priority = HOOK_PRIO_FS,
+ .owner = THIS_MODULE,
+};
+#endif
+
static int __init init_misc_binfmt(void)
{
int err = register_filesystem(&bm_fs_type);
@@ -727,11 +798,17 @@ static int __init init_misc_binfmt(void)
if (err)
unregister_filesystem(&bm_fs_type);
}
+
+ if (!err) {
+ __ve_binfmt_init(get_ve0(), &bm_fs_type);
+ ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook);
+ }
return err;
}
static void __exit exit_misc_binfmt(void)
{
+ ve_hook_unregister(&ve_binfmt_hook);
unregister_binfmt(&misc_format);
unregister_filesystem(&bm_fs_type);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9b9e3dc..fe0cca1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1602,7 +1602,7 @@ int __invalidate_device(struct block_device *bdev)
* hold).
*/
shrink_dcache_sb(sb);
- res = invalidate_inodes(sb);
+ res = invalidate_inodes_check(sb, 1);
drop_super(sb);
}
invalidate_bdev(bdev);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6fa5302..34c1563 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -671,14 +671,18 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
static void __set_page_dirty(struct page *page,
struct address_space *mapping, int warn)
{
+ int acct = 0;
+
spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ acct = account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&mapping->tree_lock);
+ if (acct)
+ task_io_account_write(page, PAGE_CACHE_SIZE, 0);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
diff --git a/fs/compat.c b/fs/compat.c
index d576b55..284386f 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/vfs.h>
+#include <linux/virtinfo.h>
#include <linux/ioctl.h>
#include <linux/init.h>
#include <linux/smb.h>
@@ -73,6 +74,18 @@ int compat_printk(const char *fmt, ...)
#include "read_write.h"
+int ve_compat_printk(int dst, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+ if (!compat_log)
+ return 0;
+ va_start(ap, fmt);
+ ret = ve_vprintk(dst, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
@@ -91,6 +104,21 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __
return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
}
+asmlinkage long compat_sys_lutime(char __user * filename,
+ struct compat_utimbuf __user *t)
+{
+ struct timespec tv[2];
+
+ if (t) {
+ if (get_user(tv[0].tv_sec, &t->actime) ||
+ get_user(tv[1].tv_sec, &t->modtime))
+ return -EFAULT;
+ tv[0].tv_nsec = 0;
+ tv[1].tv_nsec = 0;
+ }
+ return do_utimes(AT_FDCWD, filename, t ? tv : NULL, AT_SYMLINK_NOFOLLOW);
+}
+
asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, struct compat_timespec __user *t, int flags)
{
struct timespec tv[2];
@@ -269,6 +297,8 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
struct kstatfs tmp;
error = vfs_statfs(path.dentry, &tmp);
if (!error)
+ error = faudit_statfs(path.mnt->mnt_sb, &tmp);
+ if (!error)
error = put_compat_statfs(buf, &tmp);
path_put(&path);
}
@@ -287,6 +317,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
goto out;
error = vfs_statfs(file->f_path.dentry, &tmp);
if (!error)
+ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
+ if (!error)
error = put_compat_statfs(buf, &tmp);
fput(file);
out:
@@ -337,6 +369,8 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
struct kstatfs tmp;
error = vfs_statfs(path.dentry, &tmp);
if (!error)
+ error = faudit_statfs(path.mnt->mnt_sb, &tmp);
+ if (!error)
error = put_compat_statfs64(buf, &tmp);
path_put(&path);
}
@@ -358,6 +392,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
goto out;
error = vfs_statfs(file->f_path.dentry, &tmp);
if (!error)
+ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
+ if (!error)
error = put_compat_statfs64(buf, &tmp);
fput(file);
out:
@@ -1469,6 +1505,10 @@ int compat_do_execve(char * filename,
bool clear_in_exec;
int retval;
+ retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL);
+ if (retval)
+ return retval;
+
retval = unshare_files(&displaced);
if (retval)
goto out_ret;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d84e705..960f82f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2753,7 +2753,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
if (!isprint(buf[1]))
sprintf(buf, "%02x", buf[1]);
- compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+ ve_compat_printk(VE_LOG, "ioctl32(%s:%d): Unknown cmd fd(%d) "
"cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
current->comm, current->pid,
(int)fd, (unsigned int)cmd, buf,
diff --git a/fs/dcache.c b/fs/dcache.c
index a100fa3..7fce87d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,6 +26,7 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/file.h>
+#include <linux/namei.h>
#include <asm/uaccess.h>
#include <linux/security.h>
#include <linux/seqlock.h>
@@ -33,8 +34,15 @@
#include <linux/bootmem.h>
#include <linux/fs_struct.h>
#include <linux/hardirq.h>
+#include <linux/kernel_stat.h>
+#include <linux/vzstat.h>
+#include <linux/fdtable.h>
+#include <net/inet_sock.h>
#include "internal.h"
+#include <bc/dcache.h>
+#include <bc/dcache_op.h>
+
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -43,7 +51,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(dcache_lock);
-static struct kmem_cache *dentry_cache __read_mostly;
+struct kmem_cache *dentry_cache __read_mostly;
#define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
@@ -173,6 +181,7 @@ static struct dentry *d_kill(struct dentry *dentry)
list_del(&dentry->d_u.d_child);
dentry_stat.nr_dentry--; /* For d_free, below */
+ preempt_enable_no_resched();
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
if (IS_ROOT(dentry))
@@ -220,15 +229,22 @@ void dput(struct dentry *dentry)
repeat:
if (atomic_read(&dentry->d_count) == 1)
might_sleep();
- if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
- return;
+ preempt_disable();
+ if (unlikely(ub_dentry_on)) {
+ spin_lock(&dcache_lock);
+ if (!atomic_dec_and_test(&dentry->d_count)) {
+ ub_dentry_uncharge_locked(dentry);
+ spin_unlock(&dcache_lock);
+ goto out_preempt;
+ }
+ } else {
+ if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
+ goto out_preempt;
+ }
spin_lock(&dentry->d_lock);
- if (atomic_read(&dentry->d_count)) {
- spin_unlock(&dentry->d_lock);
- spin_unlock(&dcache_lock);
- return;
- }
+ if (atomic_read(&dentry->d_count))
+ goto out_unlock;
/*
* AV: ->d_delete() is _NOT_ allowed to block now.
@@ -244,8 +260,12 @@ repeat:
dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
}
+out_unlock:
spin_unlock(&dentry->d_lock);
+ ub_dentry_uncharge_locked(dentry);
spin_unlock(&dcache_lock);
+out_preempt:
+ preempt_enable();
return;
unhash_it:
@@ -253,9 +273,21 @@ unhash_it:
kill_it:
/* if dentry was on the d_lru list delete it from there */
dentry_lru_del(dentry);
+
+ if (unlikely(ub_dentry_on)) {
+ struct user_beancounter *ub;
+
+ ub = dentry->dentry_bc.d_ub;
+ BUG_ON(!ub_dput_testzero(dentry));
+ uncharge_dcache(ub, dentry->dentry_bc.d_ubsize);
+ put_beancounter(ub);
+ }
+
dentry = d_kill(dentry);
- if (dentry)
+ preempt_disable();
+ if (dentry)
goto repeat;
+ preempt_enable();
}
/**
@@ -321,6 +353,7 @@ static inline struct dentry * __dget_locked(struct dentry *dentry)
{
atomic_inc(&dentry->d_count);
dentry_lru_del_init(dentry);
+ ub_dentry_charge_nofail(dentry);
return dentry;
}
@@ -423,6 +456,7 @@ static void prune_one_dentry(struct dentry * dentry)
__acquires(dcache_lock)
{
__d_drop(dentry);
+ preempt_disable();
dentry = d_kill(dentry);
/*
@@ -438,6 +472,7 @@ static void prune_one_dentry(struct dentry * dentry)
dentry->d_op->d_delete(dentry);
dentry_lru_del_init(dentry);
__d_drop(dentry);
+ preempt_disable();
dentry = d_kill(dentry);
spin_lock(&dcache_lock);
}
@@ -732,6 +767,8 @@ void shrink_dcache_for_umount(struct super_block *sb)
dentry = sb->s_root;
sb->s_root = NULL;
+ /* "/" was also charged in d_alloc_root() */
+ ub_dentry_uncharge(dentry);
atomic_dec(&dentry->d_count);
shrink_dcache_for_umount_subtree(dentry);
@@ -891,12 +928,18 @@ void shrink_dcache_parent(struct dentry * parent)
*/
static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
{
+ int res = -1;
+
+ KSTAT_PERF_ENTER(shrink_dcache)
if (nr) {
if (!(gfp_mask & __GFP_FS))
- return -1;
+ goto out;
prune_dcache(nr);
}
- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+out:
+ KSTAT_PERF_LEAVE(shrink_dcache)
+ return res;
}
static struct shrinker dcache_shrinker = {
@@ -919,21 +962,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
struct dentry *dentry;
char *dname;
+ dname = NULL;
+ if (name->len > DNAME_INLINE_LEN-1) {
+ dname = kmalloc(name->len + 1, GFP_KERNEL);
+ if (!dname)
+ goto err_name;
+ }
+
+ ub_dentry_alloc_start();
+
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
- return NULL;
+ goto err_alloc;
- if (name->len > DNAME_INLINE_LEN-1) {
- dname = kmalloc(name->len + 1, GFP_KERNEL);
- if (!dname) {
- kmem_cache_free(dentry_cache, dentry);
- return NULL;
- }
- } else {
+ preempt_disable();
+ if (dname == NULL)
dname = dentry->d_iname;
- }
dentry->d_name.name = dname;
+ if (ub_dentry_alloc(dentry))
+ goto err_charge;
+
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
@@ -961,12 +1010,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
}
spin_lock(&dcache_lock);
- if (parent)
+ if (parent) {
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
+ if (parent->d_flags & DCACHE_VIRTUAL)
+ dentry->d_flags |= DCACHE_VIRTUAL;
+ }
dentry_stat.nr_dentry++;
spin_unlock(&dcache_lock);
+ preempt_enable();
+ ub_dentry_alloc_end();
return dentry;
+
+err_charge:
+ preempt_enable();
+ kmem_cache_free(dentry_cache, dentry);
+err_alloc:
+ if (name->len > DNAME_INLINE_LEN - 1)
+ kfree(dname);
+ ub_dentry_alloc_end();
+err_name:
+ return NULL;
}
struct dentry *d_alloc_name(struct dentry *parent, const char *name)
@@ -1363,12 +1427,12 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
unsigned int hash = name->hash;
const unsigned char *str = name->name;
struct hlist_head *head = d_hash(parent,hash);
- struct dentry *found = NULL;
struct hlist_node *node;
- struct dentry *dentry;
+ struct dentry *dentry, *found;
rcu_read_lock();
+ found = NULL;
hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
struct qstr *qstr;
@@ -1408,6 +1472,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
atomic_inc(&dentry->d_count);
found = dentry;
+
+ if (ub_dentry_charge(found))
+ goto charge_failure;
+
spin_unlock(&dentry->d_lock);
break;
next:
@@ -1416,6 +1484,14 @@ next:
rcu_read_unlock();
return found;
+
+charge_failure:
+ spin_unlock(&found->d_lock);
+ rcu_read_unlock();
+ /* dentry is now unhashed, just kill it */
+ dput(found);
+ /* ... and fail lookup */
+ return NULL;
}
/**
@@ -1884,6 +1960,16 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
}
/**
+ * d_root_check - checks if dentry is accessible from current's fs root
+ * @dentry: dentry to be verified
+ * @vfsmnt: vfsmnt to which the dentry belongs
+ */
+int d_root_check(struct path *path)
+{
+ return PTR_ERR(d_path(path, NULL, 0));
+}
+
+/**
* __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
@@ -1908,18 +1994,21 @@ char *__d_path(const struct path *path, struct path *root,
struct vfsmount *vfsmnt = path->mnt;
char *end = buffer + buflen;
char *retval;
+ int deleted;
+ struct vfsmount *oldmnt = vfsmnt;
spin_lock(&vfsmount_lock);
- prepend(&end, &buflen, "\0", 1);
- if (d_unlinked(dentry) &&
- (prepend(&end, &buflen, " (deleted)", 10) != 0))
+ if (buffer) {
+ prepend(&end, &buflen, "\0", 1);
+ if (buflen < 1)
goto Elong;
+ }
+ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry));
- if (buflen < 1)
- goto Elong;
/* Get '/' right */
retval = end-1;
- *retval = '/';
+ if (buffer)
+ *retval = '/';
for (;;) {
struct dentry * parent;
@@ -1937,20 +2026,43 @@ char *__d_path(const struct path *path, struct path *root,
}
parent = dentry->d_parent;
prefetch(parent);
- if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
- (prepend(&end, &buflen, "/", 1) != 0))
+ if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+ (prepend(&end, &buflen, "/", 1) != 0)))
goto Elong;
retval = end;
dentry = parent;
}
out:
+ if (deleted && buffer &&
+ prepend(&end, &buflen, " (deleted)", 10) != 0)
+ goto Elong;
+out_err:
spin_unlock(&vfsmount_lock);
- return retval;
+ return buffer ? retval : NULL;
global_root:
+ /*
+ * We traversed the tree upward and reached a root, but the given
+ * lookup terminal point wasn't encountered. It means either that the
+ * dentry is out of our scope or belongs to an abstract space like
+ * sock_mnt or pipe_mnt. Check for it.
+ *
+ * There are different options to check it.
+ * We may assume that any dentry tree is unreachable unless it's
+ * connected to `root' (defined as fs root of init aka child reaper)
+ * and expose all paths that are not connected to it.
+ * The other option is to allow exposing of known abstract spaces
+ * explicitly and hide the path information for other cases.
+ * This approach is more safe, let's take it. 2001/04/22 SAW
+ */
+ if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER)) {
+ retval = ERR_PTR(-EINVAL);
+ goto out_err;
+ }
+
retval += 1; /* hit the slash */
- if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+ if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0)
goto Elong;
root->mnt = vfsmnt;
root->dentry = dentry;
@@ -1958,8 +2070,9 @@ global_root:
Elong:
retval = ERR_PTR(-ENAMETOOLONG);
- goto out;
+ goto out_err;
}
+EXPORT_SYMBOL(__d_path);
/**
* d_path - return the path of a dentry
@@ -1989,8 +2102,11 @@ char *d_path(const struct path *path, char *buf, int buflen)
* thus don't need to be hashed. They also don't need a name until a
* user wants to identify the object in /proc/pid/fd/. The little hack
* below allows us to generate a name for these objects on demand:
+ *
+ * pipefs and socketfs methods assume valid buffer, d_root_check()
+ * supplies NULL one for access checks.
*/
- if (path->dentry->d_op && path->dentry->d_op->d_dname)
+ if (buf && path->dentry->d_op && path->dentry->d_op->d_dname)
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
read_lock(&current->fs->lock);
@@ -2005,6 +2121,231 @@ char *d_path(const struct path *path, char *buf, int buflen)
return res;
}
+#ifdef CONFIG_VE
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/file.h>
+#include <linux/mnt_namespace.h>
+#include <linux/vzratelimit.h>
+
+static void mark_sub_tree_virtual(struct dentry *d)
+{
+ struct dentry *orig_root;
+
+ orig_root = d;
+ while (1) {
+ spin_lock(&d->d_lock);
+ d->d_flags |= DCACHE_VIRTUAL;
+ spin_unlock(&d->d_lock);
+
+ if (!list_empty(&d->d_subdirs)) {
+ d = list_entry(d->d_subdirs.next,
+ struct dentry, d_u.d_child);
+ continue;
+ }
+ if (d == orig_root)
+ break;
+ while (d == list_entry(d->d_parent->d_subdirs.prev,
+ struct dentry, d_u.d_child)) {
+ d = d->d_parent;
+ if (d == orig_root)
+ goto out;
+ }
+ d = list_entry(d->d_u.d_child.next,
+ struct dentry, d_u.d_child);
+ }
+out:
+ return;
+}
+
+void mark_tree_virtual(struct path *path)
+{
+ struct vfsmount *orig_rootmnt;
+ struct vfsmount *m = path->mnt;
+ struct dentry *d = path->dentry;
+
+ spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
+ orig_rootmnt = m;
+ while (1) {
+ mark_sub_tree_virtual(d);
+ if (!list_empty(&m->mnt_mounts)) {
+ m = list_entry(m->mnt_mounts.next,
+ struct vfsmount, mnt_child);
+ d = m->mnt_root;
+ continue;
+ }
+ if (m == orig_rootmnt)
+ break;
+ while (m == list_entry(m->mnt_parent->mnt_mounts.prev,
+ struct vfsmount, mnt_child)) {
+ m = m->mnt_parent;
+ if (m == orig_rootmnt)
+ goto out;
+ }
+ m = list_entry(m->mnt_child.next,
+ struct vfsmount, mnt_child);
+ d = m->mnt_root;
+ }
+out:
+ spin_unlock(&vfsmount_lock);
+ spin_unlock(&dcache_lock);
+}
+EXPORT_SYMBOL(mark_tree_virtual);
+
+static struct vz_rate_info area_ri = { 20, 10*HZ };
+#define VE_AREA_ACC_CHECK 0x0001
+#define VE_AREA_ACC_DENY 0x0002
+#define VE_AREA_EXEC_CHECK 0x0010
+#define VE_AREA_EXEC_DENY 0x0020
+#define VE0_AREA_ACC_CHECK 0x0100
+#define VE0_AREA_ACC_DENY 0x0200
+#define VE0_AREA_EXEC_CHECK 0x1000
+#define VE0_AREA_EXEC_DENY 0x2000
+int ve_area_access_check = 0;
+
+static void print_connection_info(struct task_struct *tsk)
+{
+ struct files_struct *files;
+ struct fdtable *fdt;
+ int fd;
+
+ files = get_files_struct(tsk);
+ if (!files)
+ return;
+
+ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ for (fd = 0; fd < fdt->max_fds; fd++) {
+ struct file *file;
+ struct inode *inode;
+ struct socket *socket;
+ struct sock *sk;
+ struct inet_sock *inet;
+
+ file = fdt->fd[fd];
+ if (file == NULL)
+ continue;
+
+ inode = file->f_dentry->d_inode;
+ if (!S_ISSOCK(inode->i_mode))
+ continue;
+
+ socket = SOCKET_I(inode);
+ if (socket == NULL)
+ continue;
+
+ sk = socket->sk;
+ if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ || sk->sk_type != SOCK_STREAM)
+ continue;
+
+ inet = inet_sk(sk);
+ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n",
+ NIPQUAD(inet->daddr), ntohs(inet->dport),
+ inet->num);
+ }
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+}
+
+static void check_alert(struct path *path, char *str)
+{
+ struct task_struct *tsk;
+ unsigned long page;
+ struct super_block *sb;
+ char *p;
+
+ if (!vz_ratelimit(&area_ri))
+ return;
+
+ tsk = current;
+ p = ERR_PTR(-ENOMEM);
+ page = __get_free_page(GFP_KERNEL);
+ if (page) {
+ spin_lock(&dcache_lock);
+ p = __d_path(path, &tsk->fs->root, (char *)page, PAGE_SIZE);
+ spin_unlock(&dcache_lock);
+ }
+ if (IS_ERR(p))
+ p = "(undefined)";
+
+ sb = path->dentry->d_sb;
+ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n"
+ "Task %d/%d[%s] from VE%d, execenv %d\n",
+ str, p, sb->s_type->owner_env->veid,
+ sb->s_type->name, sb->s_dev,
+ tsk->pid, task_pid_vnr(tsk), tsk->comm,
+ VE_TASK_INFO(tsk)->owner_env->veid,
+ get_exec_env()->veid);
+
+ free_page(page);
+
+ print_connection_info(tsk);
+
+ read_lock(&tasklist_lock);
+ tsk = tsk->parent;
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+
+ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n",
+ tsk->pid, task_pid_vnr(tsk), tsk->comm,
+ VE_TASK_INFO(tsk)->owner_env->veid);
+
+ print_connection_info(tsk);
+ put_task_struct(tsk);
+ dump_stack();
+}
+#endif
+
+int check_area_access_ve(struct path *path)
+{
+#ifdef CONFIG_VE
+ int check, alert, deny;
+
+ if (ve_is_super(get_exec_env())) {
+ check = ve_area_access_check & VE0_AREA_ACC_CHECK;
+ alert = path->dentry->d_flags & DCACHE_VIRTUAL;
+ deny = ve_area_access_check & VE0_AREA_ACC_DENY;
+ } else {
+ check = ve_area_access_check & VE_AREA_ACC_CHECK;
+ alert = !(path->dentry->d_flags & DCACHE_VIRTUAL);
+ deny = ve_area_access_check & VE_AREA_ACC_DENY;
+ }
+
+ if (check && alert)
+ check_alert(path, "Access");
+ if (deny && alert)
+ return -EACCES;
+#endif
+ return 0;
+}
+
+#if 0
+int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt)
+{
+#ifdef CONFIG_VE
+ int check, alert, deny;
+
+ if (ve_is_super(get_exec_env())) {
+ check = ve_area_access_check & VE0_AREA_EXEC_CHECK;
+ alert = dentry->d_flags & DCACHE_VIRTUAL;
+ deny = ve_area_access_check & VE0_AREA_EXEC_DENY;
+ } else {
+ check = ve_area_access_check & VE_AREA_EXEC_CHECK;
+ alert = !(dentry->d_flags & DCACHE_VIRTUAL);
+ deny = ve_area_access_check & VE_AREA_EXEC_DENY;
+ }
+
+ if (check && alert)
+ check_alert(mnt, dentry, "Exec");
+ if (deny && alert)
+ return -EACCES;
+#endif
+ return 0;
+}
+#endif
+
/*
* Helper function for dentry_operations.d_dname() members
*/
@@ -2190,10 +2531,12 @@ resume:
goto repeat;
}
atomic_dec(&dentry->d_count);
+ ub_dentry_uncharge_locked(dentry);
}
if (this_parent != root) {
next = this_parent->d_u.d_child.next;
atomic_dec(&this_parent->d_count);
+ ub_dentry_uncharge_locked(this_parent);
this_parent = this_parent->d_parent;
goto resume;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 39c6ee8..8f43266 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -228,9 +228,12 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
{
struct dentry *dentry = NULL;
int error;
+ struct user_beancounter *ub;
pr_debug("debugfs: creating file '%s'\n",name);
+ ub = set_exec_ub(get_ub0());
+
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error)
@@ -244,6 +247,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
goto exit;
}
exit:
+ set_exec_ub(ub);
return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_create_file);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc..4a0ee5e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -38,7 +38,9 @@
extern int pty_limit; /* Config limit on Unix98 ptys */
static DEFINE_MUTEX(allocated_ptys_lock);
+#ifndef CONFIG_VE
static struct vfsmount *devpts_mnt;
+#endif
struct pts_mount_opts {
int setuid;
@@ -83,7 +85,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
return inode->i_sb;
#endif
- return devpts_mnt->mnt_sb;
+ return get_exec_env()->devpts_mnt->mnt_sb;
}
#define PARSE_MOUNT 0
@@ -421,11 +423,12 @@ static void devpts_kill_sb(struct super_block *sb)
kill_litter_super(sb);
}
-static struct file_system_type devpts_fs_type = {
+struct file_system_type devpts_fs_type = {
.name = "devpts",
.get_sb = devpts_get_sb,
.kill_sb = devpts_kill_sb,
};
+EXPORT_SYMBOL(devpts_fs_type);
/*
* The normal naming convention is simply /dev/pts/<number>; this conforms
@@ -566,9 +569,9 @@ static int __init init_devpts_fs(void)
{
int err = register_filesystem(&devpts_fs_type);
if (!err) {
- devpts_mnt = kern_mount(&devpts_fs_type);
- if (IS_ERR(devpts_mnt)) {
- err = PTR_ERR(devpts_mnt);
+ get_ve0()->devpts_mnt = kern_mount(&devpts_fs_type);
+ if (IS_ERR(get_ve0()->devpts_mnt)) {
+ err = PTR_ERR(get_ve0()->devpts_mnt);
unregister_filesystem(&devpts_fs_type);
}
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87..006a735 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -658,7 +658,7 @@ submit_page_section(struct dio *dio, struct page *page,
/*
* Read accounting is performed in submit_bio()
*/
- task_io_account_write(len);
+ task_io_account_write(page, len, 1);
}
/*
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c0..8f10643 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -31,6 +31,7 @@
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <asm/uaccess.h>
@@ -86,11 +87,6 @@
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
-struct epoll_filefd {
- struct file *file;
- int fd;
-};
-
/*
* Structure used to track possible nested calls, for too deep recursions
* and loop cycles.
@@ -110,82 +106,6 @@ struct nested_calls {
spinlock_t lock;
};
-/*
- * Each file descriptor added to the eventpoll interface will
- * have an entry of this type linked to the "rbr" RB tree.
- */
-struct epitem {
- /* RB tree node used to link this structure to the eventpoll RB tree */
- struct rb_node rbn;
-
- /* List header used to link this structure to the eventpoll ready list */
- struct list_head rdllink;
-
- /*
- * Works together "struct eventpoll"->ovflist in keeping the
- * single linked chain of items.
- */
- struct epitem *next;
-
- /* The file descriptor information this item refers to */
- struct epoll_filefd ffd;
-
- /* Number of active wait queue attached to poll operations */
- int nwait;
-
- /* List containing poll wait queues */
- struct list_head pwqlist;
-
- /* The "container" of this item */
- struct eventpoll *ep;
-
- /* List header used to link this item to the "struct file" items list */
- struct list_head fllink;
-
- /* The structure that describe the interested events and the source fd */
- struct epoll_event event;
-};
-
-/*
- * This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
- * interface.
- */
-struct eventpoll {
- /* Protect the this structure access */
- spinlock_t lock;
-
- /*
- * This mutex is used to ensure that files are not removed
- * while epoll is using them. This is held during the event
- * collection loop, the file cleanup path, the epoll file exit
- * code and the ctl operations.
- */
- struct mutex mtx;
-
- /* Wait queue used by sys_epoll_wait() */
- wait_queue_head_t wq;
-
- /* Wait queue used by file->poll() */
- wait_queue_head_t poll_wait;
-
- /* List of ready file descriptors */
- struct list_head rdllist;
-
- /* RB tree root used to store monitored fd structs */
- struct rb_root rbr;
-
- /*
- * This is a single linked list that chains all the "struct epitem" that
- * happened while transfering ready events to userspace w/out
- * holding ->lock.
- */
- struct epitem *ovflist;
-
- /* The user that created the eventpoll descriptor */
- struct user_struct *user;
-};
-
/* Wait structure used by the poll hooks */
struct eppoll_entry {
/* List header used to link this structure to the "struct epitem" */
@@ -225,7 +145,8 @@ static int max_user_watches __read_mostly;
/*
* This mutex is used to serialize ep_free() and eventpoll_release_file().
*/
-static DEFINE_MUTEX(epmutex);
+DEFINE_MUTEX(epmutex);
+EXPORT_SYMBOL_GPL(epmutex);
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;
@@ -672,10 +593,11 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
}
/* File callbacks that implement the eventpoll file behaviour */
-static const struct file_operations eventpoll_fops = {
+const struct file_operations eventpoll_fops = {
.release = ep_eventpoll_release,
.poll = ep_eventpoll_poll
};
+EXPORT_SYMBOL(eventpoll_fops);
/* Fast test to see if the file is an evenpoll file */
static inline int is_file_epoll(struct file *f)
@@ -757,7 +679,7 @@ free_uid:
* are protected by the "mtx" mutex, and ep_find() must be called with
* "mtx" held.
*/
-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;
@@ -780,6 +702,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
return epir;
}
+EXPORT_SYMBOL_GPL(ep_find);
/*
* This is the callback that is passed to the wait queue wakeup
@@ -895,7 +818,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
/*
* Must be called with "mtx" held.
*/
-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
@@ -994,6 +917,7 @@ error_unregister:
return error;
}
+EXPORT_SYMBOL(ep_insert);
/*
* Modify the interest event mask by dropping an event if the new mask
@@ -1220,6 +1144,7 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return sys_epoll_create1(0);
}
+EXPORT_SYMBOL(sys_epoll_create);
/*
* The following function implements the controller interface for
diff --git a/fs/exec.c b/fs/exec.c
index 56da15f..6ea8efa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
+#include <linux/virtinfo.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/smp_lock.h>
@@ -62,6 +63,8 @@
#include <asm/tlb.h>
#include "internal.h"
+#include <bc/vmpages.h>
+
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
unsigned int core_pipe_limit;
@@ -69,6 +72,8 @@ int suid_dumpable = 0;
/* The maximal length of core_pattern is also specified in sysctl.c */
+int sysctl_at_vsyscall;
+
static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);
@@ -230,9 +235,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
struct vm_area_struct *vma = NULL;
struct mm_struct *mm = bprm->mm;
+ err = -ENOMEM;
+ if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags,
+ NULL, UB_SOFT))
+ goto err_charge;
+
bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma)
- return -ENOMEM;
+ goto err_alloc;
down_write(&mm->mmap_sem);
vma->vm_mm = mm;
@@ -259,6 +269,9 @@ err:
up_write(&mm->mmap_sem);
bprm->vma = NULL;
kmem_cache_free(vm_area_cachep, vma);
+err_alloc:
+ ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL);
+err_charge:
return err;
}
@@ -711,10 +724,11 @@ int kernel_read(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_read);
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct linux_binprm *bprm)
{
struct task_struct *tsk;
- struct mm_struct * old_mm, *active_mm;
+ struct mm_struct *old_mm, *active_mm, *mm;
+ int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
@@ -734,6 +748,10 @@ static int exec_mmap(struct mm_struct *mm)
return -EINTR;
}
}
+
+ ret = 0;
+ mm = bprm->mm;
+ mm->vps_dumpable = 1;
task_lock(tsk);
active_mm = tsk->active_mm;
tsk->mm = mm;
@@ -741,15 +759,25 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
task_unlock(tsk);
arch_pick_mmap_layout(mm);
+ bprm->mm = NULL; /* We're using it now */
+
+#ifdef CONFIG_VZ_GENCALLS
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP,
+ bprm) & NOTIFY_FAIL) {
+ /* similar to binfmt_elf */
+ send_sig(SIGKILL, current, 0);
+ ret = -ENOMEM;
+ }
+#endif
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
- return 0;
+ return ret;
}
mmdrop(active_mm);
- return 0;
+ return ret;
}
/*
@@ -844,6 +872,10 @@ static int de_thread(struct task_struct *tsk)
transfer_pid(leader, tsk, PIDTYPE_PGID);
transfer_pid(leader, tsk, PIDTYPE_SID);
list_replace_rcu(&leader->tasks, &tsk->tasks);
+#ifdef CONFIG_VE
+ list_replace_rcu(&leader->ve_task_info.vetask_list,
+ &tsk->ve_task_info.vetask_list);
+#endif
tsk->group_leader = tsk;
leader->group_leader = tsk;
@@ -962,12 +994,10 @@ int flush_old_exec(struct linux_binprm * bprm)
/*
* Release all of the old mmap stuff
*/
- retval = exec_mmap(bprm->mm);
+ retval = exec_mmap(bprm);
if (retval)
goto out;
- bprm->mm = NULL; /* We're using it now */
-
current->flags &= ~PF_RANDOMIZE;
flush_thread();
current->personality &= ~bprm->per_clear;
@@ -1315,6 +1345,10 @@ int do_execve(char * filename,
bool clear_in_exec;
int retval;
+ retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL);
+ if (retval)
+ return retval;
+
retval = unshare_files(&displaced);
if (retval)
goto out_ret;
@@ -1566,7 +1600,7 @@ static int zap_process(struct task_struct *start)
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ } while_each_thread_ve(start, t);
return nr;
}
@@ -1621,7 +1655,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
* next_thread().
*/
rcu_read_lock();
- for_each_process(g) {
+ for_each_process_ve(g) {
if (g == tsk->group_leader)
continue;
if (g->flags & PF_KTHREAD)
@@ -1636,7 +1670,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
}
break;
}
- } while_each_thread(g, p);
+ } while_each_thread_ve(g, p);
}
rcu_read_unlock();
done:
@@ -1804,7 +1838,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
/*
* If another thread got here first, or we are not dumpable, bail out.
*/
- if (mm->core_state || !get_dumpable(mm)) {
+ if (mm->core_state || !get_dumpable(mm) || mm->vps_dumpable != 1) {
up_write(&mm->mmap_sem);
put_cred(cred);
goto fail;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index dd7175c..eb08505 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -31,6 +31,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
@@ -262,6 +263,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry)
struct page * page;
int err = -ENOENT;
+ vfs_dq_init(inode);
+
de = ext2_find_entry (dir, &dentry->d_name, &page);
if (!de)
goto out;
@@ -304,6 +307,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
struct ext2_dir_entry_2 * old_de;
int err = -ENOENT;
+ if (new_inode)
+ vfs_dq_init(new_inode);
+
old_de = ext2_find_entry (old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1a9ffee..ba5ef60 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1426,7 +1426,7 @@ static struct file_system_type ext2_fs_type = {
.name = "ext2",
.get_sb = ext2_get_sb,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED,
};
static int __init init_ext2_fs(void)
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8897481..54b4ba6 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -78,7 +78,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
+ if (!capable(CAP_SYS_ADMIN))
goto flags_out;
}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index aad6400..7be0b93 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1340,7 +1340,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
if (err)
ext3_std_error(dir->i_sb, err);
brelse(bh);
- return 0;
+ return err;
}
/*
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ca3068f..0c4978f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2986,7 +2986,7 @@ static struct file_system_type ext3_fs_type = {
.name = "ext3",
.get_sb = ext3_get_sb,
.kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED,
};
static int __init init_ext3_fs(void)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 16efcee..3833fe9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5770,9 +5770,14 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret = -EINVAL;
void *fsdata;
struct file *file = vma->vm_file;
- struct inode *inode = file->f_path.dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
+ struct inode *inode;
+ struct address_space *mapping;
+
+ if (file->f_op->get_host)
+ file = file->f_op->get_host(file);
+ inode = file->f_path.dentry->d_inode;
+ mapping = inode->i_mapping;
/*
* Get i_alloc_sem to stop truncates messing with the inode. We cannot
* get i_mutex because we are already holding mmap_sem.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193..0ae6e52 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -77,7 +77,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
+ if (!capable(CAP_SYS_ADMIN))
goto flags_out;
}
if (oldflags & EXT4_EXTENTS_FL) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 97e01dc..e24ca00 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -126,6 +126,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
}
return sys_dup3(oldfd, newfd, 0);
}
+EXPORT_SYMBOL_GPL(sys_dup2);
SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
@@ -149,6 +150,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
struct inode * inode = filp->f_path.dentry->d_inode;
int error = 0;
+ if (!capable(CAP_SYS_RAWIO) && !odirect_enable)
+ arg &= ~O_DIRECT;
+
/*
* O_APPEND cannot be cleared if the file is marked as append-only
* and the file is open for write.
@@ -742,7 +746,7 @@ EXPORT_SYMBOL(kill_fasync);
static int __init fasync_init(void)
{
fasync_cache = kmem_cache_create("fasync_cache",
- sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+ sizeof(struct fasync_struct), 0, SLAB_PANIC|SLAB_UBC, NULL);
return 0;
}
diff --git a/fs/file.c b/fs/file.c
index 87e1290..2ae9cad 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/time.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -21,6 +22,8 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
+#include <bc/kmem.h>
+
struct fdtable_defer {
spinlock_t lock;
struct work_struct wq;
@@ -42,9 +45,9 @@ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
static inline void * alloc_fdmem(unsigned int size)
{
if (size <= PAGE_SIZE)
- return kmalloc(size, GFP_KERNEL);
+ return kmalloc(size, GFP_KERNEL_UBC);
else
- return vmalloc(size);
+ return ub_vmalloc(size);
}
static inline void free_fdarr(struct fdtable *fdt)
@@ -163,7 +166,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC);
if (!fdt)
goto out;
fdt->max_fds = nr;
@@ -198,7 +201,7 @@ out:
* Return <0 error code on error; 1 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
-static int expand_fdtable(struct files_struct *files, int nr)
+int expand_fdtable(struct files_struct *files, int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -238,6 +241,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
}
return 1;
}
+EXPORT_SYMBOL_GPL(expand_fdtable);
/*
* Expand files.
diff --git a/fs/file_table.c b/fs/file_table.c
index 666c7ce..5351b54 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -22,9 +22,14 @@
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
+#include <linux/ve.h>
#include <asm/atomic.h>
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/misc.h>
+
/* sysctl tunables... */
struct files_stat_struct files_stat = {
.max_files = NR_FILE
@@ -34,7 +39,8 @@ struct files_stat_struct files_stat = {
__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
/* SLAB cache for file structures */
-static struct kmem_cache *filp_cachep __read_mostly;
+struct kmem_cache *filp_cachep __read_mostly;
+EXPORT_SYMBOL_GPL(filp_cachep);
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
@@ -43,13 +49,16 @@ static inline void file_free_rcu(struct rcu_head *head)
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
put_cred(f->f_cred);
+ put_ve(f->owner_env);
kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
- percpu_counter_dec(&nr_files);
file_check_state(f);
+ if (f->f_ub == get_ub0())
+ percpu_counter_dec(&nr_files);
+ ub_file_uncharge(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -103,11 +112,14 @@ struct file *get_empty_filp(void)
const struct cred *cred = current_cred();
static int old_max;
struct file * f;
+ int acct;
+ acct = (get_exec_ub() == get_ub0());
/*
* Privileged users can go above max_files
*/
- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+ if (acct && get_nr_files() >= files_stat.max_files &&
+ !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
@@ -120,7 +132,13 @@ struct file *get_empty_filp(void)
if (f == NULL)
goto fail;
- percpu_counter_inc(&nr_files);
+ if (ub_file_charge(f))
+ goto fail_ch;
+ if (acct)
+ percpu_counter_inc(&nr_files);
+
+ f->owner_env = get_ve(get_exec_env());
+
if (security_file_alloc(f))
goto fail_sec;
@@ -146,6 +164,10 @@ fail_sec:
file_free(f);
fail:
return NULL;
+
+fail_ch:
+ kmem_cache_free(filp_cachep, f);
+ return NULL;
}
EXPORT_SYMBOL(get_empty_filp);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e..bd5c213 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -14,6 +14,9 @@
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/sched.h> /* for 'current' */
+#include <linux/mount.h>
+#include <linux/ve.h>
#include <asm/uaccess.h>
/*
@@ -23,8 +26,8 @@
* During the unload module must call unregister_filesystem().
* We can access the fields of list element if:
* 1) spinlock is held or
- * 2) we hold the reference to the module.
- * The latter can be guaranteed by call of try_module_get(); if it
+ * 2) we hold the reference to the element.
+ * The latter can be guaranteed by call of try_filesystem(); if it
* returned 0 we must skip the element, otherwise we got the reference.
* Once the reference is obtained we can drop the spinlock.
*/
@@ -32,24 +35,46 @@
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);
+int try_get_filesystem(struct file_system_type *fs)
+{
+ if (try_module_get(fs->owner)) {
+ (void)get_ve(fs->owner_env);
+ return 1;
+ }
+ return 0;
+}
+
/* WARNING: This can be used only if we _already_ own a reference */
void get_filesystem(struct file_system_type *fs)
{
+ (void)get_ve(fs->owner_env);
__module_get(fs->owner);
}
void put_filesystem(struct file_system_type *fs)
{
module_put(fs->owner);
+ put_ve(fs->owner_env);
}
-static struct file_system_type **find_filesystem(const char *name, unsigned len)
+static inline int check_ve_fstype(struct file_system_type *p,
+ struct ve_struct *env)
+{
+ return ((p->fs_flags & FS_VIRTUALIZED) ||
+ ve_accessible_strict(p->owner_env, env));
+}
+
+static struct file_system_type **find_filesystem(const char *name, unsigned len,
+ struct ve_struct *env)
{
struct file_system_type **p;
- for (p=&file_systems; *p; p=&(*p)->next)
+ for (p=&file_systems; *p; p=&(*p)->next) {
+ if (!check_ve_fstype(*p, env))
+ continue;
if (strlen((*p)->name) == len &&
strncmp((*p)->name, name, len) == 0)
break;
+ }
return p;
}
@@ -75,8 +100,12 @@ int register_filesystem(struct file_system_type * fs)
if (fs->next)
return -EBUSY;
INIT_LIST_HEAD(&fs->fs_supers);
+ if (fs->owner_env == NULL)
+ fs->owner_env = get_ve0();
+ if (fs->proto == NULL)
+ fs->proto = fs;
write_lock(&file_systems_lock);
- p = find_filesystem(fs->name, strlen(fs->name));
+ p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env);
if (*p)
res = -EBUSY;
else
@@ -120,6 +149,75 @@ int unregister_filesystem(struct file_system_type * fs)
EXPORT_SYMBOL(unregister_filesystem);
+#ifdef CONFIG_VE
+int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template,
+ struct file_system_type **p_fs_type, struct vfsmount **p_mnt)
+{
+ struct vfsmount *mnt;
+ struct file_system_type *local_fs_type;
+ int ret;
+
+ local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *),
+ GFP_KERNEL);
+ if (local_fs_type == NULL)
+ return -ENOMEM;
+
+ local_fs_type->name = template->name;
+ local_fs_type->fs_flags = template->fs_flags;
+ local_fs_type->get_sb = template->get_sb;
+ local_fs_type->kill_sb = template->kill_sb;
+ local_fs_type->owner = template->owner;
+ local_fs_type->owner_env = ve;
+ local_fs_type->proto = template;
+
+ get_filesystem(local_fs_type); /* get_ve() inside */
+
+ ret = register_filesystem(local_fs_type);
+ if (ret)
+ goto reg_err;
+
+ if (p_mnt == NULL)
+ goto done;
+
+ mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL);
+ if (IS_ERR(mnt))
+ goto mnt_err;
+
+ *p_mnt = mnt;
+done:
+ *p_fs_type = local_fs_type;
+ return 0;
+
+mnt_err:
+ ret = PTR_ERR(mnt);
+ unregister_filesystem(local_fs_type); /* does not put */
+
+reg_err:
+ put_filesystem(local_fs_type);
+ kfree(local_fs_type);
+ printk(KERN_DEBUG
+ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret);
+ return ret;
+}
+
+EXPORT_SYMBOL(register_ve_fs_type);
+
+void unregister_ve_fs_type(struct file_system_type *local_fs_type,
+ struct vfsmount *local_fs_mount)
+{
+ if (local_fs_mount == NULL && local_fs_type == NULL)
+ return;
+
+ unregister_filesystem(local_fs_type);
+ umount_ve_fs_type(local_fs_type);
+ if (local_fs_mount)
+ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */
+ put_filesystem(local_fs_type);
+}
+
+EXPORT_SYMBOL(unregister_ve_fs_type);
+#endif
+
static int fs_index(const char __user * __name)
{
struct file_system_type * tmp;
@@ -133,11 +231,14 @@ static int fs_index(const char __user * __name)
err = -EINVAL;
read_lock(&file_systems_lock);
- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
+ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) {
+ if (!check_ve_fstype(tmp, get_exec_env()))
+ continue;
if (strcmp(tmp->name,name) == 0) {
err = index;
break;
}
+ index++;
}
read_unlock(&file_systems_lock);
putname(name);
@@ -150,9 +251,15 @@ static int fs_name(unsigned int index, char __user * buf)
int len, res;
read_lock(&file_systems_lock);
- for (tmp = file_systems; tmp; tmp = tmp->next, index--)
- if (index <= 0 && try_module_get(tmp->owner))
- break;
+ for (tmp = file_systems; tmp; tmp = tmp->next) {
+ if (!check_ve_fstype(tmp, get_exec_env()))
+ continue;
+ if (!index) {
+ if (try_get_filesystem(tmp))
+ break;
+ } else
+ index--;
+ }
read_unlock(&file_systems_lock);
if (!tmp)
return -EINVAL;
@@ -170,8 +277,9 @@ static int fs_maxindex(void)
int index;
read_lock(&file_systems_lock);
- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
- ;
+ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next)
+ if (check_ve_fstype(tmp, get_exec_env()))
+ index++;
read_unlock(&file_systems_lock);
return index;
}
@@ -207,9 +315,10 @@ int __init get_filesystem_list(char *buf)
read_lock(&file_systems_lock);
tmp = file_systems;
while (tmp && len < PAGE_SIZE - 80) {
- len += sprintf(buf+len, "%s\t%s\n",
- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
- tmp->name);
+ if (check_ve_fstype(tmp, get_exec_env()))
+ len += sprintf(buf+len, "%s\t%s\n",
+ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+ tmp->name);
tmp = tmp->next;
}
read_unlock(&file_systems_lock);
@@ -224,9 +333,12 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
read_lock(&file_systems_lock);
tmp = file_systems;
while (tmp) {
+ if (!check_ve_fstype(tmp, get_exec_env()))
+ goto next; /* skip in VE */
seq_printf(m, "%s\t%s\n",
(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
tmp->name);
+next:
tmp = tmp->next;
}
read_unlock(&file_systems_lock);
@@ -247,7 +359,7 @@ static const struct file_operations filesystems_proc_fops = {
static int __init proc_filesystems_init(void)
{
- proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+ proc_create("filesystems", 0, &glob_proc_root, &filesystems_proc_fops);
return 0;
}
module_init(proc_filesystems_init);
@@ -258,8 +370,8 @@ static struct file_system_type *__get_fs_type(const char *name, int len)
struct file_system_type *fs;
read_lock(&file_systems_lock);
- fs = *(find_filesystem(name, len));
- if (fs && !try_module_get(fs->owner))
+ fs = *(find_filesystem(name, len, get_exec_env()));
+ if (fs && !try_get_filesystem(fs))
fs = NULL;
read_unlock(&file_systems_lock);
return fs;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee0590..777c90a 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -47,7 +47,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
int count = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_ve(g, p) {
task_lock(p);
fs = p->fs;
if (fs) {
@@ -67,7 +67,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
write_unlock(&fs->lock);
}
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_ve(g, p);
read_unlock(&tasklist_lock);
while (count--)
path_put(old_root);
@@ -96,6 +96,7 @@ void exit_fs(struct task_struct *tsk)
free_fs_struct(fs);
}
}
+EXPORT_SYMBOL(exit_fs);
struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 3773fd6..df26800 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -10,6 +10,8 @@
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/ve_proto.h>
#define FUSE_CTL_SUPER_MAGIC 0x65735543
@@ -17,7 +19,11 @@
* This is non-NULL when the single instance of the control filesystem
* exists. Protected by fuse_mutex
*/
+#ifdef CONFIG_VE
+#define fuse_control_sb (get_exec_env()->_fuse_control_sb)
+#else
static struct super_block *fuse_control_sb;
+#endif
static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
{
@@ -345,12 +351,51 @@ static struct file_system_type fuse_ctl_fs_type = {
.kill_sb = fuse_ctl_kill_sb,
};
+#ifdef CONFIG_VE
+static int fuse_ctl_start(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ if (ve->fuse_ctl_fs_type != NULL)
+ return -EBUSY;
+
+ return register_ve_fs_type(ve, &fuse_ctl_fs_type,
+ &ve->fuse_ctl_fs_type, NULL);
+}
+
+static void fuse_ctl_stop(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ if (ve->fuse_ctl_fs_type == NULL)
+ return;
+
+ unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL);
+ ve->fuse_ctl_fs_type = NULL;
+}
+
+static struct ve_hook fuse_ctl_ve_hook = {
+ .init = fuse_ctl_start,
+ .fini = fuse_ctl_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_FS,
+};
+#endif
+
int __init fuse_ctl_init(void)
{
- return register_filesystem(&fuse_ctl_fs_type);
+ int err;
+
+ err = register_filesystem(&fuse_ctl_fs_type);
+ if (err == 0)
+ ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook);
+ return err;
}
void fuse_ctl_cleanup(void)
{
+ ve_hook_unregister(&fuse_ctl_ve_hook);
unregister_filesystem(&fuse_ctl_fs_type);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462..8b3387f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,7 +44,11 @@
#define FUSE_ALLOW_OTHER (1 << 1)
/** List of active connections */
+#ifdef CONFIG_VE
+#define fuse_conn_list (get_exec_env()->_fuse_conn_list)
+#else
extern struct list_head fuse_conn_list;
+#endif
/** Global mutex protecting fuse_conn_list and the control filesystem */
extern struct mutex fuse_mutex;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1a822ce..00dbf5f 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -20,13 +20,16 @@
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/exportfs.h>
+#include <linux/ve_proto.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
MODULE_LICENSE("GPL");
static struct kmem_cache *fuse_inode_cachep;
+#ifndef CONFIG_VE
struct list_head fuse_conn_list;
+#endif
DEFINE_MUTEX(fuse_mutex);
static int set_global_limit(const char *val, struct kernel_param *kp);
@@ -1194,6 +1197,41 @@ static void fuse_sysfs_cleanup(void)
kobject_put(fuse_kobj);
}
+#ifdef CONFIG_VE
+static int fuse_start(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ if (ve->fuse_fs_type != NULL)
+ return -EBUSY;
+
+ INIT_LIST_HEAD(&ve->_fuse_conn_list);
+ return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL);
+}
+
+static void fuse_stop(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ if (ve->fuse_fs_type == NULL)
+ return;
+
+ unregister_ve_fs_type(ve->fuse_fs_type, NULL);
+ kfree(ve->fuse_fs_type);
+ ve->fuse_fs_type = NULL;
+ BUG_ON(!list_empty(&ve->_fuse_conn_list));
+}
+
+static struct ve_hook fuse_ve_hook = {
+ .init = fuse_start,
+ .fini = fuse_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_FS,
+};
+#endif
+
static int __init fuse_init(void)
{
int res;
@@ -1218,6 +1256,7 @@ static int __init fuse_init(void)
if (res)
goto err_sysfs_cleanup;
+ ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook);
sanitize_global_limit(&max_user_bgreq);
sanitize_global_limit(&max_user_congthresh);
@@ -1237,6 +1276,7 @@ static void __exit fuse_exit(void)
{
printk(KERN_DEBUG "fuse exit\n");
+ ve_hook_unregister(&fuse_ve_hook);
fuse_ctl_cleanup();
fuse_sysfs_cleanup();
fuse_fs_cleanup();
diff --git a/fs/inode.c b/fs/inode.c
index 4d8e3be..ab63b5f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -8,10 +8,13 @@
#include <linux/mm.h>
#include <linux/dcache.h>
#include <linux/init.h>
+#include <linux/kernel_stat.h>
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <linux/rwsem.h>
@@ -27,6 +30,7 @@
#include <linux/mount.h>
#include <linux/async.h>
#include <linux/posix_acl.h>
+#include <linux/vzstat.h>
/*
* This is needed for the following functions:
@@ -106,7 +110,8 @@ static DECLARE_RWSEM(iprune_sem);
*/
struct inodes_stat_t inodes_stat;
-static struct kmem_cache *inode_cachep __read_mostly;
+struct kmem_cache *inode_cachep __read_mostly;
+
static void wake_up_inode(struct inode *inode)
{
@@ -125,19 +130,22 @@ static void wake_up_inode(struct inode *inode)
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
*/
+
+static struct address_space_operations vfs_empty_aops;
+const struct inode_operations vfs_empty_iops;
+static const struct file_operations vfs_empty_fops;
+EXPORT_SYMBOL(vfs_empty_iops);
+
int inode_init_always(struct super_block *sb, struct inode *inode)
{
- static const struct address_space_operations empty_aops;
- static const struct inode_operations empty_iops;
- static const struct file_operations empty_fops;
struct address_space *const mapping = &inode->i_data;
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
atomic_set(&inode->i_count, 1);
- inode->i_op = &empty_iops;
- inode->i_fop = &empty_fops;
+ inode->i_op = &vfs_empty_iops;
+ inode->i_fop = &vfs_empty_fops;
inode->i_nlink = 1;
inode->i_uid = 0;
inode->i_gid = 0;
@@ -163,15 +171,15 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
goto out_free_security;
spin_lock_init(&inode->i_lock);
- lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+ lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key);
mutex_init(&inode->i_mutex);
- lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+ lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key);
init_rwsem(&inode->i_alloc_sem);
- lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+ lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key);
- mapping->a_ops = &empty_aops;
+ mapping->a_ops = &vfs_empty_aops;
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
@@ -370,13 +378,76 @@ static void dispose_list(struct list_head *head)
spin_unlock(&inode_lock);
}
+static void show_header(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ printk("VFS: Busy inodes after unmount. "
+ "sb = %p, fs type = %s, sb count = %d, "
+ "sb->s_root = %s\n", sb,
+ (sb->s_type != NULL) ? sb->s_type->name : "",
+ sb->s_count,
+ (sb->s_root != NULL) ?
+ (char *)sb->s_root->d_name.name : "");
+}
+
+static void show_inode(struct inode *inode)
+{
+ struct dentry *d;
+ struct vfsmount *mnt;
+ int i;
+
+ printk("inode = %p, inode->i_count = %d, "
+ "inode->i_nlink = %d, "
+ "inode->i_mode = %d, "
+ "inode->i_state = %ld, "
+ "inode->i_flags = %d, "
+ "inode->i_devices.next = %p, "
+ "inode->i_devices.prev = %p, "
+ "inode->i_ino = %ld\n",
+ inode,
+ atomic_read(&inode->i_count),
+ inode->i_nlink,
+ inode->i_mode,
+ inode->i_state,
+ inode->i_flags,
+ inode->i_devices.next,
+ inode->i_devices.prev,
+ inode->i_ino);
+ printk("inode dump: ");
+ for (i = 0; i < sizeof(*inode); i++)
+ printk("%2.2x ", *((u_char *)inode + i));
+ printk("\n");
+ list_for_each_entry(d, &inode->i_dentry, d_alias) {
+ printk(" d_alias %s d_count=%d d_flags=%x\n",
+ d->d_name.name, atomic_read(&d->d_count), d->d_flags);
+ for (i = 0; i < sizeof(*d); i++)
+ printk("%2.2x ", *((u_char *)d + i));
+ printk("\n");
+ }
+
+ spin_lock(&vfsmount_lock);
+ list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) {
+ if (mnt->mnt_sb != inode->i_sb)
+ continue;
+ printk("mnt=%p count=%d flags=%x exp_mask=%x\n",
+ mnt, atomic_read(&mnt->mnt_count),
+ mnt->mnt_flags,
+ mnt->mnt_expiry_mark);
+ for (i = 0; i < sizeof(*mnt); i++)
+ printk("%2.2x ", *((u_char *)mnt + i));
+ printk("\n");
+ }
+ spin_unlock(&vfsmount_lock);
+}
+
/*
* Invalidate all inodes for a device.
*/
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
+static int invalidate_list(struct list_head *head, struct list_head *dispose, int check)
{
struct list_head *next;
- int busy = 0, count = 0;
+ int busy = 0, count = 0, once = 1;
next = head->next;
for (;;) {
@@ -406,6 +477,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
continue;
}
busy = 1;
+
+ if (check) {
+ if (once) {
+ once = 0;
+ show_header(inode);
+ }
+ show_inode(inode);
+ }
}
/* only unused inodes may be cached with i_count zero */
inodes_stat.nr_unused -= count;
@@ -420,7 +499,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
* fails because there are busy inodes then a non zero value is returned.
* If the discard is successful all the inodes have been discarded.
*/
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes_check(struct super_block *sb, int check)
{
int busy;
LIST_HEAD(throw_away);
@@ -429,7 +508,7 @@ int invalidate_inodes(struct super_block *sb)
spin_lock(&inode_lock);
inotify_unmount_inodes(&sb->s_inodes);
fsnotify_unmount_inodes(&sb->s_inodes);
- busy = invalidate_list(&sb->s_inodes, &throw_away);
+ busy = invalidate_list(&sb->s_inodes, &throw_away, check);
spin_unlock(&inode_lock);
dispose_list(&throw_away);
@@ -437,7 +516,7 @@ int invalidate_inodes(struct super_block *sb)
return busy;
}
-EXPORT_SYMBOL(invalidate_inodes);
+EXPORT_SYMBOL(invalidate_inodes_check);
static int can_unuse(struct inode *inode)
{
@@ -528,6 +607,7 @@ static void prune_icache(int nr_to_scan)
*/
static int shrink_icache_memory(int nr, gfp_t gfp_mask)
{
+ KSTAT_PERF_ENTER(shrink_icache)
if (nr) {
/*
* Nasty deadlock avoidance. We may hold various FS locks,
@@ -538,6 +618,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
return -1;
prune_icache(nr);
}
+ KSTAT_PERF_LEAVE(shrink_icache)
return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
@@ -695,7 +776,7 @@ void unlock_new_inode(struct inode *inode)
mutex_destroy(&inode->i_mutex);
mutex_init(&inode->i_mutex);
lockdep_set_class(&inode->i_mutex,
- &type->i_mutex_dir_key);
+ &type->proto->i_mutex_dir_key);
}
}
#endif
@@ -1258,7 +1339,7 @@ int generic_detach_inode(struct inode *inode)
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
list_move(&inode->i_list, &inode_unused);
inodes_stat.nr_unused++;
- if (sb->s_flags & MS_ACTIVE) {
+ if (sb->s_flags & MS_ACTIVE && !(inode->i_flags & S_NOUNUSE)) {
spin_unlock(&inode_lock);
return 0;
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28..25f7275 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -26,6 +26,8 @@
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
+#include <linux/ve_proto.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
@@ -78,8 +80,11 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
int data = IOPRIO_PRIO_DATA(ioprio);
struct task_struct *p, *g;
struct user_struct *user;
- struct pid *pgrp;
int ret;
+ struct pid *pgrp;
+
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
switch (class) {
case IOPRIO_CLASS_RT:
@@ -137,17 +142,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
if (!user)
break;
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (__task_cred(p)->uid != who)
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
goto free_uid;
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
free_uid:
if (who)
free_uid(user);
break;
+ case IOPRIO_WHO_UBC:
+ if (class != IOPRIO_CLASS_BE) {
+ ret = -ERANGE;
+ break;
+ }
+
+ ret = ve_set_ioprio(who, data);
+ break;
default:
ret = -EINVAL;
}
@@ -192,9 +205,9 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
- struct pid *pgrp;
int ret = -ESRCH;
int tmpio;
+ struct pid *pgrp;
read_lock(&tasklist_lock);
switch (which) {
@@ -230,7 +243,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
if (!user)
break;
- do_each_thread(g, p) {
+ do_each_thread_ve(g, p) {
if (__task_cred(p)->uid != user->uid)
continue;
tmpio = get_task_ioprio(p);
@@ -240,7 +253,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
- } while_each_thread(g, p);
+ } while_each_thread_ve(g, p);
if (who)
free_uid(user);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249f..64cf48d 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -155,12 +155,15 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
{
struct nlm_rqst *call;
int status;
+ struct ve_struct *ve;
nlm_get_host(host);
call = nlm_alloc_call(host);
if (call == NULL)
return -ENOMEM;
+ ve = set_exec_env(host->owner_env);
+
nlmclnt_locks_init_private(fl, host);
/* Set up the argument struct */
nlmclnt_setlockargs(call, fl);
@@ -182,6 +185,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
unlock_kernel();
dprintk("lockd: clnt proc returns %d\n", status);
+ (void)set_exec_env(ve);
return status;
}
EXPORT_SYMBOL_GPL(nlmclnt_proc);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 4600c20..55cc770 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -96,6 +96,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
+ struct ve_struct *ve;
mutex_lock(&nlm_host_mutex);
@@ -109,10 +110,14 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
* different NLM rpc_clients into one single nlm_host object.
* This would allow us to have one nlm_host per address.
*/
+
+ ve = get_exec_env();
chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
continue;
+ if (!ve_accessible_strict(host->owner_env, ve))
+ continue;
/* See if we have an NSM handle for this client */
if (!nsm)
@@ -186,6 +191,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
spin_lock_init(&host->h_lock);
INIT_LIST_HEAD(&host->h_granted);
INIT_LIST_HEAD(&host->h_reclaim);
+ host->owner_env = ve;
nrhosts++;
@@ -567,3 +573,50 @@ nlm_gc_hosts(void)
next_gc = jiffies + NLM_HOST_COLLECT;
}
+
+#ifdef CONFIG_VE
+void ve_nlm_shutdown_hosts(struct ve_struct *ve)
+{
+ envid_t veid = ve->veid;
+ int i;
+
+ dprintk("lockd: shutting down host module for ve %d\n", veid);
+ mutex_lock(&nlm_host_mutex);
+
+ /* Perform a garbage collection pass */
+ for (i = 0; i < NLM_HOST_NRHASH; i++) {
+ struct nlm_host *host;
+ struct hlist_node *pos;
+
+ hlist_for_each_entry(host, pos, &nlm_hosts[i], h_hash) {
+ struct rpc_clnt *clnt;
+
+ if (ve != host->owner_env)
+ continue;
+
+ hlist_del(&host->h_hash);
+ if (host->h_nsmhandle)
+ host->h_nsmhandle->sm_monitored = 0;
+ dprintk("lockd: delete host %s ve %d\n", host->h_name,
+ veid);
+ if ((clnt = host->h_rpcclnt) != NULL) {
+ if (!list_empty(&clnt->cl_tasks)) {
+ struct rpc_xprt *xprt;
+
+ printk(KERN_WARNING
+ "lockd: active RPC handle\n");
+ rpc_killall_tasks(clnt);
+ xprt = clnt->cl_xprt;
+ xprt_disconnect_done(xprt);
+ xprt->ops->close(xprt);
+ } else
+ rpc_shutdown_client(clnt);
+ }
+ kfree(host);
+ nrhosts--;
+ }
+ }
+
+ mutex_unlock(&nlm_host_mutex);
+}
+#endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 1a54ae1..b0cde74 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -27,6 +27,7 @@
#include <linux/mutex.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/ve_proto.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
@@ -47,25 +48,29 @@ struct nlmsvc_binding * nlmsvc_ops;
EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
-static unsigned int nlmsvc_users;
-static struct task_struct *nlmsvc_task;
-static struct svc_rqst *nlmsvc_rqst;
-unsigned long nlmsvc_timeout;
/*
* These can be set at insmod time (useful for NFS as root filesystem),
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
*/
-static unsigned long nlm_grace_period;
static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
static int nlm_udpport, nlm_tcpport;
+#ifndef CONFIG_VE
+static unsigned int _nlmsvc_users;
+static struct task_struct *_nlmsvc_task;
+static struct svc_rqst *_nlmsvc_rqst;
+static unsigned long _nlmsvc_grace_period;
+unsigned long _nlmsvc_timeout;
+#endif
+
/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
static unsigned int nlm_max_connections = 1024;
/*
* Constants needed for the sysctl interface.
*/
+static unsigned long nlm_grace_period;
static const unsigned long nlm_grace_period_min = 0;
static const unsigned long nlm_grace_period_max = 240;
static const unsigned long nlm_timeout_min = 3;
@@ -171,6 +176,10 @@ lockd(void *vrqstp)
*/
err = svc_recv(rqstp, timeout);
if (err == -EAGAIN || err == -EINTR) {
+#ifdef CONFIG_VE
+ if (!get_exec_env()->is_running)
+ break;
+#endif
preverr = err;
continue;
}
@@ -309,7 +318,7 @@ int lockd_up(void)
svc_sock_update_bufs(serv);
serv->sv_maxconn = nlm_max_connections;
- nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+ nlmsvc_task = kthread_run_ve(get_exec_env(), lockd, nlmsvc_rqst, serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
svc_exit_thread(nlmsvc_rqst);
@@ -347,12 +356,12 @@ lockd_down(void)
} else {
printk(KERN_ERR "lockd_down: no users! task=%p\n",
nlmsvc_task);
- BUG();
+ goto out;
}
if (!nlmsvc_task) {
printk(KERN_ERR "lockd_down: no lockd running.\n");
- BUG();
+ goto out;
}
kthread_stop(nlmsvc_task);
svc_exit_thread(nlmsvc_rqst);
@@ -497,6 +506,29 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
return SVC_DENIED;
}
+#ifdef CONFIG_VE
+extern void ve_nlm_shutdown_hosts(struct ve_struct *ve);
+
+static int ve_lockd_start(void *data)
+{
+ return 0;
+}
+
+static void ve_lockd_stop(void *data)
+{
+ struct ve_struct *ve = (struct ve_struct *)data;
+
+ ve_nlm_shutdown_hosts(ve);
+ flush_scheduled_work();
+}
+
+static struct ve_hook lockd_hook = {
+ .init = ve_lockd_start,
+ .fini = ve_lockd_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_FS,
+};
+#endif
param_set_min_max(port, int, simple_strtol, 0, 65535)
param_set_min_max(grace_period, unsigned long, simple_strtoul,
@@ -525,16 +557,20 @@ module_param(nlm_max_connections, uint, 0644);
static int __init init_nlm(void)
{
+ ve_hook_register(VE_SS_CHAIN, &lockd_hook);
#ifdef CONFIG_SYSCTL
nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root);
- return nlm_sysctl_table ? 0 : -ENOMEM;
-#else
- return 0;
+ if (nlm_sysctl_table == NULL) {
+ ve_hook_unregister(&lockd_hook);
+ return -ENOMEM;
+ }
#endif
+ return 0;
}
static void __exit exit_nlm(void)
{
+ ve_hook_unregister(&lockd_hook);
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
#ifdef CONFIG_SYSCTL
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da..2fdf8cf 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -334,6 +334,9 @@ nlmsvc_is_client(void *data, struct nlm_host *dummy)
{
struct nlm_host *host = data;
+ if (!ve_accessible_strict(host->owner_env, get_exec_env()))
+ return 0;
+
if (host->h_server) {
/* we are destroying locks even though the client
* hasn't asked us too, so don't unmonitor the
diff --git a/fs/locks.c b/fs/locks.c
index a8794f2..4d3fa3f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -130,6 +130,8 @@
#include <asm/uaccess.h>
+#include <bc/misc.h>
+
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
@@ -146,9 +148,25 @@ static LIST_HEAD(blocked_list);
static struct kmem_cache *filelock_cache __read_mostly;
/* Allocate an empty lock structure. */
-static struct file_lock *locks_alloc_lock(void)
+static struct file_lock *locks_alloc_lock(int charge)
{
- return kmem_cache_alloc(filelock_cache, GFP_KERNEL);
+ struct file_lock *fl;
+
+ fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL);
+#ifdef CONFIG_BEANCOUNTERS
+ if (fl == NULL)
+ goto out;
+ fl->fl_charged = 0;
+ if (!charge)
+ goto out;
+ if (!ub_flock_charge(fl, 1))
+ goto out;
+
+ kmem_cache_free(filelock_cache, fl);
+ fl = NULL;
+out:
+#endif
+ return fl;
}
void locks_release_private(struct file_lock *fl)
@@ -174,6 +192,7 @@ static void locks_free_lock(struct file_lock *fl)
BUG_ON(!list_empty(&fl->fl_block));
BUG_ON(!list_empty(&fl->fl_link));
+ ub_flock_uncharge(fl);
locks_release_private(fl);
kmem_cache_free(filelock_cache, fl);
}
@@ -277,7 +296,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
if (type < 0)
return type;
- fl = locks_alloc_lock();
+ fl = locks_alloc_lock(type != F_UNLCK);
if (fl == NULL)
return -ENOMEM;
@@ -464,7 +483,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl)
/* Allocate a file_lock initialised to this type of lease */
static struct file_lock *lease_alloc(struct file *filp, int type)
{
- struct file_lock *fl = locks_alloc_lock();
+ struct file_lock *fl = locks_alloc_lock(1);
int error = -ENOMEM;
if (fl == NULL)
@@ -735,8 +754,13 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
goto find_conflict;
if (request->fl_type != F_UNLCK) {
+ /*
+ * Nont F_UNLCK request must be already charged in
+ * flock_make_lock(). Actually new_fl must be charged not the
+ * request, but we try to fail earlier.
+ */
error = -ENOMEM;
- new_fl = locks_alloc_lock();
+ new_fl = locks_alloc_lock(0);
if (new_fl == NULL)
goto out;
error = 0;
@@ -788,6 +812,10 @@ find_conflict:
}
if (request->fl_flags & FL_ACCESS)
goto out;
+
+ set_flock_charged(new_fl);
+ unset_flock_charged(request);
+
locks_copy_lock(new_fl, request);
locks_insert_lock(before, new_fl);
new_fl = NULL;
@@ -819,8 +847,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (!(request->fl_flags & FL_ACCESS) &&
(request->fl_type != F_UNLCK ||
request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
- new_fl = locks_alloc_lock();
- new_fl2 = locks_alloc_lock();
+ if (request->fl_type != F_UNLCK)
+ new_fl = locks_alloc_lock(1);
+ else
+ new_fl = NULL;
+ new_fl2 = locks_alloc_lock(0);
}
lock_kernel();
@@ -954,7 +985,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
* bail out.
*/
error = -ENOLCK; /* "no luck" */
- if (right && left == right && !new_fl2)
+ if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2))
goto out;
error = 0;
@@ -965,23 +996,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
goto out;
}
- if (!new_fl) {
- error = -ENOLCK;
+ error = -ENOLCK;
+ if (!new_fl)
+ goto out;
+ if (right && (left == right) && ub_flock_charge(new_fl, 1))
goto out;
- }
locks_copy_lock(new_fl, request);
locks_insert_lock(before, new_fl);
new_fl = NULL;
+ error = 0;
}
if (right) {
if (left == right) {
/* The new lock breaks the old one in two pieces,
* so we have to use the second new lock.
*/
+ error = -ENOLCK;
+ if (added && ub_flock_charge(new_fl2,
+ request->fl_type != F_UNLCK))
+ goto out;
+ /* FIXME move all fl_charged manipulations in ub code */
+ set_flock_charged(new_fl2);
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
locks_insert_lock(before, left);
+ error = 0;
}
right->fl_start = request->fl_end + 1;
locks_wake_up_blocks(right);
@@ -1366,7 +1406,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
if (arg != F_UNLCK) {
error = -ENOMEM;
- new_fl = locks_alloc_lock();
+ new_fl = locks_alloc_lock(1);
if (new_fl == NULL)
goto out;
@@ -1610,6 +1650,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
out:
return error;
}
+EXPORT_SYMBOL_GPL(sys_flock);
/**
* vfs_test_lock - test file byte range lock
@@ -1770,7 +1811,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock __user *l)
{
- struct file_lock *file_lock = locks_alloc_lock();
+ struct file_lock *file_lock = locks_alloc_lock(0);
struct flock flock;
struct inode *inode;
struct file *f;
@@ -1888,7 +1929,7 @@ out:
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
struct flock64 __user *l)
{
- struct file_lock *file_lock = locks_alloc_lock();
+ struct file_lock *file_lock = locks_alloc_lock(0);
struct flock64 flock;
struct inode *inode;
struct file *f;
@@ -2159,6 +2200,8 @@ static int locks_show(struct seq_file *f, void *v)
struct file_lock *fl, *bfl;
fl = list_entry(v, struct file_lock, fl_link);
+ if (!ve_accessible(fl->fl_file->owner_env, get_exec_env()))
+ goto out;
lock_get_status(f, fl, (long)f->private, "");
@@ -2166,6 +2209,7 @@ static int locks_show(struct seq_file *f, void *v)
lock_get_status(f, bfl, (long)f->private, " ->");
f->private++;
+out:
return 0;
}
@@ -2207,7 +2251,7 @@ static const struct file_operations proc_locks_operations = {
static int __init proc_locks_init(void)
{
- proc_create("locks", 0, NULL, &proc_locks_operations);
+ proc_create("locks", 0, &glob_proc_root, &proc_locks_operations);
return 0;
}
module_init(proc_locks_init);
@@ -2294,7 +2338,7 @@ EXPORT_SYMBOL(lock_may_write);
static int __init filelock_init(void)
{
filelock_cache = kmem_cache_create("file_lock_cache",
- sizeof(struct file_lock), 0, SLAB_PANIC,
+ sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC,
init_once);
return 0;
}
diff --git a/fs/namei.c b/fs/namei.c
index b0afbd4..84f4037 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -143,6 +143,7 @@ char * getname(const char __user * filename)
{
char *tmp, *result;
+ /*ub_dentry_checkup();*/
result = ERR_PTR(-ENOMEM);
tmp = __getname();
if (tmp) {
@@ -428,6 +429,21 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
if (!dentry)
dentry = d_lookup(parent, name);
+ /*
+ * The revalidation rules are simple:
+ * d_revalidate operation is called when we're about to use a cached
+ * dentry rather than call d_lookup.
+ * d_revalidate method may unhash the dentry itself or return FALSE, in
+ * which case if the dentry can be released d_lookup will be called.
+ *
+ * Additionally, by request of NFS people
+ * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c)
+ * d_revalidate is called when `/', `.' or `..' are looked up.
+ * Since re-lookup is impossible on them, we introduce a hack and
+ * return an error in this case.
+ *
+ * 2003/02/19 SAW
+ */
if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
dentry = do_revalidate(dentry, nd);
@@ -479,6 +495,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
struct dentry * result;
struct inode *dir = parent->d_inode;
+repeat:
mutex_lock(&dir->i_mutex);
/*
* First re-do the cached lookup just in case it was created
@@ -525,7 +542,7 @@ out_unlock:
if (result->d_op && result->d_op->d_revalidate) {
result = do_revalidate(result, nd);
if (!result)
- result = ERR_PTR(-ENOENT);
+ goto repeat;
}
return result;
}
@@ -765,6 +782,12 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
nd->path.mnt == nd->root.mnt) {
break;
}
+#ifdef CONFIG_VE
+ if (nd->path.dentry == get_exec_env()->root_path.dentry &&
+ nd->path.mnt == get_exec_env()->root_path.mnt) {
+ break;
+ }
+#endif
spin_lock(&dcache_lock);
if (nd->path.dentry != nd->path.mnt->mnt_root) {
nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -805,6 +828,10 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
if (dentry->d_op && dentry->d_op->d_revalidate)
goto need_revalidate;
done:
+ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) {
+ dput(dentry);
+ return -ENOENT;
+ }
path->mnt = mnt;
path->dentry = dentry;
__follow_mount(path);
@@ -836,6 +863,7 @@ fail:
static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
{
return inode && unlikely(inode->i_op->follow_link) &&
+ !(lookup_flags & LOOKUP_STRICT) &&
((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
}
@@ -853,6 +881,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
struct inode *inode;
int err;
unsigned int lookup_flags = nd->flags;
+ int real_components = 0;
while (*name=='/')
name++;
@@ -921,6 +950,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
break;
}
/* This does the actual lookups.. */
+ real_components++;
err = do_lookup(nd, &this, &next);
if (err)
break;
@@ -931,6 +961,9 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
goto out_dput;
if (inode->i_op->follow_link) {
+ err = -ENOENT;
+ if (lookup_flags & LOOKUP_STRICT)
+ goto out_dput;
err = do_follow_link(&next, nd);
if (err)
goto return_err;
@@ -996,27 +1029,41 @@ lookup_parent:
nd->last_type = LAST_NORM;
if (this.name[0] != '.')
goto return_base;
- if (this.len == 1)
+ if (this.len == 1) {
nd->last_type = LAST_DOT;
- else if (this.len == 2 && this.name[1] == '.')
+ goto return_reval;
+ } else if (this.len == 2 && this.name[1] == '.') {
nd->last_type = LAST_DOTDOT;
- else
- goto return_base;
+ goto return_reval;
+ }
+return_base:
+ if (!(nd->flags & LOOKUP_NOAREACHECK)) {
+ err = check_area_access_ve(&nd->path);
+ if (err)
+ break;
+ }
+ return 0;
return_reval:
/*
* We bypassed the ordinary revalidation routines.
* We may need to check the cached dentry for staleness.
*/
- if (nd->path.dentry && nd->path.dentry->d_sb &&
+ if (!real_components && nd->path.dentry && nd->path.dentry->d_sb &&
(nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
err = -ESTALE;
/* Note: we do not d_invalidate() */
if (!nd->path.dentry->d_op->d_revalidate(
nd->path.dentry, nd))
+ /*
+ * This lookup is for `/' or `.' or `..'.
+ * The filesystem unhashed the dentry itself
+ * inside d_revalidate (otherwise, d_invalidate
+ * wouldn't succeed). As a special courtesy to
+ * NFS we return an error. 2003/02/19 SAW
+ */
break;
}
-return_base:
- return 0;
+ goto return_base;
out_dput:
path_put_conditional(&next, nd);
break;
@@ -2095,6 +2142,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
{
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
+EXPORT_SYMBOL_GPL(sys_mknod);
int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
@@ -2159,6 +2207,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
{
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
+EXPORT_SYMBOL_GPL(sys_mkdir);
/*
* We try to drop the dentry early: we should have
@@ -2186,6 +2235,7 @@ void dentry_unhash(struct dentry *dentry)
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
}
+EXPORT_SYMBOL(sys_symlink);
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -2273,6 +2323,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
return do_rmdir(AT_FDCWD, pathname);
}
+EXPORT_SYMBOL_GPL(sys_rmdir);
int vfs_unlink(struct inode *dir, struct dentry *dentry)
{
@@ -2380,6 +2431,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
return do_unlinkat(AT_FDCWD, pathname);
}
+EXPORT_SYMBOL_GPL(sys_unlink);
int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
{
@@ -2550,6 +2602,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
{
return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
+EXPORT_SYMBOL(sys_rename);
/*
* The worst of all namespace operations - renaming directory. "Perverted"
@@ -2661,6 +2714,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
const char *old_name;
+ if (vfs_dq_rename(old_dentry->d_inode, old_dir, new_dir))
+ return -EXDEV;
+
if (old_dentry->d_inode == new_dentry->d_inode)
return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index bdc3cb4..d811360 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
#include <linux/log2.h>
#include <linux/idr.h>
#include <linux/fs_struct.h>
+#include <linux/fsnotify_backend.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include "pnode.h"
@@ -48,7 +50,8 @@ static int mnt_group_start = 1;
static struct list_head *mount_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+struct rw_semaphore namespace_sem;
+EXPORT_SYMBOL_GPL(namespace_sem);
/* /sys/fs */
struct kobject *fs_kobj;
@@ -136,11 +139,12 @@ struct vfsmount *alloc_vfsmnt(const char *name)
goto out_free_cache;
if (name) {
- mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+ mnt->mnt_devname = kstrdup(name, GFP_KERNEL_UBC);
if (!mnt->mnt_devname)
goto out_free_id;
}
+ mnt->owner = VEID(get_exec_env());
atomic_set(&mnt->mnt_count, 1);
INIT_LIST_HEAD(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
@@ -517,7 +521,7 @@ static void commit_tree(struct vfsmount *mnt)
touch_mnt_namespace(n);
}
-static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
+struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
{
struct list_head *next = p->mnt_mounts.next;
if (next == &p->mnt_mounts) {
@@ -532,6 +536,7 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
}
return list_entry(next, struct vfsmount, mnt_child);
}
+EXPORT_SYMBOL(next_mnt);
static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
{
@@ -629,6 +634,7 @@ repeat:
spin_unlock(&vfsmount_lock);
acct_auto_close_mnt(mnt);
security_sb_umount_close(mnt);
+ fsnotify_unmount_mnt(mnt);
goto repeat;
}
}
@@ -789,15 +795,50 @@ static void show_type(struct seq_file *m, struct super_block *sb)
}
}
+static int prepare_mnt_root_mangle(struct path *path,
+ char **path_buf, char **ret_path)
+{
+ /* skip FS_NOMOUNT mounts (rootfs) */
+ if (path->mnt->mnt_sb->s_flags & MS_NOUSER)
+ return -EACCES;
+
+ *path_buf = (char *)__get_free_page(GFP_KERNEL);
+ if (!*path_buf)
+ return -ENOMEM;
+
+ *ret_path = d_path(path, *path_buf, PAGE_SIZE);
+ if (IS_ERR(*ret_path)) {
+ free_page((unsigned long)*path_buf);
+ /*
+ * This means that the file position will be incremented, i.e.
+ * the total number of "invisible" vfsmnt will leak.
+ */
+ return -EACCES;
+ }
+ return 0;
+}
+
static int show_vfsmnt(struct seq_file *m, void *v)
{
struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
- int err = 0;
+ int err;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-
- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+ char *path_buf, *path;
+
+ err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
+ if (err < 0)
+ return (err == -EACCES ? 0 : err);
+
+ if (ve_is_super(get_exec_env()) ||
+ !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC))
+ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+ else {
+ seq_puts(m, "/dev/");
+ mangle(m, mnt->mnt_sb->s_type->name);
+ }
seq_putc(m, ' ');
- seq_path(m, &mnt_path, " \t\n\\");
+ mangle(m, path);
+ free_page((unsigned long) path_buf);
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
@@ -884,18 +925,27 @@ static int show_vfsstat(struct seq_file *m, void *v)
{
struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- int err = 0;
+ char *path_buf, *path;
+ int err;
+
+ err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path);
+ if (err < 0)
+ return (err == -EACCES ? 0 : err);
/* device */
if (mnt->mnt_devname) {
seq_puts(m, "device ");
- mangle(m, mnt->mnt_devname);
+ if (ve_is_super(get_exec_env()))
+ mangle(m, mnt->mnt_devname);
+ else
+ mangle(m, mnt->mnt_sb->s_type->name);
} else
seq_puts(m, "no device");
/* mount point */
seq_puts(m, " mounted on ");
- seq_path(m, &mnt_path, " \t\n\\");
+ mangle(m, path);
+ free_page((unsigned long)path_buf);
seq_putc(m, ' ');
/* file system type */
@@ -1107,6 +1157,36 @@ static int do_umount(struct vfsmount *mnt, int flags)
return retval;
}
+#ifdef CONFIG_VE
+void umount_ve_fs_type(struct file_system_type *local_fs_type)
+{
+ struct vfsmount *mnt;
+ struct list_head *p, *q;
+ LIST_HEAD(kill);
+ LIST_HEAD(umount_list);
+
+ down_write(&namespace_sem);
+ spin_lock(&vfsmount_lock);
+ list_for_each_safe(p, q, &current->nsproxy->mnt_ns->list) {
+ mnt = list_entry(p, struct vfsmount, mnt_list);
+ if (mnt->mnt_sb->s_type != local_fs_type)
+ continue;
+ list_del(p);
+ list_add(p, &kill);
+ }
+
+ while (!list_empty(&kill)) {
+ LIST_HEAD(kill2);
+ mnt = list_entry(kill.next, struct vfsmount, mnt_list);
+ umount_tree(mnt, 1, &kill2);
+ list_splice(&kill2, &umount_list);
+ }
+ spin_unlock(&vfsmount_lock);
+ up_write(&namespace_sem);
+ release_mounts(&umount_list);
+}
+#endif
+
/*
* Now umount can handle mount points as well as block devices.
* This is important for filesystems which use unnamed block devices.
@@ -1130,7 +1210,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
goto dput_and_out;
retval = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
goto dput_and_out;
retval = do_umount(path.mnt, flags);
@@ -1156,7 +1236,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
static int mount_is_safe(struct path *path)
{
- if (capable(CAP_SYS_ADMIN))
+ if (capable(CAP_VE_SYS_ADMIN))
return 0;
return -EPERM;
#ifdef notyet
@@ -1425,6 +1505,8 @@ static int do_change_type(struct path *path, int flag)
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
+ if (!ve_accessible_veid(path->mnt->owner, get_exec_env()->veid))
+ return -EPERM;
down_write(&namespace_sem);
if (type == MS_SHARED) {
@@ -1447,7 +1529,7 @@ static int do_change_type(struct path *path, int flag)
* do loopback mount.
*/
static int do_loopback(struct path *path, char *old_name,
- int recurse)
+ int recurse, int mnt_flags)
{
struct path old_path;
struct vfsmount *mnt = NULL;
@@ -1477,6 +1559,7 @@ static int do_loopback(struct path *path, char *old_name,
if (!mnt)
goto out;
+ mnt->mnt_flags |= mnt_flags;
err = graft_tree(mnt, path);
if (err) {
LIST_HEAD(umount_list);
@@ -1520,7 +1603,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
int err;
struct super_block *sb = path->mnt->mnt_sb;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
if (!check_mnt(path->mnt))
@@ -1529,6 +1612,9 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
if (path->dentry != path->mnt->mnt_root)
return -EINVAL;
+ if (!ve_accessible_veid(path->mnt->owner, get_exec_env()->veid))
+ return -EPERM;
+
down_write(&sb->s_umount);
if (flags & MS_BIND)
err = change_mount_flags(path->mnt, flags);
@@ -1562,7 +1648,7 @@ static int do_move_mount(struct path *path, char *old_name)
struct path old_path, parent_path;
struct vfsmount *p;
int err = 0;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
@@ -1570,6 +1656,10 @@ static int do_move_mount(struct path *path, char *old_name)
if (err)
return err;
+ err = -EPERM;
+ if (!ve_accessible_veid(old_path.mnt->owner, get_exec_env()->veid))
+ goto out_nosem;
+
down_write(&namespace_sem);
while (d_mountpoint(path->dentry) &&
follow_down(path))
@@ -1627,6 +1717,7 @@ out:
up_write(&namespace_sem);
if (!err)
path_put(&parent_path);
+out_nosem:
path_put(&old_path);
return err;
}
@@ -1644,7 +1735,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
return -EINVAL;
/* we need capabilities... */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
lock_kernel();
@@ -1685,6 +1776,11 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
goto unlock;
newmnt->mnt_flags = mnt_flags;
+
+ /* make this before graft_tree reveals mnt_root to the world... */
+ if (path->dentry->d_flags & DCACHE_VIRTUAL)
+ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL;
+
if ((err = graft_tree(newmnt, path)))
goto unlock;
@@ -1959,7 +2055,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&path, dev_name, flags & MS_REC);
+ retval = do_loopback(&path, dev_name, flags & MS_REC, mnt_flags);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&path, flags);
else if (flags & MS_MOVE)
@@ -2122,6 +2218,7 @@ out_dir:
out_type:
return ret;
}
+EXPORT_SYMBOL_GPL(sys_mount);
/*
* pivot_root Semantics:
@@ -2281,7 +2378,7 @@ void __init mnt_init(void)
init_rwsem(&namespace_sem);
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL);
mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 127ed5c..95a31c8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -125,6 +125,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
atomic_set(&clp->cl_count, 1);
clp->cl_cons_state = NFS_CS_INITING;
+ clp->owner_env = get_exec_env();
memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen);
clp->cl_addrlen = cl_init->addrlen;
@@ -364,6 +365,7 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
{
struct nfs_client *clp;
+ struct ve_struct *ve = get_exec_env();
spin_lock(&nfs_client_lock);
list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
@@ -378,6 +380,9 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
if (clp->rpc_ops->version != nfsversion)
continue;
+ if (!ve_accessible_strict(clp->owner_env, ve))
+ continue;
+
/* Match only the IP address, not the port number */
if (!nfs_sockaddr_match_ipaddr(addr, clap))
continue;
@@ -398,6 +403,7 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
{
struct sockaddr *sap = (struct sockaddr *)&clp->cl_addr;
u32 nfsvers = clp->rpc_ops->version;
+ struct ve_struct *ve = get_exec_env();
spin_lock(&nfs_client_lock);
list_for_each_entry_continue(clp, &nfs_client_list, cl_share_link) {
@@ -411,6 +417,9 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
if (clp->rpc_ops->version != nfsvers)
continue;
+ if (!ve_accessible_strict(clp->owner_env, ve))
+ continue;
+
/* Match only the IP address, not the port number */
if (!nfs_sockaddr_match_ipaddr(sap, clap))
continue;
@@ -431,13 +440,18 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
{
struct nfs_client *clp;
const struct sockaddr *sap = data->addr;
+ struct ve_struct *ve;
+ ve = get_exec_env();
list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
/* Don't match clients that failed to initialise properly */
if (clp->cl_cons_state < 0)
continue;
+ if (!ve_accessible_strict(clp->owner_env, ve))
+ continue;
+
/* Different NFS versions cannot share the same nfs_client */
if (clp->rpc_ops != data->rpc_ops)
continue;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4bf23f6..253438f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -53,6 +53,9 @@
#include <linux/nfs_xdr.h>
#include <linux/magic.h>
#include <linux/parser.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+#include <linux/ve_nfs.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -250,7 +253,8 @@ static struct file_system_type nfs_fs_type = {
.name = "nfs",
.get_sb = nfs_get_sb,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|
+ FS_BINARY_MOUNTDATA|FS_VIRTUALIZED,
};
struct file_system_type nfs_xdev_fs_type = {
@@ -258,7 +262,8 @@ struct file_system_type nfs_xdev_fs_type = {
.name = "nfs",
.get_sb = nfs_xdev_get_sb,
.kill_sb = nfs_kill_super,
- .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+ .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|
+ FS_BINARY_MOUNTDATA|FS_VIRTUALIZED,
};
static const struct super_operations nfs_sops = {
@@ -350,6 +355,55 @@ static struct shrinker acl_shrinker = {
.seeks = DEFAULT_SEEKS,
};
+#ifdef CONFIG_VE
+static int ve_nfs_start(void *data)
+{
+ return 0;
+}
+
+static void ve_nfs_stop(void *data)
+{
+ struct ve_struct *ve;
+ struct super_block *sb;
+
+ flush_scheduled_work();
+
+ ve = (struct ve_struct *)data;
+ /* Basically, on a valid stop we can be here iff NFS was mounted
+ read-only. In such a case client force-stop is not a problem.
+ If we are here and NFS is read-write, we are in a FORCE stop, so
+ force the client to stop.
+ Lock daemon is already dead.
+ Only superblock client remains. Den */
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ struct rpc_clnt *clnt;
+ struct rpc_xprt *xprt;
+ if (sb->s_type != &nfs_fs_type)
+ continue;
+ clnt = NFS_SB(sb)->client;
+ if (!ve_accessible_strict(clnt->cl_xprt->owner_env, ve))
+ continue;
+ clnt->cl_broken = 1;
+ rpc_killall_tasks(clnt);
+
+ xprt = clnt->cl_xprt;
+ xprt_disconnect_done(xprt);
+ xprt->ops->close(xprt);
+ }
+ spin_unlock(&sb_lock);
+
+ flush_scheduled_work();
+}
+
+static struct ve_hook nfs_hook = {
+ .init = ve_nfs_start,
+ .fini = ve_nfs_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_NET_POST,
+};
+#endif
+
/*
* Register the NFS filesystems
*/
@@ -370,6 +424,7 @@ int __init register_nfs_fs(void)
goto error_2;
#endif
register_shrinker(&acl_shrinker);
+ ve_hook_register(VE_SS_CHAIN, &nfs_hook);
return 0;
#ifdef CONFIG_NFS_V4
@@ -388,6 +443,7 @@ error_0:
void __exit unregister_nfs_fs(void)
{
unregister_shrinker(&acl_shrinker);
+ ve_hook_unregister(&nfs_hook);
#ifdef CONFIG_NFS_V4
unregister_filesystem(&nfs4_fs_type);
#endif
@@ -1794,6 +1850,11 @@ static int nfs_validate_mount_data(void *options,
goto out_v3_not_compiled;
#endif /* !CONFIG_NFS_V3 */
+ if (!(args->flags & NFS_MOUNT_VER3)) {
+ printk("NFSv2 is broken and not supported\n");
+ return -EPROTONOSUPPORT;
+ }
+
return 0;
out_no_data:
@@ -2079,6 +2140,10 @@ static int nfs_compare_super(struct super_block *sb, void *data)
struct nfs_server *server = sb_mntdata->server, *old = NFS_SB(sb);
int mntflags = sb_mntdata->mntflags;
+ if (!ve_accessible_strict(old->client->cl_xprt->owner_env,
+ get_exec_env()))
+ return 0;
+
if (!nfs_compare_super_address(old, server))
return 0;
/* Note: NFS_MOUNT_UNSHARED == NFS4_MOUNT_UNSHARED */
@@ -2107,6 +2172,11 @@ static int nfs_get_sb(struct file_system_type *fs_type,
.mntflags = flags,
};
int error = -ENOMEM;
+ struct ve_struct *ve;
+
+ ve = get_exec_env();
+ if (!ve_is_super(ve) && !(ve->features & VE_FEATURE_NFS))
+ return -ENODEV;
data = nfs_alloc_parsed_mount_data(3);
mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
@@ -2237,6 +2307,11 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
.mntflags = flags,
};
int error;
+ struct ve_struct *ve;
+
+ ve = get_exec_env();
+ if (!ve_is_super(ve) && !(ve->features & VE_FEATURE_NFS))
+ return -ENODEV;
dprintk("--> nfs_xdev_get_sb()\n");
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85..0f763f5 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -90,6 +90,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/writeback.h> /* for inode_lock */
+#include <linux/mount.h>
#include <asm/atomic.h>
@@ -262,6 +263,23 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
}
}
+static void fsnotify_detach_mnt(struct inode *inode)
+{
+ struct fsnotify_mark_entry *entry;
+ struct hlist_node *pos;
+ struct fsnotify_group *group;
+
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+ spin_lock(&entry->lock);
+ group = entry->group;
+ if (group->ops->detach_mnt)
+ group->ops->detach_mnt(entry);
+ spin_unlock(&entry->lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+
/*
* given a group and inode, find the mark associated with that combination.
* if found take a reference to that mark and return it, else return NULL
@@ -362,7 +380,7 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
* of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
* We temporarily drop inode_lock, however, and CAN block.
*/
-void fsnotify_unmount_inodes(struct list_head *list)
+static void fsnotify_unmount(struct list_head *list, struct vfsmount *mnt)
{
struct inode *inode, *next_i, *need_iput = NULL;
@@ -414,13 +432,29 @@ void fsnotify_unmount_inodes(struct list_head *list)
if (need_iput_tmp)
iput(need_iput_tmp);
- /* for each watch, send FS_UNMOUNT and then remove it */
- fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ if (mnt)
+ fsnotify_detach_mnt(inode);
+ else {
+ /* for each watch, send FS_UNMOUNT and then remove it */
+ fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
- fsnotify_inode_delete(inode);
+ fsnotify_inode_delete(inode);
+ }
iput(inode);
spin_lock(&inode_lock);
}
}
+
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+ fsnotify_unmount(list, NULL);
+}
+
+void fsnotify_unmount_mnt(struct vfsmount *mnt)
+{
+ spin_lock(&inode_lock);
+ fsnotify_unmount(&mnt->mnt_sb->s_inodes, mnt);
+ spin_unlock(&inode_lock);
+}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index f234f3a..21faa74 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -13,6 +13,7 @@ struct inotify_inode_mark_entry {
/* fsnotify_mark_entry MUST be the first thing */
struct fsnotify_mark_entry fsn_entry;
int wd;
+ struct path path;
};
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e27960c..9b31a34 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -29,6 +29,7 @@
#include <linux/slab.h> /* kmem_* */
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/mount.h>
#include "inotify.h"
@@ -161,10 +162,25 @@ void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
kmem_cache_free(event_priv_cachep, event_priv);
}
+static void inotify_detach_mnt(struct fsnotify_mark_entry *fe)
+{
+ struct inotify_inode_mark_entry *e;
+
+ e = container_of(fe, struct inotify_inode_mark_entry, fsn_entry);
+ if (e->path.dentry) {
+ dput(e->path.dentry);
+ e->path.dentry = NULL;
+ mnt_unpin(e->path.mnt);
+ mntput(e->path.mnt);
+ e->path.dentry = NULL;
+ }
+}
+
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
.should_send_event = inotify_should_send_event,
.free_group_priv = inotify_free_group_priv,
.free_event_priv = inotify_free_event_priv,
.freeing_mark = inotify_freeing_mark,
+ .detach_mnt = inotify_detach_mnt,
};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 22ef16a..d9909cd 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -40,6 +40,7 @@
#include <linux/uaccess.h>
#include <linux/poll.h>
#include <linux/wait.h>
+#include <linux/module.h>
#include "inotify.h"
@@ -340,7 +341,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
return ret;
}
-static const struct file_operations inotify_fops = {
+const struct file_operations inotify_fops = {
.poll = inotify_poll,
.read = inotify_read,
.fasync = inotify_fasync,
@@ -348,6 +349,7 @@ static const struct file_operations inotify_fops = {
.unlocked_ioctl = inotify_ioctl,
.compat_ioctl = inotify_ioctl,
};
+EXPORT_SYMBOL(inotify_fops);
/*
@@ -461,6 +463,12 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
{
struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+ if (ientry->path.dentry) {
+ dput(ientry->path.dentry);
+ mnt_unpin(ientry->path.mnt);
+ mntput(ientry->path.mnt);
+ }
+
kmem_cache_free(inotify_inode_mark_cachep, ientry);
}
@@ -527,16 +535,13 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
return ret;
}
-static int inotify_new_watch(struct fsnotify_group *group,
- struct inode *inode,
- u32 arg)
+int __inotify_new_watch(struct fsnotify_group *group,
+ struct path *path, __u32 mask, int wd)
{
struct inotify_inode_mark_entry *tmp_ientry;
- __u32 mask;
+ u32 start_wd;
int ret;
- /* don't allow invalid bits: we don't want flags set */
- mask = inotify_arg_to_mask(arg);
if (unlikely(!mask))
return -EINVAL;
@@ -547,6 +552,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
tmp_ientry->fsn_entry.mask = mask;
tmp_ientry->wd = -1;
+ tmp_ientry->path.dentry = NULL;
+ tmp_ientry->path.mnt = NULL;
ret = -ENOSPC;
if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
@@ -556,13 +563,16 @@ retry:
if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
goto out_err;
+ if (wd == -1)
+ start_wd = group->inotify_data.last_wd + 1;
+ else
+ start_wd = wd;
/* we are putting the mark on the idr, take a reference */
fsnotify_get_mark(&tmp_ientry->fsn_entry);
spin_lock(&group->inotify_data.idr_lock);
ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
- group->inotify_data.last_wd+1,
- &tmp_ientry->wd);
+ start_wd, &tmp_ientry->wd);
spin_unlock(&group->inotify_data.idr_lock);
if (ret) {
/* we didn't get on the idr, drop the idr reference */
@@ -574,8 +584,15 @@ retry:
goto out_err;
}
+ if (wd != -1 && tmp_ientry->wd != wd) {
+ ret = -EBUSY;
+ fsnotify_put_mark(&tmp_ientry->fsn_entry);
+ inotify_remove_from_idr(group, tmp_ientry);
+ goto out_err;
+ }
+
/* we are on the idr, now get on the inode */
- ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+ ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, path->dentry->d_inode);
if (ret) {
/* we failed to get on the inode, get off the idr */
inotify_remove_from_idr(group, tmp_ientry);
@@ -588,6 +605,12 @@ retry:
/* increment the number of watches the user has */
atomic_inc(&group->inotify_data.user->inotify_watches);
+ if (!ve_is_super(get_exec_env())) {
+ tmp_ientry->path.dentry = dget(path->dentry);
+ mnt_pin(path->mnt);
+ tmp_ientry->path.mnt = path->mnt;
+ }
+
/* return the watch descriptor for this new entry */
ret = tmp_ientry->wd;
@@ -604,17 +627,24 @@ out_err:
return ret;
}
+EXPORT_SYMBOL(__inotify_new_watch);
+
+static int inotify_new_watch(struct fsnotify_group *group,
+ struct path *path, u32 arg)
+{
+ return __inotify_new_watch(group, path, inotify_arg_to_mask(arg), -1);
+}
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_watch(struct fsnotify_group *group, struct path *path, u32 arg)
{
int ret = 0;
retry:
/* try to update and existing watch with the new arg */
- ret = inotify_update_existing_watch(group, inode, arg);
+ ret = inotify_update_existing_watch(group, path->dentry->d_inode, arg);
/* no mark present, try to add a new one */
if (ret == -ENOENT)
- ret = inotify_new_watch(group, inode, arg);
+ ret = inotify_new_watch(group, path, arg);
/*
* inotify_new_watch could race with another thread which did an
* inotify_new_watch between the update_existing and the add watch
@@ -714,12 +744,12 @@ SYSCALL_DEFINE0(inotify_init)
{
return sys_inotify_init1(0);
}
+EXPORT_SYMBOL(sys_inotify_init);
SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
u32, mask)
{
struct fsnotify_group *group;
- struct inode *inode;
struct path path;
struct file *filp;
int ret, fput_needed;
@@ -744,12 +774,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (ret)
goto fput_and_out;
- /* inode held in place by reference to path; group by fget on fd */
- inode = path.dentry->d_inode;
group = filp->private_data;
/* create/update an inode mark */
- ret = inotify_update_watch(group, inode, mask);
+ ret = inotify_update_watch(group, &path, mask);
if (unlikely(ret))
goto path_put_and_out;
diff --git a/fs/open.c b/fs/open.c
index 4f01e06..77f73fc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
+#include <linux/faudit.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
@@ -52,7 +53,21 @@ int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
EXPORT_SYMBOL(vfs_statfs);
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+int faudit_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+ struct faudit_statfs_arg arg;
+
+ arg.sb = sb;
+ arg.stat = buf;
+
+ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg)
+ != NOTIFY_DONE)
+ return arg.err;
+ return 0;
+}
+
+static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt,
+ struct statfs *buf)
{
struct kstatfs st;
int retval;
@@ -61,6 +76,10 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
if (retval)
return retval;
+ retval = faudit_statfs(mnt->mnt_sb, &st);
+ if (retval)
+ return retval;
+
if (sizeof(*buf) == sizeof(st))
memcpy(buf, &st, sizeof(st));
else {
@@ -96,7 +115,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
return 0;
}
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt,
+ struct statfs64 *buf)
{
struct kstatfs st;
int retval;
@@ -105,6 +125,10 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
if (retval)
return retval;
+ retval = faudit_statfs(mnt->mnt_sb, &st);
+ if (retval)
+ return retval;
+
if (sizeof(*buf) == sizeof(st))
memcpy(buf, &st, sizeof(st));
else {
@@ -131,7 +155,7 @@ SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, b
error = user_path(pathname, &path);
if (!error) {
struct statfs tmp;
- error = vfs_statfs_native(path.dentry, &tmp);
+ error = vfs_statfs_native(path.dentry, path.mnt, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
path_put(&path);
@@ -149,7 +173,7 @@ SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct stat
error = user_path(pathname, &path);
if (!error) {
struct statfs64 tmp;
- error = vfs_statfs64(path.dentry, &tmp);
+ error = vfs_statfs64(path.dentry, path.mnt, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
path_put(&path);
@@ -167,7 +191,7 @@ SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
file = fget(fd);
if (!file)
goto out;
- error = vfs_statfs_native(file->f_path.dentry, &tmp);
+ error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
fput(file);
@@ -188,7 +212,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user
file = fget(fd);
if (!file)
goto out;
- error = vfs_statfs64(file->f_path.dentry, &tmp);
+ error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
fput(file);
@@ -630,14 +654,20 @@ out:
return err;
}
-SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+static int do_fchmodat(int dfd, const char __user *filename, mode_t mode, int flag)
{
struct path path;
struct inode *inode;
int error;
struct iattr newattrs;
+ int follow;
- error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ error = -EINVAL;
+ if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+ goto out;
+
+ follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ error = user_path_at(dfd, filename, follow, &path);
if (error)
goto out;
inode = path.dentry->d_inode;
@@ -659,9 +689,19 @@ out:
return error;
}
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
+{
+ return do_fchmodat(dfd, filename, mode, 0);
+}
+
SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
{
- return sys_fchmodat(AT_FDCWD, filename, mode);
+ return do_fchmodat(AT_FDCWD, filename, mode, 0);
+}
+
+SYSCALL_DEFINE2(lchmod, const char __user *, filename, mode_t, mode)
+{
+ return do_fchmodat(AT_FDCWD, filename, mode, AT_SYMLINK_NOFOLLOW);
}
static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
@@ -707,6 +747,7 @@ out_release:
out:
return error;
}
+EXPORT_SYMBOL_GPL(sys_chown);
SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
gid_t, group, int, flag)
@@ -948,6 +989,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
return filp;
}
+int odirect_enable = 0;
/*
* dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
* error.
@@ -972,6 +1014,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
return ERR_PTR(-EINVAL);
}
+ if (!capable(CAP_SYS_RAWIO) && !odirect_enable)
+ flags &= ~O_DIRECT;
+
error = -ENFILE;
f = get_empty_filp();
if (f == NULL) {
@@ -1062,6 +1107,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
asmlinkage_protect(3, ret, filename, flags, mode);
return ret;
}
+EXPORT_SYMBOL_GPL(sys_open);
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
int, mode)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..20da630 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -20,6 +20,7 @@
#include <linux/ctype.h>
#include <linux/genhd.h>
#include <linux/blktrace_api.h>
+#include <linux/sysfs.h>
#include "check.h"
@@ -132,6 +133,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
return buf;
}
+EXPORT_SYMBOL(disk_name);
const char *bdevname(struct block_device *bdev, char *buf)
{
@@ -483,14 +485,16 @@ void register_disk(struct gendisk *disk)
if (device_add(ddev))
return;
-#ifndef CONFIG_SYSFS_DEPRECATED
- err = sysfs_create_link(block_depr, &ddev->kobj,
- kobject_name(&ddev->kobj));
- if (err) {
- device_del(ddev);
- return;
+
+ if (!sysfs_deprecated) {
+ err = sysfs_create_link(block_depr, &ddev->kobj,
+ kobject_name(&ddev->kobj));
+ if (err) {
+ device_del(ddev);
+ return;
+ }
}
-#endif
+
disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
@@ -672,8 +676,7 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
disk->driverfs_dev = NULL;
-#ifndef CONFIG_SYSFS_DEPRECATED
- sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
-#endif
+ if (!sysfs_deprecated)
+ sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
device_del(disk_to_dev(disk));
}
diff --git a/fs/pipe.c b/fs/pipe.c
index ae17d02..1cb5f83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -22,6 +22,8 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <bc/kmem.h>
+
/*
* We use a start+len construction, which provides full use of the
* allocated memory.
@@ -526,7 +528,7 @@ redo1:
int error, atomic = 1;
if (!page) {
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_HIGHUSER | __GFP_UBC);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
@@ -875,7 +877,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
{
struct pipe_inode_info *pipe;
- pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+ pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC);
if (pipe) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
@@ -1090,6 +1092,7 @@ int do_pipe_flags(int *fd, int flags)
free_write_pipe(fw);
return error;
}
+EXPORT_SYMBOL_GPL(do_pipe_flags);
/*
* sys_pipe() is the normal C calling standard for creating
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650..7de3905 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -83,6 +83,8 @@
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <bc/beancounter.h>
+
#include <asm/pgtable.h>
#include <asm/processor.h>
#include "internal.h"
@@ -154,6 +156,18 @@ static inline const char *get_task_state(struct task_struct *tsk)
return *p;
}
+static int task_virtual_pid(struct task_struct *t)
+{
+ struct pid *pid;
+
+ pid = task_pid(t);
+ /*
+ * this will give wrong result for tasks,
+ * that failed to enter VE, but that's OK
+ */
+ return pid ? pid->numbers[pid->level].nr : 0;
+}
+
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
@@ -161,7 +175,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
int g;
struct fdtable *fdt = NULL;
const struct cred *cred;
- pid_t ppid, tpid;
+ pid_t ppid, tpid, vpid;
rcu_read_lock();
ppid = pid_alive(p) ?
@@ -172,6 +186,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
if (tracer)
tpid = task_pid_nr_ns(tracer, ns);
}
+ vpid = task_virtual_pid(p);
cred = get_cred((struct cred *) __task_cred(p));
seq_printf(m,
"State:\t%s\n"
@@ -205,6 +220,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
put_cred(cred);
seq_printf(m, "\n");
+
+ seq_printf(m, "envID:\t%d\nVPid:\t%d\n",
+ p->ve_task_info.owner_env->veid, vpid);
+ seq_printf(m, "PNState:\t%u\nStopState:\t%u\n",
+ p->pn_state, p->stopped_state);
}
static void render_sigset_t(struct seq_file *m, const char *header,
@@ -244,10 +264,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
}
}
-static inline void task_sig(struct seq_file *m, struct task_struct *p)
+void task_sig(struct seq_file *m, struct task_struct *p)
{
unsigned long flags;
- sigset_t pending, shpending, blocked, ignored, caught;
+ sigset_t pending, shpending, blocked, ignored, caught, saved;
int num_threads = 0;
unsigned long qsize = 0;
unsigned long qlim = 0;
@@ -257,11 +277,13 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
sigemptyset(&blocked);
sigemptyset(&ignored);
sigemptyset(&caught);
+ sigemptyset(&saved);
if (lock_task_sighand(p, &flags)) {
pending = p->pending.signal;
shpending = p->signal->shared_pending.signal;
blocked = p->blocked;
+ saved = p->saved_sigmask;
collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = atomic_read(&p->signal->count);
qsize = atomic_read(&__task_cred(p)->user->sigpending);
@@ -278,6 +300,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
render_sigset_t(m, "SigBlk:\t", &blocked);
render_sigset_t(m, "SigIgn:\t", &ignored);
render_sigset_t(m, "SigCgt:\t", &caught);
+ render_sigset_t(m, "SigSvd:\t", &saved);
}
static void render_cap_t(struct seq_file *m, const char *header,
@@ -312,6 +335,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
render_cap_t(m, "CapBnd:\t", &cap_bset);
}
+#ifdef CONFIG_BEANCOUNTERS
+static inline void ub_dump_task_info(struct task_struct *tsk,
+ char *stsk, int ltsk, char *smm, int lmm)
+{
+ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk);
+ task_lock(tsk);
+ if (tsk->mm)
+ print_ub_uid(tsk->mm->mm_ub, smm, lmm);
+ else
+ strncpy(smm, "N/A", lmm);
+ task_unlock(tsk);
+}
+#endif
+
static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
{
@@ -325,6 +362,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct mm_struct *mm = get_task_mm(task);
+#ifdef CONFIG_BEANCOUNTERS
+ char tsk_ub_info[64], mm_ub_info[64];
+#endif
task_name(m, task);
task_state(m, ns, pid, task);
@@ -340,6 +380,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_show_regs(m, task);
#endif
task_context_switch_counts(m, task);
+#ifdef CONFIG_BEANCOUNTERS
+ ub_dump_task_info(task,
+ tsk_ub_info, sizeof(tsk_ub_info),
+ mm_ub_info, sizeof(mm_ub_info));
+
+ seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info);
+ seq_printf(m, "MMUB:\t%s\n", mm_ub_info);
+#endif
return 0;
}
@@ -363,6 +411,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
unsigned long rsslim = 0;
char tcomm[sizeof(task->comm)];
unsigned long flags;
+#ifdef CONFIG_BEANCOUNTERS
+ char ub_task_info[64];
+ char ub_mm_info[64];
+#endif
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -444,6 +496,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
priority = task_prio(task);
nice = task_nice(task);
+#ifndef CONFIG_VE
/* Temporary variable needed for gcc-2.96 */
/* convert timespec -> nsec*/
start_time =
@@ -451,10 +504,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
+ task->real_start_time.tv_nsec;
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
+#else
+ start_time = ve_relative_clock(&task->start_time);
+#endif
+
+#ifdef CONFIG_BEANCOUNTERS
+ ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info),
+ ub_mm_info, sizeof(ub_mm_info));
+#endif
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld"
+#ifdef CONFIG_VE
+ " 0 0 0 0 0 0 0 %d %u"
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+ " %s %s"
+#endif
+ "\n",
pid_nr_ns(pid, ns),
tcomm,
state,
@@ -501,7 +569,16 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
- cputime_to_clock_t(cgtime));
+ cputime_to_clock_t(cgtime)
+#ifdef CONFIG_VE
+ , task_pid_vnr(task),
+ VEID(VE_TASK_INFO(task)->owner_env)
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+ , ub_task_info,
+ ub_mm_info
+#endif
+ );
if (mm)
mmput(mm);
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1bb0f6..ef6ee19 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -49,6 +49,7 @@
#include <asm/uaccess.h>
+#include <linux/module.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
@@ -156,10 +157,14 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
fs = task->fs;
if (fs) {
read_lock(&fs->lock);
- *path = root ? fs->root : fs->pwd;
- path_get(path);
- read_unlock(&fs->lock);
result = 0;
+ if (!root)
+ result = d_root_check(&fs->pwd);
+ if (result == 0) {
+ *path = root ? fs->root : fs->pwd;
+ path_get(path);
+ }
+ read_unlock(&fs->lock);
}
task_unlock(task);
return result;
@@ -550,17 +555,31 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
static int proc_fd_access_allowed(struct inode *inode)
{
struct task_struct *task;
- int allowed = 0;
+ int err;
+
/* Allow access to a task's file descriptors if it is us or we
* may use ptrace attach to the process and find out that
* information.
*/
+ err = -ENOENT;
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ if (ptrace_may_access(task, PTRACE_MODE_READ))
+ err = 0;
+ else
+ /*
+ * This clever ptrace_may_attach() may play a trick
+ * on us. If the task is zombie it will consider this
+ * task to be not dumpable at all and will deny any
+ * ptracing in VE. Not a big deal for ptrace(), but
+ * following the link will fail with the -EACCESS
+ * reason. Some software is unable to stand such a
+ * swindle and refuses to work :(
+ */
+ err = (task->mm ? -EACCES : -ENOENT);
put_task_struct(task);
}
- return allowed;
+ return err;
}
static int proc_setattr(struct dentry *dentry, struct iattr *attr)
@@ -1039,6 +1058,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
oom_adjust != OOM_DISABLE)
return -EINVAL;
+ if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env()))
+ return -EPERM;
task = get_proc_task(file->f_path.dentry->d_inode);
if (!task)
@@ -1295,6 +1316,7 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
mm->exe_file = new_exe_file;
mm->num_exe_file_vmas = 0;
}
+EXPORT_SYMBOL(set_mm_exe_file);
struct file *get_mm_exe_file(struct mm_struct *mm)
{
@@ -1333,10 +1355,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path)
exe_file = get_mm_exe_file(mm);
mmput(mm);
if (exe_file) {
- *exe_path = exe_file->f_path;
- path_get(&exe_file->f_path);
+ int result;
+
+ result = d_root_check(&exe_file->f_path);
+ if (result == 0) {
+ *exe_path = exe_file->f_path;
+ path_get(&exe_file->f_path);
+ }
fput(exe_file);
- return 0;
+ return result;
} else
return -ENOENT;
}
@@ -1344,13 +1371,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path)
static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- int error = -EACCES;
+ int error;
/* We don't need a base pointer in the /proc filesystem */
path_put(&nd->path);
/* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
+ error = proc_fd_access_allowed(inode);
+ if (error < 0)
goto out;
error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
@@ -1385,12 +1413,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
{
- int error = -EACCES;
+ int error;
struct inode *inode = dentry->d_inode;
struct path path;
/* Are we allowed to snoop on the tasks file descriptors? */
- if (!proc_fd_access_allowed(inode))
+ error = proc_fd_access_allowed(inode);
+ if (error < 0)
goto out;
error = PROC_I(inode)->op.proc_get_link(inode, &path);
@@ -1641,6 +1670,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
struct files_struct *files = NULL;
struct file *file;
int fd = proc_fd(inode);
+ int err = -ENOENT;
if (task) {
files = get_files_struct(task);
@@ -1653,7 +1683,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
*/
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
- if (file) {
+ err = -EACCES;
+ if (file && !d_root_check(&file->f_path)) {
if (path) {
*path = file->f_path;
path_get(&file->f_path);
@@ -1671,7 +1702,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
spin_unlock(&files->file_lock);
put_files_struct(files);
}
- return -ENOENT;
+ return err;
}
static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2458,7 +2489,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
struct task_struct *t = task;
task_io_accounting_add(&acct, &task->signal->ioac);
- while_each_thread(task, t)
+ while_each_thread_ve(task, t)
task_io_accounting_add(&acct, &t->ioac);
unlock_task_sighand(task, &flags);
@@ -3161,3 +3192,35 @@ static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.readdir = proc_task_readdir,
};
+
+/* Check whether dentry belongs to a task that already died */
+int proc_dentry_of_dead_task(struct dentry *dentry)
+{
+ if (dentry->d_inode->i_fop == &dummy_proc_pid_file_operations)
+ return 1;
+
+ return (dentry->d_op == &pid_dentry_operations &&
+ proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first == NULL);
+}
+EXPORT_SYMBOL(proc_dentry_of_dead_task);
+
+/* Place it here to avoid use vzrst module count */
+static ssize_t dummy_proc_pid_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ return -ESRCH;
+}
+
+static ssize_t dummy_proc_pid_write(struct file * file, const char * buf,
+ size_t count, loff_t *ppos)
+{
+ return -ESRCH;
+}
+
+struct file_operations dummy_proc_pid_file_operations = {
+ .read = dummy_proc_pid_read,
+ .write = dummy_proc_pid_write,
+};
+
+EXPORT_SYMBOL(dummy_proc_pid_file_operations);
+
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3..2ad657d 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -2,10 +2,12 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/sched.h>
static int cmdline_proc_show(struct seq_file *m, void *v)
{
- seq_printf(m, "%s\n", saved_command_line);
+ seq_printf(m, "%s\n",
+ ve_is_super(get_exec_env()) ? saved_command_line : "quiet");
return 0;
}
@@ -23,7 +25,7 @@ static const struct file_operations cmdline_proc_fops = {
static int __init proc_cmdline_init(void)
{
- proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+ proc_create("cmdline", 0, &glob_proc_root, &cmdline_proc_fops);
return 0;
}
module_init(proc_cmdline_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539..f7d84b5 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -18,7 +18,7 @@ static const struct file_operations proc_cpuinfo_operations = {
static int __init proc_cpuinfo_init(void)
{
- proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+ proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations);
return 0;
}
module_init(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index 59ee7da..d485f24 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -2,6 +2,7 @@
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/sched.h>
static int devinfo_show(struct seq_file *f, void *v)
{
@@ -25,6 +26,9 @@ static int devinfo_show(struct seq_file *f, void *v)
static void *devinfo_start(struct seq_file *f, loff_t *pos)
{
+ if (!ve_is_super(get_exec_env()))
+ return NULL;
+
if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
return pos;
return NULL;
@@ -64,7 +68,7 @@ static const struct file_operations proc_devinfo_operations = {
static int __init proc_devices_init(void)
{
- proc_create("devices", 0, NULL, &proc_devinfo_operations);
+ proc_create("devices", 0, &glob_proc_root, &proc_devinfo_operations);
return 0;
}
module_init(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678ab..a66517d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -255,6 +255,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
struct proc_dir_entry *de = PDE(inode);
int error;
+ if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) &&
+ LPDE(inode) == PDE(inode))
+ return -EPERM;
+
error = inode_change_ok(inode, iattr);
if (error)
goto out;
@@ -263,9 +267,12 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
if (error)
goto out;
- de->uid = inode->i_uid;
- de->gid = inode->i_gid;
- de->mode = inode->i_mode;
+ if (iattr->ia_valid & ATTR_UID)
+ de->uid = inode->i_uid;
+ if (iattr->ia_valid & ATTR_GID)
+ de->gid = inode->i_gid;
+ if (iattr->ia_valid & ATTR_MODE)
+ de->mode = inode->i_mode;
out:
return error;
}
@@ -274,11 +281,22 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- struct proc_dir_entry *de = PROC_I(inode)->pde;
- if (de && de->nlink)
- inode->i_nlink = de->nlink;
+ struct proc_dir_entry *de = PDE(inode);
+ struct proc_dir_entry *lde = LPDE(inode);
generic_fillattr(inode, stat);
+
+ if (de && de->nlink)
+ stat->nlink = de->nlink;
+ /* if dentry is found in both trees and it is a directory
+ * then inode's nlink count must be altered, because local
+ * and global subtrees may differ.
+ * on the other hand, they may intersect, so actual nlink
+ * value is difficult to calculate - upper estimate is used
+ * instead of it.
+ */
+ if (lde && lde != de && lde->nlink > 1)
+ stat->nlink += lde->nlink - 2;
return 0;
}
@@ -411,28 +429,60 @@ static const struct dentry_operations proc_dentry_operations =
.d_delete = proc_delete_dentry,
};
+static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir,
+ const char *name, int namelen)
+{
+ struct proc_dir_entry *de;
+
+ for (de = dir->subdir; de ; de = de->next) {
+ if (de->namelen != namelen)
+ continue;
+ if (memcmp(de->name, name, namelen))
+ continue;
+ break;
+ }
+ return de;
+}
+
/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
*/
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
- struct dentry *dentry)
+struct dentry *proc_lookup_de(struct proc_dir_entry *de,
+ struct proc_dir_entry *lde,
+ struct inode *dir, struct dentry *dentry)
{
struct inode *inode = NULL;
int error = -ENOENT;
spin_lock(&proc_subdir_lock);
- for (de = de->subdir; de ; de = de->next) {
- if (de->namelen != dentry->d_name.len)
- continue;
- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
+ de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len);
+ if (lde != NULL)
+ lde = __proc_lookup(lde, dentry->d_name.name,
+ dentry->d_name.len);
+
+ if (de == NULL)
+ de = lde;
+
+ if (de != NULL) {
+ /*
+ * de lde meaning inode(g,l)
+ * ------------------------------------
+ * NULL NULL -ENOENT *
+ * X NULL global X NULL
+ * NULL X local X X
+ * X Y both X Y
+ */
+ {
unsigned int ino;
ino = de->low_ino;
de_get(de);
+ if (lde != NULL)
+ de_get(lde);
spin_unlock(&proc_subdir_lock);
error = -EINVAL;
- inode = proc_get_inode(dir->i_sb, ino, de);
+ inode = proc_get_inode(dir->i_sb, ino, de, lde);
goto out_unlock;
}
}
@@ -446,13 +496,15 @@ out_unlock:
}
if (de)
de_put(de);
+ if (lde)
+ de_put(lde);
return ERR_PTR(error);
}
struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
- return proc_lookup_de(PDE(dir), dir, dentry);
+ return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry);
}
/*
@@ -464,13 +516,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* value of the readdir() call, as long as it's non-negative
* for success..
*/
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir)
+int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde,
+ struct file *filp, void *dirent, filldir_t filldir)
{
unsigned int ino;
int i;
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = 0;
+ struct proc_dir_entry *ode = de, *fde = NULL;
ino = inode->i_ino;
i = filp->f_pos;
@@ -491,25 +544,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
/* fall through */
default:
spin_lock(&proc_subdir_lock);
- de = de->subdir;
i -= 2;
- for (;;) {
- if (!de) {
- ret = 1;
- spin_unlock(&proc_subdir_lock);
- goto out;
- }
- if (!i)
- break;
- de = de->next;
- i--;
- }
-
- do {
+repeat:
+ de = de->subdir;
+ while (de != NULL) {
struct proc_dir_entry *next;
- /* filldir passes info to user space */
de_get(de);
+ if (i-- > 0 || (fde != NULL &&
+ __proc_lookup(fde,
+ de->name, de->namelen)))
+ goto skip;
+
+ /* filldir passes info to user space */
spin_unlock(&proc_subdir_lock);
if (filldir(dirent, de->name, de->namelen, filp->f_pos,
de->low_ino, de->mode >> 12) < 0) {
@@ -518,10 +565,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
}
spin_lock(&proc_subdir_lock);
filp->f_pos++;
+skip:
next = de->next;
de_put(de);
de = next;
- } while (de);
+ }
+
+ if (fde == NULL && lde != NULL && lde != ode) {
+ de = lde;
+ fde = ode;
+ goto repeat;
+ }
spin_unlock(&proc_subdir_lock);
}
ret = 1;
@@ -533,7 +587,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
struct inode *inode = filp->f_path.dentry->d_inode;
- return proc_readdir_de(PDE(inode), filp, dirent, filldir);
+ return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir);
}
/*
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d78ade3..3693efa 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -442,7 +442,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
#endif
struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
- struct proc_dir_entry *de)
+ struct proc_dir_entry *de, struct proc_dir_entry *lde)
{
struct inode * inode;
@@ -453,6 +453,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
PROC_I(inode)->fd = 0;
PROC_I(inode)->pde = de;
+#ifdef CONFIG_VE
+ PROC_I(inode)->lpde = lde;
+#endif
if (de->mode) {
inode->i_mode = de->mode;
@@ -494,9 +497,11 @@ int proc_fill_super(struct super_block *s)
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
-
- de_get(&proc_root);
- root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
+
+ de_get(get_exec_env()->proc_root);
+ de_get(&glob_proc_root);
+ root_inode = proc_get_inode(s, PROC_ROOT_INO,
+ &glob_proc_root, get_exec_env()->proc_root);
if (!root_inode)
goto out_no_root;
root_inode->i_uid = 0;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 753ca37..8f9249e 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -12,6 +12,12 @@
#include <linux/proc_fs.h>
extern struct proc_dir_entry proc_root;
+#ifdef CONFIG_VE
+extern struct proc_dir_entry glob_proc_root;
+#else
+#define glob_proc_root proc_root
+#endif
+
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
#else
@@ -80,10 +86,11 @@ static inline int proc_fd(struct inode *inode)
return PROC_I(inode)->fd;
}
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
+struct dentry *proc_lookup_de(struct proc_dir_entry *de,
+ struct proc_dir_entry *lpde, struct inode *ino,
struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
- filldir_t filldir);
+int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde,
+ struct file *filp, void *dirent, filldir_t filldir);
struct pde_opener {
struct inode *inode;
@@ -106,7 +113,8 @@ void de_put(struct proc_dir_entry *de);
extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+struct inode *proc_get_inode(struct super_block *, unsigned int,
+ struct proc_dir_entry *, struct proc_dir_entry *);
/*
* These are generic /proc routines that use the internal
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 7ca7834..2f7da10 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -12,6 +12,10 @@
#include <linux/poll.h>
#include <linux/proc_fs.h>
#include <linux/fs.h>
+#include <linux/veprintk.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -41,19 +45,20 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
static unsigned int kmsg_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &log_wait, wait);
+ poll_wait(file, &ve_log_wait, wait);
if (do_syslog(9, NULL, 0))
return POLLIN | POLLRDNORM;
return 0;
}
-static const struct file_operations proc_kmsg_operations = {
+const struct file_operations proc_kmsg_operations = {
.read = kmsg_read,
.poll = kmsg_poll,
.open = kmsg_open,
.release = kmsg_release,
};
+EXPORT_SYMBOL_GPL(proc_kmsg_operations);
static int __init proc_kmsg_init(void)
{
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd..cfd8d8b 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -13,14 +13,25 @@
static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
+ long running, threads;
+ struct ve_struct *ve;
- get_avenrun(avnrun, FIXED_1/200, 0);
+ ve = get_exec_env();
+ if (ve_is_super(ve)) {
+ get_avenrun(avnrun, FIXED_1/200, 0);
+ running = nr_running();
+ threads = nr_threads;
+ } else {
+ get_avenrun_ve(ve, avnrun, FIXED_1/200, 0);
+ running = nr_running_ve(ve);
+ threads = atomic_read(&ve->pcounter);
+ }
- seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+ seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%ld %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
- nr_running(), nr_threads,
+ running, threads,
task_active_pid_ns(current)->last_pid);
return 0;
}
@@ -39,7 +50,7 @@ static const struct file_operations loadavg_proc_fops = {
static int __init proc_loadavg_init(void)
{
- proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+ proc_create("loadavg", 0, &glob_proc_root, &loadavg_proc_fops);
return 0;
}
module_init(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a65239c..76206c4 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -10,6 +10,7 @@
#include <linux/seq_file.h>
#include <linux/swap.h>
#include <linux/vmstat.h>
+#include <linux/virtinfo.h>
#include <asm/atomic.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -19,9 +20,28 @@ void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
{
}
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
+static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
+{
+ seq_printf(m,
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n",
+ K(mi->si.totalram),
+ K(mi->si.freeram),
+ K(mi->si.totalswap),
+ K(mi->si.freeswap));
+
+ return 0;
+}
+
static int meminfo_proc_show(struct seq_file *m, void *v)
{
+ int ret;
struct sysinfo i;
+ struct meminfo mi;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;
@@ -29,12 +49,19 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long pages[NR_LRU_LISTS];
int lru;
+ si_meminfo(&i);
+ si_swapinfo(&i);
+ mi.si = i;
+
+ ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi);
+ if (ret & NOTIFY_FAIL)
+ return 0;
+ if (ret & NOTIFY_OK)
+ return meminfo_proc_show_mi(m, &mi);
+
/*
* display in kilobytes.
*/
-#define K(x) ((x) << (PAGE_SHIFT - 10))
- si_meminfo(&i);
- si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;
@@ -175,7 +202,7 @@ static const struct file_operations meminfo_proc_fops = {
static int __init proc_meminfo_init(void)
{
- proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+ proc_create("meminfo", 0, &glob_proc_root, &meminfo_proc_fops);
return 0;
}
module_init(proc_meminfo_init);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270..7dfab0b 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -126,7 +126,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
de = ERR_PTR(-ENOENT);
net = get_proc_task_net(dir);
if (net != NULL) {
- de = proc_lookup_de(net->proc_net, dir, dentry);
+ de = proc_lookup_de(net->proc_net, NULL, dir, dentry);
put_net(net);
}
return de;
@@ -164,7 +164,8 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
ret = -EINVAL;
net = get_proc_task_net(filp->f_path.dentry->d_inode);
if (net != NULL) {
- ret = proc_readdir_de(net->proc_net, filp, dirent, filldir);
+ ret = proc_readdir_de(net->proc_net, NULL,
+ filp, dirent, filldir);
put_net(net);
}
return ret;
@@ -234,7 +235,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
int __init proc_net_init(void)
{
- proc_symlink("net", NULL, "self/net");
+ proc_symlink("net", &glob_proc_root, "self/net");
return register_pernet_subsys(&proc_net_ns_ops);
}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f667e8a..e3c13c1 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -406,7 +406,7 @@ int __init proc_sys_init(void)
{
struct proc_dir_entry *proc_sys_root;
- proc_sys_root = proc_mkdir("sys", NULL);
+ proc_sys_root = proc_mkdir("sys", &glob_proc_root);
proc_sys_root->proc_iops = &proc_sys_dir_operations;
proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 83adcc8..e9dce9e 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/seq_file.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
/*
@@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_file *m, void *v)
dev_t from = MKDEV(p->major, p->minor_start);
dev_t to = from + p->num;
+ if (!ve_accessible_strict(p->owner_env, get_exec_env()))
+ goto out;
+
if (&p->tty_drivers == tty_drivers.next) {
/* pseudo-drivers first */
seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty");
@@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_file *m, void *v)
}
if (from != to)
show_tty_range(m, p, from, to - from);
+out:
return 0;
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b080b79..36f59af 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -42,6 +42,9 @@ static int proc_get_sb(struct file_system_type *fs_type,
struct super_block *sb;
struct pid_namespace *ns;
struct proc_inode *ei;
+#ifdef CONFIG_VE
+ struct vfsmount *proc_mnt = fs_type->owner_env->proc_mnt;
+#endif
if (proc_mnt) {
/* Seed the root directory with a pid so it doesn't need
@@ -95,11 +98,12 @@ static void proc_kill_sb(struct super_block *sb)
put_pid_ns(ns);
}
-static struct file_system_type proc_fs_type = {
+struct file_system_type proc_fs_type = {
.name = "proc",
.get_sb = proc_get_sb,
.kill_sb = proc_kill_sb,
};
+EXPORT_SYMBOL(proc_fs_type);
void __init proc_root_init(void)
{
@@ -109,6 +113,11 @@ void __init proc_root_init(void)
err = register_filesystem(&proc_fs_type);
if (err)
return;
+
+#ifdef CONFIG_VE
+ get_ve0()->proc_root = &proc_root;
+#endif
+
proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
err = PTR_ERR(proc_mnt);
if (IS_ERR(proc_mnt)) {
@@ -116,16 +125,21 @@ void __init proc_root_init(void)
return;
}
- proc_symlink("mounts", NULL, "self/mounts");
+ proc_symlink("mounts", &glob_proc_root, "self/mounts");
+#ifdef CONFIG_VE
+ get_ve0()->proc_mnt = proc_mnt;
+#endif
proc_net_init();
#ifdef CONFIG_SYSVIPC
- proc_mkdir("sysvipc", NULL);
+ proc_mkdir("sysvipc", &glob_proc_root);
#endif
- proc_mkdir("fs", NULL);
+ proc_mkdir("fs", &glob_proc_root);
+ proc_mkdir("fs", NULL); /* care about proc_mkdir("fs/xxx", NULL); */
+
proc_mkdir("driver", NULL);
- proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+ proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
/* just give it a mountpoint */
proc_mkdir("openprom", NULL);
@@ -141,8 +155,19 @@ void __init proc_root_init(void)
static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat
)
{
+ struct ve_struct *ve = get_exec_env();
+
generic_fillattr(dentry->d_inode, stat);
- stat->nlink = proc_root.nlink + nr_processes();
+ stat->nlink = glob_proc_root.nlink;
+ if (ve_is_super(ve))
+ stat->nlink += nr_processes();
+#ifdef CONFIG_VE
+ else
+ /* thread count. not really processes count */
+ stat->nlink += atomic_read(&ve->pcounter);
+ /* the same logic as in the proc_getattr */
+ stat->nlink += ve->proc_root->nlink - 2;
+#endif
return 0;
}
@@ -205,6 +230,22 @@ struct proc_dir_entry proc_root = {
.parent = &proc_root,
};
+#ifdef CONFIG_VE
+struct proc_dir_entry glob_proc_root = {
+ .low_ino = PROC_ROOT_INO,
+ .namelen = 5,
+ .name = "/proc",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
+ .count = ATOMIC_INIT(1),
+ .proc_iops = &proc_root_inode_operations,
+ .proc_fops = &proc_root_operations,
+ .parent = &glob_proc_root,
+};
+
+EXPORT_SYMBOL(glob_proc_root);
+#endif
+
int pid_ns_prepare_proc(struct pid_namespace *ns)
{
struct vfsmount *mnt;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 7cc726c..f2dd1fc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,6 +22,62 @@
#define arch_idle_time(cpu) 0
#endif
+static int show_stat_ve(struct seq_file *p, struct ve_struct *ve, unsigned long jif)
+{
+ int i;
+ u64 user, nice, system;
+ cycles_t idle, iowait;
+ cpumask_t ve_cpus;
+
+ ve_cpu_online_map(ve, &ve_cpus);
+
+ user = nice = system = idle = iowait = 0;
+ for_each_cpu_mask(i, ve_cpus) {
+ user += VE_CPU_STATS(ve, i)->user;
+ nice += VE_CPU_STATS(ve, i)->nice;
+ system += VE_CPU_STATS(ve, i)->system;
+ idle += ve_sched_get_idle_time(ve, i);
+ iowait += ve_sched_get_iowait_time(ve, i);
+ }
+
+ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n",
+ (unsigned long long)cputime64_to_clock_t(user),
+ (unsigned long long)cputime64_to_clock_t(nice),
+ (unsigned long long)cputime64_to_clock_t(system),
+ (unsigned long long)cycles_to_clocks(idle),
+ (unsigned long long)cycles_to_clocks(iowait));
+
+ for_each_cpu_mask(i, ve_cpus) {
+ user = VE_CPU_STATS(ve, i)->user;
+ nice = VE_CPU_STATS(ve, i)->nice;
+ system = VE_CPU_STATS(ve, i)->system;
+ idle = ve_sched_get_idle_time(ve, i);
+ iowait = ve_sched_get_iowait_time(ve, i);
+ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n",
+ i,
+ (unsigned long long)cputime64_to_clock_t(user),
+ (unsigned long long)cputime64_to_clock_t(nice),
+ (unsigned long long)cputime64_to_clock_t(system),
+ (unsigned long long)cycles_to_clocks(idle),
+ (unsigned long long)cycles_to_clocks(iowait));
+ }
+ seq_printf(p, "intr 0\nswap 0 0\n");
+
+ seq_printf(p,
+ "\nctxt %llu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ nr_context_switches(),
+ (unsigned long)jif + ve->start_timespec.tv_sec,
+ total_forks,
+ nr_running_ve(ve),
+ nr_iowait_ve(ve));
+
+ return 0;
+}
+
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
@@ -33,12 +89,18 @@ static int show_stat(struct seq_file *p, void *v)
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
struct timespec boottime;
unsigned int per_irq_sum;
+ struct ve_struct *ve;
+
+ getboottime(&boottime);
+ jif = boottime.tv_sec;
+
+ ve = get_exec_env();
+ if (!ve_is_super(ve))
+ return show_stat_ve(p, ve, jif);
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
guest = cputime64_zero;
- getboottime(&boottime);
- jif = boottime.tv_sec;
for_each_possible_cpu(i) {
user = cputime64_add(user, kstat_cpu(i).cpustat.user);
@@ -166,7 +228,7 @@ static const struct file_operations proc_stat_operations = {
static int __init proc_stat_init(void)
{
- proc_create("stat", 0, NULL, &proc_stat_operations);
+ proc_create("stat", 0, &glob_proc_root, &proc_stat_operations);
return 0;
}
module_init(proc_stat_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d4..47acfbf 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -19,6 +19,13 @@ static int uptime_proc_show(struct seq_file *m, void *v)
do_posix_clock_monotonic_gettime(&uptime);
monotonic_to_bootbased(&uptime);
+#ifdef CONFIG_VE
+ if (!ve_is_super(get_exec_env())) {
+ set_normalized_timespec(&uptime,
+ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec,
+ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
+ }
+#endif
cputime_to_timespec(idletime, &idle);
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
(unsigned long) uptime.tv_sec,
@@ -42,7 +49,7 @@ static const struct file_operations uptime_proc_fops = {
static int __init proc_uptime_init(void)
{
- proc_create("uptime", 0, NULL, &uptime_proc_fops);
+ proc_create("uptime", 0, &glob_proc_root, &uptime_proc_fops);
return 0;
}
module_init(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a6..e78b783 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -28,7 +28,7 @@ static const struct file_operations version_proc_fops = {
static int __init proc_version_init(void)
{
- proc_create("version", 0, NULL, &version_proc_fops);
+ proc_create("version", 0, &glob_proc_root, &version_proc_fops);
return 0;
}
module_init(proc_version_init);
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 8047e01..7a92faf 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -26,13 +26,22 @@ config QUOTA_NETLINK_INTERFACE
config PRINT_QUOTA_WARNING
bool "Print quota warnings to console (OBSOLETE)"
depends on QUOTA
- default y
+ default n
help
If you say Y here, quota warnings (about exceeding softlimit, reaching
hardlimit, etc.) will be printed to the process' controlling terminal.
Note that this behavior is currently deprecated and may go away in
future. Please use notification via netlink socket instead.
+config QUOTA_COMPAT
+ bool "Compatibility with older quotactl interface"
+ depends on QUOTA
+ help
+ This option enables compatibility layer for older version
+ of quotactl interface with byte granularity (QUOTAON at 0x0100,
+ GETQUOTA at 0x0D00). Interface versions older than that one and
+ with block granularity are still not supported.
+
# Generic support for tree structured quota files. Selected when needed.
config QUOTA_TREE
tristate
@@ -53,6 +62,31 @@ config QFMT_V2
This quota format allows using quotas with 32-bit UIDs/GIDs. If you
need this functionality say Y here.
+config VZ_QUOTA
+ tristate "Virtuozzo Disk Quota support"
+ select QUOTA
+ select QUOTA_COMPAT
+ select VZ_DEV
+ default m
+ help
+ Virtuozzo Disk Quota imposes disk quota on directories with their
+ files and subdirectories in total. Such disk quota is used to
+ account and limit disk usage by Virtuozzo VPS, but also may be used
+ separately.
+
+config VZ_QUOTA_UNLOAD
+ bool "Unloadable Virtuozzo Disk Quota module"
+ depends on VZ_QUOTA=m
+ default n
+ help
+ Make Virtuozzo Disk Quota module unloadable.
+ Doesn't work reliably now.
+
+config VZ_QUOTA_UGID
+ bool "Per-user and per-group quota in Virtuozzo quota partitions"
+ depends on VZ_QUOTA!=n
+ default y
+
config QUOTACTL
bool
depends on XFS_QUOTA || QUOTA
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index 68d4f6d..4c2159c 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -3,3 +3,5 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o
+
+obj-y += vzdquota/
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 4fdb0eb..e7aff07 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -170,8 +170,9 @@ static struct quota_format_type *find_quota_format(int id)
struct quota_format_type *actqf;
spin_lock(&dq_list_lock);
- for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
- actqf = actqf->qf_next)
+ for (actqf = quota_formats;
+ actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL);
+ actqf = actqf->qf_next)
;
if (!actqf || !try_module_get(actqf->qf_owner)) {
int qm;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 95c5b42..7d9d4b4 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -18,6 +18,7 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/types.h>
+#include <linux/device_cgroup.h>
/* Check validity of generic quotactl commands */
static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
@@ -83,11 +84,11 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
if (cmd == Q_GETQUOTA) {
if (((type == USRQUOTA && current_euid() != id) ||
(type == GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
+ !capable(CAP_VE_SYS_ADMIN))
return -EPERM;
}
else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
return 0;
@@ -135,10 +136,10 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
if (cmd == Q_XGETQUOTA) {
if (((type == XQM_USRQUOTA && current_euid() != id) ||
(type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
- !capable(CAP_SYS_ADMIN))
+ !capable(CAP_VE_SYS_ADMIN))
return -EPERM;
} else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
}
@@ -164,7 +165,7 @@ void sync_quota_sb(struct super_block *sb, int type)
{
int cnt;
- if (!sb->s_qcop->quota_sync)
+ if (!sb->s_qcop || !sb->s_qcop->quota_sync)
return;
sb->s_qcop->quota_sync(sb, type);
@@ -188,6 +189,8 @@ void sync_quota_sb(struct super_block *sb, int type)
continue;
if (!sb_has_quota_active(sb, cnt))
continue;
+ if (!sb_dqopt(sb)->files[cnt])
+ continue;
mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
I_MUTEX_QUOTA);
truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -361,6 +364,7 @@ static struct super_block *quotactl_block(const char __user *special)
struct block_device *bdev;
struct super_block *sb;
char *tmp = getname(special);
+ int error;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
@@ -368,6 +372,13 @@ static struct super_block *quotactl_block(const char __user *special)
putname(tmp);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
+
+ error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL);
+ if (error) {
+ bdput(bdev);
+ return ERR_PTR(error);
+ }
+
sb = get_super(bdev);
bdput(bdev);
if (!sb)
@@ -379,6 +390,231 @@ static struct super_block *quotactl_block(const char __user *special)
#endif
}
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define QC_QUOTAON 0x0100 /* enable quotas */
+#define QC_QUOTAOFF 0x0200 /* disable quotas */
+/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
+#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */
+#define QC_SETQLIM 0x0700 /* set limits */
+/* GETSTATS at 0x0800 is now longer... */
+#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */
+#define QC_SETINFO 0x0A00 /* set info about quotas */
+#define QC_SETGRACE 0x0B00 /* set inode and block grace */
+#define QC_SETFLAGS 0x0C00 /* set flags for quota */
+#define QC_GETQUOTA 0x0D00 /* get limits and usage */
+#define QC_SETQUOTA 0x0E00 /* set limits and usage */
+#define QC_SETUSE 0x0F00 /* set usage */
+/* 0x1000 used by old RSQUASH */
+#define QC_GETSTATS 0x1100 /* get collected stats */
+
+struct compat_dqblk {
+ unsigned int dqb_ihardlimit;
+ unsigned int dqb_isoftlimit;
+ unsigned int dqb_curinodes;
+ unsigned int dqb_bhardlimit;
+ unsigned int dqb_bsoftlimit;
+ qsize_t dqb_curspace;
+ __kernel_time_t dqb_btime;
+ __kernel_time_t dqb_itime;
+};
+
+#ifdef CONFIG_COMPAT
+
+struct compat_compat_dqblk {
+ compat_uint_t dqb_ihardlimit;
+ compat_uint_t dqb_isoftlimit;
+ compat_uint_t dqb_curinodes;
+ compat_uint_t dqb_bhardlimit;
+ compat_uint_t dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_time_t dqb_btime;
+ compat_time_t dqb_itime;
+};
+
+#endif
+
+struct compat_dqinfo {
+ unsigned int dqi_bgrace;
+ unsigned int dqi_igrace;
+ unsigned int dqi_flags;
+ unsigned int dqi_blocks;
+ unsigned int dqi_free_blk;
+ unsigned int dqi_free_entry;
+};
+
+struct compat_dqstats {
+ __u32 lookups;
+ __u32 drops;
+ __u32 reads;
+ __u32 writes;
+ __u32 cache_hits;
+ __u32 allocated_dquots;
+ __u32 free_dquots;
+ __u32 syncs;
+ __u32 version;
+};
+
+asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
+
+static long compat_quotactl(unsigned int cmds, unsigned int type,
+ const char __user *special, qid_t id,
+ void __user *addr)
+{
+ struct super_block *sb;
+ long ret;
+
+ sb = NULL;
+ switch (cmds) {
+ case QC_QUOTAON:
+ return sys_quotactl(QCMD(Q_QUOTAON, type),
+ special, id, addr);
+
+ case QC_QUOTAOFF:
+ return sys_quotactl(QCMD(Q_QUOTAOFF, type),
+ special, id, addr);
+
+ case QC_SYNC:
+ return sys_quotactl(QCMD(Q_SYNC, type),
+ special, id, addr);
+
+ case QC_GETQUOTA: {
+ struct if_dqblk idq;
+ struct compat_dqblk cdq;
+
+ sb = quotactl_block(special);
+ ret = PTR_ERR(sb);
+ if (IS_ERR(sb))
+ break;
+ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
+ if (ret)
+ break;
+ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+ if (ret)
+ break;
+ cdq.dqb_ihardlimit = idq.dqb_ihardlimit;
+ cdq.dqb_isoftlimit = idq.dqb_isoftlimit;
+ cdq.dqb_curinodes = idq.dqb_curinodes;
+ cdq.dqb_bhardlimit = idq.dqb_bhardlimit;
+ cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit;
+ cdq.dqb_curspace = idq.dqb_curspace;
+ cdq.dqb_btime = idq.dqb_btime;
+ cdq.dqb_itime = idq.dqb_itime;
+ ret = 0;
+ if (copy_to_user(addr, &cdq, sizeof(cdq)))
+ ret = -EFAULT;
+ break;
+ }
+
+ case QC_SETQUOTA:
+ case QC_SETUSE:
+ case QC_SETQLIM: {
+ struct if_dqblk idq;
+ struct compat_dqblk cdq;
+
+ sb = quotactl_block(special);
+ ret = PTR_ERR(sb);
+ if (IS_ERR(sb))
+ break;
+ ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id);
+ if (ret)
+ break;
+ ret = -EFAULT;
+ if (copy_from_user(&cdq, addr, sizeof(cdq)))
+ break;
+ idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
+ idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
+ idq.dqb_curinodes = cdq.dqb_curinodes;
+ idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
+ idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
+ idq.dqb_curspace = cdq.dqb_curspace;
+ idq.dqb_valid = 0;
+ if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
+ idq.dqb_valid |= QIF_LIMITS;
+ if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
+ idq.dqb_valid |= QIF_USAGE;
+ ret = sb->s_qcop->set_dqblk(sb, type, id, &idq);
+ break;
+ }
+
+ case QC_GETINFO: {
+ struct if_dqinfo iinf;
+ struct compat_dqinfo cinf;
+
+ sb = quotactl_block(special);
+ ret = PTR_ERR(sb);
+ if (IS_ERR(sb))
+ break;
+ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
+ if (ret)
+ break;
+ ret = sb->s_qcop->get_info(sb, type, &iinf);
+ if (ret)
+ break;
+ cinf.dqi_bgrace = iinf.dqi_bgrace;
+ cinf.dqi_igrace = iinf.dqi_igrace;
+ cinf.dqi_flags = 0;
+ if (iinf.dqi_flags & DQF_INFO_DIRTY)
+ cinf.dqi_flags |= 0x0010;
+ cinf.dqi_blocks = 0;
+ cinf.dqi_free_blk = 0;
+ cinf.dqi_free_entry = 0;
+ ret = 0;
+ if (copy_to_user(addr, &cinf, sizeof(cinf)))
+ ret = -EFAULT;
+ break;
+ }
+
+ case QC_SETINFO:
+ case QC_SETGRACE:
+ case QC_SETFLAGS: {
+ struct if_dqinfo iinf;
+ struct compat_dqinfo cinf;
+
+ sb = quotactl_block(special);
+ ret = PTR_ERR(sb);
+ if (IS_ERR(sb))
+ break;
+ ret = check_quotactl_valid(sb, type, Q_SETINFO, id);
+ if (ret)
+ break;
+ ret = -EFAULT;
+ if (copy_from_user(&cinf, addr, sizeof(cinf)))
+ break;
+ iinf.dqi_bgrace = cinf.dqi_bgrace;
+ iinf.dqi_igrace = cinf.dqi_igrace;
+ iinf.dqi_flags = cinf.dqi_flags;
+ iinf.dqi_valid = 0;
+ if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
+ iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
+ if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
+ iinf.dqi_valid |= IIF_FLAGS;
+ ret = sb->s_qcop->set_info(sb, type, &iinf);
+ break;
+ }
+
+ case QC_GETSTATS: {
+ struct compat_dqstats stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.version = 6*10000+5*100+0;
+ ret = 0;
+ if (copy_to_user(addr, &stat, sizeof(stat)))
+ ret = -EFAULT;
+ break;
+ }
+
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ if (sb && !IS_ERR(sb))
+ drop_super(sb);
+ return ret;
+}
+
+#endif
+
/*
* This is the system call interface. This communicates with
* the user-level programs. Currently this only supports diskquota
@@ -395,6 +631,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
cmds = cmd >> SUBCMDSHIFT;
type = cmd & SUBCMDMASK;
+#ifdef CONFIG_QUOTA_COMPAT
+ if (cmds >= 0x0100 && cmds < 0x3000)
+ return compat_quotactl(cmds, type, special, id, addr);
+#endif
+
if (cmds != Q_SYNC || special) {
sb = quotactl_block(special);
if (IS_ERR(sb))
@@ -459,6 +700,11 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
compat_uint_t data;
u16 xdata;
long ret;
+#ifdef CONFIG_QUOTA_COMPAT
+ struct compat_dqblk __user *cdq;
+ struct compat_compat_dqblk __user *compat_cdq;
+ compat_time_t time;
+#endif
cmds = cmd >> SUBCMDSHIFT;
@@ -519,6 +765,43 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
break;
ret = 0;
break;
+#ifdef CONFIG_QUOTA_COMPAT
+ case QC_GETQUOTA:
+ cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+ compat_cdq = addr;
+ ret = sys_quotactl(cmd, special, id, cdq);
+ if (ret)
+ break;
+ ret = -EFAULT;
+ if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) -
+ offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+ copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace,
+ sizeof(cdq->dqb_curspace)) ||
+ get_user(time, &cdq->dqb_btime) ||
+ put_user(time, &compat_cdq->dqb_btime) ||
+ get_user(time, &cdq->dqb_itime) ||
+ put_user(time, &compat_cdq->dqb_itime))
+ break;
+ ret = 0;
+ break;
+ case QC_SETQUOTA:
+ case QC_SETUSE:
+ case QC_SETQLIM:
+ cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+ compat_cdq = addr;
+ ret = -EFAULT;
+ if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) -
+ offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+ copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace,
+ sizeof(cdq->dqb_curspace)) ||
+ get_user(time, &compat_cdq->dqb_btime) ||
+ put_user(time, &cdq->dqb_btime) ||
+ get_user(time, &compat_cdq->dqb_itime) ||
+ put_user(time, &cdq->dqb_itime))
+ break;
+ ret = sys_quotactl(cmd, special, id, cdq);
+ break;
+#endif
default:
ret = sys_quotactl(cmd, special, id, addr);
}
diff --git a/fs/quota/vzdquota/Makefile b/fs/quota/vzdquota/Makefile
new file mode 100644
index 0000000..03fdee3
--- /dev/null
+++ b/fs/quota/vzdquota/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_VZ_QUOTA) += vzdquota.o
+vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o
+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o
+vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o
diff --git a/fs/quota/vzdquota/vzdq_file.c b/fs/quota/vzdquota/vzdq_file.c
new file mode 100644
index 0000000..3ac9f05
--- /dev/null
+++ b/fs/quota/vzdquota/vzdq_file.c
@@ -0,0 +1,956 @@
+/*
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo quota files as proc entry implementation.
+ * It is required for std quota tools to work correctly as they are expecting
+ * aquota.user and aquota.group files.
+ */
+
+#include <linux/ctype.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+#include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include "../quotaio_v2.h"
+#include "../quota_tree.h"
+#include <asm/uaccess.h>
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzdq_tree.h>
+#include <linux/vzquota.h>
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+/* ----------------------------------------------------------------------
+ *
+ * File read operation
+ *
+ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c,
+ * perhaps) abuse vz_quota_mutex.
+ * Taking a global mutex for lengthy and user-controlled operations inside
+ * VPSs is not a good idea in general.
+ * In this case, the reasons for taking this mutex are completely unclear,
+ * especially taking into account that the only function that has comments
+ * about the necessity to be called under this mutex
+ * (create_proc_quotafile) is actually called OUTSIDE it.
+ *
+ * --------------------------------------------------------------------- */
+
+#define DQBLOCK_SIZE 1024
+#define DQUOTBLKNUM 21U
+#define DQTREE_DEPTH 4
+#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1)
+#define ISINDBLOCK(num) ((num)%2 != 0)
+#define FIRST_DATABLK 2 /* first even number */
+#define LAST_IND_LEVEL (DQTREE_DEPTH - 1)
+#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS))
+#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \
+ & QUOTATREE_BMASK)
+
+#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH)
+#error xBITS and DQTREE_DEPTH does not correspond
+#endif
+
+#define BLOCK_NOT_FOUND 1
+
+/* data for quota file -- one per proc entry */
+struct quotatree_data {
+ struct list_head list;
+ struct vz_quota_master *qmblk;
+ int type; /* type of the tree */
+};
+
+/* serialized by vz_quota_mutex */
+static LIST_HEAD(qf_data_head);
+
+static const u_int32_t vzquota_magics[] = V2_INITQMAGICS;
+static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS;
+static const char aquota_user[] = "aquota.user";
+static const char aquota_group[] = "aquota.group";
+
+
+static inline loff_t get_depoff(int depth)
+{
+ loff_t res = 1;
+ while (depth) {
+ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1));
+ depth--;
+ }
+ return res;
+}
+
+static inline loff_t get_blknum(loff_t num, int depth)
+{
+ loff_t res;
+ res = (num << 1) + get_depoff(depth);
+ return res;
+}
+
+static int get_depth(loff_t num)
+{
+ int i;
+ for (i = 0; i < DQTREE_DEPTH; i++) {
+ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1
+ || num < get_depoff(i + 1)))
+ return i;
+ }
+ return -1;
+}
+
+static inline loff_t get_offset(loff_t num)
+{
+ loff_t res, tmp;
+
+ tmp = get_depth(num);
+ if (tmp < 0)
+ return -1;
+ num -= get_depoff(tmp);
+ BUG_ON(num < 0);
+ res = num >> 1;
+
+ return res;
+}
+
+static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level)
+{
+ /* return maximum available block num */
+ return tree->levels[level].freenum;
+}
+
+static inline loff_t get_block_num(struct quotatree_tree *tree)
+{
+ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot;
+
+ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1);
+ max_quot = TREENUM_2_BLKNUM(quot_blk_num);
+ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1));
+ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL)
+ : get_blknum(ind_blk_num, 0);
+
+ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1;
+}
+
+/* Write quota file header */
+static int read_header(void *buf, struct quotatree_tree *tree,
+ struct dq_info *dq_ugid_info, int type)
+{
+ struct v2_disk_dqheader *dqh;
+ struct v2_disk_dqinfo *dq_disk_info;
+
+ dqh = buf;
+ dq_disk_info = buf + sizeof(struct v2_disk_dqheader);
+
+ dqh->dqh_magic = vzquota_magics[type];
+ dqh->dqh_version = vzquota_versions[type];
+
+ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire;
+ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire;
+ dq_disk_info->dqi_flags = 0; /* no flags */
+ dq_disk_info->dqi_blocks = get_block_num(tree);
+ dq_disk_info->dqi_free_blk = 0; /* first block in the file */
+ dq_disk_info->dqi_free_entry = FIRST_DATABLK;
+
+ return 0;
+}
+
+static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf)
+{
+ int i, j, lev_num;
+
+ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1;
+ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) {
+ struct quotatree_node *next, *parent;
+
+ parent = p;
+ next = p;
+ for (j = lev_num; j >= 0; j--) {
+ if (!next->blocks[GETLEVINDX(i,j)]) {
+ buf[i] = 0;
+ goto bad_branch;
+ }
+ parent = next;
+ next = next->blocks[GETLEVINDX(i,j)];
+ }
+ buf[i] = (depth == DQTREE_DEPTH - 1) ?
+ TREENUM_2_BLKNUM(parent->num)
+ : get_blknum(next->num, depth + 1);
+
+ bad_branch:
+ ;
+ }
+
+ return 0;
+}
+
+/*
+ * Write index block to disk (or buffer)
+ * @buf has length 256*sizeof(u_int32_t) bytes
+ */
+static int read_index_block(int num, u_int32_t *buf,
+ struct quotatree_tree *tree)
+{
+ struct quotatree_node *p;
+ u_int32_t index;
+ loff_t off;
+ int depth, res;
+
+ res = BLOCK_NOT_FOUND;
+ index = 0;
+ depth = get_depth(num);
+ off = get_offset(num);
+ if (depth < 0 || off < 0)
+ return -EINVAL;
+
+ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh,
+ list) {
+ if (p->num >= off)
+ res = 0;
+ if (p->num != off)
+ continue;
+ get_block_child(depth, p, buf);
+ break;
+ }
+
+ return res;
+}
+
+static inline void convert_quot_format(struct v2_disk_dqblk *dq,
+ struct vz_quota_ugid *vzq)
+{
+ dq->dqb_id = vzq->qugid_id;
+ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit;
+ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit;
+ dq->dqb_curinodes = vzq->qugid_stat.icurrent;
+ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE;
+ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE;
+ dq->dqb_curspace = vzq->qugid_stat.bcurrent;
+ dq->dqb_btime = vzq->qugid_stat.btime;
+ dq->dqb_itime = vzq->qugid_stat.itime;
+}
+
+static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree)
+{
+ int res, i, entries = 0;
+ struct qt_disk_dqdbheader *dq_header;
+ struct quotatree_node *p;
+ struct v2_disk_dqblk *blk = buf + sizeof(struct qt_disk_dqdbheader);
+
+ res = BLOCK_NOT_FOUND;
+ dq_header = buf;
+ memset(dq_header, 0, sizeof(*dq_header));
+
+ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh),
+ list) {
+ if (TREENUM_2_BLKNUM(p->num) >= num)
+ res = 0;
+ if (TREENUM_2_BLKNUM(p->num) != num)
+ continue;
+
+ for (i = 0; i < QUOTATREE_BSIZE; i++) {
+ if (!p->blocks[i])
+ continue;
+ convert_quot_format(blk + entries,
+ (struct vz_quota_ugid *)p->blocks[i]);
+ entries++;
+ res = 0;
+ }
+ break;
+ }
+ dq_header->dqdh_entries = entries;
+
+ return res;
+}
+
+static int read_block(int num, void *buf, struct quotatree_tree *tree,
+ struct dq_info *dq_ugid_info, int magic)
+{
+ int res;
+
+ memset(buf, 0, DQBLOCK_SIZE);
+ if (!num)
+ res = read_header(buf, tree, dq_ugid_info, magic);
+ else if (ISINDBLOCK(num))
+ res = read_index_block(num, (u_int32_t*)buf, tree);
+ else
+ res = read_dquot(num, buf, tree);
+
+ return res;
+}
+
+/*
+ * FIXME: this function can handle quota files up to 2GB only.
+ */
+static int read_proc_quotafile(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ off_t blk_num, blk_off, buf_off;
+ char *tmp;
+ size_t buf_size;
+ struct quotatree_data *qtd;
+ struct quotatree_tree *tree;
+ struct dq_info *dqi;
+ int res;
+
+ *start = NULL;
+ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ qtd = data;
+ mutex_lock(&vz_quota_mutex);
+ mutex_lock(&qtd->qmblk->dq_mutex);
+
+ res = 0;
+ tree = QUGID_TREE(qtd->qmblk, qtd->type);
+ if (!tree) {
+ *eof = 1;
+ goto out_dq;
+ }
+
+ dqi = &qtd->qmblk->dq_ugid_info[qtd->type];
+
+ buf_off = 0;
+ buf_size = count;
+ blk_num = off / DQBLOCK_SIZE;
+ blk_off = off % DQBLOCK_SIZE;
+
+ while (buf_size > 0) {
+ off_t len;
+
+ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size);
+ res = read_block(blk_num, tmp, tree, dqi, qtd->type);
+ if (res < 0)
+ goto out_err;
+ if (res == BLOCK_NOT_FOUND) {
+ *eof = 1;
+ break;
+ }
+ memcpy(page + buf_off, tmp + blk_off, len);
+
+ blk_num++;
+ buf_size -= len;
+ blk_off = 0;
+ buf_off += len;
+ }
+ res = buf_off;
+
+out_err:
+ *start += count;
+out_dq:
+ mutex_unlock(&qtd->qmblk->dq_mutex);
+ mutex_unlock(&vz_quota_mutex);
+ kfree(tmp);
+
+ return res;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota/QID/aquota.* files
+ *
+ * FIXME: this code lacks serialization of read/readdir/lseek.
+ * However, this problem should be fixed after the mainstream issue of what
+ * appears to be non-atomic read and update of file position in sys_read.
+ *
+ * --------------------------------------------------------------------- */
+
+static inline unsigned long vzdq_aquot_getino(dev_t dev)
+{
+ return 0xec000000UL + dev;
+}
+
+static inline dev_t vzdq_aquot_getidev(struct inode *inode)
+{
+ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link;
+}
+
+static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev)
+{
+ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev;
+}
+
+static ssize_t vzdq_aquotf_read(struct file *file,
+ char __user *buf, size_t size, loff_t *ppos)
+{
+ char *page;
+ size_t bufsize;
+ ssize_t l, l2, copied;
+ char *start;
+ struct inode *inode;
+ struct block_device *bdev;
+ struct super_block *sb;
+ struct quotatree_data data;
+ int eof, err;
+
+ err = -ENOMEM;
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (page == NULL)
+ goto out_err;
+
+ err = -ENODEV;
+ inode = file->f_dentry->d_inode;
+ bdev = bdget(vzdq_aquot_getidev(inode));
+ if (bdev == NULL)
+ goto out_err;
+ sb = get_super(bdev);
+ bdput(bdev);
+ if (sb == NULL)
+ goto out_err;
+ data.qmblk = vzquota_find_qmblk(sb);
+ data.type = PROC_I(inode)->fd - 1;
+ drop_super(sb);
+ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD)
+ goto out_err;
+
+ copied = 0;
+ l = l2 = 0;
+ while (1) {
+ bufsize = min(size, (size_t)PAGE_SIZE);
+ if (bufsize <= 0)
+ break;
+
+ l = read_proc_quotafile(page, &start, *ppos, bufsize,
+ &eof, &data);
+ if (l <= 0)
+ break;
+
+ l2 = copy_to_user(buf, page, l);
+ copied += l - l2;
+ if (l2)
+ break;
+
+ buf += l;
+ size -= l;
+ *ppos += (unsigned long)start;
+ l = l2 = 0;
+ }
+
+ qmblk_put(data.qmblk);
+ free_page((unsigned long)page);
+ if (copied)
+ return copied;
+ else if (l2) /* last copy_to_user failed */
+ return -EFAULT;
+ else /* read error or EOF */
+ return l;
+
+out_err:
+ if (page != NULL)
+ free_page((unsigned long)page);
+ return err;
+}
+
+static struct file_operations vzdq_aquotf_file_operations = {
+ .read = &vzdq_aquotf_read,
+};
+
+static struct inode_operations vzdq_aquotf_inode_operations = {
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota/QID directory
+ *
+ * --------------------------------------------------------------------- */
+
+static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler)
+{
+ loff_t n;
+ int err;
+
+ n = file->f_pos;
+ for (err = 0; !err; n++) {
+ /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */
+ switch ((unsigned long)n) {
+ case 0:
+ err = (*filler)(data, ".", 1, n,
+ file->f_dentry->d_inode->i_ino,
+ DT_DIR);
+ break;
+ case 1:
+ err = (*filler)(data, "..", 2, n,
+ parent_ino(file->f_dentry), DT_DIR);
+ break;
+ case 2:
+ err = (*filler)(data, aquota_user,
+ sizeof(aquota_user)-1, n,
+ file->f_dentry->d_inode->i_ino
+ + USRQUOTA + 1,
+ DT_REG);
+ break;
+ case 3:
+ err = (*filler)(data, aquota_group,
+ sizeof(aquota_group)-1, n,
+ file->f_dentry->d_inode->i_ino
+ + GRPQUOTA + 1,
+ DT_REG);
+ break;
+ default:
+ goto out;
+ }
+ }
+out:
+ file->f_pos = n;
+ return err;
+}
+
+struct vzdq_aquotq_lookdata {
+ dev_t dev;
+ int type;
+ struct vz_quota_master *qmblk;
+};
+
+static int vzdq_aquotq_looktest(struct inode *inode, void *data)
+{
+ struct vzdq_aquotq_lookdata *d;
+
+ d = data;
+ return inode->i_op == &vzdq_aquotf_inode_operations &&
+ vzdq_aquot_getidev(inode) == d->dev &&
+ PROC_I(inode)->fd == d->type + 1;
+}
+
+static int vzdq_aquotq_lookset(struct inode *inode, void *data)
+{
+ struct vzdq_aquotq_lookdata *d;
+ struct super_block *sb;
+ struct quotatree_data qtd;
+ struct quotatree_tree *tree;
+
+ d = data;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1;
+ inode->i_mode = S_IFREG | S_IRUSR;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_nlink = 1;
+ inode->i_op = &vzdq_aquotf_inode_operations;
+ inode->i_fop = &vzdq_aquotf_file_operations;
+ PROC_I(inode)->fd = d->type + 1;
+ vzdq_aquot_setidev(inode, d->dev);
+
+ /* Setting size */
+ sb = user_get_super(d->dev);
+ if (sb == NULL)
+ return -ENODEV;
+ qtd.qmblk = vzquota_find_qmblk(sb);
+ drop_super(sb);
+
+ if (qtd.qmblk == NULL)
+ return -ESRCH;
+ if (qtd.qmblk == VZ_QUOTA_BAD)
+ return -EIO;
+
+ qtd.type = PROC_I(inode)->fd - 1;
+ tree = QUGID_TREE(qtd.qmblk, qtd.type);
+ inode->i_size = get_block_num(tree) * 1024;
+ return 0;
+}
+
+static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd)
+{
+ return 0;
+}
+
+static struct dentry_operations vzdq_aquotq_dentry_operations = {
+ .d_revalidate = &vzdq_aquotq_revalidate,
+};
+
+static struct vz_quota_master *find_qmblk_by_dev(dev_t dev)
+{
+ struct super_block *sb;
+ struct vz_quota_master *qmblk;
+
+ qmblk = NULL;
+ sb = user_get_super(dev);
+ if (sb != NULL) {
+ qmblk = vzquota_find_qmblk(sb);
+ drop_super(sb);
+
+ if (qmblk == VZ_QUOTA_BAD)
+ qmblk = NULL;
+ }
+
+ return qmblk;
+}
+
+static struct dentry *vzdq_aquotq_lookup(struct inode *dir,
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+ struct vzdq_aquotq_lookdata d;
+ int k;
+
+ if (dentry->d_name.len == sizeof(aquota_user)-1) {
+ if (memcmp(dentry->d_name.name, aquota_user,
+ sizeof(aquota_user)-1))
+ goto out;
+ k = USRQUOTA;
+ } else if (dentry->d_name.len == sizeof(aquota_group)-1) {
+ if (memcmp(dentry->d_name.name, aquota_group,
+ sizeof(aquota_group)-1))
+ goto out;
+ k = GRPQUOTA;
+ } else
+ goto out;
+ d.dev = vzdq_aquot_getidev(dir);
+ d.type = k;
+ d.qmblk = find_qmblk_by_dev(d.dev);
+ if (d.qmblk == NULL)
+ goto out;
+
+ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1,
+ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d);
+ if (inode == NULL)
+ goto out;
+ unlock_new_inode(inode);
+ dentry->d_op = &vzdq_aquotq_dentry_operations;
+ d_add(dentry, inode);
+ return NULL;
+
+out:
+ return ERR_PTR(-ENOENT);
+}
+
+static struct file_operations vzdq_aquotq_file_operations = {
+ .read = &generic_read_dir,
+ .readdir = &vzdq_aquotq_readdir,
+};
+
+static struct inode_operations vzdq_aquotq_inode_operations = {
+ .lookup = &vzdq_aquotq_lookup,
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * /proc/vz/vzaquota directory
+ *
+ * --------------------------------------------------------------------- */
+
+struct vzdq_aquot_de {
+ struct list_head list;
+ struct vfsmount *mnt;
+};
+
+static int vzdq_aquot_buildmntlist(struct ve_struct *ve,
+ struct list_head *head)
+{
+ struct vfsmount *rmnt, *mnt;
+ struct vzdq_aquot_de *p;
+ int err;
+
+#ifdef CONFIG_VE
+ rmnt = mntget(ve->root_path.mnt);
+#else
+ read_lock(&current->fs->lock);
+ rmnt = mntget(current->fs->rootmnt);
+ read_unlock(&current->fs->lock);
+#endif
+ mnt = rmnt;
+ spin_lock(&vfsmount_lock);
+ while (1) {
+ list_for_each_entry(p, head, list) {
+ if (p->mnt->mnt_sb == mnt->mnt_sb)
+ goto skip;
+ }
+
+ err = -ENOMEM;
+ p = kmalloc(sizeof(*p), GFP_ATOMIC);
+ if (p == NULL)
+ goto out;
+ p->mnt = mntget(mnt);
+ list_add_tail(&p->list, head);
+
+skip:
+ err = 0;
+ if (list_empty(&mnt->mnt_mounts)) {
+ while (1) {
+ if (mnt == rmnt)
+ goto out;
+ if (mnt->mnt_child.next !=
+ &mnt->mnt_parent->mnt_mounts)
+ break;
+ mnt = mnt->mnt_parent;
+ }
+ mnt = list_entry(mnt->mnt_child.next,
+ struct vfsmount, mnt_child);
+ } else
+ mnt = list_entry(mnt->mnt_mounts.next,
+ struct vfsmount, mnt_child);
+ }
+out:
+ spin_unlock(&vfsmount_lock);
+ mntput(rmnt);
+ return err;
+}
+
+static void vzdq_aquot_releasemntlist(struct ve_struct *ve,
+ struct list_head *head)
+{
+ struct vzdq_aquot_de *p;
+
+ while (!list_empty(head)) {
+ p = list_entry(head->next, typeof(*p), list);
+ mntput(p->mnt);
+ list_del(&p->list);
+ kfree(p);
+ }
+}
+
+static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler)
+{
+ struct ve_struct *ve, *old_ve;
+ struct list_head mntlist;
+ struct vzdq_aquot_de *de;
+ struct super_block *sb;
+ struct vz_quota_master *qmblk;
+ loff_t i, n;
+ char buf[24];
+ int l, err;
+
+ i = 0;
+ n = file->f_pos;
+ ve = file->f_dentry->d_sb->s_type->owner_env;
+ old_ve = set_exec_env(ve);
+
+ INIT_LIST_HEAD(&mntlist);
+#ifdef CONFIG_VE
+ /*
+ * The only reason of disabling readdir for the host system is that
+ * this readdir can be slow and CPU consuming with large number of VPSs
+ * (or just mount points).
+ */
+ err = ve_is_super(ve);
+#else
+ err = 0;
+#endif
+ if (!err) {
+ err = vzdq_aquot_buildmntlist(ve, &mntlist);
+ if (err)
+ goto out_err;
+ }
+
+ if (i >= n) {
+ if ((*filler)(data, ".", 1, i,
+ file->f_dentry->d_inode->i_ino, DT_DIR))
+ goto out_fill;
+ }
+ i++;
+
+ if (i >= n) {
+ if ((*filler)(data, "..", 2, i,
+ parent_ino(file->f_dentry), DT_DIR))
+ goto out_fill;
+ }
+ i++;
+
+ list_for_each_entry (de, &mntlist, list) {
+ sb = de->mnt->mnt_sb;
+ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL))
+ continue;
+
+ qmblk = vzquota_find_qmblk(sb);
+ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD)
+ continue;
+
+ qmblk_put(qmblk);
+ i++;
+ if (i <= n)
+ continue;
+
+ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev));
+ if ((*filler)(data, buf, l, i - 1,
+ vzdq_aquot_getino(sb->s_dev), DT_DIR))
+ break;
+ }
+
+out_fill:
+ err = 0;
+ file->f_pos = i;
+out_err:
+ vzdq_aquot_releasemntlist(ve, &mntlist);
+ (void)set_exec_env(old_ve);
+ return err;
+}
+
+static int vzdq_aquotd_looktest(struct inode *inode, void *data)
+{
+ return inode->i_op == &vzdq_aquotq_inode_operations &&
+ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data;
+}
+
+static int vzdq_aquotd_lookset(struct inode *inode, void *data)
+{
+ dev_t dev;
+
+ dev = (dev_t)(unsigned long)data;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_ino = vzdq_aquot_getino(dev);
+ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ inode->i_nlink = 2;
+ inode->i_op = &vzdq_aquotq_inode_operations;
+ inode->i_fop = &vzdq_aquotq_file_operations;
+ vzdq_aquot_setidev(inode, dev);
+ return 0;
+}
+
+static struct dentry *vzdq_aquotd_lookup(struct inode *dir,
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct ve_struct *ve, *old_ve;
+ const unsigned char *s;
+ int l;
+ dev_t dev;
+ struct inode *inode;
+
+ ve = dir->i_sb->s_type->owner_env;
+ old_ve = set_exec_env(ve);
+#ifdef CONFIG_VE
+ /*
+ * Lookup is much lighter than readdir, so it can be allowed for the
+ * host system. But it would be strange to be able to do lookup only
+ * without readdir...
+ */
+ if (ve_is_super(ve))
+ goto out;
+#endif
+
+ dev = 0;
+ l = dentry->d_name.len;
+ if (l <= 0)
+ goto out;
+ for (s = dentry->d_name.name; l > 0; s++, l--) {
+ if (!isxdigit(*s))
+ goto out;
+ if (dev & ~(~0UL >> 4))
+ goto out;
+ dev <<= 4;
+ if (isdigit(*s))
+ dev += *s - '0';
+ else if (islower(*s))
+ dev += *s - 'a' + 10;
+ else
+ dev += *s - 'A' + 10;
+ }
+ dev = new_decode_dev(dev);
+
+ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL))
+ goto out;
+
+ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev),
+ vzdq_aquotd_looktest, vzdq_aquotd_lookset,
+ (void *)(unsigned long)dev);
+ if (inode == NULL)
+ goto out;
+ unlock_new_inode(inode);
+
+ d_add(dentry, inode);
+ (void)set_exec_env(old_ve);
+ return NULL;
+
+out:
+ (void)set_exec_env(old_ve);
+ return ERR_PTR(-ENOENT);
+}
+
+static int vzdq_aquotd_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct ve_struct *ve, *old_ve;
+ struct list_head mntlist, *pos;
+
+ generic_fillattr(dentry->d_inode, stat);
+ ve = dentry->d_sb->s_type->owner_env;
+#ifdef CONFIG_VE
+ /*
+ * The only reason of disabling getattr for the host system is that
+ * this getattr can be slow and CPU consuming with large number of VPSs
+ * (or just mount points).
+ */
+ if (ve_is_super(ve))
+ return 0;
+#endif
+ INIT_LIST_HEAD(&mntlist);
+ old_ve = set_exec_env(ve);
+ if (!vzdq_aquot_buildmntlist(ve, &mntlist))
+ list_for_each(pos, &mntlist)
+ stat->nlink++;
+ vzdq_aquot_releasemntlist(ve, &mntlist);
+ (void)set_exec_env(old_ve);
+ return 0;
+}
+
+static struct file_operations vzdq_aquotd_file_operations = {
+ .read = &generic_read_dir,
+ .readdir = &vzdq_aquotd_readdir,
+};
+
+static struct inode_operations vzdq_aquotd_inode_operations = {
+ .lookup = &vzdq_aquotd_lookup,
+ .getattr = &vzdq_aquotd_getattr,
+};
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Initialization and deinitialization
+ *
+ * --------------------------------------------------------------------- */
+static int fake_data;
+static struct ctl_table fake_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = ".fake",
+ .mode = 0600,
+ .proc_handler = proc_dointvec,
+ .data = &fake_data,
+ .maxlen = sizeof(int),
+ },
+ { }
+};
+
+static struct ctl_path fake_path[] = {
+ { .ctl_name = CTL_FS, .procname = "fs", },
+ { .ctl_name = FS_DQSTATS, .procname = "quota", },
+ { }
+};
+
+/*
+ * FIXME: creation of proc entries here is unsafe with respect to module
+ * unloading.
+ */
+void vzaquota_init(void)
+{
+ struct proc_dir_entry *de;
+
+ de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR,
+ glob_proc_vz_dir, &vzdq_aquotd_file_operations);
+ if (de != NULL)
+ de->proc_iops = &vzdq_aquotd_inode_operations;
+ else
+ printk("VZDQ: vz/vzaquota creation failed\n");
+
+ register_sysctl_glob_paths(fake_path, fake_table, 1);
+}
+
+void vzaquota_fini(void)
+{
+ remove_proc_entry("vz/vzaquota", NULL);
+}
diff --git a/fs/quota/vzdquota/vzdq_mgmt.c b/fs/quota/vzdquota/vzdq_mgmt.c
new file mode 100644
index 0000000..bd066de
--- /dev/null
+++ b/fs/quota/vzdquota/vzdq_mgmt.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/quota.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+
+
+/* ----------------------------------------------------------------------
+ * Switching quota on.
+ * --------------------------------------------------------------------- */
+
+/*
+ * check limits copied from user
+ */
+int vzquota_check_sane_limits(struct dq_stat *qstat)
+{
+ int err;
+
+ err = -EINVAL;
+
+ /* softlimit must be less then hardlimit */
+ if (qstat->bsoftlimit > qstat->bhardlimit)
+ goto out;
+
+ if (qstat->isoftlimit > qstat->ihardlimit)
+ goto out;
+
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * check usage values copied from user
+ */
+int vzquota_check_sane_values(struct dq_stat *qstat)
+{
+ int err;
+
+ err = -EINVAL;
+
+ /* expiration time must not be set if softlimit was not exceeded */
+ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0)
+ goto out;
+
+ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0)
+ goto out;
+
+ err = vzquota_check_sane_limits(qstat);
+out:
+ return err;
+}
+
+/*
+ * create new quota master block
+ * this function should:
+ * - copy limits and usage parameters from user buffer;
+ * - allock, initialize quota block and insert it to hash;
+ */
+static int vzquota_create(unsigned int quota_id,
+ struct vz_quota_stat __user *u_qstat, int compat)
+{
+ int err;
+ struct vz_quota_stat qstat;
+ struct vz_quota_master *qmblk;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -EFAULT;
+ if (!compat) {
+ if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
+ goto out;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_vz_quota_stat cqstat;
+ if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
+ goto out;
+ compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat);
+ compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info);
+#endif
+ }
+
+ err = -EINVAL;
+ if (quota_id == 0)
+ goto out;
+
+ if (vzquota_check_sane_values(&qstat.dq_stat))
+ goto out;
+ err = 0;
+ qmblk = vzquota_alloc_master(quota_id, &qstat);
+
+ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */
+ err = PTR_ERR(qmblk);
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+/**
+ * vzquota_on - turn quota on
+ *
+ * This function should:
+ * - find and get refcnt of directory entry for quota root and corresponding
+ * mountpoint;
+ * - find corresponding quota block and mark it with given path;
+ * - check quota tree;
+ * - initialize quota for the tree root.
+ */
+static int vzquota_on(unsigned int quota_id, const char __user *quota_root,
+ char __user *buf)
+{
+ int err;
+ struct path path;
+ struct vz_quota_master *qmblk;
+ struct super_block *dqsb;
+
+ dqsb = NULL;
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EBUSY;
+ if (qmblk->dq_state != VZDQ_STARTING)
+ goto out;
+
+ err = user_path(quota_root, &path);
+ if (err)
+ goto out;
+ /* init path must be a directory */
+ err = -ENOTDIR;
+ if (!S_ISDIR(path.dentry->d_inode->i_mode))
+ goto out_path;
+
+ qmblk->dq_root_path = path;
+ qmblk->dq_sb = path.dentry->d_inode->i_sb;
+ err = vzquota_get_super(qmblk->dq_sb);
+ if (err)
+ goto out_super;
+
+ /*
+ * Serialization with quota initialization and operations is performed
+ * through generation check: generation is memorized before qmblk is
+ * found and compared under inode_qmblk_lock with assignment.
+ *
+ * Note that the dentry tree is shrunk only for high-level logical
+ * serialization, purely as a courtesy to the user: to have consistent
+ * quota statistics, files should be closed etc. on quota on.
+ */
+ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode,
+ qmblk, buf);
+ if (err)
+ goto out_init;
+ qmblk->dq_state = VZDQ_WORKING;
+
+ mutex_unlock(&vz_quota_mutex);
+ return 0;
+
+out_init:
+ dqsb = qmblk->dq_sb;
+out_super:
+ /* clear for qmblk_put/quota_free_master */
+ qmblk->dq_sb = NULL;
+ qmblk->dq_root_path.dentry = NULL;
+ qmblk->dq_root_path.mnt = NULL;
+out_path:
+ path_put(&path);
+out:
+ if (dqsb)
+ vzquota_put_super(dqsb);
+ mutex_unlock(&vz_quota_mutex);
+ return err;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Switching quota off.
+ * --------------------------------------------------------------------- */
+
+/*
+ * destroy quota block by ID
+ */
+static int vzquota_destroy(unsigned int quota_id)
+{
+ int err;
+ struct vz_quota_master *qmblk;
+ struct path root;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EBUSY;
+ if (qmblk->dq_state == VZDQ_WORKING)
+ goto out; /* quota_off first */
+
+ list_del_init(&qmblk->dq_hash);
+ root = qmblk->dq_root_path;
+ qmblk->dq_root_path.dentry = NULL;
+ qmblk->dq_root_path.mnt = NULL;
+
+ if (qmblk->dq_sb)
+ vzquota_put_super(qmblk->dq_sb);
+ mutex_unlock(&vz_quota_mutex);
+
+ qmblk_put(qmblk);
+ path_put(&root);
+ return 0;
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ return err;
+}
+
+/**
+ * vzquota_off - turn quota off
+ */
+
+static int __vzquota_sync_list(struct list_head *lh,
+ struct vz_quota_master *qmblk,
+ enum writeback_sync_modes sync_mode)
+{
+ struct writeback_control wbc;
+ LIST_HEAD(list);
+ struct vz_quota_ilink *qlnk;
+ struct inode *inode;
+ int err, ret;
+
+ memset(&wbc, 0, sizeof(wbc));
+ wbc.sync_mode = sync_mode;
+
+ err = ret = 0;
+ while (!list_empty(lh)) {
+ if (need_resched()) {
+ inode_qmblk_unlock(qmblk->dq_sb);
+ schedule();
+ inode_qmblk_lock(qmblk->dq_sb);
+ continue;
+ }
+
+ qlnk = list_first_entry(lh, struct vz_quota_ilink, list);
+ list_move(&qlnk->list, &list);
+
+ inode = igrab(QLNK_INODE(qlnk));
+ if (!inode)
+ continue;
+
+ inode_qmblk_unlock(qmblk->dq_sb);
+
+ wbc.nr_to_write = LONG_MAX;
+ ret = sync_inode(inode, &wbc);
+ if (ret)
+ err = ret;
+ iput(inode);
+
+ inode_qmblk_lock(qmblk->dq_sb);
+ }
+
+ list_splice(&list, lh);
+ return err;
+}
+
+static int vzquota_sync_list(struct list_head *lh,
+ struct vz_quota_master *qmblk)
+{
+ (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE);
+ return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL);
+}
+
+static int vzquota_sync_inodes(struct vz_quota_master *qmblk)
+{
+ int err;
+ LIST_HEAD(qlnk_list);
+
+ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list);
+ err = vzquota_sync_list(&qlnk_list, qmblk);
+ if (!err && !list_empty(&qmblk->dq_ilink_list))
+ err = -EBUSY;
+ list_splice(&qlnk_list, &qmblk->dq_ilink_list);
+
+ return err;
+}
+
+static int vzquota_off(unsigned int quota_id, char __user *buf, int force)
+{
+ int err, ret;
+ struct vz_quota_master *qmblk;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EALREADY;
+ if (qmblk->dq_state != VZDQ_WORKING)
+ goto out;
+
+ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */
+ ret = vzquota_sync_inodes(qmblk);
+ inode_qmblk_unlock(qmblk->dq_sb);
+
+ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force);
+ if (err)
+ goto out;
+
+ err = ret;
+ /* vzquota_destroy will free resources */
+ qmblk->dq_state = VZDQ_STOPING;
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Other VZQUOTA ioctl's.
+ * --------------------------------------------------------------------- */
+
+/*
+ * this function should:
+ * - set new limits/buffer under quota master block lock
+ * - if new softlimit less then usage, then set expiration time
+ * - no need to alloc ugid hash table - we'll do that on demand
+ */
+int vzquota_update_limit(struct dq_stat *_qstat,
+ struct dq_stat *qstat)
+{
+ int err;
+
+ err = -EINVAL;
+ if (vzquota_check_sane_limits(qstat))
+ goto out;
+
+ err = 0;
+
+ /* limits */
+ _qstat->bsoftlimit = qstat->bsoftlimit;
+ _qstat->bhardlimit = qstat->bhardlimit;
+ /*
+ * If the soft limit is exceeded, administrator can override the moment
+ * when the grace period for limit exceeding ends.
+ * Specifying the moment may be useful if the soft limit is set to be
+ * lower than the current usage. In the latter case, if the grace
+ * period end isn't specified, the grace period will start from the
+ * moment of the first write operation.
+ * There is a race with the user level. Soft limit may be already
+ * exceeded before the limit change, and grace period end calculated by
+ * the kernel will be overriden. User level may check if the limit is
+ * already exceeded, but check and set calls are not atomic.
+ * This race isn't dangerous. Under normal cicrumstances, the
+ * difference between the grace period end calculated by the kernel and
+ * the user level should be not greater than as the difference between
+ * the moments of check and set calls, i.e. not bigger than the quota
+ * timer resolution - 1 sec.
+ */
+ if (qstat->btime != (time_t)0 &&
+ _qstat->bcurrent >= _qstat->bsoftlimit)
+ _qstat->btime = qstat->btime;
+
+ _qstat->isoftlimit = qstat->isoftlimit;
+ _qstat->ihardlimit = qstat->ihardlimit;
+ if (qstat->itime != (time_t)0 &&
+ _qstat->icurrent >= _qstat->isoftlimit)
+ _qstat->itime = qstat->itime;
+
+out:
+ return err;
+}
+
+/*
+ * set new quota limits.
+ * this function should:
+ * copy new limits from user level
+ * - find quota block
+ * - set new limits and flags.
+ */
+static int vzquota_setlimit(unsigned int quota_id,
+ struct vz_quota_stat __user *u_qstat, int compat)
+{
+ int err;
+ struct vz_quota_stat qstat;
+ struct vz_quota_master *qmblk;
+
+ mutex_lock(&vz_quota_mutex); /* for hash list protection */
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EFAULT;
+ if (!compat) {
+ if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
+ goto out;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_vz_quota_stat cqstat;
+ if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat)))
+ goto out;
+ compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat);
+ compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info);
+#endif
+ }
+
+ qmblk_data_write_lock(qmblk);
+ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat);
+ if (err == 0)
+ qmblk->dq_info = qstat.dq_info;
+ qmblk_data_write_unlock(qmblk);
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ return err;
+}
+
+/*
+ * get quota limits.
+ * very simple - just return stat buffer to user
+ */
+static int vzquota_getstat(unsigned int quota_id,
+ struct vz_quota_stat __user *u_qstat, int compat)
+{
+ int err;
+ struct vz_quota_stat qstat;
+ struct vz_quota_master *qmblk;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ qmblk_data_read_lock(qmblk);
+ /* copy whole buffer under lock */
+ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat));
+ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info));
+ qmblk_data_read_unlock(qmblk);
+
+ if (!compat)
+ err = copy_to_user(u_qstat, &qstat, sizeof(qstat));
+ else {
+#ifdef CONFIG_COMPAT
+ struct compat_vz_quota_stat cqstat;
+ dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat);
+ dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info);
+ err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat));
+#endif
+ }
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ return err;
+}
+
+/*
+ * This is a system call to turn per-VE disk quota on.
+ * Note this call is allowed to run ONLY from VE0
+ */
+long do_vzquotactl(int cmd, unsigned int quota_id,
+ struct vz_quota_stat __user *qstat, const char __user *ve_root,
+ int compat)
+{
+ int ret;
+ int force = 0;
+
+ ret = -EPERM;
+ /* access allowed only from root of VE0 */
+ if (!capable(CAP_SYS_RESOURCE) ||
+ !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ switch (cmd) {
+ case VZ_DQ_CREATE:
+ ret = vzquota_create(quota_id, qstat, compat);
+ break;
+ case VZ_DQ_DESTROY:
+ ret = vzquota_destroy(quota_id);
+ break;
+ case VZ_DQ_ON:
+ /*
+ * qstat is just a pointer to userspace buffer to
+ * store busy files path in case of vzquota_on fail
+ */
+ ret = vzquota_on(quota_id, ve_root, (char *)qstat);
+ break;
+ case VZ_DQ_OFF_FORCED:
+ force = 1;
+ case VZ_DQ_OFF:
+ /*
+ * ve_root is just a pointer to userspace buffer to
+ * store busy files path in case of vzquota_off fail
+ */
+ ret = vzquota_off(quota_id, (char *)ve_root, force);
+ break;
+ case VZ_DQ_SETLIMIT:
+ ret = vzquota_setlimit(quota_id, qstat, compat);
+ break;
+ case VZ_DQ_GETSTAT:
+ ret = vzquota_getstat(quota_id, qstat, compat);
+ break;
+
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+
+/* ----------------------------------------------------------------------
+ * Proc filesystem routines
+ * ---------------------------------------------------------------------*/
+
+#if defined(CONFIG_PROC_FS)
+
+#define QUOTA_UINT_LEN 15
+#define QUOTA_TIME_LEN_FMT_UINT "%11u"
+#define QUOTA_NUM_LEN_FMT_UINT "%15u"
+#define QUOTA_NUM_LEN_FMT_ULL "%15Lu"
+#define QUOTA_TIME_LEN_FMT_STR "%11s"
+#define QUOTA_NUM_LEN_FMT_STR "%15s"
+#define QUOTA_PROC_MAX_LINE_LEN 2048
+
+/*
+ * prints /proc/ve_dq header line
+ */
+static int print_proc_header(char * buffer)
+{
+ return sprintf(buffer,
+ "%-11s"
+ QUOTA_NUM_LEN_FMT_STR
+ QUOTA_NUM_LEN_FMT_STR
+ QUOTA_NUM_LEN_FMT_STR
+ QUOTA_TIME_LEN_FMT_STR
+ QUOTA_TIME_LEN_FMT_STR
+ "\n",
+ "qid: path",
+ "usage", "softlimit", "hardlimit", "time", "expire");
+}
+
+/*
+ * prints proc master record id, dentry path
+ */
+static int print_proc_master_id(char * buffer, char * path_buf,
+ struct vz_quota_master * qp)
+{
+ char *path;
+ int over;
+
+ path = NULL;
+ switch (qp->dq_state) {
+ case VZDQ_WORKING:
+ if (!path_buf) {
+ path = "";
+ break;
+ }
+ path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ path = "";
+ break;
+ }
+ /* do not print large path, truncate it */
+ over = strlen(path) -
+ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 -
+ QUOTA_UINT_LEN);
+ if (over > 0) {
+ path += over - 3;
+ path[0] = path[1] = path[3] = '.';
+ }
+ break;
+ case VZDQ_STARTING:
+ path = "-- started --";
+ break;
+ case VZDQ_STOPING:
+ path = "-- stopped --";
+ break;
+ }
+
+ return sprintf(buffer, "%u: %s\n", qp->dq_id, path);
+}
+
+/*
+ * prints struct vz_quota_stat data
+ */
+static int print_proc_stat(char * buffer, struct dq_stat *qs,
+ struct dq_info *qi)
+{
+ return sprintf(buffer,
+ "%11s"
+ QUOTA_NUM_LEN_FMT_ULL
+ QUOTA_NUM_LEN_FMT_ULL
+ QUOTA_NUM_LEN_FMT_ULL
+ QUOTA_TIME_LEN_FMT_UINT
+ QUOTA_TIME_LEN_FMT_UINT
+ "\n"
+ "%11s"
+ QUOTA_NUM_LEN_FMT_UINT
+ QUOTA_NUM_LEN_FMT_UINT
+ QUOTA_NUM_LEN_FMT_UINT
+ QUOTA_TIME_LEN_FMT_UINT
+ QUOTA_TIME_LEN_FMT_UINT
+ "\n",
+ "1k-blocks",
+ (unsigned long long)qs->bcurrent >> 10,
+ (unsigned long long)qs->bsoftlimit >> 10,
+ (unsigned long long)qs->bhardlimit >> 10,
+ (unsigned int)qs->btime,
+ (unsigned int)qi->bexpire,
+ "inodes",
+ qs->icurrent,
+ qs->isoftlimit,
+ qs->ihardlimit,
+ (unsigned int)qs->itime,
+ (unsigned int)qi->iexpire);
+}
+
+
+/*
+ * for /proc filesystem output
+ */
+static int vzquota_read_proc(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ int len, i;
+ off_t printed = 0;
+ char *p = page;
+ struct vz_quota_master *qp;
+ struct vz_quota_ilink *ql2;
+ struct list_head *listp;
+ char *path_buf;
+
+ path_buf = (char*)__get_free_page(GFP_KERNEL);
+ if (path_buf == NULL)
+ return -ENOMEM;
+
+ len = print_proc_header(p);
+ printed += len;
+ if (off < printed) /* keep header in output */ {
+ *start = p + off;
+ p += len;
+ }
+
+ mutex_lock(&vz_quota_mutex);
+
+ /* traverse master hash table for all records */
+ for (i = 0; i < vzquota_hash_size; i++) {
+ list_for_each(listp, &vzquota_hash_table[i]) {
+ qp = list_entry(listp,
+ struct vz_quota_master, dq_hash);
+
+ /* Skip other VE's information if not root of VE0 */
+ if ((!capable(CAP_SYS_ADMIN) ||
+ !capable(CAP_SYS_RESOURCE))) {
+ ql2 = INODE_QLNK(current->fs->root.dentry->d_inode);
+ if (ql2 == NULL || qp != ql2->qmblk)
+ continue;
+ }
+ /*
+ * Now print the next record
+ */
+ len = 0;
+ /* we print quotaid and path only in VE0 */
+ if (capable(CAP_SYS_ADMIN))
+ len += print_proc_master_id(p+len,path_buf, qp);
+ len += print_proc_stat(p+len, &qp->dq_stat,
+ &qp->dq_info);
+ printed += len;
+ /* skip unnecessary lines */
+ if (printed <= off)
+ continue;
+ p += len;
+ /* provide start offset */
+ if (*start == NULL)
+ *start = p + (off - printed);
+ /* have we printed all requested size? */
+ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN ||
+ (p - *start) >= count)
+ goto out;
+ }
+ }
+
+ *eof = 1; /* checked all hash */
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ len = 0;
+ if (*start != NULL) {
+ len = (p - *start);
+ if (len > count)
+ len = count;
+ }
+
+ if (path_buf)
+ free_page((unsigned long) path_buf);
+
+ return len;
+}
+
+/*
+ * Register procfs read callback
+ */
+int vzquota_proc_init(void)
+{
+ struct proc_dir_entry *de;
+
+ de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL);
+ if (de == NULL)
+ return -EBUSY;
+
+ de->read_proc = vzquota_read_proc;
+ de->data = NULL;
+ return 0;
+}
+
+void vzquota_proc_release(void)
+{
+ /* Unregister procfs read callback */
+ remove_proc_entry("vzquota", proc_vz_dir);
+}
+
+#endif
diff --git a/fs/quota/vzdquota/vzdq_ops.c b/fs/quota/vzdquota/vzdq_ops.c
new file mode 100644
index 0000000..904ff5e
--- /dev/null
+++ b/fs/quota/vzdquota/vzdq_ops.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/vzquota.h>
+
+
+/* ----------------------------------------------------------------------
+ * Quota superblock operations - helper functions.
+ * --------------------------------------------------------------------- */
+
+static inline void vzquota_incr_inodes(struct dq_stat *dqstat,
+ unsigned long number)
+{
+ dqstat->icurrent += number;
+}
+
+static inline void vzquota_incr_space(struct dq_stat *dqstat,
+ __u64 number)
+{
+ dqstat->bcurrent += number;
+}
+
+static inline void vzquota_decr_inodes(struct dq_stat *dqstat,
+ unsigned long number)
+{
+ if (dqstat->icurrent > number)
+ dqstat->icurrent -= number;
+ else
+ dqstat->icurrent = 0;
+ if (dqstat->icurrent < dqstat->isoftlimit)
+ dqstat->itime = (time_t) 0;
+}
+
+static inline void vzquota_decr_space(struct dq_stat *dqstat,
+ __u64 number)
+{
+ if (dqstat->bcurrent > number)
+ dqstat->bcurrent -= number;
+ else
+ dqstat->bcurrent = 0;
+ if (dqstat->bcurrent < dqstat->bsoftlimit)
+ dqstat->btime = (time_t) 0;
+}
+
+/*
+ * better printk() message or use /proc/vzquotamsg interface
+ * similar to /proc/kmsg
+ */
+static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag,
+ const char *fmt)
+{
+ if (dq_info->flags & flag) /* warning already printed for this
+ masterblock */
+ return;
+ printk(fmt, dq_id);
+ dq_info->flags |= flag;
+}
+
+/*
+ * ignore_hardlimit -
+ *
+ * Intended to allow superuser of VE0 to overwrite hardlimits.
+ *
+ * ignore_hardlimit() has a very bad feature:
+ *
+ * writepage() operation for writable mapping of a file with holes
+ * may trigger get_block() with wrong current and as a consequence,
+ * opens a possibility to overcommit hardlimits
+ */
+/* for the reason above, it is disabled now */
+static inline int ignore_hardlimit(struct dq_info *dqstat)
+{
+#if 0
+ return ve_is_super(get_exec_env()) &&
+ capable(CAP_SYS_RESOURCE) &&
+ (dqstat->options & VZ_QUOTA_OPT_RSQUASH);
+#else
+ return 0;
+#endif
+}
+
+static int vzquota_check_inodes(struct dq_info *dq_info,
+ struct dq_stat *dqstat,
+ unsigned long number, int dq_id)
+{
+ if (number == 0)
+ return QUOTA_OK;
+
+ if (dqstat->icurrent + number > dqstat->ihardlimit &&
+ !ignore_hardlimit(dq_info)) {
+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
+ "VZ QUOTA: file hardlimit reached for id=%d\n");
+ return NO_QUOTA;
+ }
+
+ if (dqstat->icurrent + number > dqstat->isoftlimit) {
+ if (dqstat->itime == (time_t)0) {
+ vzquota_warn(dq_info, dq_id, 0,
+ "VZ QUOTA: file softlimit exceeded "
+ "for id=%d\n");
+ dqstat->itime = CURRENT_TIME_SECONDS +
+ dq_info->iexpire;
+ } else if (CURRENT_TIME_SECONDS >= dqstat->itime &&
+ !ignore_hardlimit(dq_info)) {
+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
+ "VZ QUOTA: file softlimit expired "
+ "for id=%d\n");
+ return NO_QUOTA;
+ }
+ }
+
+ return QUOTA_OK;
+}
+
+static int vzquota_check_space(struct dq_info *dq_info,
+ struct dq_stat *dqstat,
+ __u64 number, int dq_id, char prealloc)
+{
+ if (number == 0)
+ return QUOTA_OK;
+
+ if (prealloc == DQUOT_CMD_FORCE)
+ return QUOTA_OK;
+
+ if (dqstat->bcurrent + number > dqstat->bhardlimit &&
+ !ignore_hardlimit(dq_info)) {
+ if (!prealloc)
+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
+ "VZ QUOTA: disk hardlimit reached "
+ "for id=%d\n");
+ return NO_QUOTA;
+ }
+
+ if (dqstat->bcurrent + number > dqstat->bsoftlimit) {
+ if (dqstat->btime == (time_t)0) {
+ if (!prealloc) {
+ vzquota_warn(dq_info, dq_id, 0,
+ "VZ QUOTA: disk softlimit exceeded "
+ "for id=%d\n");
+ dqstat->btime = CURRENT_TIME_SECONDS
+ + dq_info->bexpire;
+ } else {
+ /*
+ * Original Linux quota doesn't allow
+ * preallocation to exceed softlimit so
+ * exceeding will be always printed
+ */
+ return NO_QUOTA;
+ }
+ } else if (CURRENT_TIME_SECONDS >= dqstat->btime &&
+ !ignore_hardlimit(dq_info)) {
+ if (!prealloc)
+ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
+ "VZ QUOTA: disk quota "
+ "softlimit expired "
+ "for id=%d\n");
+ return NO_QUOTA;
+ }
+ }
+
+ return QUOTA_OK;
+}
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk,
+ struct vz_quota_ugid *qugid[],
+ int type, unsigned long number)
+{
+ struct dq_info *dqinfo;
+ struct dq_stat *dqstat;
+
+ if (qugid[type] == NULL)
+ return QUOTA_OK;
+ if (qugid[type] == VZ_QUOTA_UGBAD)
+ return NO_QUOTA;
+
+ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
+ return QUOTA_OK;
+ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
+ return QUOTA_OK;
+ if (number == 0)
+ return QUOTA_OK;
+
+ dqinfo = &qmblk->dq_ugid_info[type];
+ dqstat = &qugid[type]->qugid_stat;
+
+ if (dqstat->ihardlimit != 0 &&
+ dqstat->icurrent + number > dqstat->ihardlimit)
+ return NO_QUOTA;
+
+ if (dqstat->isoftlimit != 0 &&
+ dqstat->icurrent + number > dqstat->isoftlimit) {
+ if (dqstat->itime == (time_t)0)
+ dqstat->itime = CURRENT_TIME_SECONDS +
+ dqinfo->iexpire;
+ else if (CURRENT_TIME_SECONDS >= dqstat->itime)
+ return NO_QUOTA;
+ }
+
+ return QUOTA_OK;
+}
+
+static int vzquota_check_ugid_space(struct vz_quota_master *qmblk,
+ struct vz_quota_ugid *qugid[],
+ int type, __u64 number, char prealloc)
+{
+ struct dq_info *dqinfo;
+ struct dq_stat *dqstat;
+
+ if (prealloc == DQUOT_CMD_FORCE)
+ return QUOTA_OK;
+
+ if (qugid[type] == NULL)
+ return QUOTA_OK;
+ if (qugid[type] == VZ_QUOTA_UGBAD)
+ return NO_QUOTA;
+
+ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
+ return QUOTA_OK;
+ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
+ return QUOTA_OK;
+ if (number == 0)
+ return QUOTA_OK;
+
+ dqinfo = &qmblk->dq_ugid_info[type];
+ dqstat = &qugid[type]->qugid_stat;
+
+ if (dqstat->bhardlimit != 0 &&
+ dqstat->bcurrent + number > dqstat->bhardlimit)
+ return NO_QUOTA;
+
+ if (dqstat->bsoftlimit != 0 &&
+ dqstat->bcurrent + number > dqstat->bsoftlimit) {
+ if (dqstat->btime == (time_t)0) {
+ if (!prealloc)
+ dqstat->btime = CURRENT_TIME_SECONDS
+ + dqinfo->bexpire;
+ else
+ /*
+ * Original Linux quota doesn't allow
+ * preallocation to exceed softlimit so
+ * exceeding will be always printed
+ */
+ return NO_QUOTA;
+ } else if (CURRENT_TIME_SECONDS >= dqstat->btime)
+ return NO_QUOTA;
+ }
+
+ return QUOTA_OK;
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * Quota superblock operations
+ * --------------------------------------------------------------------- */
+
+/*
+ * S_NOQUOTA note.
+ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for
+ * - quota file (absent in our case)
+ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like
+ * filesystem-specific new_inode, before the inode gets outside links.
+ * For the latter case, the only quota operation where care about S_NOQUOTA
+ * might be required is vzquota_drop, but there S_NOQUOTA has already been
+ * checked in DQUOT_DROP().
+ * So, S_NOQUOTA may be ignored for now in the VZDQ code.
+ *
+ * The above note is not entirely correct.
+ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from
+ * delete_inode if new_inode fails (for example, because of inode quota
+ * limits), so S_NOQUOTA check is needed in free_inode.
+ * This seems to be the dark corner of the current quota API.
+ */
+
+/*
+ * Initialize quota operations for the specified inode.
+ */
+static int vzquota_initialize(struct inode *inode, int type)
+{
+ vzquota_inode_init_call(inode);
+ return 0; /* ignored by caller */
+}
+
+/*
+ * Release quota for the specified inode.
+ */
+static int vzquota_drop(struct inode *inode)
+{
+ vzquota_inode_drop_call(inode);
+ return 0; /* ignored by caller */
+}
+
+/*
+ * Allocate block callback.
+ *
+ * If (prealloc) disk quota exceeding warning is not printed.
+ * See Linux quota to know why.
+ *
+ * Return:
+ * QUOTA_OK == 0 on SUCCESS
+ * NO_QUOTA == 1 if allocation should fail
+ */
+static int vzquota_alloc_space(struct inode *inode,
+ qsize_t number, int prealloc)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+ int ret = QUOTA_OK;
+
+ qmblk = vzquota_inode_data(inode, &data);
+ if (qmblk == VZ_QUOTA_BAD)
+ return NO_QUOTA;
+ if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+ int cnt;
+ struct vz_quota_ugid * qugid[MAXQUOTAS];
+#endif
+
+ /* checking first */
+ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat,
+ number, qmblk->dq_id, prealloc);
+ if (ret == NO_QUOTA)
+ goto no_quota;
+#ifdef CONFIG_VZ_QUOTA_UGID
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+ ret = vzquota_check_ugid_space(qmblk, qugid,
+ cnt, number, prealloc);
+ if (ret == NO_QUOTA)
+ goto no_quota;
+ }
+ /* check ok, may increment */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (qugid[cnt] == NULL)
+ continue;
+ vzquota_incr_space(&qugid[cnt]->qugid_stat, number);
+ }
+#endif
+ vzquota_incr_space(&qmblk->dq_stat, number);
+ vzquota_data_unlock(inode, &data);
+ }
+
+ inode_add_bytes(inode, number);
+ might_sleep();
+ return QUOTA_OK;
+
+no_quota:
+ vzquota_data_unlock(inode, &data);
+ return NO_QUOTA;
+}
+
+/*
+ * Allocate inodes callback.
+ *
+ * Return:
+ * QUOTA_OK == 0 on SUCCESS
+ * NO_QUOTA == 1 if allocation should fail
+ */
+static int vzquota_alloc_inode(const struct inode *inode, qsize_t number)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+ int ret = QUOTA_OK;
+
+ qmblk = vzquota_inode_data((struct inode *)inode, &data);
+ if (qmblk == VZ_QUOTA_BAD)
+ return NO_QUOTA;
+ if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+ int cnt;
+ struct vz_quota_ugid *qugid[MAXQUOTAS];
+#endif
+
+ /* checking first */
+ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat,
+ number, qmblk->dq_id);
+ if (ret == NO_QUOTA)
+ goto no_quota;
+#ifdef CONFIG_VZ_QUOTA_UGID
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
+ ret = vzquota_check_ugid_inodes(qmblk, qugid,
+ cnt, number);
+ if (ret == NO_QUOTA)
+ goto no_quota;
+ }
+ /* check ok, may increment */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (qugid[cnt] == NULL)
+ continue;
+ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number);
+ }
+#endif
+ vzquota_incr_inodes(&qmblk->dq_stat, number);
+ vzquota_data_unlock((struct inode *)inode, &data);
+ }
+
+ might_sleep();
+ return QUOTA_OK;
+
+no_quota:
+ vzquota_data_unlock((struct inode *)inode, &data);
+ return NO_QUOTA;
+}
+
+/*
+ * Free space callback.
+ */
+static int vzquota_free_space(struct inode *inode, qsize_t number)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+
+ qmblk = vzquota_inode_data(inode, &data);
+ if (qmblk == VZ_QUOTA_BAD)
+ return NO_QUOTA; /* isn't checked by the caller */
+ if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+ int cnt;
+ struct vz_quota_ugid * qugid;
+#endif
+
+ vzquota_decr_space(&qmblk->dq_stat, number);
+#ifdef CONFIG_VZ_QUOTA_UGID
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ qugid = INODE_QLNK(inode)->qugid[cnt];
+ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
+ continue;
+ vzquota_decr_space(&qugid->qugid_stat, number);
+ }
+#endif
+ vzquota_data_unlock(inode, &data);
+ }
+ inode_sub_bytes(inode, number);
+ might_sleep();
+ return QUOTA_OK;
+}
+
+/*
+ * Free inodes callback.
+ */
+static int vzquota_free_inode(const struct inode *inode, qsize_t number)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+
+ qmblk = vzquota_inode_data((struct inode *)inode, &data);
+ if (qmblk == VZ_QUOTA_BAD)
+ return NO_QUOTA;
+ if (qmblk != NULL) {
+#ifdef CONFIG_VZ_QUOTA_UGID
+ int cnt;
+ struct vz_quota_ugid * qugid;
+#endif
+
+ vzquota_decr_inodes(&qmblk->dq_stat, number);
+#ifdef CONFIG_VZ_QUOTA_UGID
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ qugid = INODE_QLNK(inode)->qugid[cnt];
+ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
+ continue;
+ vzquota_decr_inodes(&qugid->qugid_stat, number);
+ }
+#endif
+ vzquota_data_unlock((struct inode *)inode, &data);
+ }
+ might_sleep();
+ return QUOTA_OK;
+}
+
+void vzquota_inode_off(struct inode * inode)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+
+ /* The call is made through virtinfo, it can be an inode
+ * not controlled by vzquota.
+ */
+ if (inode->i_sb->dq_op != &vz_quota_operations)
+ return;
+
+ qmblk = vzquota_inode_data(inode, &data);
+ if (qmblk == VZ_QUOTA_BAD)
+ return;
+
+ if (qmblk == NULL) {
+ /* Tricky place. If qmblk == NULL, it means that this inode
+ * is not in area controlled by vzquota (except for rare
+ * case of already set S_NOQUOTA). But we have to set
+ * S_NOQUOTA in any case because vzquota can be turned
+ * on later, when this inode is invalid from viewpoint
+ * of vzquota.
+ *
+ * To be safe, we reacquire vzquota lock.
+ * The assumption is that it would not hurt to call
+ * vzquota_inode_drop() more than once, but it must
+ * be called at least once after S_NOQUOTA is set.
+ */
+ inode_qmblk_lock(inode->i_sb);
+ inode->i_flags |= S_NOQUOTA;
+ inode_qmblk_unlock(inode->i_sb);
+ } else {
+ loff_t bytes = inode_get_bytes(inode);
+#ifdef CONFIG_VZ_QUOTA_UGID
+ int cnt;
+ struct vz_quota_ugid * qugid;
+#endif
+
+ inode->i_flags |= S_NOQUOTA;
+
+ vzquota_decr_space(&qmblk->dq_stat, bytes);
+ vzquota_decr_inodes(&qmblk->dq_stat, 1);
+#ifdef CONFIG_VZ_QUOTA_UGID
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ qugid = INODE_QLNK(inode)->qugid[cnt];
+ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
+ continue;
+ vzquota_decr_space(&qugid->qugid_stat, bytes);
+ vzquota_decr_inodes(&qugid->qugid_stat, 1);
+ }
+#endif
+
+ vzquota_data_unlock(inode, &data);
+ }
+ vzquota_inode_drop_call(inode);
+}
+
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+
+/*
+ * helper function for quota_transfer
+ * check that we can add inode to this quota_id
+ */
+static int vzquota_transfer_check(struct vz_quota_master *qmblk,
+ struct vz_quota_ugid *qugid[],
+ unsigned int type, __u64 size)
+{
+ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK ||
+ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK)
+ return -1;
+ return 0;
+}
+
+int vzquota_transfer_usage(struct inode *inode,
+ int mask,
+ struct vz_quota_ilink *qlnk)
+{
+ struct vz_quota_ugid *qugid_old;
+ __u64 space;
+ int i;
+
+ space = inode_get_bytes(inode);
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(mask & (1 << i)))
+ continue;
+ /*
+ * Do not permit chown a file if its owner does not have
+ * ugid record. This might happen if we somehow exceeded
+ * the UID/GID (e.g. set uglimit less than number of users).
+ */
+ if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD)
+ return -1;
+ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space))
+ return -1;
+ }
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(mask & (1 << i)))
+ continue;
+ qugid_old = INODE_QLNK(inode)->qugid[i];
+ vzquota_decr_space(&qugid_old->qugid_stat, space);
+ vzquota_decr_inodes(&qugid_old->qugid_stat, 1);
+ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space);
+ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1);
+ }
+ return 0;
+}
+
+/*
+ * Transfer the inode between diffent user/group quotas.
+ */
+static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
+{
+ return vzquota_inode_transfer_call(inode, iattr) ?
+ NO_QUOTA : QUOTA_OK;
+}
+
+static void vzquota_swap_inode(struct inode *inode, struct inode *tmpl)
+{
+ vzquota_inode_swap_call(inode, tmpl);
+}
+
+
+#else /* CONFIG_VZ_QUOTA_UGID */
+
+static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
+{
+ return QUOTA_OK;
+}
+
+#endif
+
+/*
+ * Called under following semaphores:
+ * old_d->d_inode->i_sb->s_vfs_rename_sem
+ * old_d->d_inode->i_sem
+ * new_d->d_inode->i_sem
+ * [not verified --SAW]
+ */
+static int vzquota_rename(struct inode *inode,
+ struct inode *old_dir, struct inode *new_dir)
+{
+ return vzquota_rename_check(inode, old_dir, new_dir) ?
+ NO_QUOTA : QUOTA_OK;
+}
+
+extern void vzquota_shutdown_super(struct super_block *sb);
+
+/*
+ * Structure of superblock diskquota operations.
+ */
+struct dquot_operations vz_quota_operations = {
+ .initialize = vzquota_initialize,
+ .drop = vzquota_drop,
+ .alloc_space = vzquota_alloc_space,
+ .alloc_inode = vzquota_alloc_inode,
+ .free_space = vzquota_free_space,
+ .free_inode = vzquota_free_inode,
+ .transfer = vzquota_transfer,
+ .rename = vzquota_rename,
+
+ .swap_inode = vzquota_swap_inode,
+ .shutdown = vzquota_shutdown_super,
+};
diff --git a/fs/quota/vzdquota/vzdq_tree.c b/fs/quota/vzdquota/vzdq_tree.c
new file mode 100644
index 0000000..f4f2152
--- /dev/null
+++ b/fs/quota/vzdquota/vzdq_tree.c
@@ -0,0 +1,286 @@
+/*
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo quota tree implementation
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/vzdq_tree.h>
+
+struct quotatree_tree *quotatree_alloc(void)
+{
+ int l;
+ struct quotatree_tree *tree;
+
+ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL);
+ if (tree == NULL)
+ goto out;
+
+ for (l = 0; l < QUOTATREE_DEPTH; l++) {
+ INIT_LIST_HEAD(&tree->levels[l].usedlh);
+ INIT_LIST_HEAD(&tree->levels[l].freelh);
+ tree->levels[l].freenum = 0;
+ }
+ tree->root = NULL;
+ tree->leaf_num = 0;
+out:
+ return tree;
+}
+
+static struct quotatree_node *
+quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level,
+ struct quotatree_find_state *st)
+{
+ void **block;
+ struct quotatree_node *parent;
+ int l, index;
+
+ parent = NULL;
+ block = (void **)&tree->root;
+ l = 0;
+ while (l < level && *block != NULL) {
+ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
+ parent = *block;
+ block = parent->blocks + index;
+ l++;
+ }
+ if (st != NULL) {
+ st->block = block;
+ st->level = l;
+ }
+
+ return parent;
+}
+
+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
+ struct quotatree_find_state *st)
+{
+ quotatree_follow(tree, id, QUOTATREE_DEPTH, st);
+ if (st->level == QUOTATREE_DEPTH)
+ return *st->block;
+ else
+ return NULL;
+}
+
+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index)
+{
+ int i, count;
+ struct quotatree_node *p;
+ void *leaf;
+
+ if (QTREE_LEAFNUM(tree) <= index)
+ return NULL;
+
+ count = 0;
+ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
+ for (i = 0; i < QUOTATREE_BSIZE; i++) {
+ leaf = p->blocks[i];
+ if (leaf == NULL)
+ continue;
+ if (count == index)
+ return leaf;
+ count++;
+ }
+ }
+ return NULL;
+}
+
+/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id)
+ * in the tree... */
+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id)
+{
+ int off;
+ struct quotatree_node *parent, *p;
+ struct list_head *lh;
+
+ /* get parent refering correct quota tree node of the last level */
+ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL);
+ if (!parent)
+ return NULL;
+
+ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */
+ lh = &parent->list;
+ do {
+ p = list_entry(lh, struct quotatree_node, list);
+ for ( ; off < QUOTATREE_BSIZE; off++)
+ if (p->blocks[off])
+ return p->blocks[off];
+ off = 0;
+ lh = lh->next;
+ } while (lh != &QTREE_LEAFLVL(tree)->usedlh);
+
+ return NULL;
+}
+
+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
+ struct quotatree_find_state *st, void *data)
+{
+ struct quotatree_node *p;
+ int l, index;
+
+ while (st->level < QUOTATREE_DEPTH) {
+ l = st->level;
+ if (!list_empty(&tree->levels[l].freelh)) {
+ p = list_entry(tree->levels[l].freelh.next,
+ struct quotatree_node, list);
+ list_del(&p->list);
+ } else {
+ p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL);
+ if (p == NULL)
+ return -ENOMEM;
+ /* save block number in the l-level
+ * it uses for quota file generation */
+ p->num = tree->levels[l].freenum++;
+ }
+ list_add(&p->list, &tree->levels[l].usedlh);
+ memset(p->blocks, 0, sizeof(p->blocks));
+ *st->block = p;
+
+ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
+ st->block = p->blocks + index;
+ st->level++;
+ }
+ tree->leaf_num++;
+ *st->block = data;
+
+ return 0;
+}
+
+static struct quotatree_node *
+quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id,
+ int level)
+{
+ struct quotatree_node *parent;
+ struct quotatree_find_state st;
+
+ parent = quotatree_follow(tree, id, level, &st);
+ if (st.level == QUOTATREE_DEPTH)
+ tree->leaf_num--;
+ *st.block = NULL;
+ return parent;
+}
+
+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id)
+{
+ struct quotatree_node *p;
+ int level, i;
+
+ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH);
+ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) {
+ for (i = 0; i < QUOTATREE_BSIZE; i++)
+ if (p->blocks[i] != NULL)
+ return;
+ list_move(&p->list, &tree->levels[level].freelh);
+ p = quotatree_remove_ptr(tree, id, level);
+ }
+}
+
+#if 0
+static void quotatree_walk(struct quotatree_tree *tree,
+ struct quotatree_node *node_start,
+ quotaid_t id_start,
+ int level_start, int level_end,
+ int (*callback)(struct quotatree_tree *,
+ quotaid_t id,
+ int level,
+ void *ptr,
+ void *data),
+ void *data)
+{
+ struct quotatree_node *p;
+ int l, shift, index;
+ quotaid_t id;
+ struct quotatree_find_state st;
+
+ p = node_start;
+ l = level_start;
+ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
+ id = id_start;
+ index = 0;
+
+ /*
+ * Invariants:
+ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
+ * id & ((1 << shift) - 1) == 0
+ * p is l-level node corresponding to id
+ */
+ do {
+ if (!p)
+ break;
+
+ if (l < level_end) {
+ for (; index < QUOTATREE_BSIZE; index++)
+ if (p->blocks[index] != NULL)
+ break;
+ if (index < QUOTATREE_BSIZE) {
+ /* descend */
+ p = p->blocks[index];
+ l++;
+ shift -= QUOTAID_BBITS;
+ id += (quotaid_t)index << shift;
+ index = 0;
+ continue;
+ }
+ }
+
+ if ((*callback)(tree, id, l, p, data))
+ break;
+
+ /* ascend and to the next node */
+ p = quotatree_follow(tree, id, l, &st);
+
+ index = ((id >> shift) & QUOTATREE_BMASK) + 1;
+ l--;
+ shift += QUOTAID_BBITS;
+ id &= ~(((quotaid_t)1 << shift) - 1);
+ } while (l >= level_start);
+}
+#endif
+
+static void free_list(struct list_head *node_list)
+{
+ struct quotatree_node *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, node_list, list) {
+ list_del(&p->list);
+ kfree(p);
+ }
+}
+
+static inline void quotatree_free_nodes(struct quotatree_tree *tree)
+{
+ int i;
+
+ for (i = 0; i < QUOTATREE_DEPTH; i++) {
+ free_list(&tree->levels[i].usedlh);
+ free_list(&tree->levels[i].freelh);
+ }
+}
+
+static void quotatree_free_leafs(struct quotatree_tree *tree,
+ void (*dtor)(void *))
+{
+ int i;
+ struct quotatree_node *p;
+
+ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
+ for (i = 0; i < QUOTATREE_BSIZE; i++) {
+ if (p->blocks[i] == NULL)
+ continue;
+
+ dtor(p->blocks[i]);
+ }
+ }
+}
+
+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *))
+{
+ quotatree_free_leafs(tree, dtor);
+ quotatree_free_nodes(tree);
+ kfree(tree);
+}
diff --git a/fs/quota/vzdquota/vzdq_ugid.c b/fs/quota/vzdquota/vzdq_ugid.c
new file mode 100644
index 0000000..a3e9e8c
--- /dev/null
+++ b/fs/quota/vzdquota/vzdq_ugid.c
@@ -0,0 +1,1216 @@
+/*
+ * Copyright (C) 2002 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo UID/GID disk quota implementation
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/smp_lock.h>
+#include <linux/rcupdate.h>
+#include <asm/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/quota.h>
+#include "../quotaio_v2.h"
+#include <linux/virtinfo.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/mnt_namespace.h>
+#include <linux/vmalloc.h>
+
+#include <linux/vzctl.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+
+/*
+ * XXX
+ * may be something is needed for sb->s_dquot->info[]?
+ */
+
+#define USRQUOTA_MASK (1 << USRQUOTA)
+#define GRPQUOTA_MASK (1 << GRPQUOTA)
+#define QTYPE2MASK(type) (1 << (type))
+
+static struct kmem_cache *vz_quota_ugid_cachep;
+
+inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid)
+{
+ if (qugid != VZ_QUOTA_UGBAD)
+ atomic_inc(&qugid->qugid_count);
+ return qugid;
+}
+
+/* we don't limit users with zero limits */
+static inline int vzquota_fake_stat(struct dq_stat *stat)
+{
+ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 &&
+ stat->ihardlimit == 0 && stat->isoftlimit == 0;
+}
+
+/* callback function for quotatree_free() */
+static inline void vzquota_free_qugid(void *ptr)
+{
+ kmem_cache_free(vz_quota_ugid_cachep, ptr);
+}
+
+/*
+ * destroy ugid, if it have zero refcount, limits and usage
+ * must be called under qmblk->dq_mutex
+ */
+void vzquota_put_ugid(struct vz_quota_master *qmblk,
+ struct vz_quota_ugid *qugid)
+{
+ if (qugid == VZ_QUOTA_UGBAD)
+ return;
+ qmblk_data_read_lock(qmblk);
+ if (atomic_dec_and_test(&qugid->qugid_count) &&
+ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 &&
+ vzquota_fake_stat(&qugid->qugid_stat) &&
+ qugid->qugid_stat.bcurrent == 0 &&
+ qugid->qugid_stat.icurrent == 0) {
+ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type),
+ qugid->qugid_id);
+ qmblk->dq_ugid_count--;
+ vzquota_free_qugid(qugid);
+ }
+ qmblk_data_read_unlock(qmblk);
+}
+
+/*
+ * Get ugid block by its index, like it would present in array.
+ * In reality, this is not array - this is leafs chain of the tree.
+ * NULL if index is out of range.
+ * qmblk semaphore is required to protect the tree.
+ */
+static inline struct vz_quota_ugid *
+vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type)
+{
+ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index);
+}
+
+/*
+ * get next element from ugid "virtual array"
+ * ugid must be in current array and this array may not be changed between
+ * two accesses (quaranteed by "stopped" quota state and quota semaphore)
+ * qmblk semaphore is required to protect the tree
+ */
+static inline struct vz_quota_ugid *
+vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid)
+{
+ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type),
+ qugid->qugid_id);
+}
+
+/*
+ * requires dq_mutex
+ */
+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
+ unsigned int quota_id, int type, int flags)
+{
+ struct vz_quota_ugid *qugid;
+ struct quotatree_tree *tree;
+ struct quotatree_find_state st;
+
+ tree = QUGID_TREE(qmblk, type);
+ qugid = quotatree_find(tree, quota_id, &st);
+ if (qugid)
+ goto success;
+
+ /* caller does not want alloc */
+ if (flags & VZDQUG_FIND_DONT_ALLOC)
+ goto fail;
+
+ if (flags & VZDQUG_FIND_FAKE)
+ goto doit;
+
+ /* check limit */
+ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max)
+ goto fail;
+
+ /* see comment at VZDQUG_FIXED_SET define */
+ if (qmblk->dq_flags & VZDQUG_FIXED_SET)
+ goto fail;
+
+doit:
+ /* alloc new structure */
+ qugid = kmem_cache_alloc(vz_quota_ugid_cachep,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (qugid == NULL)
+ goto fail;
+
+ /* initialize new structure */
+ qugid->qugid_id = quota_id;
+ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat));
+ qugid->qugid_type = type;
+ atomic_set(&qugid->qugid_count, 0);
+
+ /* insert in tree */
+ if (quotatree_insert(tree, quota_id, &st, qugid) < 0)
+ goto fail_insert;
+ qmblk->dq_ugid_count++;
+
+success:
+ vzquota_get_ugid(qugid);
+ return qugid;
+
+fail_insert:
+ vzquota_free_qugid(qugid);
+fail:
+ return VZ_QUOTA_UGBAD;
+}
+
+/*
+ * takes dq_mutex, may schedule
+ */
+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
+ unsigned int quota_id, int type, int flags)
+{
+ struct vz_quota_ugid *qugid;
+
+ mutex_lock(&qmblk->dq_mutex);
+ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags);
+ mutex_unlock(&qmblk->dq_mutex);
+
+ return qugid;
+}
+
+/*
+ * destroy all ugid records on given quota master
+ */
+void vzquota_kill_ugid(struct vz_quota_master *qmblk)
+{
+ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) ||
+ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL));
+
+ if (qmblk->dq_uid_tree != NULL) {
+ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid);
+ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid);
+ }
+}
+
+
+/* ----------------------------------------------------------------------
+ * Management interface to ugid quota for (super)users.
+ * --------------------------------------------------------------------- */
+
+static int vzquota_initialize2(struct inode *inode, int type)
+{
+ return QUOTA_OK;
+}
+
+static int vzquota_drop2(struct inode *inode)
+{
+ return QUOTA_OK;
+}
+
+static int vzquota_alloc_space2(struct inode *inode,
+ qsize_t number, int prealloc)
+{
+ inode_add_bytes(inode, number);
+ return QUOTA_OK;
+}
+
+static int vzquota_alloc_inode2(const struct inode *inode, qsize_t number)
+{
+ return QUOTA_OK;
+}
+
+static int vzquota_free_space2(struct inode *inode, qsize_t number)
+{
+ inode_sub_bytes(inode, number);
+ return QUOTA_OK;
+}
+
+static int vzquota_free_inode2(const struct inode *inode, qsize_t number)
+{
+ return QUOTA_OK;
+}
+
+static int vzquota_transfer2(struct inode *inode, struct iattr *iattr)
+{
+ return QUOTA_OK;
+}
+
+struct dquot_operations vz_quota_operations2 = {
+ .initialize = vzquota_initialize2,
+ .drop = vzquota_drop2,
+ .alloc_space = vzquota_alloc_space2,
+ .alloc_inode = vzquota_alloc_inode2,
+ .free_space = vzquota_free_space2,
+ .free_inode = vzquota_free_inode2,
+ .transfer = vzquota_transfer2,
+};
+
+
+asmlinkage long sys_unlink(const char __user * pathname);
+asmlinkage long sys_rename(const char __user * oldname,
+ const char __user * newname);
+asmlinkage long sys_symlink(const char __user * oldname,
+ const char __user * newname);
+
+/* called under sb->s_umount semaphore */
+static int vz_restore_symlink(struct super_block *sb, char *path, int type)
+{
+ mm_segment_t oldfs;
+ char *newpath;
+ char dest[64];
+ const char *names[] = {
+ [USRQUOTA] "aquota.user",
+ [GRPQUOTA] "aquota.group"
+ };
+ int err;
+
+ newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL);
+ if (newpath == NULL)
+ return -ENOMEM;
+
+ strcpy(newpath, path);
+ strcat(newpath, ".new");
+
+ sprintf(dest, "/proc/vz/vzaquota/%08x/%s",
+ new_encode_dev(sb->s_dev), names[type]);
+
+ /*
+ * Lockdep will learn unneeded dependency while unlink(2):
+ * ->s_umount => ->i_mutex/1 => ->i_mutex
+ * Reverse dependency is,
+ * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash()
+ * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev()
+ * => user_get_super() => ->s_umount
+ *
+ * However, first set of ->i_mutex'es belong to /, second to /proc .
+ * Right fix is to get rid of vz_restore_symlink(), of course.
+ */
+ up_read(&sb->s_umount);
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_unlink(newpath);
+ if (err < 0 && err != -ENOENT)
+ goto out_restore;
+ err = sys_symlink(dest, newpath);
+ if (err < 0)
+ goto out_restore;
+ err = sys_rename(newpath, path);
+out_restore:
+ set_fs(oldfs);
+
+ down_read(&sb->s_umount);
+ /* umounted meanwhile? */
+ if (err == 0 && !sb->s_root)
+ err = -ENODEV;
+
+ kfree(newpath);
+ return err;
+}
+
+/* called under sb->s_umount semaphore */
+static int vz_quota_on(struct super_block *sb, int type,
+ int format_id, char *path, int remount)
+{
+ struct vz_quota_master *qmblk;
+ int mask2;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+
+ err = vz_restore_symlink(sb, path, type);
+ if (err < 0)
+ goto out_put;
+
+ mutex_lock(&vz_quota_mutex);
+ mask2 = 0;
+ sb->dq_op = &vz_quota_operations2;
+ sb->s_qcop = &vz_quotactl_operations;
+ if (type == USRQUOTA)
+ mask2 = VZDQ_USRQUOTA;
+ if (type == GRPQUOTA)
+ mask2 = VZDQ_GRPQUOTA;
+
+ err = -EBUSY;
+ if (qmblk->dq_flags & mask2)
+ goto out_sem;
+
+ err = 0;
+ qmblk->dq_flags |= mask2;
+ sb->s_dquot.flags |= dquot_state_flag(
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, type);
+
+out_sem:
+ mutex_unlock(&vz_quota_mutex);
+out_put:
+ qmblk_put(qmblk);
+out:
+ return err;
+}
+
+static int vz_quota_off(struct super_block *sb, int type, int remount)
+{
+ struct vz_quota_master *qmblk;
+ int mask2;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ mutex_lock(&vz_quota_mutex);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+
+ mask2 = 0;
+ if (type == USRQUOTA)
+ mask2 = VZDQ_USRQUOTA;
+ if (type == GRPQUOTA)
+ mask2 = VZDQ_GRPQUOTA;
+ err = -EINVAL;
+ if (!(qmblk->dq_flags & mask2))
+ goto out;
+
+ qmblk->dq_flags &= ~mask2;
+ err = 0;
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+ return err;
+}
+
+static int vz_quota_sync(struct super_block *sb, int type)
+{
+ return 0; /* vz quota is always uptodate */
+}
+
+static int vz_get_dqblk(struct super_block *sb, int type,
+ qid_t id, struct if_dqblk *di)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid *ugid;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ mutex_lock(&vz_quota_mutex);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+
+ err = 0;
+ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC);
+ if (ugid != VZ_QUOTA_UGBAD) {
+ qmblk_data_read_lock(qmblk);
+ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10;
+ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10;
+ di->dqb_curspace = ugid->qugid_stat.bcurrent;
+ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit;
+ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit;
+ di->dqb_curinodes = ugid->qugid_stat.icurrent;
+ di->dqb_btime = ugid->qugid_stat.btime;
+ di->dqb_itime = ugid->qugid_stat.itime;
+ qmblk_data_read_unlock(qmblk);
+ di->dqb_valid = QIF_ALL;
+ vzquota_put_ugid(qmblk, ugid);
+ } else {
+ memset(di, 0, sizeof(*di));
+ di->dqb_valid = QIF_ALL;
+ }
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+ return err;
+}
+
+/* must be called under vz_quota_mutex */
+static int __vz_set_dqblk(struct vz_quota_master *qmblk,
+ int type, qid_t id, struct if_dqblk *di)
+{
+ struct vz_quota_ugid *ugid;
+
+ ugid = vzquota_find_ugid(qmblk, id, type, 0);
+ if (ugid == VZ_QUOTA_UGBAD)
+ return -ESRCH;
+
+ qmblk_data_write_lock(qmblk);
+ /*
+ * Subtle compatibility breakage.
+ *
+ * Some old non-vz kernel quota didn't start grace period
+ * if the new soft limit happens to be below the usage.
+ * Non-vz kernel quota in 2.4.20 starts the grace period
+ * (if it hasn't been started).
+ * Current non-vz kernel performs even more complicated
+ * manipulations...
+ *
+ * Also, current non-vz kernels have inconsistency related to
+ * the grace time start. In regular operations the grace period
+ * is started if the usage is greater than the soft limit (and,
+ * strangely, is cancelled if the usage is less).
+ * However, set_dqblk starts the grace period if the usage is greater
+ * or equal to the soft limit.
+ *
+ * Here we try to mimic the behavior of the current non-vz kernel.
+ */
+ if (di->dqb_valid & QIF_BLIMITS) {
+ ugid->qugid_stat.bhardlimit =
+ (__u64)di->dqb_bhardlimit << 10;
+ ugid->qugid_stat.bsoftlimit =
+ (__u64)di->dqb_bsoftlimit << 10;
+ if (di->dqb_bsoftlimit == 0 ||
+ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit)
+ ugid->qugid_stat.btime = 0;
+ else if (!(di->dqb_valid & QIF_BTIME))
+ ugid->qugid_stat.btime = CURRENT_TIME_SECONDS
+ + qmblk->dq_ugid_info[type].bexpire;
+ else
+ ugid->qugid_stat.btime = di->dqb_btime;
+ }
+ if (di->dqb_valid & QIF_ILIMITS) {
+ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit;
+ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit;
+ if (di->dqb_isoftlimit == 0 ||
+ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit)
+ ugid->qugid_stat.itime = 0;
+ else if (!(di->dqb_valid & QIF_ITIME))
+ ugid->qugid_stat.itime = CURRENT_TIME_SECONDS
+ + qmblk->dq_ugid_info[type].iexpire;
+ else
+ ugid->qugid_stat.itime = di->dqb_itime;
+ }
+ qmblk_data_write_unlock(qmblk);
+ vzquota_put_ugid(qmblk, ugid);
+
+ return 0;
+}
+
+static int vz_set_dqblk(struct super_block *sb, int type,
+ qid_t id, struct if_dqblk *di)
+{
+ struct vz_quota_master *qmblk;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ mutex_lock(&vz_quota_mutex);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+ err = __vz_set_dqblk(qmblk, type, id, di);
+out:
+ mutex_unlock(&vz_quota_mutex);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+ return err;
+}
+
+static int vz_get_dqinfo(struct super_block *sb, int type,
+ struct if_dqinfo *ii)
+{
+ struct vz_quota_master *qmblk;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ mutex_lock(&vz_quota_mutex);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+
+ err = 0;
+ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire;
+ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire;
+ ii->dqi_flags = 0;
+ ii->dqi_valid = IIF_ALL;
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+ return err;
+}
+
+/* must be called under vz_quota_mutex */
+static int __vz_set_dqinfo(struct vz_quota_master *qmblk,
+ int type, struct if_dqinfo *ii)
+{
+ if (ii->dqi_valid & IIF_FLAGS)
+ if (ii->dqi_flags & DQF_MASK)
+ return -EINVAL;
+
+ if (ii->dqi_valid & IIF_BGRACE)
+ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace;
+ if (ii->dqi_valid & IIF_IGRACE)
+ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace;
+ return 0;
+}
+
+static int vz_set_dqinfo(struct super_block *sb, int type,
+ struct if_dqinfo *ii)
+{
+ struct vz_quota_master *qmblk;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ mutex_lock(&vz_quota_mutex);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+ err = __vz_set_dqinfo(qmblk, type, ii);
+out:
+ mutex_unlock(&vz_quota_mutex);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+ return err;
+}
+
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define Q_GETQUOTI_SIZE 1024
+
+#define UGID2DQBLK(dst, src) \
+ do { \
+ (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \
+ (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \
+ (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \
+ /* in 1K blocks */ \
+ (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \
+ /* in 1K blocks */ \
+ (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \
+ /* in bytes, 64 bit */ \
+ (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \
+ (dst)->dqb_btime = (src)->qugid_stat.btime; \
+ (dst)->dqb_itime = (src)->qugid_stat.itime; \
+ } while (0)
+
+static int vz_get_quoti(struct super_block *sb, int type, qid_t idx,
+ struct v2_disk_dqblk __user *dqblk)
+{
+ struct vz_quota_master *qmblk;
+ struct v2_disk_dqblk *data, *kbuf;
+ struct vz_quota_ugid *ugid;
+ int count;
+ int err;
+
+ qmblk = vzquota_find_qmblk(sb);
+ err = -ESRCH;
+ if (qmblk == NULL)
+ goto out;
+ err = -EIO;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out;
+
+ err = -ENOMEM;
+ kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf));
+ if (!kbuf)
+ goto out;
+
+ mutex_lock(&vz_quota_mutex);
+ mutex_lock(&qmblk->dq_mutex);
+ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0;
+ ugid != NULL && count < Q_GETQUOTI_SIZE;
+ count++)
+ {
+ data = kbuf + count;
+ qmblk_data_read_lock(qmblk);
+ UGID2DQBLK(data, ugid);
+ qmblk_data_read_unlock(qmblk);
+ data->dqb_id = ugid->qugid_id;
+
+ /* Find next entry */
+ ugid = vzquota_get_next(qmblk, ugid);
+ BUG_ON(ugid != NULL && ugid->qugid_type != type);
+ }
+ mutex_unlock(&qmblk->dq_mutex);
+ mutex_unlock(&vz_quota_mutex);
+
+ err = count;
+ if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf)))
+ err = -EFAULT;
+
+ vfree(kbuf);
+out:
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qmblk);
+
+ return err;
+}
+
+#endif
+
+struct quotactl_ops vz_quotactl_operations = {
+ .quota_on = vz_quota_on,
+ .quota_off = vz_quota_off,
+ .quota_sync = vz_quota_sync,
+ .get_info = vz_get_dqinfo,
+ .set_info = vz_set_dqinfo,
+ .get_dqblk = vz_get_dqblk,
+ .set_dqblk = vz_set_dqblk,
+#ifdef CONFIG_QUOTA_COMPAT
+ .get_quoti = vz_get_quoti,
+#endif
+};
+
+
+/* ----------------------------------------------------------------------
+ * Management interface for host system admins.
+ * --------------------------------------------------------------------- */
+
+static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size,
+ struct vz_quota_iface __user *u_ugid_buf, int compat)
+{
+ struct vz_quota_master *qmblk;
+ int ret;
+
+ mutex_lock(&vz_quota_mutex);
+
+ ret = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ ret = -EBUSY;
+ if (qmblk->dq_state != VZDQ_STARTING)
+ goto out; /* working quota doesn't accept new ugids */
+
+ ret = 0;
+ /* start to add ugids */
+ for (ret = 0; ret < ugid_size; ret++) {
+ struct vz_quota_iface ugid_buf;
+ struct vz_quota_ugid *ugid;
+
+ if (!compat) {
+ if (copy_from_user(&ugid_buf, u_ugid_buf,
+ sizeof(ugid_buf)))
+ break;
+ u_ugid_buf++; /* next user buffer */
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_vz_quota_iface oqif;
+ if (copy_from_user(&oqif, u_ugid_buf,
+ sizeof(oqif)))
+ break;
+ ugid_buf.qi_id = oqif.qi_id;
+ ugid_buf.qi_type = oqif.qi_type;
+ compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat);
+ u_ugid_buf = (struct vz_quota_iface __user *)
+ (((void *)u_ugid_buf) + sizeof(oqif));
+#endif
+ }
+
+ if (ugid_buf.qi_type >= MAXQUOTAS)
+ break; /* bad quota type - this is the only check */
+
+ ugid = vzquota_find_ugid(qmblk,
+ ugid_buf.qi_id, ugid_buf.qi_type, 0);
+ if (ugid == VZ_QUOTA_UGBAD) {
+ qmblk->dq_flags |= VZDQUG_FIXED_SET;
+ break; /* limit reached */
+ }
+
+ /* update usage/limits
+ * we can copy the data without the lock, because the data
+ * cannot be modified in VZDQ_STARTING state */
+ ugid->qugid_stat = ugid_buf.qi_stat;
+
+ vzquota_put_ugid(qmblk, ugid);
+ }
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return ret;
+}
+
+static int quota_ugid_setgrace(unsigned int quota_id,
+ struct dq_info __user u_dq_info[], int compat)
+{
+ struct vz_quota_master *qmblk;
+ struct dq_info dq_info[MAXQUOTAS];
+ struct dq_info *target;
+ int err, type;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EBUSY;
+ if (qmblk->dq_state != VZDQ_STARTING)
+ goto out; /* working quota doesn't accept changing options */
+
+ err = -EFAULT;
+ if (!compat) {
+ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info)))
+ goto out;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_dq_info odqi[MAXQUOTAS];
+ if (copy_from_user(odqi, u_dq_info, sizeof(odqi)))
+ goto out;
+ for (type = 0; type < MAXQUOTAS; type++)
+ compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]);
+#endif
+ }
+
+ err = 0;
+
+ /* update in qmblk */
+ for (type = 0; type < MAXQUOTAS; type++) {
+ target = &qmblk->dq_ugid_info[type];
+ target->bexpire = dq_info[type].bexpire;
+ target->iexpire = dq_info[type].iexpire;
+ }
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size,
+ struct vz_quota_iface *u_ugid_buf)
+{
+ int type, count;
+ struct vz_quota_ugid *ugid;
+
+ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) +
+ QTREE_LEAFNUM(qmblk->dq_gid_tree)
+ <= index)
+ return 0;
+
+ count = 0;
+
+ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA;
+ if (type == GRPQUOTA)
+ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree);
+
+ /* loop through ugid and then qgid quota */
+repeat:
+ for (ugid = vzquota_get_byindex(qmblk, index, type);
+ ugid != NULL && count < size;
+ ugid = vzquota_get_next(qmblk, ugid), count++)
+ {
+ struct vz_quota_iface ugid_buf;
+
+ /* form interface buffer and send in to user-level */
+ qmblk_data_read_lock(qmblk);
+ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat,
+ sizeof(ugid_buf.qi_stat));
+ qmblk_data_read_unlock(qmblk);
+ ugid_buf.qi_id = ugid->qugid_id;
+ ugid_buf.qi_type = ugid->qugid_type;
+
+ memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf));
+ u_ugid_buf++; /* next portion of user buffer */
+ }
+
+ if (type == USRQUOTA && count < size) {
+ type = GRPQUOTA;
+ index = 0;
+ goto repeat;
+ }
+
+ return count;
+}
+
+static int quota_ugid_getstat(unsigned int quota_id,
+ int index, int size, struct vz_quota_iface __user *u_ugid_buf,
+ int compat)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_iface *k_ugid_buf;
+ int err;
+
+ if (index < 0 || size < 0)
+ return -EINVAL;
+
+ if (size > INT_MAX / sizeof(struct vz_quota_iface))
+ return -EINVAL;
+
+ k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface));
+ if (k_ugid_buf == NULL)
+ return -ENOMEM;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ mutex_lock(&qmblk->dq_mutex);
+ err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf);
+ mutex_unlock(&qmblk->dq_mutex);
+ if (err < 0)
+ goto out;
+
+ if (!compat) {
+ if (copy_to_user(u_ugid_buf, k_ugid_buf,
+ err * sizeof(struct vz_quota_iface)))
+ err = -EFAULT;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_vz_quota_iface oqif;
+ int i;
+ for (i = 0; i < err; i++) {
+ oqif.qi_id = k_ugid_buf[i].qi_id;
+ oqif.qi_type = k_ugid_buf[i].qi_type;
+ dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat,
+ &oqif.qi_stat);
+ if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif)))
+ err = -EFAULT;
+ u_ugid_buf = (struct vz_quota_iface __user *)
+ (((void *)u_ugid_buf) + sizeof(oqif));
+ }
+#endif
+ }
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+ vfree(k_ugid_buf);
+ return err;
+}
+
+static int quota_ugid_getgrace(unsigned int quota_id,
+ struct dq_info __user u_dq_info[], int compat)
+{
+ struct vz_quota_master *qmblk;
+ struct dq_info dq_info[MAXQUOTAS];
+ struct dq_info *target;
+ int err, type;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = 0;
+ /* update from qmblk */
+ for (type = 0; type < MAXQUOTAS; type ++) {
+ target = &qmblk->dq_ugid_info[type];
+ dq_info[type].bexpire = target->bexpire;
+ dq_info[type].iexpire = target->iexpire;
+ dq_info[type].flags = target->flags;
+ }
+
+ if (!compat) {
+ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info)))
+ err = -EFAULT;
+ } else {
+#ifdef CONFIG_COMPAT
+ struct compat_dq_info odqi[MAXQUOTAS];
+ for (type = 0; type < MAXQUOTAS; type ++)
+ dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]);
+ if (copy_to_user(u_dq_info, odqi, sizeof(odqi)))
+ err = -EFAULT;
+#endif
+ }
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+static int quota_ugid_getconfig(unsigned int quota_id,
+ struct vz_quota_ugid_stat __user *info)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid_stat kinfo;
+ int err;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = 0;
+ kinfo.limit = qmblk->dq_ugid_max;
+ kinfo.count = qmblk->dq_ugid_count;
+ kinfo.flags = qmblk->dq_flags;
+
+ if (copy_to_user(info, &kinfo, sizeof(kinfo)))
+ err = -EFAULT;
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+static int quota_ugid_setconfig(unsigned int quota_id,
+ struct vz_quota_ugid_stat __user *info)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid_stat kinfo;
+ int err;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ENOENT;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EFAULT;
+ if (copy_from_user(&kinfo, info, sizeof(kinfo)))
+ goto out;
+
+ err = 0;
+ qmblk->dq_ugid_max = kinfo.limit;
+ if (qmblk->dq_state == VZDQ_STARTING) {
+ qmblk->dq_flags = kinfo.flags;
+ if (qmblk->dq_flags & VZDQUG_ON)
+ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA;
+ }
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+static int quota_ugid_setlimit(unsigned int quota_id,
+ struct vz_quota_ugid_setlimit __user *u_lim)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid_setlimit lim;
+ int err;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ESRCH;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EFAULT;
+ if (copy_from_user(&lim, u_lim, sizeof(lim)))
+ goto out;
+
+ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb);
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+static int quota_ugid_setinfo(unsigned int quota_id,
+ struct vz_quota_ugid_setinfo __user *u_info)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid_setinfo info;
+ int err;
+
+ mutex_lock(&vz_quota_mutex);
+
+ err = -ESRCH;
+ qmblk = vzquota_find_master(quota_id);
+ if (qmblk == NULL)
+ goto out;
+
+ err = -EFAULT;
+ if (copy_from_user(&info, u_info, sizeof(info)))
+ goto out;
+
+ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi);
+
+out:
+ mutex_unlock(&vz_quota_mutex);
+
+ return err;
+}
+
+/*
+ * This is a system call to maintain UGID quotas
+ * Note this call is allowed to run ONLY from VE0
+ */
+long do_vzquotaugidctl(int cmd, unsigned int quota_id,
+ unsigned int ugid_index, unsigned int ugid_size,
+ void *addr, int compat)
+{
+ int ret;
+
+ ret = -EPERM;
+ /* access allowed only from root of VE0 */
+ if (!capable(CAP_SYS_RESOURCE) ||
+ !capable(CAP_SYS_ADMIN))
+ goto out;
+
+ switch (cmd) {
+ case VZ_DQ_UGID_GETSTAT:
+ ret = quota_ugid_getstat(quota_id,
+ ugid_index, ugid_size,
+ (struct vz_quota_iface __user *)addr,
+ compat);
+ break;
+ case VZ_DQ_UGID_ADDSTAT:
+ ret = quota_ugid_addstat(quota_id, ugid_size,
+ (struct vz_quota_iface __user *) addr,
+ compat);
+ break;
+ case VZ_DQ_UGID_GETGRACE:
+ ret = quota_ugid_getgrace(quota_id,
+ (struct dq_info __user *)addr, compat);
+ break;
+ case VZ_DQ_UGID_SETGRACE:
+ ret = quota_ugid_setgrace(quota_id,
+ (struct dq_info __user *)addr, compat);
+ break;
+ case VZ_DQ_UGID_GETCONFIG:
+ ret = quota_ugid_getconfig(quota_id,
+ (struct vz_quota_ugid_stat __user *)
+ addr);
+ break;
+ case VZ_DQ_UGID_SETCONFIG:
+ ret = quota_ugid_setconfig(quota_id,
+ (struct vz_quota_ugid_stat __user *)
+ addr);
+ break;
+ case VZ_DQ_UGID_SETLIMIT:
+ ret = quota_ugid_setlimit(quota_id,
+ (struct vz_quota_ugid_setlimit __user *)
+ addr);
+ break;
+ case VZ_DQ_UGID_SETINFO:
+ ret = quota_ugid_setinfo(quota_id,
+ (struct vz_quota_ugid_setinfo __user *)
+ addr);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static void ugid_quota_on_sb(struct super_block *sb)
+{
+ struct super_block *real_sb;
+ struct vz_quota_master *qmblk;
+
+ if (!sb->s_op->get_quota_root)
+ return;
+
+ real_sb = sb->s_op->get_quota_root(sb)->i_sb;
+ if (real_sb->dq_op != &vz_quota_operations)
+ return;
+
+ sb->dq_op = &vz_quota_operations2;
+ sb->s_qcop = &vz_quotactl_operations;
+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
+ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
+
+ qmblk = vzquota_find_qmblk(sb);
+ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD))
+ return;
+ mutex_lock(&vz_quota_mutex);
+ if (qmblk->dq_flags & VZDQ_USRQUOTA)
+ sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED, USRQUOTA);
+ if (qmblk->dq_flags & VZDQ_GRPQUOTA)
+ sb->s_dquot.flags |= dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED, GRPQUOTA);
+ mutex_unlock(&vz_quota_mutex);
+ qmblk_put(qmblk);
+}
+
+static void ugid_quota_off_sb(struct super_block *sb)
+{
+ /* can't make quota off on mounted super block */
+ BUG_ON(sb->s_root != NULL);
+}
+
+static int ugid_notifier_call(struct vnotifier_block *self,
+ unsigned long n, void *data, int old_ret)
+{
+ struct virt_info_quota *viq;
+
+ viq = (struct virt_info_quota *)data;
+
+ switch (n) {
+ case VIRTINFO_QUOTA_ON:
+ ugid_quota_on_sb(viq->super);
+ break;
+ case VIRTINFO_QUOTA_OFF:
+ ugid_quota_off_sb(viq->super);
+ break;
+ case VIRTINFO_QUOTA_GETSTAT:
+ break;
+ default:
+ return old_ret;
+ }
+ return NOTIFY_OK;
+}
+
+static struct vnotifier_block ugid_notifier_block = {
+ .notifier_call = ugid_notifier_call,
+};
+
+/* ----------------------------------------------------------------------
+ * Init/exit.
+ * --------------------------------------------------------------------- */
+
+int vzquota_ugid_init(void)
+{
+ int err;
+
+ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid",
+ sizeof(struct vz_quota_ugid),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (vz_quota_ugid_cachep == NULL)
+ goto err_slab;
+
+ err = register_quota_format(&vz_quota_empty_v2_format);
+ if (err)
+ goto err_reg;
+
+ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block);
+ return 0;
+
+err_reg:
+ kmem_cache_destroy(vz_quota_ugid_cachep);
+ return err;
+
+err_slab:
+ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
+ return -ENOMEM;
+}
+
+void vzquota_ugid_release(void)
+{
+ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block);
+ unregister_quota_format(&vz_quota_empty_v2_format);
+
+ kmem_cache_destroy(vz_quota_ugid_cachep);
+}
diff --git a/fs/quota/vzdquota/vzdquot.c b/fs/quota/vzdquota/vzdquot.c
new file mode 100644
index 0000000..f091943
--- /dev/null
+++ b/fs/quota/vzdquota/vzdquot.c
@@ -0,0 +1,1994 @@
+/*
+ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains the core of Virtuozzo disk quota implementation:
+ * maintenance of VZDQ information in inodes,
+ * external interfaces,
+ * module entry.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/quota.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+#include <linux/vzctl.h>
+#include <linux/vzctl_quota.h>
+#include <linux/vzquota.h>
+#include <linux/virtinfo.h>
+#include <linux/vzdq_tree.h>
+#include <linux/mount.h>
+#include <linux/quotaops.h>
+
+/* ----------------------------------------------------------------------
+ *
+ * Locking
+ *
+ * ---------------------------------------------------------------------- */
+
+/*
+ * Serializes on/off and all other do_vzquotactl operations.
+ * Protects qmblk hash.
+ */
+struct mutex vz_quota_mutex;
+
+/*
+ * Data access locks
+ * inode_qmblk
+ * protects qmblk pointers in all inodes and qlnk content in general
+ * (but not qmblk content);
+ * also protects related qmblk invalidation procedures;
+ * can't be per-inode because of vzquota_dtree_qmblk complications
+ * and problems with serialization with quota_on,
+ * but can be per-superblock;
+ * qmblk_data
+ * protects qmblk fields (such as current usage)
+ * quota_data
+ * protects charge/uncharge operations, thus, implies
+ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock
+ * (to protect ugid pointers).
+ *
+ * Lock order:
+ * inode_qmblk_lock -> dcache_lock
+ * inode_qmblk_lock -> qmblk_data
+ */
+static DEFINE_SPINLOCK(vzdq_qmblk_lock);
+
+inline void inode_qmblk_lock(struct super_block *sb)
+{
+ spin_lock(&vzdq_qmblk_lock);
+}
+
+inline void inode_qmblk_unlock(struct super_block *sb)
+{
+ spin_unlock(&vzdq_qmblk_lock);
+}
+
+inline void qmblk_data_read_lock(struct vz_quota_master *qmblk)
+{
+ spin_lock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk)
+{
+ spin_unlock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_write_lock(struct vz_quota_master *qmblk)
+{
+ spin_lock(&qmblk->dq_data_lock);
+}
+
+inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk)
+{
+ spin_unlock(&qmblk->dq_data_lock);
+}
+
+struct quota_format_type vz_quota_empty_v2_format = {
+ .qf_fmt_id = QFMT_VFS_V0,
+ .qf_ops = NULL,
+ .qf_owner = THIS_MODULE,
+};
+
+/* ----------------------------------------------------------------------
+ *
+ * Master hash table handling.
+ *
+ * SMP not safe, serialied by vz_quota_mutex within quota syscalls
+ *
+ * --------------------------------------------------------------------- */
+
+static struct kmem_cache *vzquota_cachep;
+
+/*
+ * Hash function.
+ */
+#define QHASH_BITS 6
+#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS)
+#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1)
+
+struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE];
+int vzquota_hash_size = VZ_QUOTA_HASH_SIZE;
+
+static inline int vzquota_hash_func(unsigned int qid)
+{
+ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK);
+}
+
+/**
+ * vzquota_alloc_master - alloc and instantiate master quota record
+ *
+ * Returns:
+ * pointer to newly created record if SUCCESS
+ * -ENOMEM if out of memory
+ * -EEXIST if record with given quota_id already exist
+ */
+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
+ struct vz_quota_stat *qstat)
+{
+ int err;
+ struct vz_quota_master *qmblk;
+
+ err = -EEXIST;
+ if (vzquota_find_master(quota_id) != NULL)
+ goto out;
+
+ err = -ENOMEM;
+ qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
+ if (qmblk == NULL)
+ goto out;
+#ifdef CONFIG_VZ_QUOTA_UGID
+ qmblk->dq_uid_tree = quotatree_alloc();
+ if (!qmblk->dq_uid_tree)
+ goto out_free;
+
+ qmblk->dq_gid_tree = quotatree_alloc();
+ if (!qmblk->dq_gid_tree)
+ goto out_free_tree;
+#endif
+
+ qmblk->dq_state = VZDQ_STARTING;
+ mutex_init(&qmblk->dq_mutex);
+ spin_lock_init(&qmblk->dq_data_lock);
+
+ qmblk->dq_id = quota_id;
+ qmblk->dq_stat = qstat->dq_stat;
+ qmblk->dq_info = qstat->dq_info;
+ qmblk->dq_root_path.dentry = NULL;
+ qmblk->dq_root_path.mnt = NULL;
+ qmblk->dq_sb = NULL;
+ qmblk->dq_ugid_count = 0;
+ qmblk->dq_ugid_max = 0;
+ qmblk->dq_flags = 0;
+ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info));
+ INIT_LIST_HEAD(&qmblk->dq_ilink_list);
+
+ atomic_set(&qmblk->dq_count, 1);
+
+ /* insert in hash chain */
+ list_add(&qmblk->dq_hash,
+ &vzquota_hash_table[vzquota_hash_func(quota_id)]);
+
+ /* success */
+ return qmblk;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+out_free_tree:
+ quotatree_free(qmblk->dq_uid_tree, NULL);
+out_free:
+ kmem_cache_free(vzquota_cachep, qmblk);
+#endif
+out:
+ return ERR_PTR(err);
+}
+
+static struct vz_quota_master *vzquota_alloc_fake(void)
+{
+ struct vz_quota_master *qmblk;
+
+ qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL);
+ if (qmblk == NULL)
+ return NULL;
+ memset(qmblk, 0, sizeof(*qmblk));
+ qmblk->dq_state = VZDQ_STOPING;
+ qmblk->dq_flags = VZDQ_NOQUOT;
+ spin_lock_init(&qmblk->dq_data_lock);
+ INIT_LIST_HEAD(&qmblk->dq_ilink_list);
+ atomic_set(&qmblk->dq_count, 1);
+ return qmblk;
+}
+
+/**
+ * vzquota_find_master - find master record with given id
+ *
+ * Returns qmblk without touching its refcounter.
+ * Called under vz_quota_mutex.
+ */
+struct vz_quota_master *vzquota_find_master(unsigned int quota_id)
+{
+ int i;
+ struct vz_quota_master *qp;
+
+ i = vzquota_hash_func(quota_id);
+ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) {
+ if (qp->dq_id == quota_id)
+ return qp;
+ }
+ return NULL;
+}
+
+/**
+ * vzquota_free_master - release resources taken by qmblk, freeing memory
+ *
+ * qmblk is assumed to be already taken out from the hash.
+ * Should be called outside vz_quota_mutex.
+ */
+void vzquota_free_master(struct vz_quota_master *qmblk)
+{
+#ifdef CONFIG_VZ_QUOTA_UGID
+ vzquota_kill_ugid(qmblk);
+#endif
+ BUG_ON(!list_empty(&qmblk->dq_ilink_list));
+ kmem_cache_free(vzquota_cachep, qmblk);
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Passing quota information through current
+ *
+ * Used in inode -> qmblk lookup at inode creation stage (since at that
+ * time there are no links between the inode being created and its parent
+ * directory).
+ *
+ * --------------------------------------------------------------------- */
+
+#define VZDQ_CUR_MAGIC 0x57d0fee2
+
+static inline int vzquota_cur_qmblk_check(void)
+{
+ return current->magic == VZDQ_CUR_MAGIC;
+}
+
+static inline struct inode *vzquota_cur_qmblk_fetch(void)
+{
+ return current->ino;
+}
+
+static inline void vzquota_cur_qmblk_set(struct inode *data)
+{
+ struct task_struct *tsk;
+
+ tsk = current;
+ tsk->magic = VZDQ_CUR_MAGIC;
+ tsk->ino = data;
+}
+
+#if 0
+static inline void vzquota_cur_qmblk_reset(void)
+{
+ current->magic = 0;
+}
+#endif
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Superblock quota operations
+ *
+ * --------------------------------------------------------------------- */
+
+/*
+ * Kernel structure abuse.
+ * We use files[0] pointer as an int variable:
+ * reference counter of how many quota blocks uses this superblock.
+ * files[1] is used for generations structure which helps us to track
+ * when traversing of dentries is really required.
+ */
+#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master
+#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\
+ &sb->s_dquot.dqio_mutex)
+
+#if defined(VZ_QUOTA_UNLOAD)
+
+#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count
+
+struct dquot_operations *orig_dq_op;
+struct quotactl_ops *orig_dq_cop;
+
+/**
+ * quota_get_super - account for new a quoted tree under the superblock
+ *
+ * One superblock can have multiple directory subtrees with different VZ
+ * quotas. We keep a counter of such subtrees and set VZ quota operations or
+ * reset the default ones.
+ *
+ * Called under vz_quota_mutex (from quota_on).
+ */
+int vzquota_get_super(struct super_block *sb)
+{
+ if (sb->dq_op != &vz_quota_operations) {
+ down(&sb->s_dquot.dqonoff_sem);
+ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) {
+ up(&sb->s_dquot.dqonoff_sem);
+ return -EEXIST;
+ }
+ if (orig_dq_op == NULL && sb->dq_op != NULL)
+ orig_dq_op = sb->dq_op;
+ sb->dq_op = &vz_quota_operations;
+ if (orig_dq_cop == NULL && sb->s_qcop != NULL)
+ orig_dq_cop = sb->s_qcop;
+ /* XXX this may race with sys_quotactl */
+#ifdef CONFIG_VZ_QUOTA_UGID
+ sb->s_qcop = &vz_quotactl_operations;
+#else
+ sb->s_qcop = NULL;
+#endif
+ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+
+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
+ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
+ /*
+ * To get quotaops.h call us we need to mark superblock
+ * as having quota. These flags mark the moment when
+ * our dq_op start to be called.
+ *
+ * The ordering of dq_op and s_dquot.flags assignment
+ * needs to be enforced, but other CPUs do not do rmb()
+ * between s_dquot.flags and dq_op accesses.
+ */
+ wmb(); synchronize_sched();
+ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
+ __module_get(THIS_MODULE);
+ up(&sb->s_dquot.dqonoff_sem);
+ }
+ /* protected by vz_quota_mutex */
+ __VZ_QUOTA_SBREF(sb)++;
+ return 0;
+}
+
+/**
+ * quota_put_super - release superblock when one quota tree goes away
+ *
+ * Called under vz_quota_mutex.
+ */
+void vzquota_put_super(struct super_block *sb)
+{
+ int count;
+
+ count = --__VZ_QUOTA_SBREF(sb);
+ if (count == 0) {
+ down(&sb->s_dquot.dqonoff_sem);
+ sb->s_dquot.flags = 0;
+ wmb(); synchronize_sched();
+ sema_init(&sb->s_dquot.dqio_sem, 1);
+ sb->s_qcop = orig_dq_cop;
+ sb->dq_op = orig_dq_op;
+ inode_qmblk_lock(sb);
+ quota_gen_put(SB_QGEN(sb));
+ SB_QGEN(sb) = NULL;
+ /* release qlnk's without qmblk */
+ remove_inode_quota_links_list(&non_vzquota_inodes_lh,
+ sb, NULL);
+ /*
+ * Races with quota initialization:
+ * after this inode_qmblk_unlock all inode's generations are
+ * invalidated, quota_inode_qmblk checks superblock operations.
+ */
+ inode_qmblk_unlock(sb);
+ /*
+ * Module refcounting: in theory, this is the best place
+ * to call module_put(THIS_MODULE).
+ * In reality, it can't be done because we can't be sure that
+ * other CPUs do not enter our code segment through dq_op
+ * cached long time ago. Quotaops interface isn't supposed to
+ * go into modules currently (that is, into unloadable
+ * modules). By omitting module_put, our module isn't
+ * unloadable.
+ */
+ up(&sb->s_dquot.dqonoff_sem);
+ }
+}
+
+#else
+
+/**
+ * vzquota_shutdown_super - callback on umount
+ */
+void vzquota_shutdown_super(struct super_block *sb)
+{
+ struct vz_quota_master *qmblk;
+
+ qmblk = __VZ_QUOTA_NOQUOTA(sb);
+ __VZ_QUOTA_NOQUOTA(sb) = NULL;
+ if (qmblk != NULL)
+ qmblk_put(qmblk);
+}
+
+/**
+ * vzquota_get_super - account for new a quoted tree under the superblock
+ *
+ * One superblock can have multiple directory subtrees with different VZ
+ * quotas.
+ *
+ * Called under vz_quota_mutex (from vzquota_on).
+ */
+int vzquota_get_super(struct super_block *sb)
+{
+ struct vz_quota_master *qnew;
+ int err;
+
+ mutex_lock(&sb->s_dquot.dqonoff_mutex);
+ err = -EEXIST;
+ if (sb_any_quota_loaded(sb) && sb->dq_op != &vz_quota_operations)
+ goto out_up;
+
+ /*
+ * This allocation code should be under sb->dq_op check below, but
+ * it doesn't really matter...
+ */
+ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) {
+ qnew = vzquota_alloc_fake();
+ if (qnew == NULL)
+ goto out_up;
+ __VZ_QUOTA_NOQUOTA(sb) = qnew;
+ }
+
+ if (sb->dq_op != &vz_quota_operations) {
+ sb->dq_op = &vz_quota_operations;
+#ifdef CONFIG_VZ_QUOTA_UGID
+ sb->s_qcop = &vz_quotactl_operations;
+#else
+ sb->s_qcop = NULL;
+#endif
+ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
+
+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+ /* these 2 list heads are checked in sync_dquots() */
+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+ sb->s_dquot.info[USRQUOTA].dqi_format =
+ &vz_quota_empty_v2_format;
+ sb->s_dquot.info[GRPQUOTA].dqi_format =
+ &vz_quota_empty_v2_format;
+
+ /*
+ * To get quotaops.h to call us we need to mark superblock
+ * as having quota. These flags mark the moment when
+ * our dq_op start to be called.
+ *
+ * The ordering of dq_op and s_dquot.flags assignment
+ * needs to be enforced, but other CPUs do not do rmb()
+ * between s_dquot.flags and dq_op accesses.
+ */
+ wmb(); synchronize_sched();
+ sb->s_dquot.flags =
+ dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED,
+ USRQUOTA) |
+ dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED,
+ GRPQUOTA);
+ }
+ err = 0;
+
+out_up:
+ mutex_unlock(&sb->s_dquot.dqonoff_mutex);
+ return err;
+}
+
+/**
+ * vzquota_put_super - one quota tree less on this superblock
+ *
+ * Called under vz_quota_mutex.
+ */
+void vzquota_put_super(struct super_block *sb)
+{
+ /*
+ * Even if this put is the last one,
+ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop
+ * won't be called and the remaining qmblk references won't be put.
+ */
+}
+
+#endif
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Helpers for inode -> qmblk link maintenance
+ *
+ * --------------------------------------------------------------------- */
+
+#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd)
+#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT)
+#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops)
+extern struct inode_operations vfs_empty_iops;
+
+static int VZ_QUOTA_IS_ACTUAL(struct inode *inode)
+{
+ struct vz_quota_master *qmblk;
+
+ qmblk = INODE_QLNK(inode)->qmblk;
+ if (qmblk == VZ_QUOTA_BAD)
+ return 1;
+ if (qmblk == __VZ_QUOTA_EMPTY)
+ return 0;
+ if (qmblk->dq_flags & VZDQ_NOACT)
+ /* not actual (invalidated) qmblk */
+ return 0;
+ return 1;
+}
+
+static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk)
+{
+ return qlnk->qmblk == __VZ_QUOTA_EMPTY;
+}
+
+static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk,
+ unsigned char origin)
+{
+ qlnk->origin[0] = qlnk->origin[1];
+ qlnk->origin[1] = origin;
+}
+
+static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk)
+{
+ qlnk->qmblk = __VZ_QUOTA_EMPTY;
+ set_qlnk_origin(qlnk, VZ_QUOTAO_SETE);
+}
+
+void vzquota_qlnk_init(struct vz_quota_ilink *qlnk)
+{
+ memset(qlnk, 0, sizeof(*qlnk));
+ INIT_LIST_HEAD(&qlnk->list);
+ vzquota_qlnk_set_empty(qlnk);
+ set_qlnk_origin(qlnk, VZ_QUOTAO_INIT);
+}
+
+void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk)
+{
+ might_sleep();
+ if (vzquota_qlnk_is_empty(qlnk))
+ return;
+#if defined(CONFIG_VZ_QUOTA_UGID)
+ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) {
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid *quid, *qgid;
+ qmblk = qlnk->qmblk;
+ quid = qlnk->qugid[USRQUOTA];
+ qgid = qlnk->qugid[GRPQUOTA];
+ if (quid != NULL || qgid != NULL) {
+ mutex_lock(&qmblk->dq_mutex);
+ if (qgid != NULL)
+ vzquota_put_ugid(qmblk, qgid);
+ if (quid != NULL)
+ vzquota_put_ugid(qmblk, quid);
+ mutex_unlock(&qmblk->dq_mutex);
+ }
+ }
+#endif
+ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD)
+ qmblk_put(qlnk->qmblk);
+ set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR);
+}
+
+/**
+ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents
+ * @qlt: temporary
+ * @qli: inode's
+ *
+ * Locking is provided by the caller (depending on the context).
+ * After swap, @qli is inserted into the corresponding dq_ilink_list,
+ * @qlt list is reinitialized.
+ */
+static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt,
+ struct vz_quota_ilink *qli)
+{
+ struct vz_quota_master *qb;
+ struct vz_quota_ugid *qu;
+ int i;
+
+ qb = qlt->qmblk;
+ qlt->qmblk = qli->qmblk;
+ qli->qmblk = qb;
+ list_del_init(&qli->list);
+ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD)
+ list_add(&qli->list, &qb->dq_ilink_list);
+ INIT_LIST_HEAD(&qlt->list);
+ set_qlnk_origin(qli, VZ_QUOTAO_SWAP);
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ qu = qlt->qugid[i];
+ qlt->qugid[i] = qli->qugid[i];
+ qli->qugid[i] = qu;
+ }
+}
+
+/**
+ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks
+ *
+ * Called under dcache_lock and inode_qmblk locks.
+ * Returns 1 if locks were dropped inside, 0 if atomic.
+ */
+static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk,
+ struct inode *inode)
+{
+ if (vzquota_qlnk_is_empty(qlnk))
+ return 0;
+ if (qlnk->qmblk == VZ_QUOTA_BAD) {
+ vzquota_qlnk_set_empty(qlnk);
+ set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK);
+ return 0;
+ }
+ spin_unlock(&dcache_lock);
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(qlnk);
+ vzquota_qlnk_init(qlnk);
+ inode_qmblk_lock(inode->i_sb);
+ spin_lock(&dcache_lock);
+ return 1;
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+/**
+ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content
+ *
+ * Similar to vzquota_qlnk_reinit_locked, called under different locks.
+ */
+static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk,
+ struct inode *inode,
+ struct vz_quota_master *qmblk)
+{
+ if (vzquota_qlnk_is_empty(qlnk))
+ return 0;
+ /* may be optimized if qlnk->qugid all NULLs */
+ qmblk_data_write_unlock(qmblk);
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(qlnk);
+ vzquota_qlnk_init(qlnk);
+ inode_qmblk_lock(inode->i_sb);
+ qmblk_data_write_lock(qmblk);
+ return 1;
+}
+#endif
+
+/**
+ * vzquota_qlnk_fill - fill vz_quota_ilink content
+ * @qlnk: vz_quota_ilink to fill
+ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid)
+ * @qmblk: qmblk to which this @qlnk will belong
+ *
+ * Called under dcache_lock and inode_qmblk locks.
+ * Returns 1 if locks were dropped inside, 0 if atomic.
+ * @qlnk is expected to be empty.
+ */
+static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk,
+ struct inode *inode,
+ struct vz_quota_master *qmblk)
+{
+ if (qmblk != VZ_QUOTA_BAD)
+ qmblk_get(qmblk);
+ qlnk->qmblk = qmblk;
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+ if (qmblk != VZ_QUOTA_BAD &&
+ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
+ (qmblk->dq_flags & VZDQUG_ON)) {
+ struct vz_quota_ugid *quid, *qgid;
+
+ spin_unlock(&dcache_lock);
+ inode_qmblk_unlock(inode->i_sb);
+
+ mutex_lock(&qmblk->dq_mutex);
+ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0);
+ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0);
+ mutex_unlock(&qmblk->dq_mutex);
+
+ inode_qmblk_lock(inode->i_sb);
+ spin_lock(&dcache_lock);
+ qlnk->qugid[USRQUOTA] = quid;
+ qlnk->qugid[GRPQUOTA] = qgid;
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+/**
+ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid
+ *
+ * This function is a helper for vzquota_transfer, and differs from
+ * vzquota_qlnk_fill only by locking.
+ */
+static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk,
+ struct inode *inode,
+ struct iattr *iattr,
+ int mask,
+ struct vz_quota_master *qmblk)
+{
+ qmblk_get(qmblk);
+ qlnk->qmblk = qmblk;
+
+ if (mask) {
+ struct vz_quota_ugid *quid, *qgid;
+
+ quid = qgid = NULL; /* to make gcc happy */
+ if (!(mask & (1 << USRQUOTA)))
+ quid = vzquota_get_ugid(INODE_QLNK(inode)->
+ qugid[USRQUOTA]);
+ if (!(mask & (1 << GRPQUOTA)))
+ qgid = vzquota_get_ugid(INODE_QLNK(inode)->
+ qugid[GRPQUOTA]);
+
+ qmblk_data_write_unlock(qmblk);
+ inode_qmblk_unlock(inode->i_sb);
+
+ mutex_lock(&qmblk->dq_mutex);
+ if (mask & (1 << USRQUOTA))
+ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid,
+ USRQUOTA, 0);
+ if (mask & (1 << GRPQUOTA))
+ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid,
+ GRPQUOTA, 0);
+ mutex_unlock(&qmblk->dq_mutex);
+
+ inode_qmblk_lock(inode->i_sb);
+ qmblk_data_write_lock(qmblk);
+ qlnk->qugid[USRQUOTA] = quid;
+ qlnk->qugid[GRPQUOTA] = qgid;
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+/**
+ * __vzquota_inode_init - make sure inode's qlnk is initialized
+ *
+ * May be called if qlnk is already initialized, detects this situation itself.
+ * Called under inode_qmblk_lock.
+ */
+static void __vzquota_inode_init(struct inode *inode, unsigned char origin)
+{
+ if (inode->i_dquot[USRQUOTA] == NULL) {
+ vzquota_qlnk_init(INODE_QLNK(inode));
+ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NULL;
+ }
+ set_qlnk_origin(INODE_QLNK(inode), origin);
+}
+
+/**
+ * vzquota_inode_drop - destroy VZ quota information in the inode
+ *
+ * Inode must not be externally accessible or dirty.
+ */
+static void vzquota_inode_drop(struct inode *inode)
+{
+ struct vz_quota_ilink qlnk;
+
+ vzquota_qlnk_init(&qlnk);
+ inode_qmblk_lock(inode->i_sb);
+ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode));
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL);
+ inode->i_dquot[USRQUOTA] = NULL;
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(&qlnk);
+}
+
+/**
+ * vzquota_inode_qmblk_set - initialize inode's qlnk
+ * @inode: inode to be initialized
+ * @qmblk: quota master block to which this inode should belong (may be BAD)
+ * @qlnk: placeholder to store data to resolve locking issues
+ *
+ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise.
+ * Called under dcache_lock and inode_qmblk locks.
+ * @qlnk will be destroyed in the caller chain.
+ *
+ * It is not mandatory to restart parent checks since quota on/off currently
+ * shrinks dentry tree and checks that there are not outside references.
+ * But if at some time that shink is removed, restarts will be required.
+ * Additionally, the restarts prevent inconsistencies if the dentry tree
+ * changes (inode is moved). This is not a big deal, but anyway...
+ */
+static int vzquota_inode_qmblk_set(struct inode *inode,
+ struct vz_quota_master *qmblk,
+ struct vz_quota_ilink *qlnk)
+{
+ if (qmblk == NULL) {
+ printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, "
+ "dev %s, inode %lu, fs %s\n",
+ INODE_QLNK(inode)->origin[0],
+ INODE_QLNK(inode)->origin[1],
+ inode->i_sb->s_id, inode->i_ino,
+ inode->i_sb->s_type->name);
+ printk(KERN_ERR "current %d (%s), VE %d\n",
+ current->pid, current->comm,
+ VEID(get_exec_env()));
+ dump_stack();
+ qmblk = VZ_QUOTA_BAD;
+ }
+ while (1) {
+ if (vzquota_qlnk_is_empty(qlnk) &&
+ vzquota_qlnk_fill(qlnk, inode, qmblk))
+ return 1;
+ if (qlnk->qmblk == qmblk)
+ break;
+ if (vzquota_qlnk_reinit_locked(qlnk, inode))
+ return 1;
+ }
+ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode));
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET);
+ return 0;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * vzquota_inode_qmblk (inode -> qmblk lookup) parts
+ *
+ * --------------------------------------------------------------------- */
+
+static int vzquota_dparents_check_attach(struct inode *inode)
+{
+ if (!list_empty(&inode->i_dentry))
+ return 0;
+ printk(KERN_ERR "VZDQ: no parent for "
+ "dev %s, inode %lu, fs %s\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ inode->i_sb->s_type->name);
+ return -1;
+}
+
+static struct inode *vzquota_dparents_check_actual(struct inode *inode)
+{
+ struct dentry *de;
+
+ list_for_each_entry(de, &inode->i_dentry, d_alias) {
+ if (de->d_parent == de) /* detached dentry, perhaps */
+ continue;
+ /* first access to parent, make sure its qlnk initialized */
+ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT);
+ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode))
+ return de->d_parent->d_inode;
+ }
+ return NULL;
+}
+
+static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode)
+{
+ struct dentry *de;
+ struct vz_quota_master *qmblk;
+
+ qmblk = NULL;
+ list_for_each_entry(de, &inode->i_dentry, d_alias) {
+ if (de->d_parent == de) /* detached dentry, perhaps */
+ continue;
+ if (qmblk == NULL) {
+ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk;
+ continue;
+ }
+ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) {
+ printk(KERN_WARNING "VZDQ: multiple quotas for "
+ "dev %s, inode %lu, fs %s\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ inode->i_sb->s_type->name);
+ qmblk = VZ_QUOTA_BAD;
+ break;
+ }
+ }
+ if (qmblk == NULL) {
+ printk(KERN_WARNING "VZDQ: not attached to tree, "
+ "dev %s, inode %lu, fs %s\n",
+ inode->i_sb->s_id,
+ inode->i_ino,
+ inode->i_sb->s_type->name);
+ qmblk = VZ_QUOTA_BAD;
+ }
+ return qmblk;
+}
+
+/* NFS root is disconnected dentry. */
+
+static int is_nfs_root(struct inode * inode)
+{
+ struct dentry *de;
+
+ if (inode->i_sb->s_magic != 0x6969)
+ return 0;
+
+ if (list_empty(&inode->i_dentry))
+ return 0;
+
+ list_for_each_entry(de, &inode->i_dentry, d_alias) {
+ if (de->d_parent != de)
+ return 0;
+ if (d_unhashed(de))
+ return 0;
+ if (!(de->d_flags & DCACHE_DISCONNECTED))
+ return 0;
+ }
+ return 1;
+}
+
+static void vzquota_dbranch_actualize(struct inode *inode,
+ struct inode *refinode)
+{
+ struct inode *pinode;
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ilink qlnk;
+
+ vzquota_qlnk_init(&qlnk);
+
+start:
+ if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) {
+ /* filesystem root */
+ atomic_inc(&inode->i_count);
+ do {
+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk));
+ goto out;
+ }
+
+ if (!vzquota_dparents_check_attach(inode)) {
+ pinode = vzquota_dparents_check_actual(inode);
+ if (pinode != NULL) {
+ inode = pinode;
+ goto start;
+ }
+ }
+
+ atomic_inc(&inode->i_count);
+ while (1) {
+ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */
+ break;
+ /*
+ * Need to check parents again if we have slept inside
+ * vzquota_inode_qmblk_set() in the loop.
+ * If the state of parents is different, just return and repeat
+ * the actualizing process again from the inode passed to
+ * vzquota_inode_qmblk_recalc().
+ */
+ if (!vzquota_dparents_check_attach(inode)) {
+ if (vzquota_dparents_check_actual(inode) != NULL)
+ break;
+ qmblk = vzquota_dparents_check_same(inode);
+ } else
+ qmblk = VZ_QUOTA_BAD;
+ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT);
+ break;
+ }
+ }
+
+out:
+ spin_unlock(&dcache_lock);
+ inode_qmblk_unlock(refinode->i_sb);
+ vzquota_qlnk_destroy(&qlnk);
+ iput(inode);
+ inode_qmblk_lock(refinode->i_sb);
+ spin_lock(&dcache_lock);
+}
+
+static void vzquota_dtree_qmblk_recalc(struct inode *inode,
+ struct vz_quota_ilink *qlnk)
+{
+ struct inode *pinode;
+ struct vz_quota_master *qmblk;
+
+ if (inode == inode->i_sb->s_root->d_inode || is_nfs_root(inode)) {
+ /* filesystem root */
+ do {
+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk));
+ return;
+ }
+
+start:
+ if (VZ_QUOTA_IS_ACTUAL(inode))
+ return;
+ /*
+ * Here qmblk is (re-)initialized for all ancestors.
+ * This is not a very efficient procedure, but it guarantees that
+ * the quota tree is consistent (that is, the inode doesn't have two
+ * ancestors with different qmblk).
+ */
+ if (!vzquota_dparents_check_attach(inode)) {
+ pinode = vzquota_dparents_check_actual(inode);
+ if (pinode != NULL) {
+ vzquota_dbranch_actualize(pinode, inode);
+ goto start;
+ }
+ qmblk = vzquota_dparents_check_same(inode);
+ } else
+ qmblk = VZ_QUOTA_BAD;
+
+ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
+ goto start;
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE);
+}
+
+static void vzquota_det_qmblk_recalc(struct inode *inode,
+ struct vz_quota_ilink *qlnk)
+{
+ struct inode *parent;
+ struct vz_quota_master *qmblk;
+ char *msg;
+ int cnt;
+ time_t timeout;
+
+ cnt = 0;
+ parent = NULL;
+start:
+ /*
+ * qmblk of detached inodes shouldn't be considered as not actual.
+ * They are not in any dentry tree, so quota on/off shouldn't affect
+ * them.
+ */
+ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode)))
+ return;
+
+ timeout = 3;
+ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
+ /*
+ * Scenario:
+ * open
+ * unlink
+ * quotaon
+ * generic_delete_inode
+ *
+ * This is the first time vzquota sees inode. inode is outside of
+ * vzquota area of interest, otherwise quotaon would have got -EBUSY
+ * due to shrink_dcache_parent().
+ * inode is almost completely destroyed, so don't intervene.
+ *
+ * dev@:
+ * However, there is a small race here...
+ * dput() first removes itself from all the lists,
+ * so shrink_dcache_parent() can succeed while dentry_iput is not
+ * done yet.
+ */
+ if (inode->i_state & I_FREEING)
+ goto set;
+
+ msg = "detached inode not in creation";
+ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS)
+ goto fail;
+ qmblk = VZ_QUOTA_BAD;
+ msg = "unexpected creation context";
+ if (!vzquota_cur_qmblk_check())
+ goto fail;
+ timeout = 0;
+ parent = vzquota_cur_qmblk_fetch();
+ msg = "uninitialized parent";
+ if (vzquota_qlnk_is_empty(INODE_QLNK(parent)))
+ goto fail;
+ msg = "parent not in tree";
+ if (list_empty(&parent->i_dentry))
+ goto fail;
+ msg = "parent has 0 refcount";
+ if (!atomic_read(&parent->i_count))
+ goto fail;
+ msg = "parent has different sb";
+ if (parent->i_sb != inode->i_sb)
+ goto fail;
+ if (!VZ_QUOTA_IS_ACTUAL(parent)) {
+ vzquota_dbranch_actualize(parent, inode);
+ goto start;
+ }
+
+ qmblk = INODE_QLNK(parent)->qmblk;
+set:
+ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
+ goto start;
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET);
+ return;
+
+fail:
+ {
+ struct timeval tv, tvo;
+ do_gettimeofday(&tv);
+ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo));
+ tv.tv_sec -= tvo.tv_sec;
+ if (tv.tv_usec < tvo.tv_usec) {
+ tv.tv_sec--;
+ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec;
+ } else
+ tv.tv_usec -= tvo.tv_usec;
+ if (tv.tv_sec < timeout)
+ goto set;
+ printk(KERN_ERR "VZDQ: %s, orig {%u, %u},"
+ " dev %s, inode %lu, fs %s\n",
+ msg,
+ INODE_QLNK(inode)->origin[0],
+ INODE_QLNK(inode)->origin[1],
+ inode->i_sb->s_id, inode->i_ino,
+ inode->i_sb->s_type->name);
+ printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count));
+ printk(KERN_ERR "i_mode %o, ", inode->i_mode);
+ printk(KERN_ERR "i_state %lx, ", inode->i_state);
+ printk(KERN_ERR "i_flags %x\n", inode->i_flags);
+ printk(KERN_ERR "i_op %p, vfs_empty_iops %p, "
+ "i_fop %p, i_mapping %p\n",
+ inode->i_op, &vfs_empty_iops,
+ inode->i_fop, inode->i_mapping);
+ if (!cnt++) {
+ printk(KERN_ERR "current %d (%s), VE %d,"
+ " time %ld.%06ld\n",
+ current->pid, current->comm,
+ VEID(get_exec_env()),
+ tv.tv_sec, (long)tv.tv_usec);
+ dump_stack();
+ }
+ if (parent != NULL)
+ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n",
+ inode->i_ino, parent->i_ino);
+ }
+ goto set;
+}
+
+static void vzquota_inode_qmblk_recalc(struct inode *inode,
+ struct vz_quota_ilink *qlnk)
+{
+ spin_lock(&dcache_lock);
+ if (!list_empty(&inode->i_dentry))
+ vzquota_dtree_qmblk_recalc(inode, qlnk);
+ else
+ vzquota_det_qmblk_recalc(inode, qlnk);
+ spin_unlock(&dcache_lock);
+}
+
+/**
+ * vzquota_inode_qmblk - obtain inode's qmblk
+ *
+ * Returns qmblk with refcounter taken, %NULL if not under
+ * VZ quota or %VZ_QUOTA_BAD.
+ *
+ * FIXME: This function should be removed when vzquota_find_qmblk /
+ * get_quota_root / vzquota_dstat code is cleaned up.
+ */
+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ilink qlnk;
+
+ might_sleep();
+
+ if (inode->i_sb->dq_op != &vz_quota_operations)
+ return NULL;
+#if defined(VZ_QUOTA_UNLOAD)
+#error Make sure qmblk does not disappear
+#endif
+
+ vzquota_qlnk_init(&qlnk);
+ inode_qmblk_lock(inode->i_sb);
+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+ !VZ_QUOTA_IS_ACTUAL(inode))
+ vzquota_inode_qmblk_recalc(inode, &qlnk);
+
+ qmblk = INODE_QLNK(inode)->qmblk;
+ if (qmblk != VZ_QUOTA_BAD) {
+ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb))
+ qmblk_get(qmblk);
+ else
+ qmblk = NULL;
+ }
+
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(&qlnk);
+ return qmblk;
+}
+
+/**
+ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems
+ *
+ * This function finds a quota master block corresponding to the root of
+ * a virtual filesystem.
+ * Returns a quota master block with reference taken, or %NULL if not under
+ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation
+ * operations will fail).
+ *
+ * Note: this function uses vzquota_inode_qmblk().
+ * The latter is a rather confusing function: it returns qmblk that used to be
+ * on the inode some time ago (without guarantee that it still has any
+ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the
+ * caller to think whether the inode could have changed its qmblk and what to
+ * do in that case.
+ * Currently, the callers appear to not care :(
+ */
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb)
+{
+ struct inode *qrinode;
+ struct vz_quota_master *qmblk;
+
+ qmblk = NULL;
+ qrinode = NULL;
+ if (sb->s_op->get_quota_root != NULL)
+ qrinode = sb->s_op->get_quota_root(sb);
+ if (qrinode != NULL)
+ qmblk = vzquota_inode_qmblk(qrinode);
+ return qmblk;
+}
+
+/* ----------------------------------------------------------------------
+ *
+ * Calls from quota operations
+ *
+ * --------------------------------------------------------------------- */
+
+/**
+ * vzquota_inode_init_call - call from DQUOT_INIT
+ */
+void vzquota_inode_init_call(struct inode *inode)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+
+ /* initializes inode's quota inside */
+ qmblk = vzquota_inode_data(inode, &data);
+ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
+ vzquota_data_unlock(inode, &data);
+
+ /*
+ * The check is needed for repeated new_inode() calls from a single
+ * ext3 call like create or mkdir in case of -ENOSPC.
+ */
+ spin_lock(&dcache_lock);
+ if (!list_empty(&inode->i_dentry))
+ vzquota_cur_qmblk_set(inode);
+ spin_unlock(&dcache_lock);
+}
+
+void vzquota_inode_swap_call(struct inode *inode, struct inode *tmpl)
+{
+ struct vz_quota_master *qmblk;
+
+ __vzquota_inode_init(inode, VZ_QUOTAO_INIT);
+
+ might_sleep();
+
+ inode_qmblk_lock(tmpl->i_sb);
+ if (unlikely(tmpl->i_flags & S_NOQUOTA)) {
+ inode_qmblk_unlock(tmpl->i_sb);
+ return;
+ }
+ __vzquota_inode_init(tmpl, VZ_QUOTAO_INICAL);
+
+ qmblk = INODE_QLNK(tmpl)->qmblk;
+ if (qmblk != VZ_QUOTA_BAD) {
+ void * uq;
+ list_del_init(&INODE_QLNK(tmpl)->list);
+ vzquota_qlnk_swap(INODE_QLNK(tmpl), INODE_QLNK(inode));
+ uq = inode->i_dquot[USRQUOTA];
+ inode->i_dquot[USRQUOTA] = tmpl->i_dquot[USRQUOTA];
+ tmpl->i_dquot[USRQUOTA] = uq;
+ tmpl->i_flags |= S_NOQUOTA;
+ inode_qmblk_unlock(inode->i_sb);
+
+ vzquota_inode_drop(tmpl);
+ } else {
+ inode_qmblk_unlock(tmpl->i_sb);
+ }
+}
+
+
+/**
+ * vzquota_inode_drop_call - call from DQUOT_DROP
+ */
+void vzquota_inode_drop_call(struct inode *inode)
+{
+ vzquota_inode_drop(inode);
+}
+
+/**
+ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs
+ * @inode: the inode
+ * @data: storage space
+ *
+ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk.
+ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD:
+ * qmblk in inode's qlnk is the same as returned,
+ * ugid pointers inside inode's qlnk are valid,
+ * some locks are taken (and should be released by vzquota_data_unlock).
+ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken.
+ */
+struct vz_quota_master *vzquota_inode_data(struct inode *inode,
+ struct vz_quota_datast *data)
+{
+ struct vz_quota_master *qmblk;
+
+ might_sleep();
+
+ vzquota_qlnk_init(&data->qlnk);
+ inode_qmblk_lock(inode->i_sb);
+ if (unlikely(inode->i_flags & S_NOQUOTA)) {
+ inode_qmblk_unlock(inode->i_sb);
+ return NULL;
+ }
+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+ !VZ_QUOTA_IS_ACTUAL(inode))
+ vzquota_inode_qmblk_recalc(inode, &data->qlnk);
+
+ qmblk = INODE_QLNK(inode)->qmblk;
+ if (qmblk != VZ_QUOTA_BAD) {
+ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) {
+ /*
+ * Note that in the current implementation,
+ * inode_qmblk_lock can theoretically be dropped here.
+ * This place is serialized with quota_off because
+ * quota_off fails when there are extra dentry
+ * references and syncs inodes before removing quota
+ * information from them.
+ * However, quota usage information should stop being
+ * updated immediately after vzquota_off.
+ */
+ qmblk_data_write_lock(qmblk);
+ } else {
+ inode_qmblk_unlock(inode->i_sb);
+ qmblk = NULL;
+ }
+ } else {
+ inode_qmblk_unlock(inode->i_sb);
+ }
+ return qmblk;
+}
+
+void vzquota_data_unlock(struct inode *inode,
+ struct vz_quota_datast *data)
+{
+ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk);
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(&data->qlnk);
+}
+
+#if defined(CONFIG_VZ_QUOTA_UGID)
+/**
+ * vzquota_inode_transfer_call - call from vzquota_transfer
+ */
+int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_datast data;
+ struct vz_quota_ilink qlnew;
+ int mask;
+ int ret;
+
+ might_sleep();
+ vzquota_qlnk_init(&qlnew);
+start:
+ qmblk = vzquota_inode_data(inode, &data);
+ ret = NO_QUOTA;
+ if (qmblk == VZ_QUOTA_BAD)
+ goto out_destr;
+ ret = QUOTA_OK;
+ if (qmblk == NULL)
+ goto out_destr;
+ qmblk_get(qmblk);
+
+ ret = QUOTA_OK;
+ if (!(qmblk->dq_flags & VZDQUG_ON))
+ /* no ugid quotas */
+ goto out_unlock;
+
+ mask = 0;
+ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid)
+ mask |= 1 << USRQUOTA;
+ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid)
+ mask |= 1 << GRPQUOTA;
+ while (1) {
+ if (vzquota_qlnk_is_empty(&qlnew) &&
+ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk))
+ break;
+ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk &&
+ qlnew.qmblk == qmblk)
+ goto finish;
+ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk))
+ break;
+ }
+
+ /* prepare for restart */
+ vzquota_data_unlock(inode, &data);
+ qmblk_put(qmblk);
+ goto start;
+
+finish:
+ /* all references obtained successfully */
+ ret = vzquota_transfer_usage(inode, mask, &qlnew);
+ if (!ret) {
+ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode));
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS);
+ }
+out_unlock:
+ vzquota_data_unlock(inode, &data);
+ qmblk_put(qmblk);
+out_destr:
+ vzquota_qlnk_destroy(&qlnew);
+ return ret;
+}
+#endif
+
+int vzquota_rename_check(struct inode *inode,
+ struct inode *old_dir, struct inode *new_dir)
+{
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ilink qlnk1, qlnk2, qlnk3;
+ int c, ret;
+
+ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb)
+ return -1;
+
+ might_sleep();
+
+ vzquota_qlnk_init(&qlnk1);
+ vzquota_qlnk_init(&qlnk2);
+ vzquota_qlnk_init(&qlnk3);
+ inode_qmblk_lock(inode->i_sb);
+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL);
+ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL);
+
+ do {
+ c = 0;
+ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
+ !VZ_QUOTA_IS_ACTUAL(inode)) {
+ vzquota_inode_qmblk_recalc(inode, &qlnk1);
+ c++;
+ }
+ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) ||
+ !VZ_QUOTA_IS_ACTUAL(new_dir)) {
+ vzquota_inode_qmblk_recalc(new_dir, &qlnk2);
+ c++;
+ }
+ } while (c);
+
+ ret = 0;
+ qmblk = INODE_QLNK(inode)->qmblk;
+ if (qmblk != INODE_QLNK(new_dir)->qmblk) {
+ ret = -1;
+ while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) ||
+ !VZ_QUOTA_IS_ACTUAL(old_dir))
+ vzquota_inode_qmblk_recalc(old_dir, &qlnk3);
+ if (qmblk != VZ_QUOTA_BAD &&
+ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
+ qmblk->dq_root_path.dentry->d_inode == inode &&
+ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk,
+ inode->i_sb) &&
+ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk,
+ inode->i_sb))
+ /* quota root rename is allowed */
+ ret = 0;
+ }
+
+ inode_qmblk_unlock(inode->i_sb);
+ vzquota_qlnk_destroy(&qlnk3);
+ vzquota_qlnk_destroy(&qlnk2);
+ vzquota_qlnk_destroy(&qlnk1);
+ return ret;
+}
+
+/*
+ * Scan parent subdirs and find busy dentries names/path
+ * @parent: parent dentry
+ * @buf: buffer to store path.
+ */
+static void vzdquota_read_busy_dentries(struct path *parent,
+ char *buf, int buflen)
+{
+ struct dentry *this_parent = parent->dentry;
+ struct list_head *next;
+ char *res, *end, *start;
+ struct path root, path;
+ int len;
+
+ if (!buf || buflen <= 0)
+ return;
+
+ path.mnt = parent->mnt;
+ /* From d_path() ... */
+ read_lock(&current->fs->lock);
+ path_get(&current->fs->root);
+ root = current->fs->root;
+ read_unlock(&current->fs->lock);
+
+ spin_lock(&dcache_lock);
+
+ end = buf + buflen;
+ start = buf;
+repeat:
+ next = this_parent->d_subdirs.next;
+resume:
+ while (next != &this_parent->d_subdirs) {
+ struct list_head *tmp = next;
+ struct dentry *dentry;
+ int subdirs;
+
+ dentry = list_entry(tmp, struct dentry, d_u.d_child);
+ next = tmp->next;
+ subdirs = !list_empty(&dentry->d_subdirs);
+
+ if (atomic_read(&dentry->d_count) && !subdirs) {
+ if (!buflen)
+ goto out;
+ /*
+ * Note: __d_path will store filename at the
+ * end of buf.
+ */
+ path.dentry = dentry;
+ res = __d_path(&path, &root, buf, buflen);
+ /* Exit if name is too long */
+ if (IS_ERR(res))
+ goto out;
+
+ /*
+ * Move the string obtained by __d_path,
+ * behind the last dentry path in buf.
+ */
+ len = end - res;
+ BUG_ON(len <= 0);
+
+ memmove(buf, res, len);
+
+ /* Trick: replace \0 by \n */
+ if (buf != start)
+ *(char *)(buf - 1) = '\n';
+
+ buf += len;
+ buflen -= len;
+ }
+
+ /*
+ * Descend a level if the d_subdirs list is non-empty.
+ */
+ if (subdirs) {
+ this_parent = dentry;
+ goto repeat;
+ }
+ }
+ /*
+ * All done at this level ... ascend and resume the search.
+ */
+ if (this_parent != parent->dentry) {
+ next = this_parent->d_u.d_child.next;
+ this_parent = this_parent->d_parent;
+ goto resume;
+ }
+out:
+ /* From d_path() ... */
+ spin_unlock(&dcache_lock);
+ path_put(&root);
+}
+
+/* ----------------------------------------------------------------------
+ *
+ * qmblk-related parts of on/off operations
+ *
+ * --------------------------------------------------------------------- */
+
+/**
+ * vzquota_check_dtree - check dentry tree if quota on/off is allowed
+ *
+ * This function doesn't allow quota to be turned on/off if some dentries in
+ * the tree have external references.
+ * In addition to technical reasons, it enforces user-space correctness:
+ * current usage (taken from or reported to the user space) can be meaningful
+ * and accurate only if the tree is not being modified.
+ * Side effect: additional vfsmount structures referencing the tree (bind
+ * mounts of tree nodes to some other places) are not allowed at on/off time.
+ *
+ * Store busy dentries path to the buf (if passed) in case of vzquota_off
+ * ioctl fail.
+ */
+int vzquota_check_dtree(struct vz_quota_master *qmblk, int off,
+ char *buf, int buflen)
+{
+ struct dentry *dentry;
+ int err, count;
+
+ err = -EBUSY;
+ dentry = qmblk->dq_root_path.dentry;
+
+ if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root)
+ goto unhashed;
+
+ /* attempt to shrink */
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&dcache_lock);
+ inode_qmblk_unlock(dentry->d_sb);
+ shrink_dcache_parent(dentry);
+ inode_qmblk_lock(dentry->d_sb);
+ spin_lock(&dcache_lock);
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&dcache_lock);
+ vzdquota_read_busy_dentries(&qmblk->dq_root_path,
+ buf, buflen);
+ spin_lock(&dcache_lock);
+ goto out;
+ }
+
+ count = 1;
+ if (dentry == dentry->d_sb->s_root)
+ count += 2; /* sb and mnt refs */
+ if (atomic_read(&dentry->d_count) < count) {
+ printk(KERN_ERR "%s: too small count %d vs %d.\n",
+ __FUNCTION__,
+ atomic_read(&dentry->d_count), count);
+ goto out;
+ }
+ if (atomic_read(&dentry->d_count) > count)
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+
+unhashed:
+ /*
+ * Quota root is removed.
+ * Allow to turn quota off, but not on.
+ */
+ if (off)
+ err = 0;
+ goto out;
+}
+
+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
+ struct vz_quota_master *qmblk, char __user *ubuf)
+{
+ struct vz_quota_ilink qlnk;
+ struct vz_quota_master *qold, *qnew;
+ int err;
+ char *buf;
+
+ buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
+
+ might_sleep();
+
+ qold = NULL;
+ qnew = vzquota_alloc_fake();
+ if (qnew == NULL) {
+ free_page((unsigned long)buf);
+ return -ENOMEM;
+ }
+
+ vzquota_qlnk_init(&qlnk);
+ inode_qmblk_lock(sb);
+ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
+
+ spin_lock(&dcache_lock);
+ while (1) {
+ err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE);
+ if (err)
+ break;
+ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk))
+ break;
+ }
+ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON);
+ spin_unlock(&dcache_lock);
+
+ if (!err) {
+ qold = __VZ_QUOTA_NOQUOTA(sb);
+ qold->dq_flags |= VZDQ_NOACT;
+ __VZ_QUOTA_NOQUOTA(sb) = qnew;
+ }
+
+ inode_qmblk_unlock(sb);
+ vzquota_qlnk_destroy(&qlnk);
+ if (qold != NULL)
+ qmblk_put(qold);
+
+ if (buf) {
+ if (copy_to_user(ubuf, buf, PAGE_SIZE))
+ ;
+ free_page((unsigned long)buf);
+ }
+ return err;
+}
+
+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
+ char __user *ubuf, int force)
+{
+ int ret;
+ char *buf;
+
+ buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL;
+
+ ret = 0;
+ inode_qmblk_lock(sb);
+
+ spin_lock(&dcache_lock);
+ if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force)
+ ret = -EBUSY;
+ spin_unlock(&dcache_lock);
+
+ if (!ret)
+ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT;
+ inode_qmblk_unlock(sb);
+
+ if (buf) {
+ if (copy_to_user(ubuf, buf, PAGE_SIZE))
+ ;
+ free_page((unsigned long)buf);
+ }
+ return ret;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * External interfaces
+ *
+ * ---------------------------------------------------------------------*/
+
+static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ switch (cmd) {
+ case VZCTL_QUOTA_NEW_CTL: {
+ struct vzctl_quotactl qb;
+
+ err = -EFAULT;
+ if (copy_from_user(&qb, (void __user *)arg, sizeof(qb)))
+ break;
+ err = do_vzquotactl(qb.cmd, qb.quota_id,
+ qb.qstat, qb.ve_root, 0);
+ break;
+ }
+#ifdef CONFIG_VZ_QUOTA_UGID
+ case VZCTL_QUOTA_UGID_CTL: {
+ struct vzctl_quotaugidctl qub;
+
+ err = -EFAULT;
+ if (copy_from_user(&qub, (void __user *)arg, sizeof(qub)))
+ break;
+ err = do_vzquotaugidctl(qub.cmd, qub.quota_id,
+ qub.ugid_index, qub.ugid_size, qub.addr, 0);
+ break;
+ }
+#endif
+ default:
+ err = -ENOTTY;
+ }
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ switch (cmd) {
+ case VZCTL_COMPAT_QUOTA_CTL: {
+ struct compat_vzctl_quotactl cs;
+
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+ err = do_vzquotactl(cs.cmd, cs.quota_id,
+ compat_ptr(cs.qstat),
+ compat_ptr(cs.ve_root), 1);
+ break;
+ }
+#ifdef CONFIG_VZ_QUOTA_UGID
+ case VZCTL_COMPAT_QUOTA_UGID_CTL: {
+ struct compat_vzctl_quotaugidctl cs;
+
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+
+ err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index,
+ cs.ugid_size, compat_ptr(cs.addr), 1);
+ break;
+ }
+#endif
+ default:
+ err = -ENOIOCTLCMD;
+ }
+ return err;
+}
+#endif
+
+static struct vzioctlinfo vzdqcalls = {
+ .type = VZDQCTLTYPE,
+ .ioctl = vzquota_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_vzquota_ioctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+/**
+ * vzquota_dstat - get quota usage info for virtual superblock
+ */
+static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat)
+{
+ struct vz_quota_master *qmblk;
+
+ qmblk = vzquota_find_qmblk(super);
+ if (qmblk == NULL)
+ return -ENOENT;
+ if (qmblk == VZ_QUOTA_BAD) {
+ memset(qstat, 0, sizeof(*qstat));
+ return 0;
+ }
+
+ qmblk_data_read_lock(qmblk);
+ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat));
+ qmblk_data_read_unlock(qmblk);
+ qmblk_put(qmblk);
+ return 0;
+}
+
+
+/* ----------------------------------------------------------------------
+ *
+ * Init/exit helpers
+ *
+ * ---------------------------------------------------------------------*/
+
+static int vzquota_cache_init(void)
+{
+ int i;
+
+ vzquota_cachep = kmem_cache_create("vz_quota_master",
+ sizeof(struct vz_quota_master),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (vzquota_cachep == NULL) {
+ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
+ goto nomem2;
+ }
+ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&vzquota_hash_table[i]);
+
+ return 0;
+
+nomem2:
+ return -ENOMEM;
+}
+
+static void vzquota_cache_release(void)
+{
+ int i;
+
+ /* sanity check */
+ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
+ if (!list_empty(&vzquota_hash_table[i]))
+ BUG();
+
+ /* release caches */
+ kmem_cache_destroy(vzquota_cachep);
+ vzquota_cachep = NULL;
+}
+
+static int quota_notifier_call(struct vnotifier_block *self,
+ unsigned long n, void *data, int err)
+{
+ struct virt_info_quota *viq;
+ struct super_block *sb;
+
+ viq = (struct virt_info_quota *)data;
+ switch (n) {
+ case VIRTINFO_QUOTA_ON:
+ err = NOTIFY_BAD;
+ if (!try_module_get(THIS_MODULE))
+ break;
+ sb = viq->super;
+ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
+ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
+ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
+ err = NOTIFY_OK;
+ break;
+ case VIRTINFO_QUOTA_OFF:
+ module_put(THIS_MODULE);
+ err = NOTIFY_OK;
+ break;
+ case VIRTINFO_QUOTA_GETSTAT:
+ err = NOTIFY_BAD;
+ if (vzquota_dstat(viq->super, viq->qstat))
+ break;
+ err = NOTIFY_OK;
+ break;
+ case VIRTINFO_QUOTA_DISABLE:
+ err = NOTIFY_OK;
+ vzquota_inode_off((struct inode *)data);
+ break;
+ }
+ return err;
+}
+
+struct vnotifier_block quota_notifier_block = {
+ .notifier_call = quota_notifier_call,
+ .priority = INT_MAX,
+};
+
+/* ----------------------------------------------------------------------
+ *
+ * Init/exit procedures
+ *
+ * ---------------------------------------------------------------------*/
+
+static int __init vzquota_init(void)
+{
+ int err;
+
+ if ((err = vzquota_cache_init()) != 0)
+ goto out_cache;
+
+ if ((err = vzquota_proc_init()) != 0)
+ goto out_proc;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+ if ((err = vzquota_ugid_init()) != 0)
+ goto out_ugid;
+#endif
+
+ mutex_init(&vz_quota_mutex);
+ vzioctl_register(&vzdqcalls);
+ virtinfo_notifier_register(VITYPE_QUOTA, &quota_notifier_block);
+#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS)
+ vzaquota_init();
+#endif
+
+ return 0;
+
+#ifdef CONFIG_VZ_QUOTA_UGID
+out_ugid:
+ vzquota_proc_release();
+#endif
+out_proc:
+ vzquota_cache_release();
+out_cache:
+ return err;
+}
+
+#if defined(VZ_QUOTA_UNLOAD)
+static void __exit vzquota_release(void)
+{
+ virtinfo_notifier_unregister(VITYPE_QUOTA, &quota_notifier_block);
+ vzioctl_unregister(&vzdqcalls);
+#ifdef CONFIG_VZ_QUOTA_UGID
+#ifdef CONFIG_PROC_FS
+ vzaquota_fini();
+#endif
+ vzquota_ugid_release();
+#endif
+ vzquota_proc_release();
+ vzquota_cache_release();
+}
+#endif
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Disk Quota");
+MODULE_LICENSE("GPL v2");
+
+module_init(vzquota_init)
+#if defined(VZ_QUOTA_UNLOAD)
+module_exit(vzquota_release)
+#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index 3ac2898..618603e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -21,6 +21,8 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <bc/beancounter.h>
+
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -369,6 +371,29 @@ static inline void file_pos_write(struct file *file, loff_t pos)
file->f_pos = pos;
}
+static inline void bc_acct_write(size_t bytes)
+{
+ struct user_beancounter *ub;
+
+ if (bytes > 0) {
+ ub = get_exec_ub();
+ ub_percpu_inc(ub, write);
+ ub_percpu_add(ub, wchar, bytes);
+ }
+}
+
+static inline void bc_acct_read(size_t bytes)
+{
+ struct user_beancounter *ub;
+
+ if (bytes > 0) {
+ ub = get_exec_ub();
+ ub_percpu_inc(ub, read);
+ ub_percpu_add(ub, rchar, bytes);
+ }
+}
+
+
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct file *file;
@@ -381,6 +406,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
+
+ bc_acct_read(ret);
}
return ret;
@@ -399,6 +426,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(file, buf, count, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
+
+ bc_acct_write(ret);
}
return ret;
@@ -420,6 +449,8 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
if (file->f_mode & FMODE_PREAD)
ret = vfs_read(file, buf, count, &pos);
fput_light(file, fput_needed);
+
+ bc_acct_read(ret);
}
return ret;
@@ -449,6 +480,8 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
if (file->f_mode & FMODE_PWRITE)
ret = vfs_write(file, buf, count, &pos);
fput_light(file, fput_needed);
+
+ bc_acct_write(ret);
}
return ret;
@@ -702,6 +735,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(file, vec, vlen, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
+
+ bc_acct_read(ret);
}
if (ret > 0)
@@ -723,6 +758,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(file, vec, vlen, &pos);
file_pos_write(file, pos);
fput_light(file, fput_needed);
+
+ bc_acct_write(ret);
}
if (ret > 0)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 2715791..b6fa0ea 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -826,6 +826,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
+ inode = dentry->d_inode;
+ vfs_dq_init(inode);
+
/* we will be doing 2 balancings and update 2 stat data, we change quotas
* of the owner of the directory and of the owner of the parent directory.
* The quota structure is possibly deleted only on last iput => outside
@@ -850,8 +853,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
goto end_rmdir;
}
- inode = dentry->d_inode;
-
reiserfs_update_inode_transaction(inode);
reiserfs_update_inode_transaction(dir);
@@ -915,6 +916,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
unsigned long savelink;
inode = dentry->d_inode;
+ vfs_dq_init(inode);
/* in this transaction we can be doing at max two balancings and update
* two stat datas, we change quotas of the owner of the directory and of
@@ -1228,6 +1230,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode = old_dentry->d_inode;
new_dentry_inode = new_dentry->d_inode;
+ if (new_dentry_inode)
+ vfs_dq_init(new_dentry_inode);
// make sure, that oldname still exists and points to an object we
// are going to rename
diff --git a/fs/select.c b/fs/select.c
index fd38ce2..9a9e8b0 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -29,6 +29,7 @@
#include <asm/uaccess.h>
+#include <bc/kmem.h>
/*
* Estimate expected accuracy in ns from a timeval.
@@ -551,7 +552,8 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
if (size > sizeof(stack_fds) / 6) {
/* Not enough space in on-stack array; must use kmalloc */
ret = -ENOMEM;
- bits = kmalloc(6 * size, GFP_KERNEL);
+ bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ?
+ GFP_KERNEL_UBC : GFP_KERNEL);
if (!bits)
goto out_nofds;
}
@@ -841,7 +843,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
- walk = walk->next = kmalloc(size, GFP_KERNEL);
+ walk = walk->next = kmalloc(size, GFP_KERNEL_UBC);
if (!walk) {
err = -ENOMEM;
goto out_fds;
@@ -873,7 +875,7 @@ out_fds:
return err;
}
-static long do_restart_poll(struct restart_block *restart_block)
+long do_restart_poll(struct restart_block *restart_block)
{
struct pollfd __user *ufds = restart_block->poll.ufds;
int nfds = restart_block->poll.nfds;
@@ -894,6 +896,7 @@ static long do_restart_poll(struct restart_block *restart_block)
}
return ret;
}
+EXPORT_SYMBOL_GPL(do_restart_poll);
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
long, timeout_msecs)
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eae7d9d..d9a7043 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -32,7 +32,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
struct seq_file *p = file->private_data;
if (!p) {
- p = kmalloc(sizeof(*p), GFP_KERNEL);
+ p = kmalloc(sizeof(*p), GFP_KERNEL_UBC);
if (!p)
return -ENOMEM;
file->private_data = p;
@@ -76,7 +76,7 @@ static int traverse(struct seq_file *m, loff_t offset)
return 0;
}
if (!m->buf) {
- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+ m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
if (!m->buf)
return -ENOMEM;
}
@@ -116,7 +116,7 @@ static int traverse(struct seq_file *m, loff_t offset)
Eoverflow:
m->op->stop(m, p);
kfree(m->buf);
- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+ m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
return !m->buf ? -ENOMEM : -EAGAIN;
}
@@ -169,7 +169,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
m->version = file->f_version;
/* grab buffer if we didn't have one */
if (!m->buf) {
- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+ m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC);
if (!m->buf)
goto Enomem;
}
@@ -210,7 +210,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
goto Fill;
m->op->stop(m, p);
kfree(m->buf);
- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+ m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC);
if (!m->buf)
goto Enomem;
m->count = 0;
@@ -435,6 +435,8 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
if (size) {
char *p = d_path(path, buf, size);
+ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG)
+ return 0;
if (!IS_ERR(p)) {
char *end = mangle_path(buf, p, esc);
if (end)
@@ -551,7 +553,7 @@ static void single_stop(struct seq_file *p, void *v)
int single_open(struct file *file, int (*show)(struct seq_file *, void *),
void *data)
{
- struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+ struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC);
int res = -ENOMEM;
if (op) {
@@ -595,7 +597,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
void *private;
struct seq_file *seq;
- private = kzalloc(psize, GFP_KERNEL);
+ private = kzalloc(psize, GFP_KERNEL_UBC);
if (private == NULL)
goto out;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c..5b872c3 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -28,10 +28,7 @@
#include <linux/anon_inodes.h>
#include <linux/signalfd.h>
#include <linux/syscalls.h>
-
-struct signalfd_ctx {
- sigset_t sigmask;
-};
+#include <linux/module.h>
static int signalfd_release(struct inode *inode, struct file *file)
{
@@ -199,17 +196,17 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
return total ? total: ret;
}
-static const struct file_operations signalfd_fops = {
+const struct file_operations signalfd_fops = {
.release = signalfd_release,
.poll = signalfd_poll,
.read = signalfd_read,
};
+EXPORT_SYMBOL(signalfd_fops);
SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
size_t, sizemask, int, flags)
{
sigset_t sigmask;
- struct signalfd_ctx *ctx;
/* Check the SFD_* constants for consistency. */
BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
@@ -224,12 +221,19 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
signotset(&sigmask);
+ return do_signalfd(ufd, &sigmask, flags);
+}
+
+long do_signalfd(int ufd, sigset_t *sigmask, int flags)
+{
+ struct signalfd_ctx *ctx;
+
if (ufd == -1) {
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
- ctx->sigmask = sigmask;
+ ctx->sigmask = *sigmask;
/*
* When we call this, the initialization must be complete, since
@@ -249,7 +253,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
return -EINVAL;
}
spin_lock_irq(&current->sighand->siglock);
- ctx->sigmask = sigmask;
+ ctx->sigmask = *sigmask;
spin_unlock_irq(&current->sighand->siglock);
wake_up(&current->sighand->signalfd_wqh);
@@ -258,6 +262,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
return ufd;
}
+EXPORT_SYMBOL_GPL(do_signalfd);
SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
size_t, sizemask)
diff --git a/fs/simfs.c b/fs/simfs.c
new file mode 100644
index 0000000..e21f911
--- /dev/null
+++ b/fs/simfs.c
@@ -0,0 +1,339 @@
+/*
+ * fs/simfs.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/vzquota.h>
+#include <linux/statfs.h>
+#include <linux/virtinfo.h>
+#include <linux/faudit.h>
+#include <linux/genhd.h>
+#include <linux/reiserfs_fs.h>
+
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+
+#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb
+
+static struct super_operations sim_super_ops;
+
+static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct super_block *sb;
+ struct inode *inode;
+
+ inode = dentry->d_inode;
+ if (!inode->i_op->getattr) {
+ generic_fillattr(inode, stat);
+ if (!stat->blksize) {
+ unsigned blocks;
+
+ sb = inode->i_sb;
+ blocks = (stat->size + sb->s_blocksize-1) >>
+ sb->s_blocksize_bits;
+ stat->blocks = (sb->s_blocksize / 512) * blocks;
+ stat->blksize = sb->s_blocksize;
+ }
+ } else {
+ int err;
+
+ err = inode->i_op->getattr(mnt, dentry, stat);
+ if (err)
+ return err;
+ }
+
+ if (!mnt)
+ return 0;
+ sb = mnt->mnt_sb;
+ if (sb->s_op == &sim_super_ops)
+ stat->dev = sb->s_dev;
+ return 0;
+}
+
+static void quota_get_stat(struct super_block *sb, struct kstatfs *buf)
+{
+ int err;
+ struct dq_stat qstat;
+ struct virt_info_quota q;
+ long free_file, adj_file;
+ s64 blk, free_blk, adj_blk;
+ int bsize_bits;
+
+ q.super = sb;
+ q.qstat = &qstat;
+ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q);
+ if (err != NOTIFY_OK)
+ return;
+
+ bsize_bits = ffs(buf->f_bsize) - 1;
+
+ if (qstat.bsoftlimit > qstat.bcurrent)
+ free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits;
+ else
+ free_blk = 0;
+ /*
+ * In the regular case, we always set buf->f_bfree and buf->f_blocks to
+ * the values reported by quota. In case of real disk space shortage,
+ * we adjust the values. We want this adjustment to look as if the
+ * total disk space were reduced, not as if the usage were increased.
+ * -- SAW
+ */
+ adj_blk = 0;
+ if (buf->f_bfree < free_blk)
+ adj_blk = free_blk - buf->f_bfree;
+ buf->f_bfree = free_blk - adj_blk;
+
+ if (free_blk < buf->f_bavail)
+ buf->f_bavail = free_blk;
+
+ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk;
+ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk;
+
+ free_file = qstat.isoftlimit - qstat.icurrent;
+ if (free_file < 0)
+ free_file = 0;
+ if (buf->f_type == REISERFS_SUPER_MAGIC)
+ /*
+ * reiserfs doesn't initialize f_ffree and f_files values of
+ * kstatfs because it doesn't have an inode limit.
+ */
+ buf->f_ffree = free_file;
+ adj_file = 0;
+ if (buf->f_ffree < free_file)
+ adj_file = free_file - buf->f_ffree;
+ buf->f_ffree = free_file - adj_file;
+ buf->f_files = qstat.isoftlimit - adj_file;
+}
+
+static int sim_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+ int err;
+ struct super_block *lsb;
+ struct kstatfs statbuf;
+
+ err = 0;
+ if (sb->s_op != &sim_super_ops)
+ return 0;
+
+ memset(&statbuf, 0, sizeof(statbuf));
+ lsb = SIMFS_GET_LOWER_FS_SB(sb);
+
+ err = -ENOSYS;
+ if (lsb && lsb->s_op && lsb->s_op->statfs)
+ err = lsb->s_op->statfs(sb->s_root, &statbuf);
+ if (err)
+ return err;
+
+ quota_get_stat(sb, &statbuf);
+
+ buf->f_files = statbuf.f_files;
+ buf->f_ffree = statbuf.f_ffree;
+ buf->f_blocks = statbuf.f_blocks;
+ buf->f_bfree = statbuf.f_bfree;
+ buf->f_bavail = statbuf.f_bavail;
+ return 0;
+}
+
+static int sim_systemcall(struct vnotifier_block *me, unsigned long n,
+ void *d, int old_ret)
+{
+ int err;
+
+ switch (n) {
+ case VIRTINFO_FAUDIT_STAT: {
+ struct faudit_stat_arg *arg;
+
+ arg = (struct faudit_stat_arg *)d;
+ err = sim_getattr(arg->mnt, arg->dentry, arg->stat);
+ arg->err = err;
+ }
+ break;
+ case VIRTINFO_FAUDIT_STATFS: {
+ struct faudit_statfs_arg *arg;
+
+ arg = (struct faudit_statfs_arg *)d;
+ err = sim_statfs(arg->sb, arg->stat);
+ arg->err = err;
+ }
+ break;
+ default:
+ return old_ret;
+ }
+ return (err ? NOTIFY_BAD : NOTIFY_OK);
+}
+
+#ifdef CONFIG_QUOTA
+static struct inode *sim_quota_root(struct super_block *sb)
+{
+ return sb->s_root->d_inode;
+}
+#endif
+
+/*
+ * NOTE: We need to setup s_bdev field on super block, since sys_quotactl()
+ * does lookup_bdev() and get_super() which are comparing sb->s_bdev.
+ * so this is a MUST if we want unmodified sys_quotactl
+ * to work correctly on /dev/simfs inside VE
+ */
+static int sim_init_blkdev(struct super_block *sb)
+{
+ static struct hd_struct fake_hd;
+ struct block_device *blkdev;
+
+ blkdev = bdget(sb->s_dev);
+ if (blkdev == NULL)
+ return -ENOMEM;
+
+ blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */
+ sb->s_bdev = blkdev;
+
+ return 0;
+}
+
+static void sim_free_blkdev(struct super_block *sb)
+{
+ /* set bd_part back to NULL */
+ sb->s_bdev->bd_part = NULL;
+ bdput(sb->s_bdev);
+}
+
+static void sim_quota_init(struct super_block *sb)
+{
+ struct virt_info_quota viq;
+
+ viq.super = sb;
+ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq);
+}
+
+static void sim_quota_free(struct super_block *sb)
+{
+ struct virt_info_quota viq;
+
+ viq.super = sb;
+ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq);
+}
+
+static struct super_operations sim_super_ops = {
+#ifdef CONFIG_QUOTA
+ .get_quota_root = sim_quota_root,
+#endif
+};
+
+static int sim_fill_super(struct super_block *s, void *data)
+{
+ int err;
+ struct nameidata *nd;
+
+ err = set_anon_super(s, NULL);
+ if (err)
+ goto out;
+
+ err = 0;
+ nd = (struct nameidata *)data;
+ s->s_fs_info = mntget(nd->path.mnt);
+ s->s_root = dget(nd->path.dentry);
+ s->s_op = &sim_super_ops;
+out:
+ return err;
+}
+
+static int sim_get_sb(struct file_system_type *type, int flags,
+ const char *dev_name, void *opt, struct vfsmount *mnt)
+{
+ int err;
+ struct nameidata nd;
+ struct super_block *sb;
+
+ err = -EINVAL;
+ if (opt == NULL)
+ goto out;
+
+ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+ if (err)
+ goto out;
+
+ sb = sget(type, NULL, sim_fill_super, &nd);
+ err = PTR_ERR(sb);
+ if (IS_ERR(sb))
+ goto out_path;
+
+ err = sim_init_blkdev(sb);
+ if (err)
+ goto out_killsb;
+
+ sim_quota_init(sb);
+
+ path_put(&nd.path);
+ simple_set_mnt(mnt, sb);
+ return 0;
+
+out_killsb:
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+out_path:
+ path_put(&nd.path);
+out:
+ return err;
+}
+
+static void sim_kill_sb(struct super_block *sb)
+{
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ mntput((struct vfsmount *)(sb->s_fs_info));
+
+ sim_quota_free(sb);
+ sim_free_blkdev(sb);
+
+ kill_anon_super(sb);
+}
+
+static struct file_system_type sim_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "simfs",
+ .get_sb = sim_get_sb,
+ .kill_sb = sim_kill_sb,
+ .fs_flags = FS_MANGLE_PROC,
+};
+
+static struct vnotifier_block sim_syscalls = {
+ .notifier_call = sim_systemcall,
+};
+
+static int __init init_simfs(void)
+{
+ int err;
+
+ err = register_filesystem(&sim_fs_type);
+ if (err)
+ return err;
+
+ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls);
+ return 0;
+}
+
+static void __exit exit_simfs(void)
+{
+ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls);
+ unregister_filesystem(&sim_fs_type);
+}
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System");
+MODULE_LICENSE("GPL v2");
+
+module_init(init_simfs);
+module_exit(exit_simfs);
diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c
index e37fe4d..1992fc0 100644
--- a/fs/smbfs/sock.c
+++ b/fs/smbfs/sock.c
@@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *server)
VERBOSE("closing socket %p\n", sock);
sock->sk->sk_data_ready = server->data_ready;
+ sock->sk->sk_user_data = NULL;
server->sock_file = NULL;
fput(file);
}
diff --git a/fs/stat.c b/fs/stat.c
index c4ecd52..37e6cd9 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/faudit.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int retval;
+ struct faudit_stat_arg arg;
retval = security_inode_getattr(mnt, dentry);
if (retval)
return retval;
+ arg.mnt = mnt;
+ arg.dentry = dentry;
+ arg.stat = stat;
+ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg)
+ != NOTIFY_DONE)
+ return arg.err;
+
if (inode->i_op->getattr)
return inode->i_op->getattr(mnt, dentry, stat);
diff --git a/fs/super.c b/fs/super.c
index aff046b..cce99ab 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,12 +37,15 @@
#include <linux/kobject.h>
#include <linux/mutex.h>
#include <linux/file.h>
+#include <linux/ve_proto.h>
#include <asm/uaccess.h>
#include "internal.h"
LIST_HEAD(super_blocks);
+EXPORT_SYMBOL_GPL(super_blocks);
DEFINE_SPINLOCK(sb_lock);
+EXPORT_SYMBOL_GPL(sb_lock);
/**
* alloc_super - create new superblock
@@ -69,13 +72,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
INIT_LIST_HEAD(&s->s_dentry_lru);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
- lockdep_set_class(&s->s_umount, &type->s_umount_key);
+ lockdep_set_class(&s->s_umount,
+ &type->proto->s_umount_key);
/*
* The locking rules for s_lock are up to the
* filesystem. For example ext3fs has different
* lock ordering than usbfs:
*/
- lockdep_set_class(&s->s_lock, &type->s_lock_key);
+ lockdep_set_class(&s->s_lock,
+ &type->proto->s_lock_key);
/*
* sget() can have s_umount recursion.
*
@@ -307,11 +312,13 @@ void generic_shutdown_super(struct super_block *sb)
/* bad name - it should be evict_inodes() */
invalidate_inodes(sb);
+ if (sb->dq_op && sb->dq_op->shutdown)
+ sb->dq_op->shutdown(sb);
if (sop->put_super)
sop->put_super(sb);
/* Forget any remaining inodes */
- if (invalidate_inodes(sb)) {
+ if (invalidate_inodes_check(sb, 1)) {
printk("VFS: Busy inodes after unmount of %s. "
"Self-destruct in 5 seconds. Have a nice day...\n",
sb->s_id);
@@ -531,17 +538,26 @@ rescan:
spin_unlock(&sb_lock);
return NULL;
}
+EXPORT_SYMBOL(user_get_super);
SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
{
+ dev_t kdev;
struct super_block *s;
struct ustat tmp;
struct kstatfs sbuf;
- int err = -EINVAL;
+ int err;
+
+ kdev = new_decode_dev(dev);
+ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ);
+ if (err)
+ goto out;
+
+ err = -EINVAL;
+ s = user_get_super(kdev);
+ if (s == NULL)
+ goto out;
- s = user_get_super(new_decode_dev(dev));
- if (s == NULL)
- goto out;
err = vfs_statfs(s->s_root, &sbuf);
drop_super(s);
if (err)
@@ -653,6 +669,13 @@ static DEFINE_IDA(unnamed_dev_ida);
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* for compatibility with coreutils still unaware of new minor sizes */
+int unnamed_dev_majors[] = {
+ 0, 144, 145, 146, 242, 243, 244, 245,
+ 246, 247, 248, 249, 250, 251, 252, 253
+};
+EXPORT_SYMBOL(unnamed_dev_majors);
+
int set_anon_super(struct super_block *s, void *data)
{
int dev;
@@ -672,7 +695,7 @@ int set_anon_super(struct super_block *s, void *data)
else if (error)
return -EAGAIN;
- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
+ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) {
spin_lock(&unnamed_dev_lock);
ida_remove(&unnamed_dev_ida, dev);
if (unnamed_dev_start > dev)
@@ -680,7 +703,7 @@ int set_anon_super(struct super_block *s, void *data)
spin_unlock(&unnamed_dev_lock);
return -EMFILE;
}
- s->s_dev = MKDEV(0, dev & MINORMASK);
+ s->s_dev = make_unnamed_dev(dev);
return 0;
}
@@ -688,8 +711,9 @@ EXPORT_SYMBOL(set_anon_super);
void kill_anon_super(struct super_block *sb)
{
- int slot = MINOR(sb->s_dev);
+ int slot;
+ slot = unnamed_dev_idx(sb->s_dev);
generic_shutdown_super(sb);
spin_lock(&unnamed_dev_lock);
ida_remove(&unnamed_dev_ida, slot);
diff --git a/fs/sync.c b/fs/sync.c
index d104591..d23ae15 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -15,6 +15,8 @@
#include <linux/buffer_head.h>
#include "internal.h"
+#include <bc/beancounter.h>
+
#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
SYNC_FILE_RANGE_WAIT_AFTER)
@@ -127,11 +129,18 @@ restart:
*/
SYSCALL_DEFINE0(sync)
{
+ struct user_beancounter *ub;
+
+ ub = get_exec_ub();
+ ub_percpu_inc(ub, sync);
+
wakeup_flusher_threads(0);
sync_filesystems(0);
sync_filesystems(1);
if (unlikely(laptop_mode))
laptop_sync_completion();
+
+ ub_percpu_inc(ub, sync_done);
return 0;
}
@@ -207,6 +216,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
const struct file_operations *fop;
struct address_space *mapping;
int err, ret;
+ struct user_beancounter *ub;
/*
* Get mapping and operations from the file in case we have
@@ -226,6 +236,12 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
goto out;
}
+ ub = get_exec_ub();
+ if (datasync)
+ ub_percpu_inc(ub, fdsync);
+ else
+ ub_percpu_inc(ub, fsync);
+
ret = filemap_write_and_wait_range(mapping, start, end);
/*
@@ -238,6 +254,10 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
ret = err;
mutex_unlock(&mapping->host->i_mutex);
+ if (datasync)
+ ub_percpu_inc(ub, fdsync_done);
+ else
+ ub_percpu_inc(ub, fsync_done);
out:
return ret;
}
@@ -444,12 +464,16 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
loff_t endbyte, unsigned int flags)
{
int ret;
+ struct user_beancounter *ub;
if (!mapping) {
ret = -EINVAL;
- goto out;
+ goto out_noacct;
}
+ ub = get_exec_ub();
+ ub_percpu_inc(ub, frsync);
+
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
ret = wait_on_page_writeback_range(mapping,
@@ -472,6 +496,8 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
endbyte >> PAGE_CACHE_SHIFT);
}
out:
+ ub_percpu_inc(ub, frsync_done);
+out_noacct:
return ret;
}
EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 60c702b..8c2e052 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -398,6 +398,9 @@ static int open(struct inode * inode, struct file * file)
struct bin_buffer *bb = NULL;
int error;
+ if (!ve_sysfs_alowed())
+ return 0;
+
/* binary file operations requires both @sd and its parent */
if (!sysfs_get_active_two(attr_sd))
return -ENODEV;
@@ -485,6 +488,9 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
{
+ if (!ve_sysfs_alowed())
+ return 0;
+
BUG_ON(!kobj || !kobj->sd || !attr);
return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
@@ -499,6 +505,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
{
+ if (!ve_sysfs_alowed())
+ return;
sysfs_hash_and_remove(kobj->sd, attr->attr.name);
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e020183..3af502c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -539,6 +539,9 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
struct inode *inode;
struct dentry *dentry;
+ if (!ve_sysfs_alowed())
+ return;
+
inode = ilookup(sysfs_sb, sd->s_ino);
if (!inode)
return;
@@ -712,12 +715,15 @@ int sysfs_create_dir(struct kobject * kobj)
struct sysfs_dirent *parent_sd, *sd;
int error = 0;
+ if (!ve_sysfs_alowed())
+ return 0;
+
BUG_ON(!kobj);
if (kobj->parent)
parent_sd = kobj->parent->sd;
else
- parent_sd = &sysfs_root;
+ parent_sd = ve_sysfs_root;
error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
if (!error)
@@ -819,6 +825,9 @@ void sysfs_remove_dir(struct kobject * kobj)
{
struct sysfs_dirent *sd = kobj->sd;
+ if (!ve_sysfs_alowed())
+ return;
+
spin_lock(&sysfs_assoc_lock);
kobj->sd = NULL;
spin_unlock(&sysfs_assoc_lock);
@@ -834,6 +843,9 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
const char *dup_name = NULL;
int error;
+ if (!ve_sysfs_alowed())
+ return 0;
+
mutex_lock(&sysfs_rename_mutex);
error = 0;
@@ -899,7 +911,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
mutex_lock(&sysfs_rename_mutex);
BUG_ON(!sd->s_parent);
new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
- new_parent_kobj->sd : &sysfs_root;
+ new_parent_kobj->sd : ve_sysfs_root;
error = 0;
if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f5ea468..b135ba6 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -536,6 +536,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
{
+ if (!ve_sysfs_alowed())
+ return 0;
BUG_ON(!kobj || !kobj->sd || !attr);
return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
@@ -634,6 +636,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
{
+ if (!ve_sysfs_alowed())
+ return;
sysfs_hash_and_remove(kobj->sd, attr->name);
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index fe61194..628afe3 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -62,6 +62,8 @@ static int internal_create_group(struct kobject *kobj, int update,
struct sysfs_dirent *sd;
int error;
+ if (!ve_sysfs_alowed())
+ return 0;
BUG_ON(!kobj || (!update && !kobj->sd));
/* Updates may happen before the object has been instantiated */
@@ -131,6 +133,9 @@ void sysfs_remove_group(struct kobject * kobj,
struct sysfs_dirent *dir_sd = kobj->sd;
struct sysfs_dirent *sd;
+ if (!ve_sysfs_alowed())
+ return;
+
if (grp->name) {
sd = sysfs_get_dirent(dir_sd, grp->name);
if (!sd) {
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 02a022a..7a68ae0 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -22,8 +22,6 @@
#include <linux/security.h>
#include "sysfs.h"
-extern struct super_block * sysfs_sb;
-
static const struct address_space_operations sysfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 4974995..2b3c67e 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -22,8 +22,22 @@
#include "sysfs.h"
-static struct vfsmount *sysfs_mount;
+#ifndef CONFIG_VE
+struct vfsmount *sysfs_mount;
struct super_block * sysfs_sb = NULL;
+#endif
+
+#ifdef CONFIG_SYSFS_DEPRECATED_DYN
+unsigned sysfs_deprecated __read_mostly;
+
+static int __init sysfs_init_deprecated(char *str)
+{
+ sysfs_deprecated = 1;
+ return 1;
+}
+__setup("old_sysfs", sysfs_init_deprecated);
+#endif
+
struct kmem_cache *sysfs_dir_cachep;
static const struct super_operations sysfs_ops = {
@@ -40,6 +54,13 @@ struct sysfs_dirent sysfs_root = {
.s_ino = 1,
};
+static void init_ve0_sysfs_root(void)
+{
+#ifdef CONFIG_VE
+ get_ve0()->_sysfs_root = &sysfs_root;
+#endif
+}
+
static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
@@ -54,7 +75,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
/* get root inode, initialize and unlock it */
mutex_lock(&sysfs_mutex);
- inode = sysfs_get_inode(&sysfs_root);
+ inode = sysfs_get_inode(ve_sysfs_root);
mutex_unlock(&sysfs_mutex);
if (!inode) {
pr_debug("sysfs: could not get root inode\n");
@@ -68,7 +89,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
iput(inode);
return -ENOMEM;
}
- root->d_fsdata = &sysfs_root;
+ root->d_fsdata = ve_sysfs_root;
sb->s_root = root;
return 0;
}
@@ -79,16 +100,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
}
-static struct file_system_type sysfs_fs_type = {
+struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.get_sb = sysfs_get_sb,
.kill_sb = kill_anon_super,
};
+EXPORT_SYMBOL(sysfs_fs_type);
+
int __init sysfs_init(void)
{
int err = -ENOMEM;
+ init_ve0_sysfs_root();
sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
sizeof(struct sysfs_dirent),
0, 0, NULL);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c5081ad..c9dc0b0 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -29,10 +29,13 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
struct sysfs_addrm_cxt acxt;
int error;
+ if (!ve_sysfs_alowed())
+ return 0;
+
BUG_ON(!name);
if (!kobj)
- parent_sd = &sysfs_root;
+ parent_sd = ve_sysfs_root;
else
parent_sd = kobj->sd;
@@ -115,8 +118,11 @@ void sysfs_remove_link(struct kobject * kobj, const char * name)
{
struct sysfs_dirent *parent_sd = NULL;
+ if(!ve_sysfs_alowed())
+ return;
+
if (!kobj)
- parent_sd = &sysfs_root;
+ parent_sd = ve_sysfs_root;
else
parent_sd = kobj->sd;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index af4c4e7..561271d 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -10,74 +10,17 @@
#include <linux/fs.h>
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
- struct kobject *kobj;
- /* children list starts here and goes through sd->s_sibling */
- struct sysfs_dirent *children;
-};
-
-struct sysfs_elem_symlink {
- struct sysfs_dirent *target_sd;
-};
-
-struct sysfs_elem_attr {
- struct attribute *attr;
- struct sysfs_open_dirent *open;
-};
-
-struct sysfs_elem_bin_attr {
- struct bin_attribute *bin_attr;
- struct hlist_head buffers;
-};
-
-struct sysfs_inode_attrs {
- struct iattr ia_iattr;
- void *ia_secdata;
- u32 ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy. Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible. Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
- atomic_t s_count;
- atomic_t s_active;
- struct sysfs_dirent *s_parent;
- struct sysfs_dirent *s_sibling;
- const char *s_name;
-
- union {
- struct sysfs_elem_dir s_dir;
- struct sysfs_elem_symlink s_symlink;
- struct sysfs_elem_attr s_attr;
- struct sysfs_elem_bin_attr s_bin_attr;
- };
-
- unsigned int s_flags;
- ino_t s_ino;
- umode_t s_mode;
- struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS INT_MIN
-
-#define SYSFS_TYPE_MASK 0x00ff
-#define SYSFS_DIR 0x0001
-#define SYSFS_KOBJ_ATTR 0x0002
-#define SYSFS_KOBJ_BIN_ATTR 0x0004
-#define SYSFS_KOBJ_LINK 0x0008
-#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
-
-#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
-#define SYSFS_FLAG_REMOVED 0x0200
+#ifndef CONFIG_VE
+extern struct vfsmount *sysfs_mount;
+extern struct super_block *sysfs_sb;
+#define ve_sysfs_alowed() 1
+#else
+#include <linux/sched.h>
+#include <linux/ve.h>
+#define sysfs_mount (get_exec_env()->sysfs_mnt)
+#define sysfs_sb (get_exec_env()->sysfs_sb)
+#define ve_sysfs_alowed() (sysfs_sb != NULL)
+#endif
static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
{
@@ -97,8 +40,12 @@ struct sysfs_addrm_cxt {
/*
* mount.c
*/
+#ifdef CONFIG_VE
+#define ve_sysfs_root (get_exec_env()->_sysfs_root)
+#else
extern struct sysfs_dirent sysfs_root;
-extern struct super_block *sysfs_sb;
+#define ve_sysfs_root (&sysfs_root)
+#endif
extern struct kmem_cache *sysfs_dir_cachep;
/*
diff --git a/fs/utimes.c b/fs/utimes.c
index e4c75db..86a62a1 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,6 +40,20 @@ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
#endif
+SYSCALL_DEFINE2(lutime, char __user *, filename, struct utimbuf __user *, times)
+{
+ struct timespec tv[2];
+
+ if (times) {
+ if (get_user(tv[0].tv_sec, &times->actime) ||
+ get_user(tv[1].tv_sec, &times->modtime))
+ return -EFAULT;
+ tv[0].tv_nsec = 0;
+ tv[1].tv_nsec = 0;
+ }
+ return do_utimes(AT_FDCWD, filename, times ? tv : NULL, AT_SYMLINK_NOFOLLOW);
+}
+
static bool nsec_valid(long nsec)
{
if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
diff --git a/fs/xattr.c b/fs/xattr.c
index 6d4f6d3..3243bd7 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -115,6 +115,15 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
struct inode *inode = dentry->d_inode;
int error;
+#if defined(CONFIG_VE) && defined(CONFIG_SYSCTL)
+ if (!ve_is_super(get_exec_env())) {
+ if (ve_xattr_policy == VE_XATTR_POLICY_IGNORE)
+ return 0;
+ else if (ve_xattr_policy == VE_XATTR_POLICY_REJECT)
+ return -EPERM;
+ }
+#endif
+
error = xattr_permission(inode, name, MAY_WRITE);
if (error)
return error;
diff --git a/include/asm-generic/mman.h b/include/asm-generic/mman.h
index 32c8bd6..cb151a4 100644
--- a/include/asm-generic/mman.h
+++ b/include/asm-generic/mman.h
@@ -12,6 +12,7 @@
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+#define MAP_EXECPRIO 0x80000 /* soft ubc charge */
#define MCL_CURRENT 1 /* lock all current mappings */
#define MCL_FUTURE 2 /* lock all future mappings */
diff --git a/include/bc/beancounter.h b/include/bc/beancounter.h
new file mode 100644
index 0000000..7ba4c77
--- /dev/null
+++ b/include/bc/beancounter.h
@@ -0,0 +1,453 @@
+/*
+ * include/bc/beancounter.h
+ *
+ * Copyright (C) 1999-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * Andrey Savochkin saw@sw-soft.com
+ *
+ */
+
+#ifndef _LINUX_BEANCOUNTER_H
+#define _LINUX_BEANCOUNTER_H
+
+/*
+ * Generic ratelimiting stuff.
+ */
+
+struct ub_rate_info {
+ int burst;
+ int interval; /* jiffy_t per event */
+ int bucket; /* kind of leaky bucket */
+ unsigned long last; /* last event */
+};
+
+/* Return true if rate limit permits. */
+int ub_ratelimit(struct ub_rate_info *);
+
+
+/*
+ * This magic is used to distinuish user beancounter and pages beancounter
+ * in struct page. page_ub and page_bc are placed in union and MAGIC
+ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
+ */
+#define UB_MAGIC 0x62756275
+
+/*
+ * Resource list.
+ */
+
+#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including
+ * struct task, page directories, etc.
+ */
+#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */
+#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially
+ * private pages as private and used.
+ */
+#define UB_SHMPAGES 3 /* IPC SHM segment size. */
+#define UB_DUMMY 4 /* Dummy resource (compatibility) */
+#define UB_NUMPROC 5 /* Number of processes. */
+#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */
+#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation,
+ * checked against PRIVVMPAGES.
+ */
+#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill.
+ * Only limit is used, no accounting.
+ */
+#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */
+#define UB_NUMFLOCK 10 /* Number of file locks. */
+#define UB_NUMPTY 11 /* Number of PTYs. */
+#define UB_NUMSIGINFO 12 /* Number of siginfos. */
+#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */
+#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */
+#define UB_OTHERSOCKBUF 15 /* Total size of other socket
+ * send buffers (all buffers for PF_UNIX).
+ */
+#define UB_DGRAMRCVBUF 16 /* Total size of other socket
+ * receive buffers.
+ */
+#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */
+#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */
+#define UB_NUMFILE 19 /* Number of open files. */
+
+#define UB_RESOURCES_COMPAT 24
+
+/* Add new resources here */
+
+#define UB_NUMXTENT 23
+#define UB_SWAPPAGES 24
+#define UB_RESOURCES 25
+
+#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0)
+#define UB_TMPFSPAGES (UB_RESOURCES + 1)
+#define UB_HELDPAGES (UB_RESOURCES + 2)
+
+struct ubparm {
+ /*
+ * A barrier over which resource allocations are failed gracefully.
+ * If the amount of consumed memory is over the barrier further sbrk()
+ * or mmap() calls fail, the existing processes are not killed.
+ */
+ unsigned long barrier;
+ /* hard resource limit */
+ unsigned long limit;
+ /* consumed resources */
+ unsigned long held;
+ /* maximum amount of consumed resources through the last period */
+ unsigned long maxheld;
+ /* minimum amount of consumed resources through the last period */
+ unsigned long minheld;
+ /* count of failed charges */
+ unsigned long failcnt;
+};
+
+/*
+ * Kernel internal part.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/percpu_counter.h>
+#include <bc/debug.h>
+#include <bc/decl.h>
+#include <asm/atomic.h>
+
+/*
+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
+ */
+#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1)
+
+
+/*
+ * Resource management structures
+ * Serialization issues:
+ * beancounter list management is protected via ub_hash_lock
+ * task pointers are set only for current task and only once
+ * refcount is managed atomically
+ * value and limit comparison and change are protected by per-ub spinlock
+ */
+
+struct page_beancounter;
+struct task_beancounter;
+struct sock_beancounter;
+
+struct page_private {
+ unsigned long ubp_unused_privvmpages;
+ unsigned long ubp_tmpfs_respages;
+ unsigned long ubp_pbcs;
+ unsigned long long ubp_held_pages;
+};
+
+struct sock_private {
+ unsigned long ubp_rmem_thres;
+ unsigned long ubp_wmem_pressure;
+ unsigned long ubp_maxadvmss;
+ unsigned long ubp_rmem_pressure;
+ int ubp_tw_count;
+#define UB_RMEM_EXPAND 0
+#define UB_RMEM_KEEP 1
+#define UB_RMEM_SHRINK 2
+ struct list_head ubp_other_socks;
+ struct list_head ubp_tcp_socks;
+ struct percpu_counter ubp_orphan_count;
+};
+
+struct ub_percpu_struct {
+ unsigned long unmap;
+ unsigned long swapin;
+#ifdef CONFIG_BC_IO_ACCOUNTING
+ unsigned long long bytes_wrote;
+ unsigned long long bytes_read;
+ unsigned long long bytes_cancelled;
+#endif
+#ifdef CONFIG_BC_DEBUG_KMEM
+ long pages_charged;
+ long vmalloc_charged;
+#endif
+ unsigned long sync;
+ unsigned long sync_done;
+
+ unsigned long fsync;
+ unsigned long fsync_done;
+
+ unsigned long fdsync;
+ unsigned long fdsync_done;
+
+ unsigned long frsync;
+ unsigned long frsync_done;
+
+ unsigned long write;
+ unsigned long read;
+ unsigned long long wchar;
+ unsigned long long rchar;
+};
+
+struct user_beancounter
+{
+ unsigned long ub_magic;
+ atomic_t ub_refcount;
+ struct list_head ub_list;
+ struct hlist_node ub_hash;
+
+ union {
+ struct rcu_head rcu;
+ struct execute_work cleanup;
+ };
+
+ spinlock_t ub_lock;
+ uid_t ub_uid;
+ unsigned int ub_cookie;
+
+ struct ub_rate_info ub_limit_rl;
+ int ub_oom_noproc;
+
+ struct page_private ppriv;
+#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages
+#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages
+#define ub_held_pages ppriv.ubp_held_pages
+#define ub_pbcs ppriv.ubp_pbcs
+ struct sock_private spriv;
+#define ub_rmem_thres spriv.ubp_rmem_thres
+#define ub_maxadvmss spriv.ubp_maxadvmss
+#define ub_rmem_pressure spriv.ubp_rmem_pressure
+#define ub_wmem_pressure spriv.ubp_wmem_pressure
+#define ub_tcp_sk_list spriv.ubp_tcp_socks
+#define ub_other_sk_list spriv.ubp_other_socks
+#define ub_orphan_count spriv.ubp_orphan_count
+#define ub_tw_count spriv.ubp_tw_count
+
+ struct user_beancounter *parent;
+ int ub_childs;
+ void *private_data;
+ unsigned long ub_aflags;
+
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc;
+#endif
+
+ /* resources statistic and settings */
+ struct ubparm ub_parms[UB_RESOURCES];
+ /* resources statistic for last interval */
+ struct ubparm ub_store[UB_RESOURCES];
+
+ struct ub_percpu_struct *ub_percpu;
+#ifdef CONFIG_BC_IO_ACCOUNTING
+ /* these are protected with pb_lock */
+ unsigned long long bytes_wrote;
+ unsigned long long bytes_dirtied;
+ unsigned long long bytes_dirty_missed;
+ unsigned long io_pb_held;
+#endif
+#ifdef CONFIG_BC_DEBUG_KMEM
+ struct list_head ub_cclist;
+#endif
+};
+
+extern int ub_count;
+
+enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE };
+
+#define UB_AFLAG_NOTIF_PAGEIN 0
+
+static inline
+struct user_beancounter *top_beancounter(struct user_beancounter *ub)
+{
+ while (ub->parent != NULL)
+ ub = ub->parent;
+ return ub;
+}
+
+static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
+{
+ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
+}
+
+static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
+{
+ return (ub->ub_parms[resource].held >
+ ((ub->ub_parms[resource].barrier) >> 1));
+}
+
+static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource)
+{
+ struct ubparm *p;
+ p = ub->ub_parms + resource;
+ return p->held <= (p->barrier >> 3);
+}
+
+static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource)
+{
+ struct ubparm *p;
+ p = ub->ub_parms + resource;
+ return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024;
+}
+
+#ifndef CONFIG_BEANCOUNTERS
+
+#define ub_percpu_add(ub, f, v) do { } while (0)
+#define ub_percpu_sub(ub, f, v) do { } while (0)
+#define ub_percpu_inc(ub, f) do { } while (0)
+#define ub_percpu_dec(ub, f) do { } while (0)
+
+#define mm_ub(mm) (NULL)
+
+extern inline struct user_beancounter *get_beancounter_byuid
+ (uid_t uid, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter
+ (struct user_beancounter *ub) { return NULL; }
+extern inline void put_beancounter(struct user_beancounter *ub) { }
+
+static inline void ub_init_late(void) { };
+static inline void ub_init_early(void) { };
+
+static inline int charge_beancounter(struct user_beancounter *ub,
+ int resource, unsigned long val,
+ enum ub_severity strict) { return 0; }
+static inline void uncharge_beancounter(struct user_beancounter *ub,
+ int resource, unsigned long val) { }
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define ub_percpu_add(ub, field, v) do { \
+ per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \
+ put_cpu(); \
+ } while (0)
+#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1)
+
+#define ub_percpu_sub(ub, field, v) do { \
+ per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \
+ put_cpu(); \
+ } while (0)
+#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1)
+
+#define mm_ub(mm) ((mm)->mm_ub)
+/*
+ * Charge/uncharge operations
+ */
+
+extern int __charge_beancounter_locked(struct user_beancounter *ub,
+ int resource, unsigned long val, enum ub_severity strict);
+
+extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
+ int resource, unsigned long val);
+
+extern void put_beancounter_safe(struct user_beancounter *ub);
+extern void __put_beancounter(struct user_beancounter *ub);
+
+extern void uncharge_warn(struct user_beancounter *ub, int resource,
+ unsigned long val, unsigned long held);
+
+extern const char *ub_rnames[];
+/*
+ * Put a beancounter reference
+ */
+
+static inline void put_beancounter(struct user_beancounter *ub)
+{
+ if (unlikely(ub == NULL))
+ return;
+
+ /* FIXME - optimize not to disable interrupts and make call */
+ __put_beancounter(ub);
+}
+
+/* fast put, refcount can't reach zero */
+static inline void __put_beancounter_batch(struct user_beancounter *ub, int n)
+{
+ atomic_sub(n, &ub->ub_refcount);
+}
+
+static inline void put_beancounter_batch(struct user_beancounter *ub, int n)
+{
+ if (n > 1)
+ __put_beancounter_batch(ub, n - 1);
+ __put_beancounter(ub);
+}
+
+/*
+ * Create a new beancounter reference
+ */
+extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
+
+static inline
+struct user_beancounter *get_beancounter(struct user_beancounter *ub)
+{
+ if (unlikely(ub == NULL))
+ return NULL;
+
+ atomic_inc(&ub->ub_refcount);
+ return ub;
+}
+
+static inline
+struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub)
+{
+ return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL;
+}
+
+static inline void get_beancounter_batch(struct user_beancounter *ub, int n)
+{
+ atomic_add(n, &ub->ub_refcount);
+}
+
+extern struct user_beancounter *get_subbeancounter_byid(
+ struct user_beancounter *,
+ int id, int create);
+
+extern void ub_init_late(void);
+extern void ub_init_early(void);
+
+extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size);
+
+/*
+ * Resource charging
+ * Change user's account and compare against limits
+ */
+
+static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
+{
+ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
+ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
+ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
+ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
+}
+
+int charge_beancounter(struct user_beancounter *ub, int resource,
+ unsigned long val, enum ub_severity strict);
+void uncharge_beancounter(struct user_beancounter *ub, int resource,
+ unsigned long val);
+void __charge_beancounter_notop(struct user_beancounter *ub, int resource,
+ unsigned long val);
+void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource,
+ unsigned long val);
+
+static inline void charge_beancounter_notop(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ if (ub->parent != NULL)
+ __charge_beancounter_notop(ub, resource, val);
+}
+
+static inline void uncharge_beancounter_notop(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ if (ub->parent != NULL)
+ __uncharge_beancounter_notop(ub, resource, val);
+}
+
+#endif /* CONFIG_BEANCOUNTERS */
+
+#ifndef CONFIG_BC_RSS_ACCOUNTING
+static inline void ub_ini_pbc(void) { }
+#else
+extern void ub_init_pbc(void);
+#endif
+#endif /* __KERNEL__ */
+#endif /* _LINUX_BEANCOUNTER_H */
diff --git a/include/bc/dcache.h b/include/bc/dcache.h
new file mode 100644
index 0000000..5ebefff
--- /dev/null
+++ b/include/bc/dcache.h
@@ -0,0 +1,47 @@
+/*
+ * include/bc/dcache.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DCACHE_H_
+#define __BC_DCACHE_H_
+
+#include <bc/decl.h>
+
+/*
+ * UB_DCACHESIZE accounting
+ */
+
+struct dentry_beancounter
+{
+ /*
+ * d_inuse =
+ * <number of external refs> +
+ * <number of 'used' childs>
+ *
+ * d_inuse == -1 means that dentry is unused
+ * state change -1 => 0 causes charge
+ * state change 0 => -1 causes uncharge
+ */
+ atomic_t d_inuse;
+ /* charged size, including name length if name is not inline */
+ unsigned long d_ubsize;
+ struct user_beancounter *d_ub;
+};
+
+#ifdef CONFIG_BEANCOUNTERS
+#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse))
+#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse))
+#define INUSE_INIT 0
+
+extern int ub_dentry_on;
+#else
+#define ub_dget_testone(d) (0)
+#define ub_dput_testzero(d) (0)
+#endif
+#endif
diff --git a/include/bc/dcache_op.h b/include/bc/dcache_op.h
new file mode 100644
index 0000000..23306e9
--- /dev/null
+++ b/include/bc/dcache_op.h
@@ -0,0 +1,102 @@
+/*
+ * include/bc/dcache_op.h
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DCACHE_OP_H_
+#define __BC_DCACHE_OP_H_
+
+struct dentry;
+
+#ifdef CONFIG_BEANCOUNTERS
+
+#include <linux/spinlock.h>
+#include <bc/dcache.h>
+#include <bc/task.h>
+
+extern int ub_dentry_alloc_barrier;
+extern spinlock_t dcache_lock;
+
+static inline int ub_dentry_alloc(struct dentry *d)
+{
+ extern int __ub_dentry_alloc(struct dentry *);
+
+ if (!ub_dentry_on)
+ return 0;
+ return __ub_dentry_alloc(d);
+}
+
+static inline void ub_dentry_alloc_start(void)
+{
+ extern void __ub_dentry_alloc_start(void);
+
+ if (ub_dentry_alloc_barrier)
+ __ub_dentry_alloc_start();
+}
+
+static inline void ub_dentry_alloc_end(void)
+{
+ extern void __ub_dentry_alloc_end(void);
+
+ if (current->task_bc.dentry_alloc)
+ __ub_dentry_alloc_end();
+}
+
+static inline int ub_dentry_charge(struct dentry *d)
+{
+ extern int __ub_dentry_charge(struct dentry *);
+
+ if (!ub_dentry_on)
+ return 0;
+ return __ub_dentry_charge(d);
+}
+
+static inline void ub_dentry_charge_nofail(struct dentry *d)
+{
+ extern void __ub_dentry_charge_nofail(struct dentry *);
+
+ if (!ub_dentry_on)
+ return;
+ __ub_dentry_charge_nofail(d);
+}
+
+static inline void ub_dentry_uncharge_locked(struct dentry *d)
+{
+ extern void __ub_dentry_uncharge(struct dentry *);
+
+ if (!ub_dentry_on)
+ return;
+ __ub_dentry_uncharge(d);
+}
+
+static inline void ub_dentry_uncharge(struct dentry *d)
+{
+ extern void __ub_dentry_uncharge(struct dentry *);
+
+ if (!ub_dentry_on)
+ return;
+ spin_lock(&dcache_lock);
+ __ub_dentry_uncharge(d);
+ spin_unlock(&dcache_lock);
+}
+
+void uncharge_dcache(struct user_beancounter *ub, unsigned long size);
+#else /* CONFIG_BEANCOUNTERS */
+
+static inline int ub_dentry_alloc(struct dentry *d) { return 0; }
+static inline void ub_dentry_alloc_start(void) { }
+static inline void ub_dentry_alloc_end(void) { }
+static inline int ub_dentry_charge(struct dentry *d) { return 0; }
+static inline void ub_dentry_charge_nofail(struct dentry *d) { }
+static inline void ub_dentry_uncharge_locked(struct dentry *d) { }
+static inline void ub_dentry_uncharge(struct dentry *d) { }
+static inline void uncharge_dcache(struct user_beancounter *ub, unsigned long size) { }
+
+#endif /* CONFIG_BEANCOUNTERS */
+
+#endif /* __dcache_op.h_ */
diff --git a/include/bc/debug.h b/include/bc/debug.h
new file mode 100644
index 0000000..58c64f3
--- /dev/null
+++ b/include/bc/debug.h
@@ -0,0 +1,103 @@
+/*
+ * include/bc/debug.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DEBUG_H_
+#define __BC_DEBUG_H_
+
+/*
+ * general debugging
+ */
+
+#define UBD_ALLOC 0x1
+#define UBD_CHARGE 0x2
+#define UBD_LIMIT 0x4
+#define UBD_TRACE 0x8
+
+/*
+ * ub_net debugging
+ */
+
+#define UBD_NET_SOCKET 0x10
+#define UBD_NET_SLEEP 0x20
+#define UBD_NET_SEND 0x40
+#define UBD_NET_RECV 0x80
+
+/*
+ * Main routines
+ */
+
+#define UB_DEBUG (0)
+#define DEBUG_RESOURCE (0ULL)
+
+#define ub_dbg_cond(__cond, __str, args...) \
+ do { \
+ if ((__cond) != 0) \
+ printk(__str, ##args); \
+ } while(0)
+
+#define ub_debug(__section, __str, args...) \
+ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args)
+
+#define ub_debug_resource(__resource, __str, args...) \
+ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \
+ (DEBUG_RESOURCE & (1 << (__resource))), \
+ __str, ##args)
+
+#if UB_DEBUG & UBD_TRACE
+#define ub_debug_trace(__cond, __b, __r) \
+ do { \
+ static struct ub_rate_info ri = { __b, __r }; \
+ if ((__cond) != 0 && ub_ratelimit(&ri)) \
+ dump_stack(); \
+ } while(0)
+#else
+#define ub_debug_trace(__cond, __burst, __rate)
+#endif
+
+#ifdef CONFIG_BC_DEBUG_KMEM
+#include <linux/list.h>
+
+struct user_beancounter;
+struct ub_cache_counter {
+ struct list_head ulist;
+ struct ub_cache_counter *next;
+ struct user_beancounter *ub;
+ struct kmem_cache *cachep;
+ unsigned long counter;
+};
+
+extern spinlock_t cc_lock;
+extern void init_cache_counters(void);
+extern void ub_free_counters(struct user_beancounter *);
+extern void ub_kmemcache_free(struct kmem_cache *cachep);
+
+struct vm_struct;
+#define inc_vmalloc_charged(vm, flags) do { \
+ if (flags & __GFP_UBC) \
+ ub_percpu_add(get_exec_ub(), vmalloc_charged, \
+ vm->nr_pages); \
+ } while (0)
+#define dec_vmalloc_charged(vm) do { \
+ struct user_beancounter *ub; \
+ ub = page_ub(vm->pages[0]); \
+ if (ub != NULL) \
+ ub_percpu_sub(ub, vmalloc_charged, \
+ vm->nr_pages); \
+ } while (0)
+#else
+#define init_cache_counters() do { } while (0)
+#define inc_vmalloc_charged(vm, f) do { } while (0)
+#define dec_vmalloc_charged(vm) do { } while (0)
+
+#define ub_free_counters(ub) do { } while (0)
+#define ub_kmemcache_free(cachep) do { } while (0)
+#endif
+
+#endif
diff --git a/include/bc/decl.h b/include/bc/decl.h
new file mode 100644
index 0000000..6dd4cb9
--- /dev/null
+++ b/include/bc/decl.h
@@ -0,0 +1,41 @@
+/*
+ * include/bc/decl.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_DECL_H_
+#define __BC_DECL_H_
+
+#ifdef __KERNEL__
+
+/*
+ * Naming convension:
+ * ub_<section|object>_<operation>
+ */
+
+#ifdef CONFIG_BEANCOUNTERS
+
+#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl;
+#define UB_DECLARE_VOID_FUNC(decl) extern void decl;
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define UB_DECLARE_FUNC(ret_type, decl) \
+ static inline ret_type decl \
+ { \
+ return (ret_type)0; \
+ }
+#define UB_DECLARE_VOID_FUNC(decl) \
+ static inline void decl \
+ { \
+ }
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif
+
+#endif
diff --git a/include/bc/hash.h b/include/bc/hash.h
new file mode 100644
index 0000000..b2afb69
--- /dev/null
+++ b/include/bc/hash.h
@@ -0,0 +1,36 @@
+/*
+ * include/bc/hash.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_UBHASH_H
+#define _LINUX_UBHASH_H
+
+#ifdef __KERNEL__
+
+#define UB_HASH_SIZE 256
+
+extern struct hlist_head ub_hash[];
+extern spinlock_t ub_hash_lock;
+extern struct list_head ub_list_head;
+
+#ifdef CONFIG_BEANCOUNTERS
+
+/*
+ * Iterate over beancounters
+ * @__ubp - beancounter ptr
+ * Can use break :)
+ */
+#define for_each_beancounter(__ubp) \
+ list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \
+
+#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash)
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif /* __KERNEL__ */
+#endif /* _LINUX_UBHASH_H */
diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h
new file mode 100644
index 0000000..361b26c
--- /dev/null
+++ b/include/bc/io_acct.h
@@ -0,0 +1,113 @@
+/*
+ * include/bc/io_acct.h
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifndef __UB_IO_ACCT_H_
+#define __UB_IO_ACCT_H_
+
+#define PAGE_IO_MARK (0x1UL)
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+#include <bc/beancounter.h>
+#include <bc/rss_pages.h>
+
+#define page_iopb(page) ({ \
+ struct page_beancounter *pb; \
+ pb = page_pbc(page); \
+ rmb(); \
+ pb; \
+ })
+
+/*
+ * IO ub is required in task context only, so if exec_ub is set
+ * to NULL this means that uses doesn't need to charge some
+ * resources. nevertheless IO activity must be accounted, so we
+ * account it to current's task beancounter.
+ */
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+ struct user_beancounter *ub;
+
+ ub = get_exec_ub();
+ if (unlikely(ub == NULL))
+ ub = get_task_ub(current);
+
+ return top_beancounter(ub);
+}
+
+extern struct page_beancounter **page_pblist(struct page *);
+
+extern void ub_io_save_context(struct page *, size_t);
+extern void ub_io_release_context(struct page *pg, size_t size);
+
+static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb)
+{
+ if (!((unsigned long)pb & PAGE_IO_MARK))
+ return NULL;
+
+ return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK);
+}
+
+static inline void ub_io_account_read(size_t bytes)
+{
+ ub_percpu_add(get_io_ub(), bytes_read, bytes);
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+ ub_percpu_add(get_io_ub(), bytes_wrote, bytes);
+}
+
+static inline void ub_io_account_dirty(struct page *page, size_t bytes)
+{
+ ub_io_save_context(page, bytes);
+}
+
+static inline void ub_io_account_write_cancelled(size_t bytes)
+{
+ ub_percpu_add(get_io_ub(), bytes_cancelled, bytes);
+}
+
+void ub_init_io(struct kmem_cache *);
+#else /* BC_IO_ACCOUNTING */
+#define page_iopb(page) (NULL)
+#define page_pblist(page) (&page_pbc(page))
+
+static inline void ub_io_release_context(struct page *pg, size_t bytes)
+{
+}
+
+static inline void ub_io_account_dirty(struct page *p, size_t bytes)
+{
+}
+
+static inline void ub_io_account_read(size_t bytes)
+{
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+}
+
+static inline void ub_io_account_write_cancelled(size_t bytes)
+{
+}
+
+static inline void ub_init_io(struct kmem_cache *pb_cachep) { };
+#endif
+
+#ifdef CONFIG_BC_DEBUG_IO
+extern void ub_io_release_debug(struct page *pg);
+#else
+#define ub_io_release_debug(pg) do { } while (0)
+#endif
+#endif
diff --git a/include/bc/kmem.h b/include/bc/kmem.h
new file mode 100644
index 0000000..e6a31f2
--- /dev/null
+++ b/include/bc/kmem.h
@@ -0,0 +1,69 @@
+/*
+ * include/bc/kmem.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_SLAB_H_
+#define __UB_SLAB_H_
+
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+/*
+ * UB_KMEMSIZE accounting
+ */
+
+#ifdef CONFIG_BC_DEBUG_ITEMS
+#define CHARGE_ORDER(__o) (1 << (__o))
+#define CHARGE_SIZE(__s) 1
+#else
+#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o))
+#define CHARGE_SIZE(__s) (__s)
+#endif
+
+#ifdef CONFIG_BEANCOUNTERS
+#define page_ub(__page) ((__page)->bc.page_ub)
+#else
+#define page_ub(__page) NULL
+#endif
+
+struct mm_struct;
+struct page;
+struct kmem_cache;
+
+UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj))
+UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj))
+
+UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub,
+ unsigned long size, enum ub_severity strict))
+UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub,
+ unsigned long size))
+
+UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask))
+UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order))
+UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep,
+ void *objp, gfp_t flags))
+UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj))
+
+#ifdef CONFIG_BEANCOUNTERS
+static inline int should_charge(unsigned long cflags, gfp_t flags)
+{
+ if (!(cflags & SLAB_UBC))
+ return 0;
+ if ((cflags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC))
+ return 0;
+ return 1;
+}
+
+#define should_uncharge(cflags) should_charge(cflags, __GFP_UBC)
+#else
+#define should_charge(cflags, f) 0
+#define should_uncharge(cflags) 0
+#endif
+
+#endif /* __UB_SLAB_H_ */
diff --git a/include/bc/misc.h b/include/bc/misc.h
new file mode 100644
index 0000000..84082b2
--- /dev/null
+++ b/include/bc/misc.h
@@ -0,0 +1,55 @@
+/*
+ * include/bc/misc.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_MISC_H_
+#define __BC_MISC_H_
+
+#include <bc/decl.h>
+
+struct tty_struct;
+struct file;
+struct file_lock;
+struct sigqueue;
+
+UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
+UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
+UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
+UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
+UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q,
+ struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q))
+UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent,
+ struct task_struct *task))
+UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task))
+UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task))
+UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
+UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
+
+#ifdef CONFIG_BEANCOUNTERS
+#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0)
+#define unset_flock_charged(fl) do { \
+ WARN_ON((fl)->fl_charged == 0); \
+ (fl)->fl_charged = 0; \
+ } while (0)
+#define set_mm_ub(mm, tsk) do { \
+ (mm)->mm_ub = get_beancounter(tsk != current ? \
+ tsk->task_bc.task_ub : get_exec_ub()); \
+ } while (0)
+#define put_mm_ub(mm) do { \
+ put_beancounter((mm)->mm_ub); \
+ (mm)->mm_ub = NULL; \
+ } while (0)
+#else
+#define set_flock_charged(fl) do { } while (0)
+#define unset_flock_charged(fl) do { } while (0)
+#define set_mm_ub(mm, tsk) do { } while (0)
+#define put_mm_ub(mm) do { } while (0)
+#endif
+#endif
diff --git a/include/bc/net.h b/include/bc/net.h
new file mode 100644
index 0000000..32f33b9
--- /dev/null
+++ b/include/bc/net.h
@@ -0,0 +1,213 @@
+/*
+ * include/bc/net.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_NET_H_
+#define __BC_NET_H_
+
+/*
+ * UB_NUMXXXSOCK, UB_XXXBUF accounting
+ */
+
+#include <bc/decl.h>
+#include <bc/sock.h>
+#include <bc/beancounter.h>
+
+#define bid2sid(__bufid) \
+ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK)
+
+#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \
+ ~(SMP_CACHE_BYTES-1)))
+#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE)
+
+static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter));
+#endif
+ return 0;
+}
+
+static inline void ub_skb_free_bc(struct sk_buff *skb)
+{
+}
+
+#define IS_TCP_SOCK(__family, __type) \
+ (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM)
+
+/* number of sockets */
+UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type))
+UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk))
+UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk))
+UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk))
+
+/* management of queue for send space */
+UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo,
+ unsigned long size))
+UB_DECLARE_FUNC(int, ub_sock_snd_queue_add(struct sock *sk, int resource,
+ unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk))
+
+/* send space */
+UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid,
+ unsigned long size))
+UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid,
+ unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid,
+ unsigned long size, unsigned long ressize))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk,
+ struct sk_buff *skb, enum ub_severity strict))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk))
+UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk))
+
+UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk))
+
+/* receive space */
+UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb))
+UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk,
+ struct sk_buff *skb, enum ub_severity strict))
+
+/* skb destructor */
+UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb))
+
+static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size)
+{
+ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size);
+}
+
+static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size)
+{
+ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size);
+}
+
+UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk,
+ unsigned long size))
+
+static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size)
+{
+ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size);
+}
+
+UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk,
+ unsigned long size, unsigned long ressize))
+
+static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size,
+ unsigned long ressize)
+{
+ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize);
+}
+
+static inline int ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz)
+{
+ return ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz);
+}
+
+static inline int ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)
+{
+ return ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz);
+}
+
+static inline int ub_tcpsndbuf_charge(struct sock *sk,
+ struct sk_buff *skb)
+{
+ return ub_sock_tcp_chargesend(sk, skb, UB_HARD);
+}
+
+static inline int ub_tcpsndbuf_charge_forced(struct sock *sk,
+ struct sk_buff *skb)
+{
+ return ub_sock_tcp_chargesend(sk, skb, UB_FORCE);
+}
+
+static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+ return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT);
+}
+
+static inline int ub_tcprcvbuf_charge_forced(struct sock *sk,
+ struct sk_buff *skb)
+{
+ return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE);
+}
+
+/* Charge size */
+static inline unsigned long skb_charge_datalen(unsigned long chargesize)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ unsigned long slabsize;
+
+ chargesize -= sizeof(struct sk_buff);
+ slabsize = 64;
+ do {
+ slabsize <<= 1;
+ } while (slabsize <= chargesize);
+
+ slabsize >>= 1;
+ return (slabsize - sizeof(struct skb_shared_info)) &
+ ~(SMP_CACHE_BYTES-1);
+#else
+ return 0;
+#endif
+}
+
+static inline unsigned long skb_charge_size_gen(unsigned long size)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ unsigned int slabsize;
+
+ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info);
+ slabsize = 32; /* min size is 64 because of skb_shared_info */
+ do {
+ slabsize <<= 1;
+ } while (slabsize < size);
+
+ return slabsize + sizeof(struct sk_buff);
+#else
+ return 0;
+#endif
+
+}
+
+static inline unsigned long skb_charge_size_const(unsigned long size)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ unsigned int ret;
+ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64)
+ ret = 64 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128)
+ ret = 128 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256)
+ ret = 256 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512)
+ ret = 512 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024)
+ ret = 1024 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048)
+ ret = 2048 + sizeof(struct sk_buff);
+ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096)
+ ret = 4096 + sizeof(struct sk_buff);
+ else
+ ret = skb_charge_size_gen(size);
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+
+#define skb_charge_size(__size) \
+ (__builtin_constant_p(__size) ? \
+ skb_charge_size_const(__size) : \
+ skb_charge_size_gen(__size))
+
+UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb))
+UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb,
+ struct sock *sk, unsigned long size, int res))
+
+#endif
diff --git a/include/bc/oom_kill.h b/include/bc/oom_kill.h
new file mode 100644
index 0000000..c07608f
--- /dev/null
+++ b/include/bc/oom_kill.h
@@ -0,0 +1,26 @@
+#include <bc/decl.h>
+#include <bc/task.h>
+
+UB_DECLARE_FUNC(int, ub_oom_lock(void))
+UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void))
+UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_oom_unlock(void))
+UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk))
+UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub,
+ struct task_struct *tsk))
+
+#ifdef CONFIG_BEANCOUNTERS
+extern int oom_generation;
+extern int oom_kill_counter;
+#define ub_oom_start() do { \
+ current->task_bc.oom_generation = oom_generation; \
+ } while (0)
+#define ub_oom_task_killed(p) do { \
+ oom_kill_counter++; \
+ wake_up_process(p); \
+ } while (0)
+#else
+#define ub_oom_start() do { } while (0)
+#define ub_oom_task_killed(p) do { } while (0)
+#endif
diff --git a/include/bc/proc.h b/include/bc/proc.h
new file mode 100644
index 0000000..f244523
--- /dev/null
+++ b/include/bc/proc.h
@@ -0,0 +1,40 @@
+/*
+ * include/bc/proc.h
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_PROC_H_
+#define __UB_PROC_H_
+
+#include <linux/seq_file.h>
+
+struct bc_proc_entry {
+ char *name;
+ union {
+ int (*show)(struct seq_file *, void *);
+ struct file_operations *fops;
+ } u;
+ struct bc_proc_entry *next;
+ int cookie;
+};
+
+struct user_beancounter;
+
+void bc_register_proc_entry(struct bc_proc_entry *);
+void bc_register_proc_root_entry(struct bc_proc_entry *);
+
+static inline struct user_beancounter *seq_beancounter(struct seq_file *f)
+{
+ return (struct user_beancounter *)(f->private);
+}
+
+extern const char *bc_proc_lu_fmt;
+extern const char *bc_proc_lu_lfmt;
+extern const char *bc_proc_llu_fmt;
+extern const char *bc_proc_lu_lu_fmt;
+#endif
diff --git a/include/bc/rss_pages.h b/include/bc/rss_pages.h
new file mode 100644
index 0000000..b195961
--- /dev/null
+++ b/include/bc/rss_pages.h
@@ -0,0 +1,57 @@
+/*
+ * include/bc/rss_pages.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __RSS_PAGES_H_
+#define __RSS_PAGES_H_
+
+/*
+ * Page_beancounters
+ */
+
+struct page;
+struct user_beancounter;
+
+#define PB_MAGIC 0x62700001UL
+
+struct page_beancounter {
+ unsigned long pb_magic;
+ struct page *page;
+ struct user_beancounter *ub;
+ union {
+ struct page_beancounter *next_hash;
+ struct page_beancounter *page_pb_list;
+ };
+ union {
+ unsigned refcount;
+ unsigned io_debug;
+ };
+ union {
+ struct list_head page_list;
+ struct list_head io_list;
+ };
+};
+
+#define PB_REFCOUNT_BITS 24
+#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS)
+#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS))
+#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS))
+#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1))
+#define PB_COUNT_INC(c) ((c)++)
+#define PB_COUNT_DEC(c) ((c)--)
+#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c))
+
+#define page_pbc(__page) ((__page)->bc.page_pb)
+
+extern spinlock_t pb_lock;
+
+struct address_space;
+extern int is_shmem_mapping(struct address_space *);
+
+#endif
diff --git a/include/bc/sock.h b/include/bc/sock.h
new file mode 100644
index 0000000..b314c9b
--- /dev/null
+++ b/include/bc/sock.h
@@ -0,0 +1,47 @@
+/*
+ * include/bc/sock.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_SOCK_H_
+#define __BC_SOCK_H_
+
+#include <bc/task.h>
+
+struct sock;
+struct sk_buff;
+
+struct skb_beancounter {
+ struct user_beancounter *ub;
+ unsigned long charged:27, resource:5;
+};
+
+struct sock_beancounter {
+ struct user_beancounter *ub;
+ /*
+ * poll_reserv accounts space already charged for future sends.
+ * It is required to make poll agree with sendmsg.
+ * Additionally, it makes real charges (with taking bc spinlock)
+ * in the send path rarer, speeding networking up.
+ * For TCP (only): changes are protected by socket lock (not bc!)
+ * For all proto: may be read without serialization in poll.
+ */
+ unsigned long poll_reserv;
+ unsigned long forw_space;
+ /* fields below are protected by bc spinlock */
+ unsigned long ub_waitspc; /* space waiting for */
+ unsigned long ub_wcharged;
+ struct list_head ub_sock_list;
+};
+
+#define sock_bc(__sk) (&(__sk)->sk_bc)
+#define skb_bc(__skb) (&(__skb)->skb_bc)
+#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc))
+#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL)
+
+#endif
diff --git a/include/bc/sock_orphan.h b/include/bc/sock_orphan.h
new file mode 100644
index 0000000..b19a316
--- /dev/null
+++ b/include/bc/sock_orphan.h
@@ -0,0 +1,104 @@
+/*
+ * include/bc/sock_orphan.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_SOCK_ORPHAN_H_
+#define __BC_SOCK_ORPHAN_H_
+
+#include <net/tcp.h>
+
+#include "bc/beancounter.h"
+#include "bc/net.h"
+
+
+static inline struct percpu_counter *__ub_get_orphan_count_ptr(struct sock *sk)
+{
+ if (sock_has_ubc(sk))
+ return &sock_bc(sk)->ub->ub_orphan_count;
+ return sk->sk_prot->orphan_count;
+}
+
+static inline void ub_inc_orphan_count(struct sock *sk)
+{
+ percpu_counter_inc(__ub_get_orphan_count_ptr(sk));
+}
+
+static inline void ub_dec_orphan_count(struct sock *sk)
+{
+ percpu_counter_dec(__ub_get_orphan_count_ptr(sk));
+}
+
+static inline int ub_get_orphan_count(struct sock *sk)
+{
+ return percpu_counter_sum_positive(__ub_get_orphan_count_ptr(sk));
+}
+
+extern int __ub_too_many_orphans(struct sock *sk, int count);
+static inline int ub_too_many_orphans(struct sock *sk, int count)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ if (__ub_too_many_orphans(sk, count))
+ return 1;
+#endif
+ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans ||
+ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]));
+}
+
+#include <bc/kmem.h>
+
+struct inet_timewait_sock;
+
+static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *ub;
+
+ ub = slab_ub(tw);
+ if (ub != NULL)
+ ub->ub_tw_count += incdec;
+#endif
+}
+
+static inline int __ub_timewait_check(struct sock *sk)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *ub;
+ unsigned long mem_max, mem;
+ int tw_count;
+
+ ub = sock_bc(sk)->ub;
+ if (ub == NULL)
+ return 1;
+
+ tw_count = ub->ub_tw_count;
+ mem_max = sysctl_tcp_max_tw_kmem_fraction *
+ ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1);
+ mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab);
+ mem *= tw_count;
+ return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max;
+#else
+ return 1;
+#endif
+}
+
+#define ub_timewait_inc(tw, twdr) do { \
+ if ((twdr)->ub_managed) \
+ ub_timewait_mod(tw, 1); \
+ } while (0)
+
+#define ub_timewait_dec(tw, twdr) do { \
+ if ((twdr)->ub_managed) \
+ ub_timewait_mod(tw, -1); \
+ } while (0)
+
+#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \
+ __ub_timewait_check(sk))
+
+#endif
diff --git a/include/bc/statd.h b/include/bc/statd.h
new file mode 100644
index 0000000..9dafc5e
--- /dev/null
+++ b/include/bc/statd.h
@@ -0,0 +1,70 @@
+/*
+ * include/bc/statd.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_STATD_H_
+#define __BC_STATD_H_
+
+/* sys_ubstat commands list */
+#define UBSTAT_READ_ONE 0x010000
+#define UBSTAT_READ_ALL 0x020000
+#define UBSTAT_READ_FULL 0x030000
+#define UBSTAT_UBLIST 0x040000
+#define UBSTAT_UBPARMNUM 0x050000
+#define UBSTAT_GETTIME 0x060000
+
+#define UBSTAT_CMD(func) ((func) & 0xF0000)
+#define UBSTAT_PARMID(func) ((func) & 0x0FFFF)
+
+#define TIME_MAX_SEC (LONG_MAX / HZ)
+#define TIME_MAX_JIF (TIME_MAX_SEC * HZ)
+
+typedef unsigned long ubstattime_t;
+
+typedef struct {
+ ubstattime_t start_time;
+ ubstattime_t end_time;
+ ubstattime_t cur_time;
+} ubgettime_t;
+
+typedef struct {
+ long maxinterval;
+ int signum;
+} ubnotifrq_t;
+
+typedef struct {
+ unsigned long maxheld;
+ unsigned long failcnt;
+} ubstatparm_t;
+
+typedef struct {
+ unsigned long barrier;
+ unsigned long limit;
+ unsigned long held;
+ unsigned long maxheld;
+ unsigned long minheld;
+ unsigned long failcnt;
+ unsigned long __unused1;
+ unsigned long __unused2;
+} ubstatparmf_t;
+
+typedef struct {
+ ubstattime_t start_time;
+ ubstattime_t end_time;
+ ubstatparmf_t param[0];
+} ubstatfull_t;
+
+#ifdef __KERNEL__
+struct ub_stat_notify {
+ struct list_head list;
+ struct task_struct *task;
+ int signum;
+};
+#endif
+#endif
diff --git a/include/bc/task.h b/include/bc/task.h
new file mode 100644
index 0000000..f5a2915
--- /dev/null
+++ b/include/bc/task.h
@@ -0,0 +1,69 @@
+/*
+ * include/bc/task.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_TASK_H_
+#define __BC_TASK_H_
+
+struct user_beancounter;
+
+
+#ifdef CONFIG_BEANCOUNTERS
+struct task_beancounter {
+ struct user_beancounter *exec_ub;
+ struct user_beancounter *saved_ub;
+ struct user_beancounter *task_ub;
+ struct user_beancounter *fork_sub;
+ unsigned long file_precharged, file_quant, file_count;
+ unsigned long kmem_precharged;
+ char dentry_alloc, pgfault_handle;
+ void *task_fnode, *task_freserv;
+ unsigned long oom_generation;
+ unsigned long task_data[4];
+ unsigned long pgfault_allot;
+};
+
+#define get_task_ub(__task) ((__task)->task_bc.task_ub)
+
+extern struct user_beancounter ub0;
+#define get_ub0() (&ub0)
+
+#define ub_save_context(t) do { \
+ t->task_bc.saved_ub = t->task_bc.exec_ub; \
+ t->task_bc.exec_ub = get_ub0(); \
+ } while (0)
+#define ub_restore_context(t) do { \
+ t->task_bc.exec_ub = t->task_bc.saved_ub; \
+ } while (0)
+
+#define get_exec_ub() (current->task_bc.exec_ub)
+#define set_exec_ub(__newub) \
+({ \
+ struct user_beancounter *old; \
+ struct task_beancounter *tbc; \
+ \
+ tbc = &current->task_bc; \
+ old = tbc->exec_ub; \
+ tbc->exec_ub = __newub; \
+ old; \
+})
+
+void ub_init_task_bc(struct task_beancounter *);
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define get_ub0() (NULL)
+#define get_exec_ub() (NULL)
+#define get_task_ub(task) (NULL)
+#define set_exec_ub(__ub) (NULL)
+#define ub_save_context(t) do { } while (0)
+#define ub_restore_context(t) do { } while (0)
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif /* __task.h_ */
diff --git a/include/bc/tcp.h b/include/bc/tcp.h
new file mode 100644
index 0000000..d2bf748
--- /dev/null
+++ b/include/bc/tcp.h
@@ -0,0 +1,76 @@
+/*
+ * include/bc/tcp.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __BC_TCP_H_
+#define __BC_TCP_H_
+
+/*
+ * UB_NUMXXXSOCK, UB_XXXBUF accounting
+ */
+
+#include <bc/sock.h>
+#include <bc/beancounter.h>
+
+static inline void ub_tcp_update_maxadvmss(struct sock *sk)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ if (!sock_has_ubc(sk))
+ return;
+ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss)
+ return;
+
+ sock_bc(sk)->ub->ub_maxadvmss =
+ skb_charge_size(MAX_HEADER + sizeof(struct iphdr)
+ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss);
+#endif
+}
+
+static inline int ub_tcp_rmem_allows_expand(struct sock *sk)
+{
+ if (tcp_memory_pressure)
+ return 0;
+#ifdef CONFIG_BEANCOUNTERS
+ if (sock_has_ubc(sk)) {
+ struct user_beancounter *ub;
+
+ ub = sock_bc(sk)->ub;
+ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND)
+ return 1;
+ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK)
+ return 0;
+ return sk->sk_rcvbuf <= ub->ub_rmem_thres;
+ }
+#endif
+ return 1;
+}
+
+static inline int ub_tcp_memory_pressure(struct sock *sk)
+{
+ if (tcp_memory_pressure)
+ return 1;
+#ifdef CONFIG_BEANCOUNTERS
+ if (sock_has_ubc(sk))
+ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND;
+#endif
+ return 0;
+}
+
+static inline int ub_tcp_shrink_rcvbuf(struct sock *sk)
+{
+ if (tcp_memory_pressure)
+ return 1;
+#ifdef CONFIG_BEANCOUNTERS
+ if (sock_has_ubc(sk))
+ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK;
+#endif
+ return 0;
+}
+
+#endif
diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h
new file mode 100644
index 0000000..09642e3
--- /dev/null
+++ b/include/bc/vmpages.h
@@ -0,0 +1,152 @@
+/*
+ * include/bc/vmpages.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __UB_PAGES_H_
+#define __UB_PAGES_H_
+
+#include <linux/linkage.h>
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+/*
+ * Check whether vma has private or copy-on-write mapping.
+ * Should match checks in ub_protected_charge().
+ */
+#define VM_UB_PRIVATE(__flags, __file) \
+ ( ((__flags) & VM_WRITE) ? \
+ (__file) == NULL || !((__flags) & VM_SHARED) : \
+ 0 \
+ )
+
+/* Mprotect charging result */
+#define PRIVVM_ERROR -1
+#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */
+#define PRIVVM_TO_PRIVATE 1
+#define PRIVVM_TO_SHARED 2
+
+UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm,
+ unsigned long size,
+ unsigned long newflags,
+ struct vm_area_struct *vma))
+
+UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long num))
+#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1)
+UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long num))
+#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1)
+
+UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm,
+ long sz))
+
+UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
+ unsigned long size,
+ unsigned vm_flags,
+ struct file *vm_file,
+ int strict))
+UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
+ unsigned long size,
+ unsigned vm_flags,
+ struct file *vm_file))
+
+struct shmem_inode_info;
+UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i,
+ unsigned long sz))
+UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i,
+ unsigned long sz))
+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi))
+UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
+ unsigned long size))
+#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1)
+
+#ifdef CONFIG_BEANCOUNTERS
+#define shmi_ub_set(shi, ub) do { \
+ (shi)->shmi_ub = get_beancounter(ub); \
+ } while (0)
+#define shmi_ub_put(shi) do { \
+ put_beancounter((shi)->shmi_ub); \
+ (shi)->shmi_ub = NULL; \
+ } while (0)
+#else
+#define shmi_ub_set(shi, ub) do { } while (0)
+#define shmi_ub_put(shi) do { } while (0)
+#endif
+
+UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
+ unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
+ unsigned long size))
+UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
+ unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
+ unsigned long size))
+
+UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end))
+#define pages_in_vma(vma) (pages_in_vma_range(vma, \
+ vma->vm_start, vma->vm_end))
+
+#define UB_PAGE_WEIGHT_SHIFT 24
+#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT)
+
+struct page_beancounter;
+#define PBC_COPY_SAME ((struct page_beancounter *) 1)
+
+/* Mprotect charging result */
+#define PRIVVM_ERROR -1
+#define PRIVVM_NO_CHARGE 0
+#define PRIVVM_TO_PRIVATE 1
+#define PRIVVM_TO_SHARED 2
+
+extern void __ub_update_physpages(struct user_beancounter *ub);
+extern void __ub_update_oomguarpages(struct user_beancounter *ub);
+extern void __ub_update_privvm(struct user_beancounter *ub);
+
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl)
+#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl)
+#else
+#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;}
+#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { }
+#endif
+
+PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc))
+PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num))
+PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc))
+PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page,
+ struct mm_struct *mm,
+ struct page_beancounter **pbc))
+PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page,
+ struct mm_struct *mm,
+ struct page_beancounter **pbc))
+PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb))
+PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb))
+PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page,
+ struct mm_struct *mm))
+
+PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page))
+#endif
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl)
+#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl)
+#else
+#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;}
+#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { }
+#endif
+
+struct swap_info_struct;
+SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n))
+SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si))
+SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n,
+ struct user_beancounter *ub))
+SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n))
diff --git a/include/linux/aio.h b/include/linux/aio.h
index aea219d..89cab9b 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -234,4 +234,8 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
extern unsigned long aio_nr;
extern unsigned long aio_max_nr;
+void wait_for_all_aios(struct kioctx *ctx);
+extern struct kmem_cache *kioctx_cachep;
+extern void aio_kick_handler(struct work_struct *);
+
#endif /* __LINUX__AIO_H */
diff --git a/include/linux/capability.h b/include/linux/capability.h
index c8f2a5f..301d709 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -197,12 +197,9 @@ struct cpu_vfs_cap_data {
#define CAP_NET_BROADCAST 11
-/* Allow interface configuration */
/* Allow administration of IP firewall, masquerading and accounting */
/* Allow setting debug option on sockets */
/* Allow modification of routing tables */
-/* Allow setting arbitrary process / process group ownership on
- sockets */
/* Allow binding to any address for transparent proxying */
/* Allow setting TOS (type of service) */
/* Allow setting promiscuous mode */
@@ -232,6 +229,7 @@ struct cpu_vfs_cap_data {
#define CAP_SYS_MODULE 16
/* Allow ioperm/iopl access */
+/* Allow O_DIRECT access */
/* Allow sending USB messages to any device via /proc/bus/usb */
#define CAP_SYS_RAWIO 17
@@ -250,24 +248,19 @@ struct cpu_vfs_cap_data {
/* Allow configuration of the secure attention key */
/* Allow administration of the random device */
-/* Allow examination and configuration of disk quotas */
/* Allow configuring the kernel's syslog (printk behaviour) */
/* Allow setting the domainname */
/* Allow setting the hostname */
/* Allow calling bdflush() */
-/* Allow mount() and umount(), setting up new smb connection */
+/* Allow setting up new smb connection */
/* Allow some autofs root ioctls */
/* Allow nfsservctl */
/* Allow VM86_REQUEST_IRQ */
/* Allow to read/write pci config on alpha */
/* Allow irix_prctl on mips (setstacksize) */
/* Allow flushing all cache on m68k (sys_cacheflush) */
-/* Allow removing semaphores */
-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
- and shared memory */
/* Allow locking/unlocking of shared memory segment */
/* Allow turning swap on/off */
-/* Allow forged pids on socket credentials passing */
/* Allow setting readahead and flushing buffers on block devices */
/* Allow setting geometry in floppy driver */
/* Allow turning DMA on/off in xd driver */
@@ -340,6 +333,61 @@ struct cpu_vfs_cap_data {
#define CAP_SETFCAP 31
+#ifdef __KERNEL__
+/*
+ * Important note: VZ capabilities do intersect with CAP_AUDIT
+ * this is due to compatibility reasons. Nothing bad.
+ * Both VZ and Audit/SELinux caps are disabled in VPSs.
+ */
+
+/* Allow access to all information. In the other case some structures will be
+ * hiding to ensure different Virtual Environment non-interaction on the same
+ * node (NOW OBSOLETED)
+ */
+#define CAP_SETVEID 29
+
+#define capable_setveid() ({ \
+ ve_is_super(get_exec_env()) && \
+ (capable(CAP_SYS_ADMIN) || \
+ capable(CAP_VE_ADMIN)); \
+ })
+
+/*
+ * coinsides with CAP_AUDIT_CONTROL but we don't care, since
+ * audit is disabled in Virtuozzo
+ */
+#define CAP_VE_ADMIN 30
+
+#ifdef CONFIG_VE
+
+/* Replacement for CAP_NET_ADMIN:
+ delegated rights to the Virtual environment of its network administration.
+ For now the following rights have been delegated:
+
+ Allow setting arbitrary process / process group ownership on sockets
+ Allow interface configuration
+ */
+#define CAP_VE_NET_ADMIN CAP_VE_ADMIN
+
+/* Replacement for CAP_SYS_ADMIN:
+ delegated rights to the Virtual environment of its administration.
+ For now the following rights have been delegated:
+ */
+/* Allow mount/umount/remount */
+/* Allow examination and configuration of disk quotas */
+/* Allow removing semaphores */
+/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+ and shared memory */
+/* Allow locking/unlocking of shared memory segment */
+/* Allow forged pids on socket credentials passing */
+
+#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN
+#else
+#define CAP_VE_NET_ADMIN CAP_NET_ADMIN
+#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN
+#endif
+#endif
+
/* Override MAC access.
The base kernel enforces no MAC policy.
An LSM may enforce a MAC policy, and if it does and it chooses
@@ -418,7 +466,16 @@ struct cpu_vfs_cap_data {
#define CAP_INIT_INH_SET CAP_EMPTY_SET
# define cap_clear(c) do { (c) = __cap_empty_set; } while (0)
+#ifndef CONFIG_VE
# define cap_set_full(c) do { (c) = __cap_full_set; } while (0)
+#else
+# define cap_set_full(c) do { \
+ if (ve_is_super(get_exec_env())) \
+ (c) = __cap_full_set; \
+ else \
+ (c) = get_exec_env()->ve_cap_bset;\
+ } while (0)
+#endif
# define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0)
#define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag))
@@ -536,6 +593,10 @@ extern const kernel_cap_t __cap_empty_set;
extern const kernel_cap_t __cap_full_set;
extern const kernel_cap_t __cap_init_eff_set;
+#include <linux/spinlock_types.h>
+
+extern spinlock_t task_capability_lock;
+
/**
* has_capability - Determine if a task has a superior capability available
* @t: The task in question
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0008dee..9665343 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -220,6 +220,8 @@ struct cgroup {
/* For RCU-protected deletion */
struct rcu_head rcu_head;
+
+ int cgroup_lite_id;
};
/*
@@ -525,6 +527,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
int cgroup_scan_tasks(struct cgroup_scanner *scan);
int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css);
/*
* CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..ccefff0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
#endif
/* */
+
+#ifdef CONFIG_BLK_CGROUP
+SUBSYS(blkio)
+#endif
+
+/* */
diff --git a/include/linux/compat.h b/include/linux/compat.h
index af931ee..499d84a 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -258,6 +258,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
extern int compat_printk(const char *fmt, ...);
+extern int ve_compat_printk(int dst, const char *fmt, ...);
extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
diff --git a/include/linux/cpt_export.h b/include/linux/cpt_export.h
new file mode 100644
index 0000000..be88d2d
--- /dev/null
+++ b/include/linux/cpt_export.h
@@ -0,0 +1,36 @@
+/*
+ *
+ * include/linux/cpt_exports.h
+ *
+ * Copyright (C) 2008 Parallels
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __CPT_EXPORTS_H__
+#define __CPT_EXPORTS_H__
+
+struct cpt_context;
+
+struct cpt_ops {
+ void (*write)(const void *addr, size_t count, struct cpt_context *ctx);
+ void (*push_object)(loff_t *, struct cpt_context *);
+ void (*pop_object)(loff_t *, struct cpt_context *);
+ loff_t (*lookup_object)(int type, void *p, struct cpt_context *ctx);
+
+};
+
+extern struct cpt_ops cpt_ops;
+
+struct rst_ops {
+ int (*get_object)(int type, loff_t pos, void *tmp,
+ int size, struct cpt_context *ctx);
+ struct file *(*rst_file)(loff_t pos, int fd, struct cpt_context *ctx);
+};
+
+extern struct rst_ops rst_ops;
+
+#endif
+
diff --git a/include/linux/cpt_image.h b/include/linux/cpt_image.h
new file mode 100644
index 0000000..8185d4e
--- /dev/null
+++ b/include/linux/cpt_image.h
@@ -0,0 +1,1842 @@
+/*
+ *
+ * include/linux/cpt_image.h
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __CPT_IMAGE_H_
+#define __CPT_IMAGE_H_ 1
+
+#define CPT_NULL (~0ULL)
+#define CPT_NOINDEX (~0U)
+
+/*
+ * Image file layout.
+ *
+ * - major header
+ * - sections[]
+ *
+ * Each section is:
+ * - section header
+ * - array of objects
+ *
+ * All data records are arch independent, 64 bit aligned.
+ */
+
+enum _cpt_object_type
+{
+ CPT_OBJ_TASK = 0,
+ CPT_OBJ_MM,
+ CPT_OBJ_FS,
+ CPT_OBJ_FILES,
+ CPT_OBJ_FILE,
+ CPT_OBJ_SIGHAND_STRUCT,
+ CPT_OBJ_SIGNAL_STRUCT,
+ CPT_OBJ_TTY,
+ CPT_OBJ_SOCKET,
+ CPT_OBJ_SYSVSEM_UNDO,
+ CPT_OBJ_NAMESPACE,
+ CPT_OBJ_SYSV_SHM,
+ CPT_OBJ_INODE,
+ CPT_OBJ_UBC,
+ CPT_OBJ_SLM_SGREG,
+ CPT_OBJ_SLM_REGOBJ,
+ CPT_OBJ_SLM_MM,
+ CPT_OBJ_MAX,
+ /* The objects above are stored in memory while checkpointing */
+
+ CPT_OBJ_VMA = 1024,
+ CPT_OBJ_FILEDESC,
+ CPT_OBJ_SIGHANDLER,
+ CPT_OBJ_SIGINFO,
+ CPT_OBJ_LASTSIGINFO,
+ CPT_OBJ_SYSV_SEM,
+ CPT_OBJ_SKB,
+ CPT_OBJ_FLOCK,
+ CPT_OBJ_OPENREQ,
+ CPT_OBJ_VFSMOUNT,
+ CPT_OBJ_TRAILER,
+ CPT_OBJ_SYSVSEM_UNDO_REC,
+ CPT_OBJ_NET_DEVICE,
+ CPT_OBJ_NET_IFADDR,
+ CPT_OBJ_NET_ROUTE,
+ CPT_OBJ_NET_CONNTRACK,
+ CPT_OBJ_NET_CONNTRACK_EXPECT,
+ CPT_OBJ_AIO_CONTEXT,
+ CPT_OBJ_VEINFO,
+ CPT_OBJ_EPOLL,
+ CPT_OBJ_EPOLL_FILE,
+ CPT_OBJ_SKFILTER,
+ CPT_OBJ_SIGALTSTACK,
+ CPT_OBJ_SOCK_MCADDR,
+ CPT_OBJ_BIND_MNT,
+ CPT_OBJ_SYSVMSG,
+ CPT_OBJ_SYSVMSG_MSG,
+
+ CPT_OBJ_X86_REGS = 4096,
+ CPT_OBJ_X86_64_REGS,
+ CPT_OBJ_PAGES,
+ CPT_OBJ_COPYPAGES,
+ CPT_OBJ_REMAPPAGES,
+ CPT_OBJ_LAZYPAGES,
+ CPT_OBJ_NAME,
+ CPT_OBJ_BITS,
+ CPT_OBJ_REF,
+ CPT_OBJ_ITERPAGES,
+ CPT_OBJ_ITERYOUNGPAGES,
+ CPT_OBJ_VSYSCALL,
+ CPT_OBJ_IA64_REGS,
+ CPT_OBJ_INOTIFY,
+ CPT_OBJ_INOTIFY_WATCH,
+ CPT_OBJ_INOTIFY_EVENT,
+ CPT_OBJ_TASK_AUX,
+ CPT_OBJ_NET_TUNTAP,
+ CPT_OBJ_NET_HWADDR,
+ CPT_OBJ_NET_VETH,
+ CPT_OBJ_NET_STATS,
+ CPT_OBJ_NET_IPIP_TUNNEL,
+
+ /* 2.6.27-specific */
+ CPT_OBJ_NET_TAP_FILTER = 0x01000000,
+};
+
+#define CPT_ALIGN(n) (((n)+7)&~7)
+
+struct cpt_major_hdr
+{
+ __u8 cpt_signature[4]; /* Magic number */
+ __u16 cpt_hdrlen; /* Length of this header */
+ __u16 cpt_image_version; /* Format of this file */
+#define CPT_VERSION_MINOR(a) ((a) & 0xf)
+#define CPT_VERSION_8 0
+#define CPT_VERSION_9 0x100
+#define CPT_VERSION_9_1 0x101
+#define CPT_VERSION_9_2 0x102
+#define CPT_VERSION_16 0x200
+#define CPT_VERSION_18 0x300
+#define CPT_VERSION_18_1 0x301
+#define CPT_VERSION_18_2 0x302
+#define CPT_VERSION_18_3 0x303
+#define CPT_VERSION_20 0x400
+#define CPT_VERSION_24 0x500
+#define CPT_VERSION_26 0x600
+#define CPT_VERSION_27 0x700
+#define CPT_VERSION_27_3 0x703
+#define CPT_VERSION_32 0x800
+#define CPT_CURRENT_VERSION CPT_VERSION_32
+ __u16 cpt_os_arch; /* Architecture */
+#define CPT_OS_ARCH_I386 0
+#define CPT_OS_ARCH_EMT64 1
+#define CPT_OS_ARCH_IA64 2
+ __u16 __cpt_pad1;
+ __u32 cpt_ve_features; /* VE features */
+ __u32 cpt_ve_features2; /* VE features */
+ __u16 cpt_pagesize; /* Page size used by OS */
+ __u16 cpt_hz; /* HZ used by OS */
+ __u64 cpt_start_jiffies64; /* Jiffies */
+ __u32 cpt_start_sec; /* Seconds */
+ __u32 cpt_start_nsec; /* Nanoseconds */
+ __u32 cpt_cpu_caps[4]; /* CPU capabilities */
+ __u32 cpt_kernel_config[4]; /* Kernel config */
+ __u64 cpt_iptables_mask; /* Used netfilter modules */
+} __attribute__ ((aligned (8)));
+
+#define CPT_SIGNATURE0 0x79
+#define CPT_SIGNATURE1 0x1c
+#define CPT_SIGNATURE2 0x01
+#define CPT_SIGNATURE3 0x63
+
+/* CPU capabilities */
+#define CPT_CPU_X86_CMOV 0
+#define CPT_CPU_X86_FXSR 1
+#define CPT_CPU_X86_SSE 2
+#define CPT_CPU_X86_SSE2 3
+#define CPT_CPU_X86_MMX 4
+#define CPT_CPU_X86_3DNOW 5
+#define CPT_CPU_X86_3DNOW2 6
+#define CPT_CPU_X86_SEP 7
+#define CPT_CPU_X86_EMT64 8
+#define CPT_CPU_X86_IA64 9
+#define CPT_CPU_X86_SYSCALL 10
+#define CPT_CPU_X86_SYSCALL32 11
+#define CPT_CPU_X86_SEP32 12
+
+/* Unsupported features */
+#define CPT_EXTERNAL_PROCESS 16
+#define CPT_NAMESPACES 17
+#define CPT_SCHEDULER_POLICY 18
+#define CPT_PTRACED_FROM_VE0 19
+#define CPT_UNSUPPORTED_FSTYPE 20
+#define CPT_BIND_MOUNT 21
+#define CPT_UNSUPPORTED_NETDEV 22
+#define CPT_UNSUPPORTED_MISC 23
+#define CPT_SLM_DMPRST 24
+
+/* This mask is used to determine whether VE
+ has some unsupported features or not */
+#define CPT_UNSUPPORTED_MASK 0xffff0000UL
+
+#define CPT_KERNEL_CONFIG_PAE 0
+
+struct cpt_section_hdr
+{
+ __u64 cpt_next;
+ __u32 cpt_section;
+ __u16 cpt_hdrlen;
+ __u16 cpt_align;
+} __attribute__ ((aligned (8)));
+
+enum
+{
+ CPT_SECT_ERROR, /* Error section, content is string */
+ CPT_SECT_VEINFO,
+ CPT_SECT_FILES, /* Files. Content is array of file objects */
+ CPT_SECT_TASKS,
+ CPT_SECT_MM,
+ CPT_SECT_FILES_STRUCT,
+ CPT_SECT_FS,
+ CPT_SECT_SIGHAND_STRUCT,
+ CPT_SECT_TTY,
+ CPT_SECT_SOCKET,
+ CPT_SECT_NAMESPACE,
+ CPT_SECT_SYSVSEM_UNDO,
+ CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and
+ * deleted dentires with inodes not
+ * referenced inside dumped process.
+ */
+ CPT_SECT_SYSV_SHM,
+ CPT_SECT_SYSV_SEM,
+ CPT_SECT_ORPHANS,
+ CPT_SECT_NET_DEVICE,
+ CPT_SECT_NET_IFADDR,
+ CPT_SECT_NET_ROUTE,
+ CPT_SECT_NET_IPTABLES,
+ CPT_SECT_NET_CONNTRACK,
+ CPT_SECT_NET_CONNTRACK_VE0,
+ CPT_SECT_UTSNAME,
+ CPT_SECT_TRAILER,
+ CPT_SECT_UBC,
+ CPT_SECT_SLM_SGREGS,
+ CPT_SECT_SLM_REGOBJS,
+/* Due to silly mistake we cannot index sections beyond this value */
+#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1)
+ CPT_SECT_EPOLL,
+ CPT_SECT_VSYSCALL,
+ CPT_SECT_INOTIFY,
+ CPT_SECT_SYSV_MSG,
+ CPT_SECT_SNMP_STATS,
+ CPT_SECT_MAX
+};
+
+struct cpt_major_tail
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_lazypages;
+ __u32 cpt_64bit;
+ __u64 cpt_sections[CPT_SECT_MAX_INDEX];
+ __u32 cpt_nsect;
+ __u8 cpt_signature[4]; /* Magic number */
+} __attribute__ ((aligned (8)));
+
+
+/* Common object header. */
+struct cpt_object_hdr
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+} __attribute__ ((aligned (8)));
+
+enum _cpt_content_type {
+ CPT_CONTENT_VOID,
+ CPT_CONTENT_ARRAY,
+ CPT_CONTENT_DATA,
+ CPT_CONTENT_NAME,
+
+ CPT_CONTENT_STACK,
+ CPT_CONTENT_X86_FPUSTATE_OLD,
+ CPT_CONTENT_X86_FPUSTATE,
+ CPT_CONTENT_MM_CONTEXT,
+ CPT_CONTENT_SEMARRAY,
+ CPT_CONTENT_SEMUNDO,
+ CPT_CONTENT_NLMARRAY,
+ CPT_CONTENT_MAX
+};
+
+/* CPT_OBJ_BITS: encode array of bytes */
+struct cpt_obj_bits
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_size;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_REF: a reference to another object */
+struct cpt_obj_ref
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_pos;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_VEINFO: various ve specific data */
+struct cpt_veinfo_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ /* ipc ctls */
+ __u32 shm_ctl_max;
+ __u32 shm_ctl_all;
+ __u32 shm_ctl_mni;
+ __u32 msg_ctl_max;
+ __u32 msg_ctl_mni;
+ __u32 msg_ctl_mnb;
+ __u32 sem_ctl_arr[4];
+
+ /* start time */
+ __u64 start_timespec_delta;
+ __u64 start_jiffies_delta;
+
+ /* later extension */
+ __u32 last_pid;
+ __u32 rnd_va_space;
+ __u64 reserved[8];
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_FILE: one struct file */
+struct cpt_file_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_flags;
+ __u32 cpt_mode;
+ __u64 cpt_pos;
+ __u32 cpt_uid;
+ __u32 cpt_gid;
+
+ __u32 cpt_i_mode;
+ __u32 cpt_lflags;
+#define CPT_DENTRY_DELETED 1
+#define CPT_DENTRY_ROOT 2
+#define CPT_DENTRY_CLONING 4
+#define CPT_DENTRY_PROC 8
+#define CPT_DENTRY_EPOLL 0x10
+#define CPT_DENTRY_REPLACED 0x20
+#define CPT_DENTRY_INOTIFY 0x40
+#define CPT_DENTRY_FUTEX 0x80
+#define CPT_DENTRY_TUNTAP 0x100
+#define CPT_DENTRY_PROCPID_DEAD 0x200
+#define CPT_DENTRY_HARDLINKED 0x400
+#define CPT_DENTRY_SIGNALFD 0x800
+ __u64 cpt_inode;
+ __u64 cpt_priv;
+
+ __u32 cpt_fown_fd;
+ __u32 cpt_fown_pid;
+#define CPT_FOWN_STRAY_PID 0
+ __u32 cpt_fown_uid;
+ __u32 cpt_fown_euid;
+ __u32 cpt_fown_signo;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by file name, encoded as CPT_OBJ_NAME */
+
+struct cpt_epoll_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_file;
+} __attribute__ ((aligned (8)));
+/* Followed by array of struct cpt_epoll_file */
+
+struct cpt_epoll_file_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_file;
+ __u32 cpt_fd;
+ __u32 cpt_events;
+ __u64 cpt_data;
+ __u32 cpt_revents;
+ __u32 cpt_ready;
+} __attribute__ ((aligned (8)));
+
+struct cpt_inotify_wd_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_wd;
+ __u32 cpt_mask;
+} __attribute__ ((aligned (8)));
+/* Followed by cpt_file_image of inode to watch */
+
+struct cpt_inotify_ev_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_wd;
+ __u32 cpt_mask;
+ __u32 cpt_cookie;
+ __u32 cpt_namelen;
+} __attribute__ ((aligned (8)));
+/* Followed by name */
+
+struct cpt_inotify_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_file;
+ __u32 cpt_user;
+ __u32 cpt_max_events;
+ __u32 cpt_last_wd;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */
+
+
+/* CPT_OBJ_FILEDESC: one file descriptor */
+struct cpt_fd_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_fd;
+ __u32 cpt_flags;
+#define CPT_FD_FLAG_CLOSEEXEC 1
+ __u64 cpt_file;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_FILES: one files_struct */
+struct cpt_files_struct_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_index;
+ __u32 cpt_max_fds;
+ __u32 cpt_next_fd;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by array of cpt_fd_image */
+
+/* CPT_OBJ_FS: one fs_struct */
+struct cpt_fs_struct_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_umask;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */
+
+/* CPT_OBJ_INODE: one struct inode */
+struct cpt_inode_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_dev;
+ __u64 cpt_ino;
+ __u32 cpt_mode;
+ __u32 cpt_nlink;
+ __u32 cpt_uid;
+ __u32 cpt_gid;
+ __u64 cpt_rdev;
+ __u64 cpt_size;
+ __u64 cpt_blksize;
+ __u64 cpt_atime;
+ __u64 cpt_mtime;
+ __u64 cpt_ctime;
+ __u64 cpt_blocks;
+ __u32 cpt_sb;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+/* CPT_OBJ_VFSMOUNT: one vfsmount */
+struct cpt_vfsmount_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_mntflags;
+#define CPT_MNT_BIND 0x80000000
+#define CPT_MNT_EXT 0x40000000
+ __u32 cpt_flags;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_flock_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_owner;
+ __u32 cpt_pid;
+ __u64 cpt_start;
+ __u64 cpt_end;
+ __u32 cpt_flags;
+ __u32 cpt_type;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_tty_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_flags;
+ __u32 cpt_link;
+ __u32 cpt_index;
+ __u32 cpt_drv_type;
+ __u32 cpt_drv_subtype;
+ __u32 cpt_drv_flags;
+ __u8 cpt_packet;
+ __u8 cpt_stopped;
+ __u8 cpt_hw_stopped;
+ __u8 cpt_flow_stopped;
+
+ __u32 cpt_canon_data;
+ __u32 cpt_canon_head;
+ __u32 cpt_canon_column;
+ __u32 cpt_column;
+ __u8 cpt_ctrl_status;
+ __u8 cpt_erasing;
+ __u8 cpt_lnext;
+ __u8 cpt_icanon;
+ __u8 cpt_raw;
+ __u8 cpt_real_raw;
+ __u8 cpt_closing;
+ __u8 __cpt_pad1;
+ __u16 cpt_minimum_to_wake;
+ __u16 __cpt_pad2;
+ __u32 cpt_pgrp;
+ __u32 cpt_session;
+ __u32 cpt_c_line;
+ __u8 cpt_name[64];
+ __u16 cpt_ws_row;
+ __u16 cpt_ws_col;
+ __u16 cpt_ws_prow;
+ __u16 cpt_ws_pcol;
+ __u8 cpt_c_cc[32];
+ __u32 cpt_c_iflag;
+ __u32 cpt_c_oflag;
+ __u32 cpt_c_cflag;
+ __u32 cpt_c_lflag;
+ __u32 cpt_read_flags[4096/32];
+} __attribute__ ((aligned (8)));
+
+struct cpt_sock_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_file;
+ __u32 cpt_parent;
+ __u32 cpt_index;
+
+ __u64 cpt_ssflags;
+ __u16 cpt_type;
+ __u16 cpt_family;
+ __u8 cpt_sstate;
+ __u8 cpt_passcred;
+ __u8 cpt_state;
+ __u8 cpt_reuse;
+
+ __u8 cpt_zapped;
+ __u8 cpt_shutdown;
+ __u8 cpt_userlocks;
+ __u8 cpt_no_check;
+ __u8 cpt_debug;
+ __u8 cpt_rcvtstamp;
+ __u8 cpt_localroute;
+ __u8 cpt_protocol;
+
+ __u32 cpt_err;
+ __u32 cpt_err_soft;
+
+ __u16 cpt_max_ack_backlog;
+ __u16 __cpt_pad1;
+ __u32 cpt_priority;
+
+ __u32 cpt_rcvlowat;
+ __u32 cpt_bound_dev_if;
+
+ __u64 cpt_rcvtimeo;
+ __u64 cpt_sndtimeo;
+ __u32 cpt_rcvbuf;
+ __u32 cpt_sndbuf;
+ __u64 cpt_flags;
+ __u64 cpt_lingertime;
+ __u32 cpt_peer_pid;
+ __u32 cpt_peer_uid;
+
+ __u32 cpt_peer_gid;
+ __u32 cpt_laddrlen;
+ __u32 cpt_laddr[128/4];
+ __u32 cpt_raddrlen;
+ __u32 cpt_raddr[128/4];
+ /* AF_UNIX */
+ __u32 cpt_peer;
+
+ __u8 cpt_socketpair;
+ __u8 cpt_deleted;
+ __u16 __cpt_pad4;
+ __u32 __cpt_pad5;
+/*
+ struct sk_filter *sk_filter;
+ */
+
+ __u64 cpt_stamp;
+ __u32 cpt_daddr;
+ __u16 cpt_dport;
+ __u16 cpt_sport;
+
+ __u32 cpt_saddr;
+ __u32 cpt_rcv_saddr;
+
+ __u32 cpt_uc_ttl;
+ __u32 cpt_tos;
+
+ __u32 cpt_cmsg_flags;
+ __u32 cpt_mc_index;
+
+ __u32 cpt_mc_addr;
+/*
+ struct ip_options *opt;
+ */
+ __u8 cpt_hdrincl;
+ __u8 cpt_mc_ttl;
+ __u8 cpt_mc_loop;
+ __u8 cpt_pmtudisc;
+
+ __u8 cpt_recverr;
+ __u8 cpt_freebind;
+ __u16 cpt_idcounter;
+ __u32 cpt_cork_flags;
+
+ __u32 cpt_cork_fragsize;
+ __u32 cpt_cork_length;
+ __u32 cpt_cork_addr;
+ __u32 cpt_cork_saddr;
+ __u32 cpt_cork_daddr;
+ __u32 cpt_cork_oif;
+
+ __u32 cpt_udp_pending;
+ __u32 cpt_udp_corkflag;
+ __u16 cpt_udp_encap;
+ __u16 cpt_udp_len;
+ __u32 __cpt_pad7;
+
+ __u64 cpt_saddr6[2];
+ __u64 cpt_rcv_saddr6[2];
+ __u64 cpt_daddr6[2];
+ __u32 cpt_flow_label6;
+ __u32 cpt_frag_size6;
+ __u32 cpt_hop_limit6;
+ __u32 cpt_mcast_hops6;
+
+ __u32 cpt_mcast_oif6;
+ __u8 cpt_rxopt6;
+ __u8 cpt_mc_loop6;
+ __u8 cpt_recverr6;
+ __u8 cpt_sndflow6;
+
+ __u8 cpt_pmtudisc6;
+ __u8 cpt_ipv6only6;
+ __u8 cpt_mapped;
+ __u8 __cpt_pad8;
+ __u32 cpt_pred_flags;
+
+ __u32 cpt_rcv_nxt;
+ __u32 cpt_snd_nxt;
+
+ __u32 cpt_snd_una;
+ __u32 cpt_snd_sml;
+
+ __u32 cpt_rcv_tstamp;
+ __u32 cpt_lsndtime;
+
+ __u8 cpt_tcp_header_len;
+ __u8 cpt_ack_pending;
+ __u8 cpt_quick;
+ __u8 cpt_pingpong;
+ __u8 cpt_blocked;
+ __u8 __cpt_pad9;
+ __u16 __cpt_pad10;
+
+ __u32 cpt_ato;
+ __u32 cpt_ack_timeout;
+
+ __u32 cpt_lrcvtime;
+ __u16 cpt_last_seg_size;
+ __u16 cpt_rcv_mss;
+
+ __u32 cpt_snd_wl1;
+ __u32 cpt_snd_wnd;
+
+ __u32 cpt_max_window;
+ __u32 cpt_pmtu_cookie;
+
+ __u32 cpt_mss_cache;
+ __u16 cpt_mss_cache_std;
+ __u16 cpt_mss_clamp;
+
+ __u16 cpt_ext_header_len;
+ __u16 cpt_ext2_header_len;
+ __u8 cpt_ca_state;
+ __u8 cpt_retransmits;
+ __u8 cpt_reordering;
+ __u8 cpt_frto_counter;
+
+ __u32 cpt_frto_highmark;
+ __u8 cpt_adv_cong;
+ __u8 cpt_defer_accept;
+ __u8 cpt_backoff;
+ __u8 __cpt_pad11;
+
+ __u32 cpt_srtt;
+ __u32 cpt_mdev;
+
+ __u32 cpt_mdev_max;
+ __u32 cpt_rttvar;
+
+ __u32 cpt_rtt_seq;
+ __u32 cpt_rto;
+
+ __u32 cpt_packets_out;
+ __u32 cpt_left_out;
+
+ __u32 cpt_retrans_out;
+ __u32 cpt_snd_ssthresh;
+
+ __u32 cpt_snd_cwnd;
+ __u16 cpt_snd_cwnd_cnt;
+ __u16 cpt_snd_cwnd_clamp;
+
+ __u32 cpt_snd_cwnd_used;
+ __u32 cpt_snd_cwnd_stamp;
+
+ __u32 cpt_timeout;
+ __u32 cpt_ka_timeout;
+
+ __u32 cpt_rcv_wnd;
+ __u32 cpt_rcv_wup;
+
+ __u32 cpt_write_seq;
+ __u32 cpt_pushed_seq;
+
+ __u32 cpt_copied_seq;
+ __u8 cpt_tstamp_ok;
+ __u8 cpt_wscale_ok;
+ __u8 cpt_sack_ok;
+ __u8 cpt_saw_tstamp;
+
+ __u8 cpt_snd_wscale;
+ __u8 cpt_rcv_wscale;
+ __u8 cpt_nonagle;
+ __u8 cpt_keepalive_probes;
+ __u32 cpt_rcv_tsval;
+
+ __u32 cpt_rcv_tsecr;
+ __u32 cpt_ts_recent;
+
+ __u64 cpt_ts_recent_stamp;
+ __u16 cpt_user_mss;
+ __u8 cpt_dsack;
+ __u8 unused; /* was cpt_eff_sacks */
+ __u32 cpt_sack_array[2*5];
+ __u32 cpt_window_clamp;
+
+ __u32 cpt_rcv_ssthresh;
+ __u8 cpt_probes_out;
+ __u8 cpt_num_sacks;
+ __u16 cpt_advmss;
+
+ __u8 cpt_syn_retries;
+ __u8 cpt_ecn_flags;
+ __u16 cpt_prior_ssthresh;
+ __u32 cpt_lost_out;
+
+ __u32 cpt_sacked_out;
+ __u32 cpt_fackets_out;
+
+ __u32 cpt_high_seq;
+ __u32 cpt_retrans_stamp;
+
+ __u32 cpt_undo_marker;
+ __u32 cpt_undo_retrans;
+
+ __u32 cpt_urg_seq;
+ __u16 cpt_urg_data;
+ __u8 cpt_pending;
+ __u8 unused2; /* was cpt_urg_mode */
+
+ __u32 cpt_snd_up;
+ __u32 cpt_keepalive_time;
+
+ __u32 cpt_keepalive_intvl;
+ __u32 cpt_linger2;
+
+ __u32 cpt_rcvrtt_rtt;
+ __u32 cpt_rcvrtt_seq;
+
+ __u32 cpt_rcvrtt_time;
+ __u32 __cpt_pad12;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sockmc_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u16 cpt_family;
+ __u16 cpt_mode;
+ __u32 cpt_ifindex;
+ __u32 cpt_mcaddr[4];
+} __attribute__ ((aligned (8)));
+/* Followed by array of source addresses, each zero padded to 16 bytes */
+
+struct cpt_openreq_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_rcv_isn;
+ __u32 cpt_snt_isn;
+
+ __u16 cpt_rmt_port;
+ __u16 cpt_mss;
+ __u8 cpt_family;
+ __u8 cpt_retrans;
+ __u8 cpt_snd_wscale;
+ __u8 cpt_rcv_wscale;
+
+ __u8 cpt_tstamp_ok;
+ __u8 cpt_sack_ok;
+ __u8 cpt_wscale_ok;
+ __u8 cpt_ecn_ok;
+ __u8 cpt_acked;
+ __u8 __cpt_pad1;
+ __u16 __cpt_pad2;
+
+ __u32 cpt_window_clamp;
+ __u32 cpt_rcv_wnd;
+ __u32 cpt_ts_recent;
+ __u32 cpt_iif;
+ __u64 cpt_expires;
+
+ __u64 cpt_loc_addr[2];
+ __u64 cpt_rmt_addr[2];
+/*
+ struct ip_options *opt;
+ */
+
+} __attribute__ ((aligned (8)));
+
+struct cpt_skb_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_owner;
+ __u32 cpt_queue;
+#define CPT_SKB_NQ 0
+#define CPT_SKB_RQ 1
+#define CPT_SKB_WQ 2
+#define CPT_SKB_OFOQ 3
+
+ __u64 cpt_stamp;
+ __u32 cpt_len;
+ __u32 cpt_hspace;
+ __u32 cpt_tspace;
+ __u32 cpt_h;
+ __u32 cpt_nh;
+ __u32 cpt_mac;
+
+ __u64 cpt_cb[5];
+ __u32 cpt_mac_len;
+ __u32 cpt_csum;
+ __u8 cpt_local_df;
+ __u8 cpt_pkt_type;
+ __u8 cpt_ip_summed;
+ __u8 __cpt_pad1;
+ __u32 cpt_priority;
+ __u16 cpt_protocol;
+ __u16 cpt_security;
+ __u16 cpt_gso_segs;
+ __u16 cpt_gso_size;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvshm_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_key;
+ __u64 cpt_uid;
+ __u64 cpt_gid;
+ __u64 cpt_cuid;
+ __u64 cpt_cgid;
+ __u64 cpt_mode;
+ __u64 cpt_seq;
+
+ __u32 cpt_id;
+ __u32 cpt_mlockuser;
+ __u64 cpt_segsz;
+ __u64 cpt_atime;
+ __u64 cpt_ctime;
+ __u64 cpt_dtime;
+ __u64 cpt_creator;
+ __u64 cpt_last;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvsem_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_key;
+ __u64 cpt_uid;
+ __u64 cpt_gid;
+ __u64 cpt_cuid;
+ __u64 cpt_cgid;
+ __u64 cpt_mode;
+ __u64 cpt_seq;
+ __u32 cpt_id;
+ __u32 __cpt_pad1;
+
+ __u64 cpt_otime;
+ __u64 cpt_ctime;
+} __attribute__ ((aligned (8)));
+/* Content is array of pairs semval/sempid */
+
+struct cpt_sysvsem_undo_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_id;
+ __u32 cpt_nsem;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sysvmsg_msg_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_type;
+ __u64 cpt_size;
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_sysvmsg_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_key;
+ __u64 cpt_uid;
+ __u64 cpt_gid;
+ __u64 cpt_cuid;
+ __u64 cpt_cgid;
+ __u64 cpt_mode;
+ __u64 cpt_seq;
+ __u32 cpt_id;
+ __u32 __cpt_pad1;
+
+ __u64 cpt_stime;
+ __u64 cpt_rtime;
+ __u64 cpt_ctime;
+ __u64 cpt_last_sender;
+ __u64 cpt_last_receiver;
+ __u64 cpt_qbytes;
+} __attribute__ ((aligned (8)));
+/* Content is array of sysv msg */
+
+
+struct cpt_mm_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start_code;
+ __u64 cpt_end_code;
+ __u64 cpt_start_data;
+ __u64 cpt_end_data;
+ __u64 cpt_start_brk;
+ __u64 cpt_brk;
+ __u64 cpt_start_stack;
+ __u64 cpt_start_arg;
+ __u64 cpt_end_arg;
+ __u64 cpt_start_env;
+ __u64 cpt_end_env;
+ __u64 cpt_def_flags;
+ __u64 cpt_mmub;
+ __u8 cpt_dumpable;
+ __u8 cpt_vps_dumpable;
+ __u8 cpt_used_hugetlb;
+ __u8 __cpt_pad;
+ __u32 cpt_vdso;
+} __attribute__ ((aligned (8)));
+
+struct cpt_page_block
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+} __attribute__ ((aligned (8)));
+
+struct cpt_remappage_block
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+ __u64 cpt_pgoff;
+} __attribute__ ((aligned (8)));
+
+struct cpt_copypage_block
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+ __u64 cpt_source;
+} __attribute__ ((aligned (8)));
+
+struct cpt_lazypage_block
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+ __u64 cpt_index;
+} __attribute__ ((aligned (8)));
+
+struct cpt_iterpage_block
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+} __attribute__ ((aligned (8)));
+/* Followed by array of PFNs */
+
+struct cpt_vma_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_file;
+ __u32 cpt_type;
+#define CPT_VMA_TYPE_0 0
+#define CPT_VMA_TYPE_SHM 1
+#define CPT_VMA_VDSO 2
+ __u32 cpt_anonvma;
+ __u64 cpt_anonvmaid;
+
+ __u64 cpt_start;
+ __u64 cpt_end;
+ __u64 cpt_flags;
+ __u64 cpt_pgprot;
+ __u64 cpt_pgoff;
+} __attribute__ ((aligned (8)));
+
+struct cpt_aio_ctx_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_max_reqs;
+ __u32 cpt_ring_pages;
+ __u32 cpt_tail;
+ __u32 cpt_nr;
+ __u64 cpt_mmap_base;
+ /* Data (io_event's) and struct aio_ring are stored in user space VM */
+} __attribute__ ((aligned (8)));
+
+
+/* Format of MM section.
+ *
+ * It is array of MM objects (mm_struct). Each MM object is
+ * header, encoding mm_struct, followed by array of VMA objects.
+ * Each VMA consists of VMA header, encoding vm_area_struct, and
+ * if the VMA contains copied pages, the header is followed by
+ * array of tuples start-end each followed by data.
+ *
+ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good?
+ */
+
+struct cpt_restart_block {
+ __u64 fn;
+#define CPT_RBL_0 0
+#define CPT_RBL_NANOSLEEP 1
+#define CPT_RBL_COMPAT_NANOSLEEP 2
+#define CPT_RBL_POLL 3
+#define CPT_RBL_FUTEX_WAIT 4
+ __u64 arg0;
+ __u64 arg1;
+ __u64 arg2;
+ __u64 arg3;
+} __attribute__ ((aligned (8)));
+
+struct cpt_siginfo_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_qflags;
+ __u32 cpt_signo;
+ __u32 cpt_errno;
+ __u32 cpt_code;
+
+ __u64 cpt_sigval;
+ __u32 cpt_pid;
+ __u32 cpt_uid;
+ __u64 cpt_utime;
+ __u64 cpt_stime;
+
+ __u64 cpt_user;
+} __attribute__ ((aligned (8)));
+
+/* Portable presentaions for segment registers */
+
+#define CPT_SEG_ZERO 0
+#define CPT_SEG_TLS1 1
+#define CPT_SEG_TLS2 2
+#define CPT_SEG_TLS3 3
+#define CPT_SEG_USER32_DS 4
+#define CPT_SEG_USER32_CS 5
+#define CPT_SEG_USER64_DS 6
+#define CPT_SEG_USER64_CS 7
+#define CPT_SEG_LDT 256
+
+struct cpt_x86_regs
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_debugreg[8];
+ __u32 cpt_fs;
+ __u32 cpt_gs;
+
+ __u32 cpt_ebx;
+ __u32 cpt_ecx;
+ __u32 cpt_edx;
+ __u32 cpt_esi;
+ __u32 cpt_edi;
+ __u32 cpt_ebp;
+ __u32 cpt_eax;
+ __u32 cpt_xds;
+ __u32 cpt_xes;
+ __u32 cpt_orig_eax;
+ __u32 cpt_eip;
+ __u32 cpt_xcs;
+ __u32 cpt_eflags;
+ __u32 cpt_esp;
+ __u32 cpt_xss;
+ __u32 cpt_ugs;
+};
+
+struct cpt_x86_64_regs
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_debugreg[8];
+
+ __u64 cpt_fsbase;
+ __u64 cpt_gsbase;
+ __u32 cpt_fsindex;
+ __u32 cpt_gsindex;
+ __u32 cpt_ds;
+ __u32 cpt_es;
+
+ __u64 cpt_r15;
+ __u64 cpt_r14;
+ __u64 cpt_r13;
+ __u64 cpt_r12;
+ __u64 cpt_rbp;
+ __u64 cpt_rbx;
+ __u64 cpt_r11;
+ __u64 cpt_r10;
+ __u64 cpt_r9;
+ __u64 cpt_r8;
+ __u64 cpt_rax;
+ __u64 cpt_rcx;
+ __u64 cpt_rdx;
+ __u64 cpt_rsi;
+ __u64 cpt_rdi;
+ __u64 cpt_orig_rax;
+ __u64 cpt_rip;
+ __u64 cpt_cs;
+ __u64 cpt_eflags;
+ __u64 cpt_rsp;
+ __u64 cpt_ss;
+};
+
+struct cpt_ia64_regs
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 gr[128];
+ __u64 fr[256];
+ __u64 br[8];
+ __u64 nat[2];
+
+ __u64 ar_bspstore;
+ __u64 num_regs;
+ __u64 loadrs;
+ __u64 ar_bsp;
+ __u64 ar_unat;
+ __u64 ar_pfs;
+ __u64 ar_ccv;
+ __u64 ar_fpsr;
+ __u64 ar_csd;
+ __u64 ar_ssd;
+ __u64 ar_ec;
+ __u64 ar_lc;
+ __u64 ar_rsc;
+ __u64 ar_rnat;
+
+ __u64 cr_iip;
+ __u64 cr_ipsr;
+
+ __u64 cfm;
+ __u64 pr;
+
+ __u64 ibr[8];
+ __u64 dbr[8];
+};
+
+
+struct cpt_task_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_state;
+ __u64 cpt_flags;
+#define CPT_TASK_FLAGS_MASK (PF_EXITING | PF_FORKNOEXEC | \
+ PF_SUPERPRIV | PF_DUMPCORE | PF_SIGNALED)
+ __u64 cpt_ptrace;
+ __u32 cpt_prio;
+ __u32 cpt_static_prio;
+ __u32 cpt_policy;
+ __u32 cpt_rt_priority;
+
+ /* struct thread_info */
+ __u64 cpt_exec_domain;
+ __u64 cpt_thrflags;
+ __u64 cpt_thrstatus;
+ __u64 cpt_addr_limit;
+
+ __u64 cpt_personality;
+
+ __u64 cpt_mm;
+ __u64 cpt_files;
+ __u64 cpt_fs;
+ __u64 cpt_signal;
+ __u64 cpt_sighand;
+ __u64 cpt_sigblocked;
+ __u64 cpt_sigrblocked;
+ __u64 cpt_sigpending;
+ __u64 cpt_namespace;
+ __u64 cpt_sysvsem_undo;
+ __u32 cpt_pid;
+ __u32 cpt_tgid;
+ __u32 cpt_ppid;
+ __u32 cpt_rppid;
+ __u32 cpt_pgrp;
+ __u32 cpt_session;
+ __u32 cpt_old_pgrp;
+ __u32 __cpt_pad;
+ __u32 cpt_leader;
+ __u8 cpt_pn_state;
+ __u8 cpt_stopped_state;
+ __u8 cpt_sigsuspend_state;
+ __u8 cpt_64bit;
+ __u64 cpt_set_tid;
+ __u64 cpt_clear_tid;
+ __u32 cpt_exit_code;
+ __u32 cpt_exit_signal;
+ __u32 cpt_pdeath_signal;
+ __u32 cpt_user;
+ __u32 cpt_uid;
+ __u32 cpt_euid;
+ __u32 cpt_suid;
+ __u32 cpt_fsuid;
+ __u32 cpt_gid;
+ __u32 cpt_egid;
+ __u32 cpt_sgid;
+ __u32 cpt_fsgid;
+ __u32 cpt_ngids;
+ __u32 cpt_gids[32];
+ __u8 cpt_prctl_uac;
+ __u8 cpt_prctl_fpemu;
+ __u16 __cpt_pad1;
+ __u64 cpt_ecap;
+ __u64 cpt_icap;
+ __u64 cpt_pcap;
+ __u8 cpt_comm[16];
+ __u64 cpt_tls[3];
+ struct cpt_restart_block cpt_restart;
+ __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */
+ __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */
+ __u64 cpt_it_prof_value;
+ __u64 cpt_it_prof_incr;
+ __u64 cpt_it_virt_value;
+ __u64 cpt_it_virt_incr;
+
+ __u16 cpt_used_math;
+ __u8 cpt_keepcap;
+ __u8 cpt_did_exec;
+ __u32 cpt_ptrace_message;
+
+ __u64 cpt_utime;
+ __u64 cpt_stime;
+ __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */
+ __u64 cpt_nvcsw;
+ __u64 cpt_nivcsw;
+ __u64 cpt_min_flt;
+ __u64 cpt_maj_flt;
+
+ __u64 cpt_sigsuspend_blocked;
+ __u64 cpt_cutime, cpt_cstime;
+ __u64 cpt_cnvcsw, cpt_cnivcsw;
+ __u64 cpt_cmin_flt, cpt_cmaj_flt;
+
+#define CPT_RLIM_NLIMITS 16
+ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS];
+ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS];
+
+ __u64 cpt_task_ub;
+ __u64 cpt_exec_ub;
+ __u64 cpt_mm_ub;
+ __u64 cpt_fork_sub;
+} __attribute__ ((aligned (8)));
+
+struct cpt_sigaltstack_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_stack;
+ __u32 cpt_stacksize;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+struct cpt_task_aux_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_robust_list;
+ __u64 __cpt_future[16];
+} __attribute__ ((aligned (8)));
+
+
+struct cpt_signal_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_leader;
+ __u8 cpt_pgrp_type;
+ __u8 cpt_old_pgrp_type;
+ __u8 cpt_session_type;
+#define CPT_PGRP_NORMAL 0
+#define CPT_PGRP_ORPHAN 1
+#define CPT_PGRP_STRAY 2
+ __u8 __cpt_pad1;
+ __u64 cpt_pgrp;
+ __u64 cpt_old_pgrp;
+ __u64 cpt_session;
+ __u64 cpt_sigpending;
+ __u64 cpt_ctty;
+
+ __u32 cpt_curr_target;
+ __u32 cpt_group_exit;
+ __u32 cpt_group_exit_code;
+ __u32 cpt_group_exit_task;
+ __u32 cpt_notify_count;
+ __u32 cpt_group_stop_count;
+ __u32 cpt_stop_state;
+ __u32 __cpt_pad2;
+
+ __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime;
+ __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw;
+ __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt;
+
+ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS];
+ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS];
+} __attribute__ ((aligned (8)));
+/* Followed by list of posix timers. */
+
+struct cpt_sighand_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+} __attribute__ ((aligned (8)));
+/* Followed by list of sighandles. */
+
+struct cpt_sighandler_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_signo;
+ __u32 __cpt_pad1;
+ __u64 cpt_handler;
+ __u64 cpt_restorer;
+ __u64 cpt_flags;
+ __u64 cpt_mask;
+} __attribute__ ((aligned (8)));
+
+struct cpt_netdev_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_index;
+ __u32 cpt_flags;
+ __u8 cpt_name[16];
+} __attribute__ ((aligned (8)));
+
+struct cpt_tuntap_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_owner;
+ __u32 unused; /* was cpt_attached */
+ __u64 cpt_flags;
+ __u64 cpt_bindfile;
+ __u64 cpt_if_flags;
+ __u8 cpt_dev_addr[6];
+ __u16 cpt_pad;
+ __u32 cpt_chr_filter[2];
+ __u32 cpt_net_filter[2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_tap_filter_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_count;
+ __u32 cpt_mask[2];
+ __u8 cpt_addr[8][6];
+} __attribute__ ((aligned (8)));
+
+struct cpt_veth_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_allow_mac_change;
+ __u32 __cpt_pad;
+} __attribute__ ((aligned (8)));
+
+struct cpt_tunnel_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_tnl_flags;
+#define CPT_TUNNEL_FBDEV 0x1
+#define CPT_TUNNEL_SIT 0x2
+#define CPT_TUNNEL_GRE 0x4
+ __u16 cpt_i_flags;
+ __u16 cpt_o_flags;
+ __u32 cpt_i_key;
+ __u32 cpt_o_key;
+ __u32 cpt_iphdr[5];
+ __u32 cpt_i_seqno;
+ __u32 cpt_o_seqno;
+} __attribute__ ((aligned (8)));
+
+struct cpt_hwaddr_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u8 cpt_dev_addr[32];
+} __attribute__ ((aligned (8)));
+
+struct cpt_netstats_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_rx_packets;
+ __u64 cpt_tx_packets;
+ __u64 cpt_rx_bytes;
+ __u64 cpt_tx_bytes;
+ __u64 cpt_rx_errors;
+ __u64 cpt_tx_errors;
+ __u64 cpt_rx_dropped;
+ __u64 cpt_tx_dropped;
+ __u64 cpt_multicast;
+ __u64 cpt_collisions;
+ __u64 cpt_rx_length_errors;
+ __u64 cpt_rx_over_errors;
+ __u64 cpt_rx_crc_errors;
+ __u64 cpt_rx_frame_errors;
+ __u64 cpt_rx_fifo_errors;
+ __u64 cpt_rx_missed_errors;
+ __u64 cpt_tx_aborted_errors;
+ __u64 cpt_tx_carrier_errors;
+ __u64 cpt_tx_fifo_errors;
+ __u64 cpt_tx_heartbeat_errors;
+ __u64 cpt_tx_window_errors;
+ __u64 cpt_rx_compressed;
+ __u64 cpt_tx_compressed;
+ __u64 pad[4];
+} __attribute__ ((aligned (8)));
+
+struct cpt_ifaddr_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_index;
+ __u8 cpt_family;
+ __u8 cpt_masklen;
+ __u8 cpt_flags;
+ __u8 cpt_scope;
+ __u32 cpt_address[4];
+ __u32 cpt_peer[4];
+ __u32 cpt_broadcast[4];
+ __u8 cpt_label[16];
+ __u32 cpt_valid_lft;
+ __u32 cpt_prefered_lft;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ipct_tuple
+{
+ __u32 cpt_src;
+ __u16 cpt_srcport;
+ __u16 __cpt_pad1;
+
+ __u32 cpt_dst;
+ __u16 cpt_dstport;
+ __u8 cpt_protonum;
+ __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */
+} __attribute__ ((aligned (8)));
+
+struct cpt_nat_manip
+{
+ __u8 cpt_direction;
+ __u8 cpt_hooknum;
+ __u8 cpt_maniptype;
+ __u8 __cpt_pad1;
+
+ __u32 cpt_manip_addr;
+ __u16 cpt_manip_port;
+ __u16 __cpt_pad2;
+ __u32 __cpt_pad3;
+} __attribute__ ((aligned (8)));
+
+struct cpt_nat_seq
+{
+ __u32 cpt_correction_pos;
+ __u32 cpt_offset_before;
+ __u32 cpt_offset_after;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_connexpect_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_timeout;
+ __u32 cpt_sibling_conntrack; /* Index of child conntrack */
+ __u32 cpt_seq; /* id in 2.6.15 */
+
+ struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */
+ struct cpt_ipct_tuple cpt_tuple;
+ struct cpt_ipct_tuple cpt_mask;
+
+ /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */
+ __u32 cpt_help[3]; /* NU 2.6.15 */
+ __u16 cpt_manip_proto;
+ __u8 cpt_dir;
+ __u8 cpt_flags;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ip_conntrack_image
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ struct cpt_ipct_tuple cpt_tuple[2];
+ __u64 cpt_status;
+ __u64 cpt_timeout;
+ __u32 cpt_index;
+ __u8 cpt_ct_helper;
+ __u8 cpt_nat_helper;
+ __u16 cpt_pad1;
+
+ /* union ip_conntrack_proto. Used by tcp and icmp. */
+ __u32 cpt_proto_data[12];
+
+ /* union ip_conntrack_help. Used by ftp and pptp helper.
+ * We do not support pptp...
+ */
+ __u32 cpt_help_data[6];
+
+ /* nat info */
+ __u32 cpt_initialized; /* NU 2.6.15 */
+ __u32 cpt_num_manips; /* NU 2.6.15 */
+ struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */
+
+ struct cpt_nat_seq cpt_nat_seq[2];
+
+ __u32 cpt_masq_index;
+ __u32 cpt_id;
+ __u32 cpt_mark;
+} __attribute__ ((aligned (8)));
+
+/* cpt_ip_conntrack_image struct from 2.6.9 kernel */
+struct cpt_ip_conntrack_image_compat
+{
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ struct cpt_ipct_tuple cpt_tuple[2];
+ __u64 cpt_status;
+ __u64 cpt_timeout;
+ __u32 cpt_index;
+ __u8 cpt_ct_helper;
+ __u8 cpt_nat_helper;
+ __u16 __cpt_pad1;
+
+ /* union ip_conntrack_proto. Used by tcp and icmp. */
+ __u32 cpt_proto_data[12];
+
+ /* union ip_conntrack_help. Used only by ftp helper. */
+ __u32 cpt_help_data[4];
+
+ /* nat info */
+ __u32 cpt_initialized;
+ __u32 cpt_num_manips;
+ struct cpt_nat_manip cpt_nat_manips[6];
+
+ struct cpt_nat_seq cpt_nat_seq[2];
+
+ __u32 cpt_masq_index;
+ __u32 __cpt_pad2;
+} __attribute__ ((aligned (8)));
+
+struct cpt_ubparm
+{
+ __u64 barrier;
+ __u64 limit;
+ __u64 held;
+ __u64 maxheld;
+ __u64 minheld;
+ __u64 failcnt;
+} __attribute__ ((aligned (8)));
+
+struct cpt_beancounter_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u64 cpt_parent;
+ __u32 cpt_id;
+ __u32 cpt_ub_resources;
+ struct cpt_ubparm cpt_parms[32 * 2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_slm_sgreg_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_size;
+ __u32 __cpt_pad1;
+ __u32 cpt_id;
+ __u16 cpt_resource;
+ __u8 cpt_regname[32];
+ __u8 __cpt_pad2[2];
+} __attribute__ ((aligned (8)));
+
+struct cpt_slm_obj_image {
+ __u64 cpt_next;
+ __u32 cpt_object;
+ __u16 cpt_hdrlen;
+ __u16 cpt_content;
+
+ __u32 cpt_size;
+ __u32 __cpt_pad1;
+} __attribute__ ((aligned (8)));
+
+#ifdef __KERNEL__
+
+static inline void __user * cpt_ptr_import(__u64 ptr)
+{
+ return (void*)(unsigned long)ptr;
+}
+
+static inline __u64 cpt_ptr_export(void __user *ptr)
+{
+ return (__u64)(unsigned long)ptr;
+}
+
+static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr)
+{
+ memcpy(sig, &ptr, sizeof(*sig));
+}
+
+static inline __u64 cpt_sigset_export(sigset_t *sig)
+{
+ return *(__u64*)sig;
+}
+
+static inline __u64 cpt_timespec_export(struct timespec *tv)
+{
+ return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
+}
+
+static inline void cpt_timespec_import(struct timespec *tv, __u64 val)
+{
+ tv->tv_sec = val>>32;
+ tv->tv_nsec = (val&0xFFFFFFFF);
+}
+
+static inline __u64 cpt_timeval_export(struct timeval *tv)
+{
+ return (((u64)tv->tv_sec) << 32) + tv->tv_usec;
+}
+
+static inline void cpt_timeval_import(struct timeval *tv, __u64 val)
+{
+ tv->tv_sec = val>>32;
+ tv->tv_usec = (val&0xFFFFFFFF);
+}
+
+#endif
+
+#endif /* __CPT_IMAGE_H_ */
diff --git a/include/linux/cpt_ioctl.h b/include/linux/cpt_ioctl.h
new file mode 100644
index 0000000..f31b66c
--- /dev/null
+++ b/include/linux/cpt_ioctl.h
@@ -0,0 +1,45 @@
+/*
+ *
+ * include/linux/cpt_ioctl.h
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _CPT_IOCTL_H_
+#define _CPT_IOCTL_H_ 1
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define CPTCTLTYPE '-'
+#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int)
+#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int)
+#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int)
+#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int)
+#define CPT_SUSPEND _IO(CPTCTLTYPE, 5)
+#define CPT_DUMP _IO(CPTCTLTYPE, 6)
+#define CPT_UNDUMP _IO(CPTCTLTYPE, 7)
+#define CPT_RESUME _IO(CPTCTLTYPE, 8)
+#define CPT_KILL _IO(CPTCTLTYPE, 9)
+#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10)
+#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int)
+#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12)
+#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int)
+#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int)
+#define CPT_PAGEIND _IO(CPTCTLTYPE, 15)
+#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int)
+#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int)
+#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int)
+#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int)
+#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int)
+#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int)
+
+#define CPT_ITER _IOW(CPTCTLTYPE, 23, int)
+#define CPT_LINKDIR_ADD _IOW(CPTCTLTYPE, 24, int)
+#define CPT_HARDLNK_ON _IOW(CPTCTLTYPE, 25, int)
+
+#endif
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 30b93b2..44c384a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -8,6 +8,8 @@
#include <linux/cache.h>
#include <linux/rcupdate.h>
+#include <bc/dcache.h>
+
struct nameidata;
struct path;
struct vfsmount;
@@ -116,6 +118,9 @@ struct dentry {
struct super_block *d_sb; /* The root of the dentry tree */
void *d_fsdata; /* fs-specific data */
+#ifdef CONFIG_BEANCOUNTERS
+ struct dentry_beancounter dentry_bc;
+#endif
unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */
};
@@ -186,6 +191,10 @@ d_iput: no no no yes
#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
+#define DCACHE_VIRTUAL 0x0100 /* ve accessible */
+
+extern void mark_tree_virtual(struct path *path);
+extern struct kmem_cache *dentry_cache;
extern spinlock_t dcache_lock;
extern seqlock_t rename_lock;
@@ -314,6 +323,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
extern char *__d_path(const struct path *path, struct path *root, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path(struct dentry *, char *, int);
+extern int d_root_check(struct path *path);
/* Allocation counts.. */
@@ -333,6 +343,12 @@ extern char *dentry_path(struct dentry *, char *, int);
static inline struct dentry *dget(struct dentry *dentry)
{
if (dentry) {
+#ifdef CONFIG_BEANCOUNTERS
+ preempt_disable();
+ if (ub_dentry_on && ub_dget_testone(dentry))
+ BUG();
+ preempt_enable_no_resched();
+#endif
BUG_ON(!atomic_read(&dentry->d_count));
atomic_inc(&dentry->d_count);
}
@@ -380,4 +396,5 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
extern int sysctl_vfs_cache_pressure;
+extern int check_area_access_ve(struct path *);
#endif /* __LINUX_DCACHE_H */
diff --git a/include/linux/device.h b/include/linux/device.h
index 2ea3e49..192db29 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -213,8 +213,16 @@ struct class_dev_iter {
const struct device_type *type;
};
+#ifndef CONFIG_VE
extern struct kobject *sysfs_dev_block_kobj;
extern struct kobject *sysfs_dev_char_kobj;
+#define ve_sysfs_dev_block_kobj sysfs_dev_block_kobj
+#define ve_sysfs_dev_char_kobj sysfs_dev_char_kobj
+#else
+#define ve_sysfs_dev_block_kobj (get_exec_env()->dev_block_kobj)
+#define ve_sysfs_dev_char_kobj (get_exec_env()->dev_char_kobj)
+#endif
+
extern int __must_check __class_register(struct class *class,
struct lock_class_key *key);
extern void class_unregister(struct class *class);
@@ -279,6 +287,15 @@ extern struct class * __must_check __class_create(struct module *owner,
struct lock_class_key *key);
extern void class_destroy(struct class *cls);
+extern struct class net_class;
+extern struct kset *class_kset;
+
+int classes_init(void);
+void classes_fini(void);
+
+int devices_init(void);
+void devices_fini(void);
+
/* This is a #define to keep the compiler from merging different
* instances of the __key variable */
#define class_create(owner, name) \
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 5ce0e5f..2d0dfec 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -26,6 +26,7 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number);
/* unlink */
void devpts_pty_kill(struct tty_struct *tty);
+extern struct file_system_type devpts_fs_type;
#else
/* Dummy stubs in the no-pty case */
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 90a4ed0..ae84971 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -406,5 +406,7 @@ static inline int elf_coredump_extra_notes_write(struct file *file,
extern int elf_coredump_extra_notes_size(void);
extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset);
#endif
+extern int sysctl_at_vsyscall;
+
#endif /* __KERNEL__ */
#endif /* _LINUX_ELF_H */
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..a7f552f 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -17,6 +17,7 @@
/* For O_CLOEXEC */
#include <linux/fcntl.h>
#include <linux/types.h>
+#include <linux/fs.h>
/* Flags for epoll_create1. */
#define EPOLL_CLOEXEC O_CLOEXEC
@@ -63,6 +64,94 @@ static inline void eventpoll_init_file(struct file *file)
INIT_LIST_HEAD(&file->f_ep_links);
}
+struct epoll_filefd {
+ struct file *file;
+ int fd;
+};
+
+/*
+ * This structure is stored inside the "private_data" member of the file
+ * structure and rapresent the main data sructure for the eventpoll
+ * interface.
+ */
+struct eventpoll {
+ /* Protect the this structure access */
+ spinlock_t lock;
+
+ /*
+ * This mutex is used to ensure that files are not removed
+ * while epoll is using them. This is held during the event
+ * collection loop, the file cleanup path, the epoll file exit
+ * code and the ctl operations.
+ */
+ struct mutex mtx;
+
+ /* Wait queue used by sys_epoll_wait() */
+ wait_queue_head_t wq;
+
+ /* Wait queue used by file->poll() */
+ wait_queue_head_t poll_wait;
+
+ /* List of ready file descriptors */
+ struct list_head rdllist;
+
+ /* RB tree root used to store monitored fd structs */
+ struct rb_root rbr;
+
+ /*
+ * This is a single linked list that chains all the "struct epitem" that
+ * happened while transfering ready events to userspace w/out
+ * holding ->lock.
+ */
+ struct epitem *ovflist;
+
+ /* The user that created the eventpoll descriptor */
+ struct user_struct *user;
+};
+
+/*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the "rbr" RB tree.
+ */
+struct epitem {
+ /* RB tree node used to link this structure to the eventpoll RB tree */
+ struct rb_node rbn;
+
+ /* List header used to link this structure to the eventpoll ready list */
+ struct list_head rdllink;
+
+ /*
+ * Works together "struct eventpoll"->ovflist in keeping the
+ * single linked chain of items.
+ */
+ struct epitem *next;
+
+ /* The file descriptor information this item refers to */
+ struct epoll_filefd ffd;
+
+ /* Number of active wait queue attached to poll operations */
+ int nwait;
+
+ /* List containing poll wait queues */
+ struct list_head pwqlist;
+
+ /* The "container" of this item */
+ struct eventpoll *ep;
+
+ /* List header used to link this item to the "struct file" items list */
+ struct list_head fllink;
+
+ /* The structure that describe the interested events and the source fd */
+ struct epoll_event event;
+
+ /* The user that created the eventpoll descriptor */
+ struct user_struct *user;
+};
+
+extern struct semaphore epsem;
+struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
+int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+ struct file *tfile, int fd);
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
@@ -95,6 +184,8 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
+extern struct mutex epmutex;
+
#else
static inline void eventpoll_init_file(struct file *file) {}
diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h
new file mode 100644
index 0000000..521455c
--- /dev/null
+++ b/include/linux/fairsched.h
@@ -0,0 +1,92 @@
+/*
+ * Fair Scheduler
+ *
+ * Copyright (C) 2000-2008 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_FAIRSCHED_H__
+#define __LINUX_FAIRSCHED_H__
+
+#define FAIRSCHED_SET_RATE 0
+#define FAIRSCHED_DROP_RATE 1
+#define FAIRSCHED_GET_RATE 2
+
+#ifdef __KERNEL__
+
+/* refcnt change protected with tasklist write lock */
+struct fairsched_node {
+ struct task_group *tg;
+ int refcnt;
+ unsigned id;
+ struct list_head nodelist;
+
+ unsigned weight;
+ unsigned char rate_limited;
+ unsigned rate;
+#ifdef CONFIG_VE
+ struct ve_struct *owner_env;
+#endif
+};
+
+#ifdef CONFIG_VZ_FAIRSCHED
+
+#define FAIRSCHED_INIT_NODE_ID INT_MAX
+
+extern struct fairsched_node fairsched_init_node;
+
+void fairsched_init_early(void);
+void fairsched_init_late(void);
+
+static inline int task_fairsched_node_id(struct task_struct *p)
+{
+ return p->fsched_node->id;
+}
+
+/* must called with tasklist write locked */
+static inline void get_task_fairsched_node(struct task_struct *p)
+{
+ p->fsched_node->refcnt++;
+}
+static inline void put_task_fairsched_node(struct task_struct *p)
+{
+ p->fsched_node->refcnt--;
+}
+
+#define INIT_VZ_FAIRSCHED .fsched_node = &fairsched_init_node,
+
+#define FSCHWEIGHT_MAX ((1 << 16) - 1)
+#define FSCHRATE_SHIFT 10
+#define FSCH_TIMESLICE 16
+
+asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
+ unsigned int newid);
+asmlinkage int sys_fairsched_rmnod(unsigned int id);
+asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid);
+asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus);
+asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight);
+asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate);
+
+int fairsched_new_node(int id, unsigned int vcpus);
+void fairsched_drop_node(int id);
+
+#else /* CONFIG_VZ_FAIRSCHED */
+
+static inline void fairsched_init_early(void) { }
+static inline void fairsched_init_late(void) { }
+static inline int task_fairsched_node_id(struct task_struct *p) { return 0; }
+static inline void get_task_fairsched_node(struct task_struct *p) { }
+static inline void put_task_fairsched_node(struct task_struct *p) { }
+
+static inline int fairsched_new_node(int id, unsigned int vcpus) { return 0; }
+static inline void fairsched_drop_node(int id) { }
+
+#define INIT_VZ_FAIRSCHED
+
+#endif /* CONFIG_VZ_FAIRSCHED */
+#endif /* __KERNEL__ */
+
+#endif /* __LINUX_FAIRSCHED_H__ */
diff --git a/include/linux/faudit.h b/include/linux/faudit.h
new file mode 100644
index 0000000..631c42e
--- /dev/null
+++ b/include/linux/faudit.h
@@ -0,0 +1,45 @@
+/*
+ * include/linux/faudit.h
+ *
+ * Copyright (C) 2005 SWSoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __FAUDIT_H_
+#define __FAUDIT_H_
+
+#include <linux/virtinfo.h>
+
+struct vfsmount;
+struct dentry;
+struct super_block;
+struct kstatfs;
+struct kstat;
+struct pt_regs;
+
+struct faudit_regs_arg {
+ int err;
+ struct pt_regs *regs;
+};
+
+struct faudit_stat_arg {
+ int err;
+ struct vfsmount *mnt;
+ struct dentry *dentry;
+ struct kstat *stat;
+};
+
+struct faudit_statfs_arg {
+ int err;
+ struct super_block *sb;
+ struct kstatfs *stat;
+};
+
+#define VIRTINFO_FAUDIT (0)
+#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0)
+#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1)
+
+#endif
diff --git a/include/linux/file.h b/include/linux/file.h
index 335a0a5..8e31c51 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -41,4 +41,6 @@ extern void put_unused_fd(unsigned int fd);
extern void fd_install(unsigned int fd, struct file *file);
+extern struct kmem_cache *filp_cachep;
+
#endif /* __LINUX_FILE_H */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index da7e52b..099191c 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -163,6 +163,8 @@ static inline void set_freezable_with_signal(void)
} while (try_to_freeze()); \
__retval; \
})
+
+extern atomic_t global_suspend;
#else /* !CONFIG_FREEZER */
static inline int frozen(struct task_struct *p) { return 0; }
static inline int freezing(struct task_struct *p) { return 0; }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b67805..3fef9ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -53,6 +53,7 @@ struct inodes_stat_t {
#define MAY_APPEND 8
#define MAY_ACCESS 16
#define MAY_OPEN 32
+#define MAY_QUOTACTL 64 /* for devgroup-vs-openvz only */
/*
* flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond
@@ -78,6 +79,8 @@ struct inodes_stat_t {
/* File is opened using open(.., 3, ..) and is writeable only for ioctls
(specialy hack for floppy.c) */
#define FMODE_WRITE_IOCTL ((__force fmode_t)256)
+/* Can do sys_quotactl (for devperms) */
+#define FMODE_QUOTACTL ((__force fmode_t)512)
/*
* Don't update ctime and mtime.
@@ -175,6 +178,8 @@ struct inodes_stat_t {
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
+#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */
+#define FS_MANGLE_PROC 128 /* hide some /proc/mounts info inside VE */
#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move()
* during rename() internally.
@@ -235,6 +240,9 @@ struct inodes_stat_t {
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
+/* VZ flags -- These are not upstream! */
+#define S_NOUNUSE (1 << 17) /* just destroy inode in cleanup */
+
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
* flags just means all the inodes inherit those flags by default. It might be
@@ -370,7 +378,6 @@ struct inodes_stat_t {
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
-#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/prio_tree.h>
@@ -405,6 +412,7 @@ extern int get_max_files(void);
extern int sysctl_nr_open;
extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time;
+extern int odirect_enable;
#ifdef CONFIG_DNOTIFY
extern int dir_notify_enable;
#endif
@@ -464,10 +472,15 @@ struct iattr {
struct file *ia_file;
};
+#include <linux/kobject.h>
+
/*
* Includes for diskquotas.
*/
#include <linux/quota.h>
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+#include <linux/vzquota_qlnk.h>
+#endif
/**
* enum positive_aop_returns - aop return codes with specific semantics
@@ -754,6 +767,9 @@ struct inode {
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
#endif
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+ struct vz_quota_ilink i_qlnk;
+#endif
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
@@ -809,6 +825,8 @@ enum inode_i_mutex_lock_class
I_MUTEX_QUOTA
};
+extern struct kmem_cache *inode_cachep;
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
@@ -929,6 +947,7 @@ struct file {
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
+ struct user_beancounter *f_ub;
u64 f_version;
#ifdef CONFIG_SECURITY
@@ -945,6 +964,7 @@ struct file {
#ifdef CONFIG_DEBUG_WRITECOUNT
unsigned long f_mnt_write_state;
#endif
+ struct ve_struct *owner_env;
};
extern spinlock_t files_lock;
#define file_list_lock() spin_lock(&files_lock);
@@ -1063,6 +1083,9 @@ struct file_lock {
fl_owner_t fl_owner;
unsigned char fl_flags;
unsigned char fl_type;
+#ifdef CONFIG_BEANCOUNTERS
+ unsigned char fl_charged;
+#endif
unsigned int fl_pid;
struct pid *fl_nspid;
wait_queue_head_t fl_wait;
@@ -1509,6 +1532,7 @@ struct file_operations {
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **);
+ struct file * (*get_host)(struct file *);
};
struct inode_operations {
@@ -1578,6 +1602,7 @@ struct super_operations {
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ struct inode *(*get_quota_root)(struct super_block *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
};
@@ -1755,8 +1780,14 @@ struct file_system_type {
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
struct lock_class_key i_alloc_sem_key;
+
+ struct file_system_type *proto;
+ struct ve_struct *owner_env;
};
+void get_filesystem(struct file_system_type *fs);
+void put_filesystem(struct file_system_type *fs);
+
extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt);
@@ -1800,13 +1831,20 @@ extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
#define kern_mount(type) kern_mount_data(type, NULL)
+extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *,
+ struct file_system_type **, struct vfsmount **);
+extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *);
+extern void umount_ve_fs_type(struct file_system_type *local_fs_type);
+#define kern_umount mntput
extern int may_umount_tree(struct vfsmount *);
+extern struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root);
extern int may_umount(struct vfsmount *);
extern long do_mount(char *, char *, char *, unsigned long, void *);
extern struct vfsmount *collect_mounts(struct path *);
extern void drop_collected_mounts(struct vfsmount *);
extern int vfs_statfs(struct dentry *, struct kstatfs *);
+extern int faudit_statfs(struct super_block *, struct kstatfs *);
extern int current_umask(void);
@@ -2065,7 +2103,8 @@ extern int check_disk_change(struct block_device *);
extern int __invalidate_device(struct block_device *);
extern int invalidate_partition(struct gendisk *, int);
#endif
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes_check(struct super_block *, int check);
+#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0)
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -2478,6 +2517,17 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos);
+static inline void *file_private(struct file *file)
+{
+ struct file *host = file;
+
+ while (host->f_op->get_host) {
+ host = host->f_op->get_host(host);
+ BUG_ON(host->f_mapping != file->f_mapping);
+ }
+ return host->private_data;
+}
+
struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4d6f47b..5baeae0 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -85,6 +85,7 @@ struct fsnotify_ops {
void (*free_group_priv)(struct fsnotify_group *group);
void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
void (*free_event_priv)(struct fsnotify_event_private_data *priv);
+ void (*detach_mnt)(struct fsnotify_mark_entry *e);
};
/*
@@ -348,6 +349,7 @@ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
extern void fsnotify_unmount_inodes(struct list_head *list);
+extern void fsnotify_unmount_mnt(struct vfsmount *mnt);
/* put here because inotify does some weird stuff when destroying watches */
extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -380,6 +382,7 @@ static inline u32 fsnotify_get_cookie(void)
static inline void fsnotify_unmount_inodes(struct list_head *list)
{}
+static inline void fsnotify_unmount_mnt(struct vfsmount *mnt) { }
#endif /* CONFIG_FSNOTIFY */
#endif /* __KERNEL __ */
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1e5a26d..40bce74 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -132,6 +132,7 @@ union ktime;
long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
u32 __user *uaddr2, u32 val2, u32 val3);
+long futex_wait_restart(struct restart_block *restart);
extern int
handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 557bdad..2c691f4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -52,6 +52,8 @@ struct vm_area_struct;
#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
#define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */
#define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */
+#define __GFP_UBC ((__force gfp_t)0x100000u)/* charge kmem in buddy and slab */
+#define __GFP_SOFT_UBC ((__force gfp_t)0x400000u)/* use soft charging */
#ifdef CONFIG_KMEMCHECK
#define __GFP_NOTRACK ((__force gfp_t)0x200000u) /* Don't track with kmemcheck */
@@ -65,19 +67,22 @@ struct vm_area_struct;
*/
#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23 /* Room for __GFP_FOO bits */
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/* This equals 0, but use constants in case they ever change */
#define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH)
/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC (__GFP_HIGH)
+#define GFP_ATOMIC_UBC (__GFP_HIGH | __GFP_UBC)
#define GFP_NOIO (__GFP_WAIT)
#define GFP_NOFS (__GFP_WAIT | __GFP_IO)
#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC)
#define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \
__GFP_RECLAIMABLE)
#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
+#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC)
#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
__GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 6d527ee..f46e01f 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -10,6 +10,9 @@
#include <asm/hardirq.h>
#include <asm/system.h>
+#include <bc/task.h>
+#include <linux/ve_task.h>
+
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
@@ -150,6 +153,24 @@ extern void rcu_nmi_exit(void);
# define rcu_nmi_exit() do { } while (0)
#endif /* #if defined(CONFIG_NO_HZ) */
+#define save_context() do { \
+ struct task_struct *tsk; \
+ if (hardirq_count() == HARDIRQ_OFFSET) { \
+ tsk = current; \
+ ve_save_context(tsk); \
+ ub_save_context(tsk); \
+ } \
+ } while (0)
+
+#define restore_context() do { \
+ struct task_struct *tsk; \
+ if (hardirq_count() == HARDIRQ_OFFSET) { \
+ tsk = current; \
+ ve_restore_context(tsk); \
+ ub_restore_context(tsk); \
+ } \
+ } while (0)
+
/*
* It is safe to do non-atomic ops on ->hardirq_context,
* because NMI handlers may not preempt and the ops are
@@ -160,6 +181,7 @@ extern void rcu_nmi_exit(void);
do { \
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
+ save_context(); \
trace_hardirq_enter(); \
} while (0)
@@ -175,6 +197,7 @@ extern void irq_enter(void);
do { \
trace_hardirq_exit(); \
account_system_vtime(current); \
+ restore_context(); \
sub_preempt_count(HARDIRQ_OFFSET); \
} while (0)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 040b679..70658bc 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -416,6 +416,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp,
const enum hrtimer_mode mode,
const clockid_t clockid);
extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
+#ifdef CONFIG_COMPAT
+long compat_nanosleep_restart(struct restart_block *restart);
+#endif
extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
struct task_struct *tsk);
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 6badb3e..50c628d 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -42,6 +42,7 @@
#define BRCTL_SET_PORT_PRIORITY 16
#define BRCTL_SET_PATH_COST 17
#define BRCTL_GET_FDB_ENTRIES 18
+#define BRCTL_SET_VIA_ORIG_DEV 19
#define BR_STATE_DISABLED 0
#define BR_STATE_LISTENING 1
@@ -70,6 +71,7 @@ struct __bridge_info
__u32 tcn_timer_value;
__u32 topology_change_timer_value;
__u32 gc_timer_value;
+ __u8 via_phys_dev;
};
struct __port_info
@@ -104,9 +106,12 @@ struct __fdb_entry
#include <linux/netdevice.h>
+#define BR_ALREADY_SEEN 1
+
extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
struct sk_buff *skb);
+extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
extern int (*br_should_route_hook)(struct sk_buff *skb);
#endif
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 7ff9af1..7781691 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -84,6 +84,9 @@ struct vlan_group {
struct hlist_node hlist; /* linked list */
struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
struct rcu_head rcu;
+#ifdef CONFIG_VE
+ struct ve_struct *owner;
+#endif
};
static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 21a6f5d..18a050a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -11,6 +11,7 @@
#include <linux/user_namespace.h>
#include <linux/securebits.h>
#include <net/net_namespace.h>
+#include <linux/fairsched.h>
extern struct files_struct init_files;
extern struct fs_struct init_fs;
@@ -31,10 +32,17 @@ extern struct fs_struct init_fs;
}, \
}
+#ifdef CONFIG_VE
+/* one for ve0, one for init_task */
+#define INIT_NSPROXY_COUNT ATOMIC_INIT(2)
+#else
+#define INIT_NSPROXY_COUNT ATOMIC_INIT(1)
+#endif
+
extern struct nsproxy init_nsproxy;
#define INIT_NSPROXY(nsproxy) { \
.pid_ns = &init_pid_ns, \
- .count = ATOMIC_INIT(1), \
+ .count = INIT_NSPROXY_COUNT, \
.uts_ns = &init_uts_ns, \
.mnt_ns = NULL, \
INIT_NET_NS(net_ns) \
@@ -184,6 +192,7 @@ extern struct cred init_cred;
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
+ INIT_VZ_FAIRSCHED \
}
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index 37ea289..c2540bc 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -101,6 +101,11 @@ struct inotify_operations {
void (*destroy_watch)(struct inotify_watch *);
};
+struct fsnotify_group;
+extern const struct file_operations inotify_fops;
+int __inotify_new_watch(struct fsnotify_group *group,
+ struct path *path, __u32 mask, int wd);
+
#ifdef CONFIG_INOTIFY
/* Kernel API for producing events */
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 4da4a75..d61b0b8 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -40,16 +40,11 @@ struct cfq_io_context {
struct io_context *ioc;
unsigned long last_end_request;
- sector_t last_request_pos;
unsigned long ttime_total;
unsigned long ttime_samples;
unsigned long ttime_mean;
- unsigned int seek_samples;
- u64 seek_total;
- sector_t seek_mean;
-
struct list_head queue_list;
struct hlist_node cic_list;
@@ -73,6 +68,10 @@ struct io_context {
unsigned short ioprio;
unsigned short ioprio_changed;
+#ifdef CONFIG_BLK_CGROUP
+ unsigned short cgroup_changed;
+#endif
+
/*
* For request batching
*/
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 76dad48..c699950 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -39,6 +39,7 @@ enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
+ IOPRIO_WHO_UBC = 1000,
};
/*
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 3b1594d..9ffdcb5 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -79,6 +79,7 @@ struct ipc_kludge {
#ifdef __KERNEL__
#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
@@ -98,6 +99,15 @@ struct kern_ipc_perm
void *security;
};
+struct ipc_ids;
+
+struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+static inline void ipc_unlock(struct kern_ipc_perm *perm)
+{
+ spin_unlock(&perm->lock);
+ rcu_read_unlock();
+}
+
#endif /* __KERNEL__ */
#endif /* _LINUX_IPC_H */
diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
index 2dacab8..91783a7 100644
--- a/include/linux/kdev_t.h
+++ b/include/linux/kdev_t.h
@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 dev)
return dev & 0x3ffff;
}
+#define UNNAMED_MAJOR_COUNT 16
+
+#if UNNAMED_MAJOR_COUNT > 1
+
+extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT];
+
+static inline dev_t make_unnamed_dev(int idx)
+{
+ /*
+ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the
+ * unnamed device index into major number.
+ */
+ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)],
+ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8));
+}
+
+static inline int unnamed_dev_idx(dev_t dev)
+{
+ int i;
+ for (i = 0; i < UNNAMED_MAJOR_COUNT &&
+ MAJOR(dev) != unnamed_dev_majors[i]; i++);
+ return MINOR(dev) | (i << 8);
+}
+
+static inline int is_unnamed_dev(dev_t dev)
+{
+ int i;
+ for (i = 0; i < UNNAMED_MAJOR_COUNT &&
+ MAJOR(dev) != unnamed_dev_majors[i]; i++);
+ return i < UNNAMED_MAJOR_COUNT;
+}
+
+#else /* UNNAMED_MAJOR_COUNT */
+
+static inline dev_t make_unnamed_dev(int idx)
+{
+ return MKDEV(0, idx);
+}
+
+static inline int unnamed_dev_idx(dev_t dev)
+{
+ return MINOR(dev);
+}
+
+static inline int is_unnamed_dev(dev_t dev)
+{
+ return MAJOR(dev) == 0;
+}
+
+#endif /* UNNAMED_MAJOR_COUNT */
+
#else /* __KERNEL__ */
/*
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f4e3184..1a56950 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -245,6 +245,12 @@ extern struct ratelimit_state printk_ratelimit_state;
extern int printk_ratelimit(void);
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msec);
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+ __attribute__ ((format (printf, 2, 0)));
+asmlinkage int ve_printk(int, const char * fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+void prepare_printk(void);
+
extern int printk_delay_msec;
@@ -272,6 +278,15 @@ static inline int printk_ratelimit(void) { return 0; }
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
unsigned int interval_msec) \
{ return false; }
+static inline int ve_printk(int d, const char *s, ...)
+ __attribute__ ((format (printf, 2, 3)));
+static inline int ve_printk(int d, const char *s, ...)
+{
+ return 0;
+}
+static inline void prepare_printk(void)
+{
+}
/* No effect, but we still get type checking even in the !PRINTK case: */
#define printk_once(x...) printk(x)
@@ -289,9 +304,17 @@ extern void asmlinkage __attribute__((format(printf, 1, 2)))
unsigned long int_sqrt(unsigned long);
+#define VE0_LOG 1
+#define VE_LOG 2
+#define VE_LOG_BOTH (VE0_LOG | VE_LOG)
+extern int console_silence_loglevel;
+
static inline void console_silent(void)
{
- console_loglevel = 0;
+ if (console_loglevel > console_silence_loglevel) {
+ printk(KERN_EMERG "console shuts up ...\n");
+ console_loglevel = 0;
+ }
}
static inline void console_verbose(void)
@@ -305,6 +328,7 @@ extern void wake_up_klogd(void);
extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
extern int panic_timeout;
extern int panic_on_oops;
+extern int decode_call_traces;
extern int panic_on_unrecovered_nmi;
extern int panic_on_io_nmi;
extern const char *print_tainted(void);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 58ae8e0..092b14e 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -51,6 +51,8 @@ enum kobject_action {
KOBJ_REMOVE,
KOBJ_CHANGE,
KOBJ_MOVE,
+ KOBJ_START,
+ KOBJ_STOP,
KOBJ_ONLINE,
KOBJ_OFFLINE,
KOBJ_MAX
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index aabc8a1..2bdf77d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,10 +4,19 @@
#include <linux/err.h>
#include <linux/sched.h>
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_ve(struct ve_struct *ve,
+ int (*threadfn)(void *data),
void *data,
const char namefmt[], ...)
- __attribute__((format(printf, 3, 4)));
+ __attribute__((format(printf, 4, 5)));
+
+#define kthread_create(threadfn, data, namefmt, ...) \
+({ \
+ struct task_struct *__k \
+ = kthread_create_ve(get_ve0(), threadfn, data, namefmt, \
+ ## __VA_ARGS__); \
+ __k; \
+})
/**
* kthread_run - create and wake a thread.
@@ -27,6 +36,17 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
__k; \
})
+/* Like kthread_run() but run a thread in VE context */
+#define kthread_run_ve(ve, threadfn, data, namefmt, ...) \
+({ \
+ struct task_struct *__k \
+ = kthread_create_ve(ve, threadfn, data, namefmt, \
+ ## __VA_ARGS__); \
+ if (!IS_ERR(__k)) \
+ wake_up_process(__k); \
+ __k; \
+})
+
void kthread_bind(struct task_struct *k, unsigned int cpu);
int kthread_stop(struct task_struct *k);
int kthread_should_stop(void);
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index a34dea4..cc6cbaf 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -66,6 +66,7 @@ struct nlm_host {
struct list_head h_reclaim; /* Locks in RECLAIM state */
struct nsm_handle *h_nsmhandle; /* NSM status handle */
char *h_addrbuf; /* address eyecatcher */
+ struct ve_struct * owner_env; /* VE owning the host */
};
/*
@@ -192,8 +193,10 @@ extern struct svc_procedure nlmsvc_procedures[];
#ifdef CONFIG_LOCKD_V4
extern struct svc_procedure nlmsvc_procedures4[];
#endif
-extern int nlmsvc_grace_period;
-extern unsigned long nlmsvc_timeout;
+
+#include <linux/ve_nfs.h>
+extern unsigned long _nlmsvc_timeout;
+
extern int nsm_use_hostnames;
extern u32 nsm_local_state;
diff --git a/include/linux/major.h b/include/linux/major.h
index 6a8ca98..ee562c3 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -174,4 +174,7 @@
#define BLOCK_EXT_MAJOR 259
#define SCSI_OSD_MAJOR 260 /* open-osd's OSD scsi device */
+#define UNNAMED_EXTRA_MAJOR 130
+#define UNNAMED_EXTRA_MAJOR_COUNT 120
+
#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 24c3956..7bb1cf3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -712,6 +712,7 @@ extern void pagefault_out_of_memory(void);
extern void show_free_areas(void);
int shmem_lock(struct file *file, int lock, struct user_struct *user);
+#define shmem_nopage filemap_nopage
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
int shmem_zero_setup(struct vm_area_struct *);
@@ -776,7 +777,9 @@ int walk_page_range(unsigned long addr, unsigned long end,
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma);
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
+int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma,
+ unsigned long addr, size_t size);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -832,7 +835,7 @@ int __set_page_dirty_nobuffers(struct page *page);
int __set_page_dirty_no_writeback(struct page *page);
int redirty_page_for_writepage(struct writeback_control *wbc,
struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
+int account_page_dirtied(struct page *page, struct address_space *mapping);
int set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);
int clear_page_dirty_for_io(struct page *page);
@@ -1294,7 +1297,12 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
-extern int randomize_va_space;
+extern int _randomize_va_space;
+#ifndef CONFIG_VE
+#define randomize_va_space _randomize_va_space
+#else
+#define randomize_va_space (get_exec_env()->_randomize_va_space)
+#endif
#endif
const char * arch_vma_name(struct vm_area_struct *vma);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 84a524a..8ecf0ec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -106,6 +106,14 @@ struct page {
*/
void *shadow;
#endif
+#ifdef CONFIG_BEANCOUNTERS
+ /* FIXME: switch to mainline memcgroup */
+ union {
+ struct user_beancounter *page_ub;
+ struct page_beancounter *page_pb;
+ struct user_beancounter **slub_ubs;
+ } bc;
+#endif
};
/*
@@ -260,6 +268,12 @@ struct mm_struct {
unsigned long flags; /* Must use atomic bitops to access the bits */
+ unsigned int vps_dumpable:2;
+ unsigned int oom_killed:1;
+
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *mm_ub;
+#endif
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9872d6c..e6f415d 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -85,6 +85,9 @@ static inline unsigned long
calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
+#ifdef MAP_GROWSUP
+ _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) |
+#endif
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index d74785c..21551aa 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -26,6 +26,8 @@ struct fs_struct;
extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt);
extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
struct fs_struct *);
+extern struct rw_semaphore namespace_sem;
+
extern void put_mnt_ns(struct mnt_namespace *ns);
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d52753..f4bf358 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -70,6 +70,7 @@ struct vfsmount {
#else
int mnt_writers;
#endif
+ unsigned owner;
};
static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
diff --git a/include/linux/msg.h b/include/linux/msg.h
index 56abf15..050f740 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext,
extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext,
size_t msgsz, long msgtyp, int msgflg);
+int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg);
+int sysvipc_setup_msg(key_t key, int msqid, int msgflg);
+int sysv_msg_store(struct msg_msg *msg,
+ int (*store)(void * src, int len, int offset, void * data),
+ int len, void * data);
+struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
+ void * data), int len, void * data);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MSG_H */
diff --git a/include/linux/namei.h b/include/linux/namei.h
index ec0f607..e8c5efa 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -56,6 +56,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
#define LOOKUP_CREATE 0x0200
#define LOOKUP_EXCL 0x0400
#define LOOKUP_RENAME_TARGET 0x0800
+#define LOOKUP_NOAREACHECK 0x1000 /* no area check on lookup */
+#define LOOKUP_STRICT 0x2000 /* no symlinks or other filesystems */
extern int user_path_at(int, const char __user *, unsigned, struct path *);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 812a5f3..94887e1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -300,6 +300,11 @@ enum netdev_state_t
__LINK_STATE_DORMANT,
};
+struct netdev_bc {
+ struct user_beancounter *exec_ub, *owner_ub;
+};
+
+#define netdev_bc(dev) (&(dev)->dev_bc)
/*
* This structure holds at boot time configured netdevice settings. They
@@ -485,6 +490,10 @@ struct netdev_queue {
unsigned long tx_dropped;
} ____cacheline_aligned_in_smp;
+struct cpt_context;
+struct cpt_ops;
+struct rst_ops;
+struct cpt_netdev_image;
/*
* This structure defines the management hooks for network devices.
@@ -636,8 +645,23 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
#endif
+ void (*ndo_cpt)(struct net_device *dev,
+ struct cpt_ops *,
+ struct cpt_context *);
+};
+
+struct netdev_rst {
+ int cpt_object;
+ int (*ndo_rst)(loff_t, struct cpt_netdev_image *,
+ struct rst_ops *,
+ struct cpt_context *);
+ struct list_head list;
};
+void register_netdev_rst(struct netdev_rst *ops);
+void unregister_netdev_rst(struct netdev_rst *ops);
+struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops);
+
/*
* The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
@@ -708,6 +732,8 @@ struct net_device
#define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */
#define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */
#define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
+#define NETIF_F_VENET (1 << 27) /* device is venet device */
+#define NETIF_F_VIRTUAL (1 << 28) /* can be registered inside VE */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
@@ -892,6 +918,9 @@ struct net_device
/* GARP */
struct garp_port *garp_port;
+ struct ve_struct *owner_env; /* Owner VE of the interface */
+ struct netdev_bc dev_bc;
+
/* class/net/name entry */
struct device dev;
/* space for optional statistics and wireless sysfs groups */
@@ -919,6 +948,20 @@ struct net_device
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
+#define NETDEV_HASHBITS 8
+#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+ unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+ return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+ return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
#define NETDEV_ALIGN 32
static inline
@@ -1493,6 +1536,8 @@ extern int dev_ethtool(struct net *net, struct ifreq *);
extern unsigned dev_get_flags(const struct net_device *);
extern int dev_change_flags(struct net_device *, unsigned);
extern int dev_change_name(struct net_device *, const char *);
+int __dev_change_net_namespace(struct net_device *, struct net *, const char *,
+ struct user_beancounter *exec_ub);
extern int dev_set_alias(struct net_device *, const char *, size_t);
extern int dev_change_net_namespace(struct net_device *,
struct net *, const char *);
@@ -1914,6 +1959,18 @@ unsigned long netdev_increment_features(unsigned long all, unsigned long one,
unsigned long mask);
unsigned long netdev_fix_features(unsigned long features, const char *name);
+#if defined(CONFIG_VE) && defined(CONFIG_NET)
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+ return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL));
+}
+#else
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+ return 0;
+}
+#endif
+
static inline int net_gso_ok(int features, int gso_type)
{
int feature = gso_type << NETIF_F_GSO_SHIFT;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 6132b5e..56ec50d 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -353,5 +353,28 @@ extern void (*nf_ct_destroy)(struct nf_conntrack *);
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
#endif
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/vziptable_defs.h>
+
+#define net_ipt_permitted(netns, ipt) \
+ (mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt))
+
+#define net_ipt_module_permitted(netns, ipt) \
+ (mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt) && \
+ mask_ipt_allow((netns)->owner_ve->_iptables_modules, \
+ (ipt) & ~(ipt##_MOD)))
+
+#define net_ipt_module_set(netns, ipt) \
+ ({ \
+ (netns)->owner_ve->_iptables_modules |= ipt##_MOD; \
+ })
+#define net_is_ipt_module_set(netns, ipt) \
+ ((netns)->owner_ve->_iptables_modules & (ipt##_MOD))
+#else
+#define net_ipt_module_permitted(netns, ipt) (1)
+#define net_ipt_module_set(netns, ipt)
+#define net_is_ipt_module_set(netns, ipt) (1)
+#endif
+
#endif /*__KERNEL__*/
#endif /*__LINUX_NETFILTER_H*/
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 812cb15..4339ac7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -375,6 +375,7 @@ struct xt_table_info
{
/* Size per table */
unsigned int size;
+ unsigned int alloc_size;
/* Number of entries: FIXME. --RR */
unsigned int number;
/* Initial number of entries. Needed for module usage count */
@@ -605,6 +606,23 @@ extern int xt_compat_target_to_user(struct xt_entry_target *t,
void __user **dstptr, unsigned int *size);
#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VE
+static inline bool ve_xt_table_forbidden(struct xt_table *xt)
+{
+ /*
+ * The only purpose to have this check as a separate
+ * helper is "grep"-a-bility
+ *
+ * If this helper hit it means that a VE has been
+ * configured without the particular xt_table support
+ */
+ return xt == NULL;
+}
+#else
+static inline bool ve_xt_table_forbidden(struct xt_table *xt) { return true; }
+#endif
+
#endif /* __KERNEL__ */
#endif /* _X_TABLES_H */
diff --git a/include/linux/netfilter/xt_CONNMARK.h b/include/linux/netfilter/xt_CONNMARK.h
index 0a85458..7635c8f 100644
--- a/include/linux/netfilter/xt_CONNMARK.h
+++ b/include/linux/netfilter/xt_CONNMARK.h
@@ -18,6 +18,12 @@ enum {
XT_CONNMARK_RESTORE
};
+struct xt_connmark_target_info {
+ unsigned long mark;
+ unsigned long mask;
+ __u8 mode;
+};
+
struct xt_connmark_tginfo1 {
__u32 ctmark, ctmask, nfmask;
__u8 mode;
diff --git a/include/linux/netfilter/xt_MARK.h b/include/linux/netfilter/xt_MARK.h
index bc9561b..028304b 100644
--- a/include/linux/netfilter/xt_MARK.h
+++ b/include/linux/netfilter/xt_MARK.h
@@ -3,6 +3,23 @@
#include <linux/types.h>
+/* Version 0 */
+struct xt_mark_target_info {
+ unsigned long mark;
+};
+
+/* Version 1 */
+enum {
+ XT_MARK_SET=0,
+ XT_MARK_AND,
+ XT_MARK_OR,
+};
+
+struct xt_mark_target_info_v1 {
+ unsigned long mark;
+ __u8 mode;
+};
+
struct xt_mark_tginfo2 {
__u32 mark, mask;
};
diff --git a/include/linux/netfilter/xt_connmark.h b/include/linux/netfilter/xt_connmark.h
index 619e47c..571e266 100644
--- a/include/linux/netfilter/xt_connmark.h
+++ b/include/linux/netfilter/xt_connmark.h
@@ -12,6 +12,11 @@
* (at your option) any later version.
*/
+struct xt_connmark_info {
+ unsigned long mark, mask;
+ __u8 invert;
+};
+
struct xt_connmark_mtinfo1 {
__u32 mark, mask;
__u8 invert;
diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h
index 54f47a2..7ae0533 100644
--- a/include/linux/netfilter/xt_conntrack.h
+++ b/include/linux/netfilter/xt_conntrack.h
@@ -32,6 +32,42 @@ enum {
XT_CONNTRACK_DIRECTION = 1 << 12,
};
+/* This is exposed to userspace, so remains frozen in time. */
+struct ip_conntrack_old_tuple
+{
+ struct {
+ __be32 ip;
+ union {
+ __u16 all;
+ } u;
+ } src;
+
+ struct {
+ __be32 ip;
+ union {
+ __u16 all;
+ } u;
+
+ /* The protocol. */
+ __u16 protonum;
+ } dst;
+};
+
+struct xt_conntrack_info
+{
+ unsigned int statemask, statusmask;
+
+ struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+ struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
+
+ unsigned long expires_min, expires_max;
+
+ /* Flags word */
+ __u8 flags;
+ /* Inverse flags */
+ __u8 invflags;
+};
+
struct xt_conntrack_mtinfo1 {
union nf_inet_addr origsrc_addr, origsrc_mask;
union nf_inet_addr origdst_addr, origdst_mask;
diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h
index b1925b5..65eaf2b 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -65,4 +65,11 @@ struct xt_hashlimit_mtinfo1 {
struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
};
+#ifdef __KERNEL__
+struct ve_xt_hashlimit {
+ struct hlist_head hashlimit_htables;
+ struct proc_dir_entry *hashlimit_procdir4;
+ struct proc_dir_entry *hashlimit_procdir6;
+};
+#endif
#endif /*_XT_HASHLIMIT_H*/
diff --git a/include/linux/netfilter/xt_mark.h b/include/linux/netfilter/xt_mark.h
index 6607c8f..6fa460a 100644
--- a/include/linux/netfilter/xt_mark.h
+++ b/include/linux/netfilter/xt_mark.h
@@ -3,6 +3,11 @@
#include <linux/types.h>
+struct xt_mark_info {
+ unsigned long mark, mask;
+ __u8 invert;
+};
+
struct xt_mark_mtinfo1 {
__u32 mark, mask;
__u8 invert;
diff --git a/include/linux/netfilter/xt_recent.h b/include/linux/netfilter/xt_recent.h
index d2c2766..8a12181 100644
--- a/include/linux/netfilter/xt_recent.h
+++ b/include/linux/netfilter/xt_recent.h
@@ -25,4 +25,15 @@ struct xt_recent_mtinfo {
__u8 side;
};
+#ifdef __KERNEL__
+struct ve_ipt_recent {
+ struct list_head tables;
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc_dir;
+#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+ struct proc_dir_entry *proc_old_dir;
+#endif
+#endif
+};
+#endif
#endif /* _LINUX_NETFILTER_XT_RECENT_H */
diff --git a/include/linux/netfilter_ipv4/ipt_TOS.h b/include/linux/netfilter_ipv4/ipt_TOS.h
new file mode 100644
index 0000000..6752240
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_TOS.h
@@ -0,0 +1,12 @@
+#ifndef _IPT_TOS_H_target
+#define _IPT_TOS_H_target
+
+#ifndef IPTOS_NORMALSVC
+#define IPTOS_NORMALSVC 0
+#endif
+
+struct ipt_tos_target_info {
+ u_int8_t tos;
+};
+
+#endif /*_IPT_TOS_H_target*/
diff --git a/include/linux/netfilter_ipv4/ipt_iprange.h b/include/linux/netfilter_ipv4/ipt_iprange.h
new file mode 100644
index 0000000..517e8b1
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_iprange.h
@@ -0,0 +1,23 @@
+#ifndef _IPT_IPRANGE_H
+#define _IPT_IPRANGE_H
+
+#define IPRANGE_SRC 0x01 /* Match source IP address */
+#define IPRANGE_DST 0x02 /* Match destination IP address */
+#define IPRANGE_SRC_INV 0x10 /* Negate the condition */
+#define IPRANGE_DST_INV 0x20 /* Negate the condition */
+
+struct ipt_iprange {
+ /* Inclusive: network order. */
+ u_int32_t min_ip, max_ip;
+};
+
+struct ipt_iprange_info
+{
+ struct ipt_iprange src;
+ struct ipt_iprange dst;
+
+ /* Flags from above */
+ u_int8_t flags;
+};
+
+#endif /* _IPT_IPRANGE_H */
diff --git a/include/linux/netfilter_ipv4/ipt_owner.h b/include/linux/netfilter_ipv4/ipt_owner.h
new file mode 100644
index 0000000..72ea6c3
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_owner.h
@@ -0,0 +1,20 @@
+#ifndef _IPT_OWNER_H
+#define _IPT_OWNER_H
+
+/* match and invert flags */
+#define IPT_OWNER_UID 0x01
+#define IPT_OWNER_GID 0x02
+#define IPT_OWNER_PID 0x04
+#define IPT_OWNER_SID 0x08
+#define IPT_OWNER_COMM 0x10
+
+struct ipt_owner_info {
+ uid_t uid;
+ gid_t gid;
+ pid_t pid;
+ pid_t sid;
+ char comm[16];
+ u_int8_t match, invert; /* flags */
+};
+
+#endif /*_IPT_OWNER_H*/
diff --git a/include/linux/netfilter_ipv4/ipt_tos.h b/include/linux/netfilter_ipv4/ipt_tos.h
new file mode 100644
index 0000000..a21f5df
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_tos.h
@@ -0,0 +1,13 @@
+#ifndef _IPT_TOS_H
+#define _IPT_TOS_H
+
+struct ipt_tos_info {
+ u_int8_t tos;
+ u_int8_t invert;
+};
+
+#ifndef IPTOS_NORMALSVC
+#define IPTOS_NORMALSVC 0
+#endif
+
+#endif /*_IPT_TOS_H*/
diff --git a/include/linux/netfilter_ipv6/ip6t_owner.h b/include/linux/netfilter_ipv6/ip6t_owner.h
new file mode 100644
index 0000000..e9f10ba
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_owner.h
@@ -0,0 +1,18 @@
+#ifndef _IP6T_OWNER_H
+#define _IP6T_OWNER_H
+
+/* match and invert flags */
+#define IP6T_OWNER_UID 0x01
+#define IP6T_OWNER_GID 0x02
+#define IP6T_OWNER_PID 0x04
+#define IP6T_OWNER_SID 0x08
+
+struct ip6t_owner_info {
+ uid_t uid;
+ gid_t gid;
+ pid_t pid;
+ pid_t sid;
+ u_int8_t match, invert; /* flags */
+};
+
+#endif /*_IPT_OWNER_H*/
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d09db1b..5b36364 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -374,7 +374,7 @@ extern const struct address_space_operations nfs_file_aops;
static inline struct nfs_open_context *nfs_file_open_context(struct file *filp)
{
- return filp->private_data;
+ return file_private(filp);
}
static inline struct rpc_cred *nfs_file_cred(struct file *file)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b26dc51..643e380 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -91,6 +91,7 @@ struct nfs_client {
#ifdef CONFIG_NFS_FSCACHE
struct fscache_cookie *fscache; /* client index cache cookie */
#endif
+ struct ve_struct *owner_env;
};
/*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b752e80..ed9d975 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,4 +47,6 @@ static inline bool trigger_all_cpu_backtrace(void)
}
#endif
+extern void nmi_show_regs(struct pt_regs *regs, int in_nmi);
+extern int do_nmi_show_regs(struct pt_regs *regs, int cpu);
#endif
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 44428d2..a3a0a02 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
#define NOTIFY_DONE 0x0000 /* Don't care */
#define NOTIFY_OK 0x0001 /* Suits me */
+#define NOTIFY_FAIL 0x0002 /* Reject */
#define NOTIFY_STOP_MASK 0x8000 /* Don't call further */
-#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002)
+#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL)
/* Bad/Veto action */
/*
* Clean way to return from the notifier and stop further calls.
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7b370c7..fb93025 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -62,10 +62,11 @@ static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
return rcu_dereference(tsk->nsproxy);
}
-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk, int force_admin);
void exit_task_namespaces(struct task_struct *tsk);
void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
void free_nsproxy(struct nsproxy *ns);
+struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk);
int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
struct fs_struct *);
@@ -76,9 +77,10 @@ static inline void put_nsproxy(struct nsproxy *ns)
}
}
-static inline void get_nsproxy(struct nsproxy *ns)
+static inline struct nsproxy *get_nsproxy(struct nsproxy *ns)
{
atomic_inc(&ns->count);
+ return ns;
}
#ifdef CONFIG_CGROUP_NS
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b202b1..ef195a0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -209,6 +209,7 @@ __PAGEFLAG(Slab, slab)
PAGEFLAG(Checked, checked) /* Used by some filesystems */
PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
PAGEFLAG(SavePinned, savepinned); /* Xen */
+PAGEFLAG(Checkpointed, owner_priv_1)
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 49f1c2f..e4de714 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -60,6 +60,9 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *ub;
+#endif
struct rcu_head rcu;
struct upid numbers[1];
};
@@ -96,6 +99,11 @@ extern void change_pid(struct task_struct *task, enum pid_type,
struct pid *pid);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
+extern void reattach_pid(struct task_struct *, enum pid_type, struct pid *);
+extern int alloc_pidmap(struct pid_namespace *pid_ns);
+extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid);
+
+extern spinlock_t pidmap_lock;
struct pid_namespace;
extern struct pid_namespace init_pid_ns;
@@ -119,8 +127,11 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, int last);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid);
extern void free_pid(struct pid *pid);
+extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *);
+extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *);
+pid_t pid_to_vpid(pid_t nr);
/*
* ns_of_pid() returns the pid namespace in which the specified pid was
@@ -185,7 +196,7 @@ pid_t pid_vnr(struct pid *pid);
do {
#define while_each_pid_thread(pid, type, task) \
- } while_each_thread(tg___, task); \
+ } while_each_thread_ve(tg___, task); \
task = tg___; \
} while_each_pid_task(pid, type, task)
#endif /* _LINUX_PID_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..411c06d 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -16,6 +16,14 @@ struct pidmap {
struct bsd_acct_struct;
+/* pid namespace flags */
+
+/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */
+#define PID_NS_HIDE_CHILD 0x00000001
+
+/* if set newly created processes invisible from parent ns*/
+#define PID_NS_HIDDEN 0x00000002
+
struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
@@ -24,6 +32,7 @@ struct pid_namespace {
struct kmem_cache *pid_cachep;
unsigned int level;
struct pid_namespace *parent;
+ unsigned flags;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
#endif
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 6673743..977e52b 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -133,6 +133,7 @@ extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, struct timespec *end_time);
extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+long do_restart_poll(struct restart_block *restart_block);
#endif /* KERNEL */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..80bd26a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -103,9 +103,14 @@ struct vmcore {
#ifdef CONFIG_PROC_FS
extern void proc_root_init(void);
+extern struct file_system_type proc_fs_type;
+extern const struct file_operations proc_kmsg_operations;
void proc_flush_task(struct task_struct *task);
+extern int proc_dentry_of_dead_task(struct dentry *dentry);
+extern struct file_operations dummy_proc_pid_file_operations;
+
extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
struct proc_dir_entry *parent);
struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
@@ -149,6 +154,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
struct proc_dir_entry *parent);
+extern struct proc_dir_entry glob_proc_root;
+
static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode,
struct proc_dir_entry *parent, const struct file_operations *proc_fops)
{
@@ -184,6 +191,8 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
static inline void proc_net_remove(struct net *net, const char *name) {}
+static inline int proc_dentry_of_dead_task(struct dentry *dentry) { return 0; }
+
static inline void proc_flush_task(struct task_struct *task)
{
}
@@ -268,6 +277,9 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
+#ifdef CONFIG_VE
+ struct proc_dir_entry *lpde;
+#endif
struct inode vfs_inode;
};
@@ -281,6 +293,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
+static inline struct proc_dir_entry *LPDE(const struct inode *inode)
+{
+#ifdef CONFIG_VE
+ return PROC_I(inode)->lpde;
+#else
+ return NULL;
+#endif
+}
+
static inline struct net *PDE_NET(struct proc_dir_entry *pde)
{
return pde->parent->data;
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 8fd8efc..5fa291e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -173,6 +173,10 @@ enum {
#include <linux/spinlock.h>
#include <linux/wait.h>
+#include <linux/spinlock.h>
+
+extern spinlock_t dq_data_lock;
+
#include <linux/dqblk_xfs.h>
#include <linux/dqblk_v1.h>
#include <linux/dqblk_v2.h>
@@ -291,6 +295,8 @@ struct quota_format_ops {
int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */
};
+struct inode;
+struct iattr;
/* Operations working with dquots */
struct dquot_operations {
int (*initialize) (struct inode *, int);
@@ -316,9 +322,14 @@ struct dquot_operations {
/* get reserved quota for delayed alloc, value returned is managed by
* quota code only */
qsize_t *(*get_reserved_space) (struct inode *);
+ int (*rename) (struct inode *, struct inode *, struct inode *);
+
+ void (*swap_inode) (struct inode *, struct inode *);
+ void (*shutdown) (struct super_block *);
};
/* Operations handling requests from userspace */
+struct v2_disk_dqblk;
struct quotactl_ops {
int (*quota_on)(struct super_block *, int, int, char *, int);
int (*quota_off)(struct super_block *, int, int);
@@ -331,6 +342,10 @@ struct quotactl_ops {
int (*set_xstate)(struct super_block *, unsigned int, int);
int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
+#ifdef CONFIG_QUOTA_COMPAT
+ int (*get_quoti)(struct super_block *, int, unsigned int,
+ struct v2_disk_dqblk __user *);
+#endif
};
struct quota_format_type {
@@ -385,6 +400,10 @@ struct quota_info {
struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */
struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */
struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+ struct vz_quota_master *vzdq_master;
+ int vzdq_count;
+#endif
};
int register_quota_format(struct quota_format_type *fmt);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index a529d86..bdbe1f7 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -264,6 +264,19 @@ static inline void vfs_dq_free_inode(struct inode *inode)
inode->i_sb->dq_op->free_inode(inode, 1);
}
+static __inline__ int vfs_dq_rename(struct inode *inode,
+ struct inode *old_dir, struct inode *new_dir)
+{
+ const struct dquot_operations *q_op;
+
+ q_op = inode->i_sb->dq_op;
+ if (q_op && q_op->rename) {
+ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA)
+ return 1;
+ }
+ return 0;
+}
+
/* Cannot be called inside a transaction */
static inline int vfs_dq_off(struct super_block *sb, int remount)
{
@@ -274,6 +287,35 @@ static inline int vfs_dq_off(struct super_block *sb, int remount)
return ret;
}
+static __inline__ void DQUOT_SWAP(struct inode *inode, struct inode *tmpl)
+{
+ if (sb_any_quota_active(tmpl->i_sb) &&
+ tmpl->i_sb->dq_op->swap_inode)
+ tmpl->i_sb->dq_op->swap_inode(inode, tmpl);
+}
+
+static __inline__ int DQUOT_CHECK_SPACE(struct inode *inode)
+{
+ if (vfs_dq_alloc_space_nodirty(inode, 512))
+ return -EDQUOT;
+ vfs_dq_free_space_nodirty(inode, 512);
+ return 0;
+}
+
+static __inline__ void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks)
+{
+ if (sb_any_quota_active(inode->i_sb)) {
+ if (blocks > inode->i_blocks)
+ inode->i_sb->dq_op->alloc_space(inode,
+ (qsize_t)(blocks-inode->i_blocks)*512,
+ 13 /*DQUOT_CMD_FORCE*/);
+ else if (blocks < inode->i_blocks)
+ inode->i_sb->dq_op->free_space(inode, (qsize_t)(inode->i_blocks-blocks)*512);
+ } else
+ inode->i_blocks = blocks;
+}
+
+
#else
static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
@@ -363,6 +405,12 @@ static inline int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
return 0;
}
+static inline int vfs_dq_rename(struct inode *inode, struct inode *old_dir,
+ struct inode *new_dir)
+{
+ return 0;
+}
+
static inline int vfs_dq_prealloc_space_nodirty(struct inode *inode, qsize_t nr)
{
inode_add_bytes(inode, nr);
@@ -416,6 +464,15 @@ static inline void vfs_dq_free_space(struct inode *inode, qsize_t nr)
mark_inode_dirty(inode);
}
+static inline void DQUOT_SWAP(struct inode *inode, struct inode *tmpl)
+{
+}
+
+static inline void DQUOT_SYNC_BLOCKS(struct inode *inode, blkcnt_t blocks)
+{
+ inode->i_blocks = blocks;
+}
+
#endif /* CONFIG_QUOTA */
static inline int vfs_dq_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index cb0ba70..b14f124 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -70,6 +70,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
void page_remove_rmap(struct page *);
+struct anon_vma *page_lock_anon_vma(struct page *page);
+void page_unlock_anon_vma(struct anon_vma *anon_vma);
static inline void page_dup_rmap(struct page *page)
{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b253434..1412d9a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -94,6 +94,8 @@ struct sched_param {
#include <asm/processor.h>
+#include <bc/task.h>
+
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@@ -120,6 +122,8 @@ struct perf_event_context;
*/
extern unsigned long avenrun[]; /* Load averages */
extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_avenrun_ve(struct ve_struct *ve, unsigned long *loads,
+ unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
@@ -133,15 +137,38 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
load += n*(FIXED_1-exp); \
load >>= FSHIFT;
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned long nr_running(void);
+extern unsigned long nr_sleeping(void);
+extern unsigned long nr_stopped(void);
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(void);
extern unsigned long this_cpu_load(void);
+extern atomic_t nr_dead;
+extern unsigned long nr_zombie;
+
+#ifdef CONFIG_VE
+struct ve_struct;
+extern unsigned long nr_running_ve(struct ve_struct *);
+extern unsigned long nr_iowait_ve(struct ve_struct *);
+extern unsigned long nr_uninterruptible_ve(struct ve_struct *);
+extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu);
+extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu);
+void ve_sched_attach(struct ve_struct *envid);
+#else
+#define nr_running_ve(ve) 0
+#define nr_iowait_ve(ve) 0
+#define nr_uninterruptible_ve(ve) 0
+#define ve_sched_get_idle_time(ve, cpu) 0
+#define ve_sched_get_iowait_time(ve, cpu) 0
+#endif
extern void calc_global_load(void);
@@ -553,6 +580,9 @@ struct thread_group_cputimer {
spinlock_t lock;
};
+#include <linux/ve.h>
+#include <linux/ve_task.h>
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -1283,6 +1313,7 @@ struct task_struct {
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;
+ unsigned did_ve_enter:1;
/* Revert to default priority/policy when forking */
@@ -1498,6 +1529,14 @@ struct task_struct {
struct rcu_head rcu;
/*
+ * state tracking for suspend
+ * FIXME - ptrace is completely rewritten in this kernel
+ * so set_pn_state() is not set in many places correctyl
+ */
+ __u8 pn_state;
+ __u8 stopped_state:1;
+
+ /*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
@@ -1541,6 +1580,19 @@ struct task_struct {
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
unsigned long stack_start;
+#ifdef CONFIG_BEANCOUNTERS
+ struct task_beancounter task_bc;
+#endif
+#ifdef CONFIG_VE
+ struct ve_task_info ve_task_info;
+#endif
+#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
+ unsigned long magic;
+ struct inode *ino;
+#endif
+#ifdef CONFIG_VZ_FAIRSCHED
+ struct fairsched_node *fsched_node;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -1726,6 +1778,43 @@ extern cputime_t task_utime(struct task_struct *p);
extern cputime_t task_stime(struct task_struct *p);
extern cputime_t task_gtime(struct task_struct *p);
+#ifndef CONFIG_VE
+#define set_pn_state(tsk, state) do { } while(0)
+#define clear_pn_state(tsk) do { } while(0)
+#define set_stop_state(tsk) do { } while(0)
+#define clear_stop_state(tsk) do { } while(0)
+#else
+#define PN_STOP_TF 1 /* was not in 2.6.8 */
+#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */
+#define PN_STOP_ENTRY 3
+#define PN_STOP_FORK 4
+#define PN_STOP_VFORK 5
+#define PN_STOP_SIGNAL 6
+#define PN_STOP_EXIT 7
+#define PN_STOP_EXEC 8
+#define PN_STOP_LEAVE 9
+
+static inline void set_pn_state(struct task_struct *tsk, int state)
+{
+ tsk->pn_state = state;
+}
+
+static inline void clear_pn_state(struct task_struct *tsk)
+{
+ tsk->pn_state = 0;
+}
+
+static inline void set_stop_state(struct task_struct *tsk)
+{
+ tsk->stopped_state = 1;
+}
+
+static inline void clear_stop_state(struct task_struct *tsk)
+{
+ tsk->stopped_state = 0;
+}
+#endif
+
/*
* Per process flags
*/
@@ -1735,6 +1824,7 @@ extern cputime_t task_gtime(struct task_struct *p);
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
+#define PF_EXIT_RESTART 0x00000020 /* do_exit() restarted, see do_exit() */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
@@ -1871,6 +1961,21 @@ extern unsigned long long
task_sched_runtime(struct task_struct *task);
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
+static inline unsigned long cycles_to_clocks(cycles_t cycles)
+{
+ extern unsigned long cycles_per_clock;
+ do_div(cycles, cycles_per_clock);
+ return cycles;
+}
+
+static inline u64 cycles_to_jiffies(cycles_t cycles)
+{
+ extern unsigned long cycles_per_jiffy;
+ do_div(cycles, cycles_per_jiffy);
+ return cycles;
+}
+
+
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
@@ -2150,6 +2255,13 @@ extern int disallow_signal(int);
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern long do_fork_pid(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr,
+ long pid0);
struct task_struct *fork_idle(int);
extern void set_task_comm(struct task_struct *tsk, char *from);
@@ -2167,11 +2279,11 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
}
#endif
-#define next_task(p) \
+#define next_task_all(p) \
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
-#define for_each_process(p) \
- for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+#define for_each_process_all(p) \
+ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; )
extern bool current_is_single_threaded(void);
@@ -2179,10 +2291,10 @@ extern bool current_is_single_threaded(void);
* Careful: do_each_thread/while_each_thread is a double loop so
* 'break' will not work as expected - use goto instead.
*/
-#define do_each_thread(g, t) \
- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
+#define do_each_thread_all(g, t) \
+ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do
-#define while_each_thread(g, t) \
+#define while_each_thread_all(g, t) \
while ((t = next_thread(t)) != g)
/* de_thread depends on thread_group_leader not being a pid based check */
@@ -2207,8 +2319,14 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
static inline struct task_struct *next_thread(const struct task_struct *p)
{
- return list_entry_rcu(p->thread_group.next,
+ struct task_struct *tsk;
+ tsk = list_entry_rcu(p->thread_group.next,
struct task_struct, thread_group);
+#ifdef CONFIG_VE
+ /* all threads should belong to ONE ve! */
+ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env);
+#endif
+ return tsk;
}
static inline int thread_group_empty(struct task_struct *p)
@@ -2253,6 +2371,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
+#ifndef CONFIG_VE
+
+#define for_each_process_ve(p) for_each_process_all(p)
+#define do_each_thread_ve(g, t) do_each_thread_all(g, t)
+#define while_each_thread_ve(g, t) while_each_thread_all(g, t)
+#define first_task_ve() next_task_ve(&init_task)
+#define __first_task_ve(owner) next_task_ve(&init_task)
+#define __next_task_ve(owner, p) next_task_ve(p)
+#define next_task_ve(p) \
+ (next_task_all(p) != &init_task ? next_task_all(p) : NULL)
+
+#define ve_is_super(env) 1
+#define ve_accessible(target, owner) 1
+#define ve_accessible_strict(target, owner) 1
+#define ve_accessible_veid(target, owner) 1
+#define ve_accessible_strict_veid(target, owner) 1
+
+#define VEID(ve) 0
+
+#else /* CONFIG_VE */
+
+#include <linux/ve.h>
+
+#define ve_is_super(env) ((env) == get_ve0())
+
+#define ve_accessible_strict(target, owner) ((target) == (owner))
+static inline int ve_accessible(struct ve_struct *target,
+ struct ve_struct *owner)
+{
+ return ve_is_super(owner) || ve_accessible_strict(target, owner);
+}
+
+#define ve_accessible_strict_veid(target, owner) ((target) == (owner))
+static inline int ve_accessible_veid(envid_t target, envid_t owner)
+{
+ return get_ve0()->veid == owner ||
+ ve_accessible_strict_veid(target, owner);
+}
+
+#define VEID(ve) (ve->veid)
+
+static inline struct task_struct *ve_lh2task(struct ve_struct *ve,
+ struct list_head *lh)
+{
+ return lh == &ve->vetask_lh ? NULL :
+ list_entry(lh, struct task_struct, ve_task_info.vetask_list);
+}
+
+static inline struct task_struct *__first_task_ve(struct ve_struct *ve)
+{
+ struct task_struct *tsk;
+
+ if (unlikely(ve_is_super(ve))) {
+ tsk = next_task_all(&init_task);
+ if (tsk == &init_task)
+ tsk = NULL;
+ } else {
+ tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next));
+ }
+ return tsk;
+}
+
+static inline struct task_struct *__next_task_ve(struct ve_struct *ve,
+ struct task_struct *tsk)
+{
+ if (unlikely(ve_is_super(ve))) {
+ tsk = next_task_all(tsk);
+ if (tsk == &init_task)
+ tsk = NULL;
+ } else {
+ BUG_ON(tsk->ve_task_info.owner_env != ve);
+ tsk = ve_lh2task(ve, rcu_dereference(tsk->
+ ve_task_info.vetask_list.next));
+ }
+ return tsk;
+}
+
+#define first_task_ve() __first_task_ve(get_exec_env())
+#define next_task_ve(p) __next_task_ve(get_exec_env(), p)
+/* no one uses prev_task_ve(), copy next_task_ve() if needed */
+
+#define for_each_process_ve(p) \
+ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p))
+
+#define do_each_thread_ve(g, t) \
+ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do
+
+#define while_each_thread_ve(g, t) \
+ while ((t = next_thread(t)) != g)
+
+#endif /* CONFIG_VE */
+
#ifndef __HAVE_THREAD_FUNCTIONS
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 1b191c1..64f30a9 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -154,6 +154,9 @@ static inline void exit_sem(struct task_struct *tsk)
}
#endif
+int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg);
+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg);
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SEM_H */
diff --git a/include/linux/shm.h b/include/linux/shm.h
index eca6235..c2b3bb5 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -83,6 +83,22 @@ struct shm_info {
};
#ifdef __KERNEL__
+
+#include <linux/ipc_namespace.h>
+
+#define IPC_SEM_IDS 0
+#define IPC_MSG_IDS 1
+#define IPC_SHM_IDS 2
+
+struct shm_file_data {
+ int id;
+ struct ipc_namespace *ns;
+ struct file *file;
+ const struct vm_operations_struct *vm_ops;
+};
+#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS])
+
struct shmid_kernel /* private to the kernel */
{
struct kern_ipc_perm shm_perm;
@@ -97,6 +113,23 @@ struct shmid_kernel /* private to the kernel */
struct user_struct *mlock_user;
};
+/*
+ * shm_lock_(check_) routines are called in the paths where the rw_mutex
+ * is not held.
+ */
+static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
+{
+ struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
+
+ if (IS_ERR(ipcp))
+ return (struct shmid_kernel *)ipcp;
+
+ return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+#define shm_unlock(shp) \
+ ipc_unlock(&(shp)->shm_perm)
+
/* shm_mode upper byte flags */
#define SHM_DEST 01000 /* segment will be destroyed on last detach */
#define SHM_LOCKED 02000 /* segment will not be swapped */
@@ -118,6 +151,12 @@ static inline int is_file_shm_hugepages(struct file *file)
}
#endif
+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg);
+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg);
+extern const struct file_operations shmem_file_operations;
+extern const struct file_operations shm_file_operations;
+
+extern struct file_system_type tmpfs_fs_type;
#endif /* __KERNEL__ */
#endif /* _LINUX_SHM_H_ */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index deee7af..2f00bb3 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -18,6 +18,9 @@ struct shmem_inode_info {
struct page *i_indirect; /* top indirect blocks page */
swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */
struct list_head swaplist; /* chain of maybes on swap */
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *shmi_ub;
+#endif
struct inode vfs_inode;
};
@@ -57,4 +60,7 @@ static inline int shmem_acl_init(struct inode *inode, struct inode *dir)
}
#endif /* CONFIG_TMPFS_POSIX_ACL */
+int shmem_insertpage(struct inode * inode, unsigned long index,
+ swp_entry_t swap);
+
#endif
diff --git a/include/linux/signal.h b/include/linux/signal.h
index ab9272c..0acba1f 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -6,6 +6,8 @@
#ifdef __KERNEL__
#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
/*
* Real Time signals may be queued.
@@ -16,6 +18,9 @@ struct sigqueue {
int flags;
siginfo_t info;
struct user_struct *user;
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *sig_ub;
+#endif
};
/* flags values. */
@@ -376,6 +381,8 @@ int unhandled_signal(struct task_struct *tsk, int sig);
void signals_init(void);
+extern struct kmem_cache *sigqueue_cachep;
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index b363b91..d04dcb5 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -60,6 +60,12 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig)
wake_up(&tsk->sighand->signalfd_wqh);
}
+struct signalfd_ctx {
+ sigset_t sigmask;
+};
+
+extern long do_signalfd(int ufd, sigset_t *sigmask, int flags);
+
#else /* CONFIG_SIGNALFD */
static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bcdd660..109cf6c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -310,6 +310,8 @@ typedef unsigned char *sk_buff_data_t;
* @vlan_tci: vlan tag control information
*/
+#include <bc/sock.h>
+
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next;
@@ -357,6 +359,13 @@ struct sk_buff {
__be16 protocol:16;
kmemcheck_bitfield_end(flags1);
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+ __u8 brmark;
+#endif
+#ifdef CONFIG_VE
+ unsigned int accounted:1;
+ unsigned int redirected:1;
+#endif
void (*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
@@ -404,6 +413,8 @@ struct sk_buff {
*data;
unsigned int truesize;
atomic_t users;
+ struct skb_beancounter skb_bc;
+ struct ve_struct *owner_env;
};
#ifdef __KERNEL__
@@ -411,6 +422,7 @@ struct sk_buff {
* Handling routines are only of interest to the kernel
*/
#include <linux/slab.h>
+#include <bc/net.h>
#include <asm/system.h>
@@ -1422,6 +1434,9 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
*/
static inline void skb_orphan(struct sk_buff *skb)
{
+ if (skb->sk)
+ ub_skb_uncharge(skb);
+
if (skb->destructor)
skb->destructor(skb);
skb->destructor = NULL;
@@ -2008,6 +2023,26 @@ static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
+{
+ to->brmark = from->brmark;
+}
+
+static inline void skb_init_brmark(struct sk_buff *skb)
+{
+ skb->brmark = 0;
+}
+#else
+static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from)
+{
+}
+
+static inline void skb_init_brmark(struct sk_buff *skb)
+{
+}
+#endif
+
static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
skb->queue_mapping = queue_mapping;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2da8372..c6e898d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -88,6 +88,26 @@
(unsigned long)ZERO_SIZE_PTR)
/*
+ * allocation rules: __GFP_UBC 0
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * cache (SLAB_UBC) charge charge
+ * (usual caches: mm, vma, task_struct, ...)
+ *
+ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge ---
+ * (ub_kmalloc) (kmalloc)
+ *
+ * cache (no UB flags) BUG() ---
+ * (nonub caches, mempools)
+ *
+ * pages charge ---
+ * (ub_vmalloc, (vmalloc,
+ * poll, fdsets, ...) non-ub allocs)
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#define SLAB_UBC 0x10000000UL /* alloc space for ubs ... */
+#define SLAB_NO_CHARGE 0x20000000UL /* ... but don't charge */
+
+/*
* struct kmem_cache related prototypes
*/
void __init kmem_cache_init(void);
@@ -102,7 +122,20 @@ void kmem_cache_free(struct kmem_cache *, void *);
unsigned int kmem_cache_size(struct kmem_cache *);
const char *kmem_cache_name(struct kmem_cache *);
int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
+extern void show_slab_info(void);
+int kmem_cache_objuse(struct kmem_cache *cachep);
+int kmem_obj_objuse(void *obj);
+int kmem_dname_objuse(void *obj);
+unsigned long ub_cache_growth(struct kmem_cache *cachep);
+#ifdef CONFIG_BEANCOUNTERS
+void kmem_mark_nocharge(struct kmem_cache *cachep);
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj);
+struct user_beancounter *slab_ub(void *obj);
+#else
+static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { }
+static inline struct user_beancounter *slab_ub(void *obj) { return NULL; }
+#endif
/*
* Please use this macro to create slab caches. Simply specify the
* name of the structure and maybe some flags that are listed above.
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 850d057..1bc1812 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -17,6 +17,26 @@
#include <linux/kmemtrace.h>
/*
+ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * STATS - 1 to collect stats for /proc/slabinfo.
+ * 0 for faster, smaller code (especially in the critical paths).
+ *
+ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+ */
+
+#ifdef CONFIG_DEBUG_SLAB
+#define SLAB_DEBUG 1
+#define SLAB_STATS 1
+#define SLAB_FORCED_DEBUG 1
+#else
+#define SLAB_DEBUG 0
+#define SLAB_STATS 0
+#define SLAB_FORCED_DEBUG 0
+#endif
+
+/*
* struct kmem_cache
*
* manages a cache.
@@ -64,6 +84,7 @@ struct kmem_cache {
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
+ unsigned long shrunk;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
@@ -83,6 +104,9 @@ struct kmem_cache {
int obj_offset;
int obj_size;
#endif /* CONFIG_DEBUG_SLAB */
+#ifdef CONFIG_BEANCOUNTERS
+ int objuse;
+#endif
/*
* We put nodelists[] at the end of kmem_cache, because we want to size
@@ -106,6 +130,7 @@ struct cache_sizes {
#endif
};
extern struct cache_sizes malloc_sizes[];
+extern int malloc_cache_num;
void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
void *__kmalloc(size_t size, gfp_t flags);
@@ -145,6 +170,8 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
#undef CACHE
return NULL;
found:
+ if (flags & __GFP_UBC)
+ i += malloc_cache_num;
#ifdef CONFIG_ZONE_DMA
if (flags & GFP_DMA)
cachep = malloc_sizes[i].cs_dmacachep;
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5ad70a6..8f3d203 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -97,6 +97,10 @@ struct kmem_cache {
struct kobject kobj; /* For sysfs */
#endif
+#ifdef CONFIG_BEANCOUNTERS
+ atomic_t grown;
+ int objuse;
+#endif
#ifdef CONFIG_NUMA
/*
* Defragmentation by allocating from a remote node.
@@ -141,6 +145,19 @@ struct kmem_cache {
*/
extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT];
+#ifdef CONFIG_BEANCOUNTERS
+extern struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT];
+static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx)
+{
+ return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx];
+}
+#else
+static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx)
+{
+ return &kmalloc_caches[idx];
+}
+#endif
+
/*
* Sorry that the following has to be that ugly but some versions of GCC
* have trouble with constant propagation and loops.
@@ -197,14 +214,14 @@ static __always_inline int kmalloc_index(size_t size)
* This ought to end up with a global pointer to the right cache
* in kmalloc_caches.
*/
-static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
+static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
int index = kmalloc_index(size);
if (index == 0)
return NULL;
- return &kmalloc_caches[index];
+ return __kmalloc_cache(flags, index);
}
#ifdef CONFIG_ZONE_DMA
@@ -247,7 +264,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
return kmalloc_large(size, flags);
if (!(flags & SLUB_DMA)) {
- struct kmem_cache *s = kmalloc_slab(size);
+ struct kmem_cache *s = kmalloc_slab(size, flags);
if (!s)
return ZERO_SIZE_PTR;
@@ -286,7 +303,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
if (__builtin_constant_p(size) &&
size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
- struct kmem_cache *s = kmalloc_slab(size);
+ struct kmem_cache *s = kmalloc_slab(size, flags);
if (!s)
return ZERO_SIZE_PTR;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3273a0c..87cf3d1 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -296,6 +296,16 @@ struct ucred {
#define IPX_TYPE 1
#ifdef __KERNEL__
+
+#define MAX_SOCK_ADDR 128 /* 108 for Unix domain -
+ 16 for IP, 16 for IPX,
+ 24 for IPv6,
+ about 80 for AX.25
+ must be at least one bigger than
+ the AF_UNIX size (see net/unix/af_unix.c
+ :unix_mkname()).
+ */
+
extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
int offset, int len);
@@ -311,6 +321,8 @@ extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
extern int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, int __user *ulen);
extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
+extern int vz_security_family_check(int family);
+extern int vz_security_protocol_check(int protocol);
#endif
#endif /* not kernel and not glibc */
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 8ed9642..4cf823c 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -50,6 +50,7 @@ struct rpc_clnt {
cl_discrtry : 1,/* disconnect before retry */
cl_autobind : 1,/* use getport() */
cl_chatty : 1;/* be verbose */
+ unsigned int cl_broken : 1;/* no responce for too long */
struct rpc_rtt * cl_rtt; /* RTO estimator data */
const struct rpc_timeout *cl_timeout; /* Timeout strategy */
@@ -61,6 +62,7 @@ struct rpc_clnt {
struct rpc_rtt cl_rtt_default;
struct rpc_timeout cl_timeout_default;
struct rpc_program * cl_program;
+ unsigned long cl_pr_time;
char cl_inline_name[32];
char *cl_principal; /* target to authenticate to */
};
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 6f9457a..ce08bbf 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -24,6 +24,14 @@
#define RPC_MAX_SLOT_TABLE (128U)
/*
+ * Grand abort timeout (stop the client if occures)
+ */
+extern int xprt_abort_timeout;
+
+#define RPC_MIN_ABORT_TIMEOUT 300
+#define RPC_MAX_ABORT_TIMEOUT INT_MAX
+
+/*
* This describes a timeout strategy
*/
struct rpc_timeout {
@@ -144,6 +152,7 @@ enum xprt_transports {
struct rpc_xprt {
struct kref kref; /* Reference count */
struct rpc_xprt_ops * ops; /* transport methods */
+ struct ve_struct * owner_env; /* VE owner of mount */
const struct rpc_timeout *timeout; /* timeout parms */
struct sockaddr_storage addr; /* server address */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4ec9001..c2ad7fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -19,6 +19,7 @@ struct bio;
#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK 0x7fff
#define SWAP_FLAG_PRIO_SHIFT 0
+#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */
static inline int current_is_kswapd(void)
{
@@ -116,6 +117,7 @@ struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;
+struct user_beancounter;
/*
* A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
@@ -146,6 +148,7 @@ enum {
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
/* add others here before... */
+ SWP_READONLY = (1 << 5),
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
};
@@ -158,6 +161,7 @@ enum {
/*
* The in-memory structure used to track swap areas.
*/
+struct user_beancounter;
struct swap_info_struct {
unsigned long flags;
int prio; /* swap priority */
@@ -177,6 +181,9 @@ struct swap_info_struct {
unsigned int max;
unsigned int inuse_pages;
unsigned int old_block_size;
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+ struct user_beancounter **swap_ubs;
+#endif
};
struct swap_list_t {
@@ -184,9 +191,21 @@ struct swap_list_t {
int next; /* swapfile to be used next */
};
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct swap_info[MAX_SWAPFILES];
+
/* Swap 50% full? Release swapcache more aggressively.. */
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
+/* linux/mm/oom_kill.c */
+extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
+extern int register_oom_notifier(struct notifier_block *nb);
+extern int unregister_oom_notifier(struct notifier_block *nb);
+extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ struct mem_cgroup *mem, const char *message);
+extern struct task_struct *select_bad_process(struct user_beancounter *ub,
+ struct mem_cgroup *memcg);
+
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
@@ -294,6 +313,7 @@ extern void show_swap_cache_info(void);
extern int add_to_swap(struct page *);
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
extern void __delete_from_swap_cache(struct page *);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
extern void delete_from_swap_cache(struct page *);
extern void free_page_and_swap_cache(struct page *);
extern void free_pages_and_swap_cache(struct page **, int);
@@ -307,7 +327,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
extern long nr_swap_pages;
extern long total_swap_pages;
extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct user_beancounter *);
extern swp_entry_t get_swap_page_of_type(int);
extern void swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t);
@@ -322,6 +342,7 @@ extern sector_t swapdev_block(int, pgoff_t);
extern struct swap_info_struct *get_swap_info_struct(unsigned);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
+extern int swap_readonly(struct page *);
struct backing_dev_info;
/* linux/mm/thrash.c */
@@ -438,7 +459,7 @@ static inline int try_to_free_swap(struct page *page)
return 0;
}
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct user_beancounter *ub)
{
swp_entry_t entry;
entry.val = 0;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 0eb6942..5ae7ab3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -1103,10 +1103,15 @@ struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_header *register_sysctl_table(struct ctl_table * table);
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table);
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int);
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *,
+ struct ctl_table *, int);
void unregister_sysctl_table(struct ctl_table_header * table);
int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table);
+extern int ve_allow_kthreads;
+
#endif /* __KERNEL__ */
#endif /* _LINUX_SYSCTL_H */
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9d68fed..7e522fc 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -17,8 +17,23 @@
#include <linux/list.h>
#include <asm/atomic.h>
+#ifdef CONFIG_SYSFS_DEPRECATED_DYN
+extern unsigned sysfs_deprecated;
+#else
+
+/* static deprecation */
+
+#ifdef CONFIG_SYSFS_DEPRECATED
+#define sysfs_deprecated 1
+#else
+#define sysfs_deprecated 0
+#endif
+
+#endif
+
struct kobject;
struct module;
+struct sysfs_open_dirent;
/* FIXME
* The *owner field is no longer used.
@@ -38,7 +53,7 @@ struct attribute_group {
struct attribute **attrs;
};
-
+#include <linux/fs.h>
/**
* Use these macros to make defining attributes easier. See include/linux/device.h
@@ -81,6 +96,73 @@ struct sysfs_ops {
struct sysfs_dirent;
+/* type-specific structures for sysfs_dirent->s_* union members */
+struct sysfs_elem_dir {
+ struct kobject *kobj;
+ /* children list starts here and goes through sd->s_sibling */
+ struct sysfs_dirent *children;
+};
+
+struct sysfs_elem_symlink {
+ struct sysfs_dirent *target_sd;
+};
+
+struct sysfs_elem_attr {
+ struct attribute *attr;
+ struct sysfs_open_dirent *open;
+};
+
+struct sysfs_elem_bin_attr {
+ struct bin_attribute *bin_attr;
+ struct hlist_head buffers;
+};
+
+struct sysfs_inode_attrs {
+ struct iattr ia_iattr;
+ void *ia_secdata;
+ u32 ia_secdata_len;
+};
+
+/*
+ * sysfs_dirent - the building block of sysfs hierarchy. Each and
+ * every sysfs node is represented by single sysfs_dirent.
+ *
+ * As long as s_count reference is held, the sysfs_dirent itself is
+ * accessible. Dereferencing s_elem or any other outer entity
+ * requires s_active reference.
+ */
+struct sysfs_dirent {
+ atomic_t s_count;
+ atomic_t s_active;
+ struct sysfs_dirent *s_parent;
+ struct sysfs_dirent *s_sibling;
+ const char *s_name;
+
+ union {
+ struct sysfs_elem_dir s_dir;
+ struct sysfs_elem_symlink s_symlink;
+ struct sysfs_elem_attr s_attr;
+ struct sysfs_elem_bin_attr s_bin_attr;
+ };
+
+ unsigned int s_flags;
+ ino_t s_ino;
+ umode_t s_mode;
+ struct sysfs_inode_attrs *s_iattr;
+};
+
+#define SD_DEACTIVATED_BIAS INT_MIN
+
+#define SYSFS_TYPE_MASK 0x00ff
+#define SYSFS_DIR 0x0001
+#define SYSFS_KOBJ_ATTR 0x0002
+#define SYSFS_KOBJ_BIN_ATTR 0x0004
+#define SYSFS_KOBJ_LINK 0x0008
+#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
+
+#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_REMOVED 0x0200
+
#ifdef CONFIG_SYSFS
int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
@@ -129,6 +211,8 @@ void sysfs_put(struct sysfs_dirent *sd);
void sysfs_printk_last_file(void);
int __must_check sysfs_init(void);
+extern struct file_system_type sysfs_fs_type;
+
#else /* CONFIG_SYSFS */
static inline int sysfs_schedule_callback(struct kobject *kobj,
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index 4d090f9..ba40964 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -5,10 +5,12 @@
#define __TASK_IO_ACCOUNTING_OPS_INCLUDED
#include <linux/sched.h>
+#include <bc/io_acct.h>
#ifdef CONFIG_TASK_IO_ACCOUNTING
static inline void task_io_account_read(size_t bytes)
{
+ ub_io_account_read(bytes);
current->ioac.read_bytes += bytes;
}
@@ -21,8 +23,14 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
return p->ioac.read_bytes >> 9;
}
-static inline void task_io_account_write(size_t bytes)
+static inline void task_io_account_write(struct page *page, size_t bytes,
+ int sync)
{
+ if (sync)
+ ub_io_account_write(bytes);
+ else
+ ub_io_account_dirty(page, bytes);
+
current->ioac.write_bytes += bytes;
}
@@ -37,6 +45,7 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p)
static inline void task_io_account_cancelled_write(size_t bytes)
{
+ ub_io_account_write_cancelled(bytes);
current->ioac.cancelled_write_bytes += bytes;
}
@@ -64,7 +73,8 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
return 0;
}
-static inline void task_io_account_write(size_t bytes)
+static inline void task_io_account_write(struct page *page, size_t bytes,
+ int sync)
{
}
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 61723a7..fe8494b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -413,6 +413,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
return (struct tcp_sock *)sk;
}
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+ return tp->snd_una != tp->snd_up;
+}
+
struct tcp_timewait_sock {
struct inet_timewait_sock tw_sk;
u32 tw_rcv_nxt;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e9c57e9..cc60313 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -313,6 +313,7 @@ struct tty_struct {
/* If the tty has a pending do_SAK, queue it here - akpm */
struct work_struct SAK_work;
struct tty_port *port;
+ struct ve_struct *owner_env;
};
/* tty magic number */
@@ -344,6 +345,7 @@ struct tty_struct {
#define TTY_HUPPED 18 /* Post driver->hangup() */
#define TTY_FLUSHING 19 /* Flushing to ldisc in progress */
#define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */
+#define TTY_CHARGED 21 /* Charged as ub resource */
#define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
@@ -449,7 +451,7 @@ extern void free_tty_struct(struct tty_struct *tty);
extern void initialize_tty_struct(struct tty_struct *tty,
struct tty_driver *driver, int idx);
extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
- int first_ok);
+ struct tty_struct *i_tty, int first_ok);
extern void tty_release_dev(struct file *filp);
extern int tty_init_termios(struct tty_struct *tty);
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index b086779..327875b 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -309,8 +309,19 @@ struct tty_driver {
const struct tty_operations *ops;
struct list_head tty_drivers;
+ struct ve_struct *owner_env;
};
+#ifdef CONFIG_UNIX98_PTYS
+extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
+extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */
+#endif
+
+#ifdef CONFIG_LEGACY_PTYS
+extern struct tty_driver *pty_driver;
+extern struct tty_driver *pty_slave_driver;
+#endif
+
extern struct list_head tty_drivers;
extern struct tty_driver *alloc_tty_driver(int lines);
@@ -319,6 +330,9 @@ extern void tty_set_operations(struct tty_driver *driver,
const struct tty_operations *op);
extern struct tty_driver *tty_find_polling_driver(char *name, int *line);
+int init_ve_tty_class(void);
+void fini_ve_tty_class(void);
+
extern void tty_driver_kref_put(struct tty_driver *driver);
static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d)
diff --git a/include/linux/types.h b/include/linux/types.h
index c42724f..0f8c88b 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -31,6 +31,11 @@ typedef __kernel_timer_t timer_t;
typedef __kernel_clockid_t clockid_t;
typedef __kernel_mqd_t mqd_t;
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
typedef _Bool bool;
typedef __kernel_uid32_t uid_t;
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 69f3997..6c74733 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -42,6 +42,7 @@ struct uts_namespace {
struct new_utsname name;
};
extern struct uts_namespace init_uts_ns;
+extern struct new_utsname virt_utsname;
#ifdef CONFIG_UTS_NS
static inline void get_uts_ns(struct uts_namespace *ns)
diff --git a/include/linux/ve.h b/include/linux/ve.h
new file mode 100644
index 0000000..e473727
--- /dev/null
+++ b/include/linux/ve.h
@@ -0,0 +1,359 @@
+/*
+ * include/linux/ve.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VE_H
+#define _LINUX_VE_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/vzstat.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/socket.h>
+#include <net/inet_frag.h>
+
+#ifdef VZMON_DEBUG
+# define VZTRACE(fmt,args...) \
+ printk(KERN_DEBUG fmt, ##args)
+#else
+# define VZTRACE(fmt,args...)
+#endif /* VZMON_DEBUG */
+
+struct tty_driver;
+struct task_struct;
+struct new_utsname;
+struct file_system_type;
+struct icmp_mib;
+struct ip_mib;
+struct tcp_mib;
+struct udp_mib;
+struct linux_mib;
+struct fib_info;
+struct fib_rule;
+struct veip_struct;
+struct ve_monitor;
+struct nsproxy;
+
+#if defined(CONFIG_VE) && defined(CONFIG_INET)
+struct fib_table;
+#ifdef CONFIG_VE_IPTABLES
+struct xt_table;
+struct nf_conn;
+
+#define FRAG6Q_HASHSZ 64
+
+struct ve_nf_conntrack {
+ struct hlist_head *_bysource;
+ struct nf_nat_protocol **_nf_nat_protos;
+ int _nf_nat_vmalloced;
+ struct xt_table *_nf_nat_table;
+ struct nf_conntrack_l3proto *_nf_nat_l3proto;
+ atomic_t _nf_conntrack_count;
+ int _nf_conntrack_max;
+ struct hlist_head *_nf_conntrack_hash;
+ int _nf_conntrack_checksum;
+ int _nf_conntrack_vmalloc;
+ struct hlist_head _unconfirmed;
+ struct hlist_head *_nf_ct_expect_hash;
+ unsigned int _nf_ct_expect_vmalloc;
+ unsigned int _nf_ct_expect_count;
+ unsigned int _nf_ct_expect_max;
+ struct hlist_head *_nf_ct_helper_hash;
+ unsigned int _nf_ct_helper_vmalloc;
+#ifdef CONFIG_SYSCTL
+ /* l4 stuff: */
+ unsigned long _nf_ct_icmp_timeout;
+ unsigned long _nf_ct_icmpv6_timeout;
+ unsigned int _nf_ct_udp_timeout;
+ unsigned int _nf_ct_udp_timeout_stream;
+ unsigned int _nf_ct_generic_timeout;
+ unsigned int _nf_ct_log_invalid;
+ unsigned int _nf_ct_tcp_timeout_max_retrans;
+ unsigned int _nf_ct_tcp_timeout_unacknowledged;
+ int _nf_ct_tcp_be_liberal;
+ int _nf_ct_tcp_loose;
+ int _nf_ct_tcp_max_retrans;
+ unsigned int _nf_ct_tcp_timeouts[10];
+ struct ctl_table_header *_icmp_sysctl_header;
+ unsigned int _tcp_sysctl_table_users;
+ struct ctl_table_header *_tcp_sysctl_header;
+ unsigned int _udp_sysctl_table_users;
+ struct ctl_table_header *_udp_sysctl_header;
+ struct ctl_table_header *_icmpv6_sysctl_header;
+ struct ctl_table_header *_generic_sysctl_header;
+#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
+ struct ctl_table_header *_icmp_compat_sysctl_header;
+ struct ctl_table_header *_tcp_compat_sysctl_header;
+ struct ctl_table_header *_udp_compat_sysctl_header;
+ struct ctl_table_header *_generic_compat_sysctl_header;
+#endif
+ /* l4 protocols sysctl tables: */
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmp;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp4;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmpv6;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp6;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp4;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp6;
+ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_generic;
+ struct nf_conntrack_l4proto **_nf_ct_protos[PF_MAX];
+ /* l3 protocols sysctl tables: */
+ struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv4;
+ struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv6;
+ struct nf_conntrack_l3proto *_nf_ct_l3protos[AF_MAX];
+ /* sysctl standalone stuff: */
+ struct ctl_table_header *_nf_ct_sysctl_header;
+ ctl_table *_nf_ct_sysctl_table;
+ ctl_table *_nf_ct_netfilter_table;
+ ctl_table *_nf_ct_net_table;
+ ctl_table *_ip_ct_netfilter_table;
+ struct ctl_table_header *_ip_ct_sysctl_header;
+ int _nf_ct_log_invalid_proto_min;
+ int _nf_ct_log_invalid_proto_max;
+#endif /* CONFIG_SYSCTL */
+};
+#endif
+#endif
+
+struct ve_cpu_stats {
+ cycles_t idle_time;
+ cycles_t iowait_time;
+ cycles_t strt_idle_time;
+ cycles_t used_time;
+ seqcount_t stat_lock;
+ unsigned long nr_running;
+ unsigned long nr_unint;
+ unsigned long nr_iowait;
+ cputime64_t user;
+ cputime64_t nice;
+ cputime64_t system;
+} ____cacheline_aligned;
+
+struct ve_ipt_recent;
+struct ve_xt_hashlimit;
+struct svc_rqst;
+
+struct cgroup;
+struct css_set;
+
+struct ve_struct {
+ struct list_head ve_list;
+
+ envid_t veid;
+ struct list_head vetask_lh;
+ /* capability bounding set */
+ kernel_cap_t ve_cap_bset;
+ atomic_t pcounter;
+ /* ref counter to ve from ipc */
+ atomic_t counter;
+ unsigned int class_id;
+ struct rw_semaphore op_sem;
+ int is_running;
+ int is_locked;
+ atomic_t suspend;
+ /* see vzcalluser.h for VE_FEATURE_XXX definitions */
+ __u64 features;
+
+/* VE's root */
+ struct path root_path;
+
+ struct file_system_type *proc_fstype;
+ struct vfsmount *proc_mnt;
+ struct proc_dir_entry *proc_root;
+
+/* BSD pty's */
+#ifdef CONFIG_LEGACY_PTYS
+ struct tty_driver *pty_driver;
+ struct tty_driver *pty_slave_driver;
+#endif
+#ifdef CONFIG_UNIX98_PTYS
+ struct tty_driver *ptm_driver;
+ struct tty_driver *pts_driver;
+ struct ida *allocated_ptys;
+ struct file_system_type *devpts_fstype;
+ struct vfsmount *devpts_mnt;
+ struct dentry *devpts_root;
+ struct devpts_config *devpts_config;
+#endif
+
+ struct ve_nfs_context *nfs_context;
+
+ struct file_system_type *shmem_fstype;
+ struct vfsmount *shmem_mnt;
+#ifdef CONFIG_SYSFS
+ struct file_system_type *sysfs_fstype;
+ struct vfsmount *sysfs_mnt;
+ struct super_block *sysfs_sb;
+ struct sysfs_dirent *_sysfs_root;
+#endif
+ struct kobject *_virtual_dir;
+ struct kset *class_kset;
+ struct kset *devices_kset;
+ struct kobject *dev_kobj;
+ struct kobject *dev_char_kobj;
+ struct kobject *dev_block_kobj;
+ struct class *tty_class;
+ struct class *mem_class;
+
+#ifdef CONFIG_NET
+ struct class *net_class;
+#ifdef CONFIG_INET
+ unsigned long rt_flush_required;
+#endif
+#endif
+#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE)
+ struct veip_struct *veip;
+ struct net_device *_venet_dev;
+#endif
+
+/* per VE CPU stats*/
+ struct timespec start_timespec;
+ u64 start_jiffies; /* Deprecated */
+ cycles_t start_cycles;
+ unsigned long avenrun[3]; /* loadavg data */
+
+ cycles_t cpu_used_ve;
+ struct kstat_lat_pcpu_struct sched_lat_ve;
+
+#ifdef CONFIG_INET
+ struct venet_stat *stat;
+#ifdef CONFIG_VE_IPTABLES
+/* core/netfilter.c virtualization */
+ struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */
+ struct xt_table *_ve_ip6t_filter_pf;
+ struct xt_table *_ipt_mangle_table;
+ struct xt_table *_ip6t_mangle_table;
+ struct list_head _xt_tables[NPROTO];
+
+ __u64 ipt_mask;
+ __u64 _iptables_modules;
+ struct ve_nf_conntrack *_nf_conntrack;
+ struct ve_ipt_recent *_ipt_recent;
+ struct ve_xt_hashlimit *_xt_hashlimit;
+#endif /* CONFIG_VE_IPTABLES */
+#endif
+ wait_queue_head_t *_log_wait;
+ unsigned *_log_start;
+ unsigned *_log_end;
+ unsigned *_logged_chars;
+ char *log_buf;
+#define VE_DEFAULT_LOG_BUF_LEN 4096
+
+ struct ve_cpu_stats *cpu_stats;
+ unsigned long down_at;
+ struct list_head cleanup_list;
+#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
+ struct list_head _fuse_conn_list;
+ struct super_block *_fuse_control_sb;
+
+ struct file_system_type *fuse_fs_type;
+ struct file_system_type *fuse_ctl_fs_type;
+#endif
+ unsigned long jiffies_fixup;
+ unsigned char disable_net;
+ struct ve_monitor *monitor;
+ struct proc_dir_entry *monitor_proc;
+ unsigned long meminfo_val;
+ int _randomize_va_space;
+
+#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) \
+ || defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE)
+ unsigned int _nlmsvc_users;
+ struct task_struct* _nlmsvc_task;
+ unsigned long _nlmsvc_grace_period;
+ unsigned long _nlmsvc_timeout;
+ struct svc_rqst* _nlmsvc_rqst;
+#endif
+
+#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
+ struct file_system_type *bm_fs_type;
+ struct vfsmount *bm_mnt;
+ int bm_enabled;
+ int bm_entry_count;
+ struct list_head bm_entries;
+#endif
+
+ struct nsproxy *ve_ns;
+ struct user_namespace *user_ns;
+ struct net *ve_netns;
+ struct cgroup *ve_cgroup;
+ struct css_set *ve_css_set;
+};
+
+#define VE_MEMINFO_DEFAULT 1 /* default behaviour */
+#define VE_MEMINFO_SYSTEM 0 /* disable meminfo virtualization */
+
+int init_ve_cgroups(struct ve_struct *ve);
+void fini_ve_cgroups(struct ve_struct *ve);
+
+extern struct ve_cpu_stats static_ve_cpu_stats;
+static inline struct ve_cpu_stats *VE_CPU_STATS(struct ve_struct *ve, int cpu)
+{
+ return per_cpu_ptr(ve->cpu_stats, cpu);
+}
+
+extern int nr_ve;
+extern struct proc_dir_entry *proc_vz_dir;
+extern struct proc_dir_entry *glob_proc_vz_dir;
+
+#ifdef CONFIG_VE
+
+void do_update_load_avg_ve(void);
+void do_env_free(struct ve_struct *ptr);
+
+static inline struct ve_struct *get_ve(struct ve_struct *ptr)
+{
+ if (ptr != NULL)
+ atomic_inc(&ptr->counter);
+ return ptr;
+}
+
+static inline void put_ve(struct ve_struct *ptr)
+{
+ if (ptr && atomic_dec_and_test(&ptr->counter))
+ do_env_free(ptr);
+}
+
+static inline void pget_ve(struct ve_struct *ptr)
+{
+ atomic_inc(&ptr->pcounter);
+}
+
+void ve_cleanup_schedule(struct ve_struct *);
+static inline void pput_ve(struct ve_struct *ptr)
+{
+ if (unlikely(atomic_dec_and_test(&ptr->pcounter)))
+ ve_cleanup_schedule(ptr);
+}
+
+extern spinlock_t ve_cleanup_lock;
+extern struct list_head ve_cleanup_list;
+extern struct task_struct *ve_cleanup_thread;
+
+extern int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags);
+extern void (*do_env_free_hook)(struct ve_struct *ve);
+
+extern unsigned long long ve_relative_clock(struct timespec * ts);
+
+#ifdef CONFIG_FAIRSCHED
+#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask)
+#else
+#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0)
+#endif
+#else /* CONFIG_VE */
+#define ve_utsname system_utsname
+#define get_ve(ve) (NULL)
+#define put_ve(ve) do { } while (0)
+#define pget_ve(ve) do { } while (0)
+#define pput_ve(ve) do { } while (0)
+#endif /* CONFIG_VE */
+
+#endif /* _LINUX_VE_H */
diff --git a/include/linux/ve_nfs.h b/include/linux/ve_nfs.h
new file mode 100644
index 0000000..8f2e8f8
--- /dev/null
+++ b/include/linux/ve_nfs.h
@@ -0,0 +1,30 @@
+/*
+ * linux/include/ve_nfs.h
+ *
+ * VE context for NFS
+ *
+ * Copyright (C) 2007 SWsoft
+ */
+
+#ifndef __VE_NFS_H__
+#define __VE_NFS_H__
+
+#ifdef CONFIG_VE
+
+#include <linux/ve.h>
+
+#define NFS_CTX_FIELD(arg) (get_exec_env()->_##arg)
+
+#else /* CONFIG_VE */
+
+#define NFS_CTX_FIELD(arg) _##arg
+
+#endif /* CONFIG_VE */
+
+#define nlmsvc_grace_period NFS_CTX_FIELD(nlmsvc_grace_period)
+#define nlmsvc_timeout NFS_CTX_FIELD(nlmsvc_timeout)
+#define nlmsvc_users NFS_CTX_FIELD(nlmsvc_users)
+#define nlmsvc_task NFS_CTX_FIELD(nlmsvc_task)
+#define nlmsvc_rqst NFS_CTX_FIELD(nlmsvc_rqst)
+
+#endif
diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
new file mode 100644
index 0000000..5bb93e8
--- /dev/null
+++ b/include/linux/ve_proto.h
@@ -0,0 +1,100 @@
+/*
+ * include/linux/ve_proto.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_H__
+#define __VE_H__
+
+#ifdef CONFIG_VE
+
+struct ve_struct;
+
+struct seq_file;
+
+typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *);
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t);
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t);
+
+#ifdef CONFIG_INET
+void tcp_v4_kill_ve_sockets(struct ve_struct *envid);
+#ifdef CONFIG_VE_NETDEV
+int venet_init(void);
+#endif
+#endif
+
+#define VE_IOPRIO_MIN 0
+#define VE_IOPRIO_MAX 8
+extern int ve_set_ioprio(int veid, int ioprio);
+
+extern struct list_head ve_list_head;
+#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list)
+extern rwlock_t ve_list_lock;
+extern struct ve_struct *get_ve_by_id(envid_t);
+extern struct ve_struct *__find_ve_by_id(envid_t);
+
+struct env_create_param3;
+extern int real_env_create(envid_t veid, unsigned flags, u32 class_id,
+ struct env_create_param3 *data, int datalen);
+extern void ve_move_task(struct task_struct *, struct ve_struct *, struct cred *);
+
+int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned);
+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode);
+int devperms_seq_show(struct seq_file *m, void *v);
+
+enum {
+ VE_SS_CHAIN,
+
+ VE_MAX_CHAINS
+};
+
+typedef int ve_hook_init_fn(void *data);
+typedef void ve_hook_fini_fn(void *data);
+
+struct ve_hook
+{
+ ve_hook_init_fn *init;
+ ve_hook_fini_fn *fini;
+ struct module *owner;
+
+ /* Functions are called in ascending priority */
+ int priority;
+
+ /* Private part */
+ struct list_head list;
+};
+
+enum {
+ HOOK_PRIO_DEFAULT = 0,
+
+ HOOK_PRIO_FS = HOOK_PRIO_DEFAULT,
+
+ HOOK_PRIO_NET_PRE,
+ HOOK_PRIO_NET,
+ HOOK_PRIO_NET_POST,
+
+ HOOK_PRIO_AFTERALL = INT_MAX
+};
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos);
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void ve_seq_stop(struct seq_file *m, void *v);
+
+extern int ve_hook_iterate_init(int chain, void *data);
+extern void ve_hook_iterate_fini(int chain, void *data);
+
+extern void ve_hook_register(int chain, struct ve_hook *vh);
+extern void ve_hook_unregister(struct ve_hook *vh);
+#else /* CONFIG_VE */
+#define ve_hook_register(ch, vh) do { } while (0)
+#define ve_hook_unregister(ve) do { } while (0)
+
+#define get_device_perms_ve(t, d, a) (0)
+#endif /* CONFIG_VE */
+#endif
diff --git a/include/linux/ve_task.h b/include/linux/ve_task.h
new file mode 100644
index 0000000..4b7d722
--- /dev/null
+++ b/include/linux/ve_task.h
@@ -0,0 +1,73 @@
+/*
+ * include/linux/ve_task.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_TASK_H__
+#define __VE_TASK_H__
+
+#include <linux/seqlock.h>
+#include <asm/timex.h>
+
+struct ve_task_info {
+/* virtualization */
+ struct ve_struct *owner_env;
+ struct ve_struct *exec_env;
+ struct ve_struct *saved_env;
+ struct list_head vetask_list;
+ struct dentry *glob_proc_dentry;
+/* statistics: scheduling latency */
+ cycles_t sleep_time;
+ cycles_t sched_time;
+ cycles_t sleep_stamp;
+ cycles_t wakeup_stamp;
+ seqcount_t wakeup_lock;
+};
+
+#define VE_TASK_INFO(task) (&(task)->ve_task_info)
+#define VE_TASK_LIST_2_TASK(lh) \
+ list_entry(lh, struct task_struct, ve_task_info.vetask_list)
+
+#ifdef CONFIG_VE
+extern struct ve_struct ve0;
+#define get_ve0() (&ve0)
+
+#define ve_save_context(t) do { \
+ t->ve_task_info.saved_env = \
+ t->ve_task_info.exec_env; \
+ t->ve_task_info.exec_env = get_ve0(); \
+ } while (0)
+#define ve_restore_context(t) do { \
+ t->ve_task_info.exec_env = \
+ t->ve_task_info.saved_env; \
+ } while (0)
+
+#define get_exec_env() (current->ve_task_info.exec_env)
+#define set_exec_env(ve) ({ \
+ struct ve_task_info *vi; \
+ struct ve_struct *old, *new; \
+ \
+ vi = &current->ve_task_info; \
+ old = vi->exec_env; \
+ new = ve; \
+ if (unlikely(new == NULL)) { \
+ printk("%s: NULL exec env (%s)\n", __func__, #ve);\
+ new = get_ve0(); \
+ } \
+ vi->exec_env = new; \
+ old; \
+ })
+#else
+#define get_ve0() (NULL)
+#define get_exec_env() (NULL)
+#define set_exec_env(new_env) (NULL)
+#define ve_save_context(t) do { } while (0)
+#define ve_restore_context(t) do { } while (0)
+#endif
+
+#endif /* __VE_TASK_H__ */
diff --git a/include/linux/veip.h b/include/linux/veip.h
new file mode 100644
index 0000000..745f1ec
--- /dev/null
+++ b/include/linux/veip.h
@@ -0,0 +1,15 @@
+#ifndef __VE_IP_H_
+#define __VE_IP_H_
+
+struct ve_addr_struct {
+ int family;
+ __u32 key[4];
+};
+
+struct sockaddr;
+
+extern void veaddr_print(char *, int, struct ve_addr_struct *);
+extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+ struct ve_addr_struct *veaddr);
+
+#endif
diff --git a/include/linux/venet.h b/include/linux/venet.h
new file mode 100644
index 0000000..dd26f11
--- /dev/null
+++ b/include/linux/venet.h
@@ -0,0 +1,95 @@
+/*
+ * include/linux/venet.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VENET_H
+#define _VENET_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/vzcalluser.h>
+#include <linux/veip.h>
+#include <linux/netdevice.h>
+
+#define VEIP_HASH_SZ 512
+
+struct ve_struct;
+struct venet_stat;
+struct venet_stats {
+ struct net_device_stats stats;
+ struct net_device_stats *real_stats;
+};
+
+struct ip_entry_struct
+{
+ struct ve_addr_struct addr;
+ struct ve_struct *active_env;
+ struct venet_stat *stat;
+ struct veip_struct *veip;
+ struct list_head ip_hash;
+ struct list_head ve_list;
+};
+
+struct ext_entry_struct
+{
+ struct list_head list;
+ struct ve_addr_struct addr;
+};
+
+struct veip_struct
+{
+ struct list_head src_lh;
+ struct list_head dst_lh;
+ struct list_head ip_lh;
+ struct list_head list;
+ struct list_head ext_lh;
+ envid_t veid;
+};
+
+static inline struct net_device_stats *
+venet_stats(struct net_device *dev, int cpu)
+{
+ struct venet_stats *stats;
+ stats = (struct venet_stats*)dev->ml_priv;
+ return per_cpu_ptr(stats->real_stats, cpu);
+}
+
+/* veip_hash_lock should be taken for write by caller */
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
+/* veip_hash_lock should be taken for write by caller */
+void ip_entry_unhash(struct ip_entry_struct *entry);
+/* veip_hash_lock should be taken for read by caller */
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *);
+
+/* veip_hash_lock should be taken for read by caller */
+struct veip_struct *veip_find(envid_t veid);
+/* veip_hash_lock should be taken for write by caller */
+struct veip_struct *veip_findcreate(envid_t veid);
+/* veip_hash_lock should be taken for write by caller */
+void veip_put(struct veip_struct *veip);
+
+extern struct list_head veip_lh;
+
+int veip_start(struct ve_struct *ve);
+void veip_stop(struct ve_struct *ve);
+__exit void veip_cleanup(void);
+int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr);
+int veip_entry_del(envid_t veid, struct ve_addr_struct *addr);
+int venet_change_skb_owner(struct sk_buff *skb);
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+ struct ve_addr_struct *addr);
+
+extern struct list_head ip_entry_hash_table[];
+extern rwlock_t veip_hash_lock;
+
+#ifdef CONFIG_PROC_FS
+int veip_seq_show(struct seq_file *m, void *v);
+#endif
+
+#endif
diff --git a/include/linux/veprintk.h b/include/linux/veprintk.h
new file mode 100644
index 0000000..5669d7b
--- /dev/null
+++ b/include/linux/veprintk.h
@@ -0,0 +1,38 @@
+/*
+ * include/linux/veprintk.h
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VE_PRINTK_H__
+#define __VE_PRINTK_H__
+
+#ifdef CONFIG_VE
+
+#define ve_log_wait (*(get_exec_env()->_log_wait))
+#define ve_log_start (*(get_exec_env()->_log_start))
+#define ve_log_end (*(get_exec_env()->_log_end))
+#define ve_logged_chars (*(get_exec_env()->_logged_chars))
+#define ve_log_buf (get_exec_env()->log_buf)
+#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \
+ log_buf_len : VE_DEFAULT_LOG_BUF_LEN)
+#define VE_LOG_BUF_MASK (ve_log_buf_len - 1)
+#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK])
+
+#else
+
+#define ve_log_wait log_wait
+#define ve_log_start log_start
+#define ve_log_end log_end
+#define ve_logged_chars logged_chars
+#define ve_log_buf log_buf
+#define ve_log_buf_len log_buf_len
+#define VE_LOG_BUF_MASK LOG_BUF_MASK
+#define VE_LOG_BUF(idx) LOG_BUF(idx)
+
+#endif /* CONFIG_VE */
+#endif /* __VE_PRINTK_H__ */
diff --git a/include/linux/veth.h b/include/linux/veth.h
index 3354c1e..caa35c1 100644
--- a/include/linux/veth.h
+++ b/include/linux/veth.h
@@ -1,3 +1,12 @@
+/*
+ * include/linux/veth.h
+ *
+ * Copyright (C) 2007 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
#ifndef __NET_VETH_H_
#define __NET_VETH_H_
@@ -9,4 +18,29 @@ enum {
#define VETH_INFO_MAX (__VETH_INFO_MAX - 1)
};
+#ifdef __KERNEL__
+struct veth_struct
+{
+ struct net_device_stats stats;
+ struct net_device *me;
+ struct net_device *pair;
+ struct list_head hwaddr_list;
+ struct net_device_stats *real_stats;
+ int allow_mac_change;
+};
+
+#define veth_from_netdev(dev) \
+ ((struct veth_struct *)(netdev_priv(dev)))
+static inline struct net_device * veth_to_netdev(struct veth_struct *veth)
+{
+ return veth->me;
+}
+#endif
+
+static inline struct net_device_stats *
+veth_stats(struct net_device *dev, int cpuid)
+{
+ return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid);
+}
+
#endif
diff --git a/include/linux/virtinfo.h b/include/linux/virtinfo.h
new file mode 100644
index 0000000..b0dad07
--- /dev/null
+++ b/include/linux/virtinfo.h
@@ -0,0 +1,100 @@
+/*
+ * include/linux/virtinfo.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_VIRTINFO_H
+#define __LINUX_VIRTINFO_H
+
+#include <linux/kernel.h>
+#include <linux/page-flags.h>
+#include <linux/notifier.h>
+
+struct vnotifier_block
+{
+ int (*notifier_call)(struct vnotifier_block *self,
+ unsigned long, void *, int);
+ struct vnotifier_block *next;
+ int priority;
+};
+
+extern struct semaphore virtinfo_sem;
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
+int virtinfo_notifier_call(int type, unsigned long n, void *data);
+
+struct page_info {
+ unsigned long nr_file_dirty;
+ unsigned long nr_writeback;
+ unsigned long nr_anon_pages;
+ unsigned long nr_file_mapped;
+ unsigned long nr_slab_rec;
+ unsigned long nr_slab_unrec;
+ unsigned long nr_pagetable;
+ unsigned long nr_unstable_nfs;
+ unsigned long nr_bounce;
+ unsigned long nr_writeback_temp;
+};
+
+struct meminfo {
+ struct sysinfo si;
+ struct page_info pi;
+ unsigned long active, inactive;
+ unsigned long cache, swapcache;
+ unsigned long committed_space;
+ unsigned long allowed;
+ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest;
+};
+
+#define VIRTINFO_MEMINFO 0
+#define VIRTINFO_ENOUGHMEM 1
+#define VIRTINFO_DOFORK 2
+#define VIRTINFO_DOEXIT 3
+#define VIRTINFO_DOEXECVE 4
+#define VIRTINFO_DOFORKRET 5
+#define VIRTINFO_DOFORKPOST 6
+#define VIRTINFO_EXIT 7
+#define VIRTINFO_EXITMMAP 8
+#define VIRTINFO_EXECMMAP 9
+#define VIRTINFO_OUTOFMEM 10
+#define VIRTINFO_PAGEIN 11
+#define VIRTINFO_SYSINFO 12
+#define VIRTINFO_NEWUBC 13
+#define VIRTINFO_VMSTAT 14
+
+enum virt_info_types {
+ VITYPE_GENERAL,
+ VITYPE_FAUDIT,
+ VITYPE_QUOTA,
+ VITYPE_SCP,
+
+ VIRT_TYPES
+};
+
+#ifdef CONFIG_VZ_GENCALLS
+
+static inline int virtinfo_gencall(unsigned long n, void *data)
+{
+ int r;
+
+ r = virtinfo_notifier_call(VITYPE_GENERAL, n, data);
+ if (r & NOTIFY_FAIL)
+ return -ENOBUFS;
+ if (r & NOTIFY_OK)
+ return -ERESTARTNOINTR;
+ return 0;
+}
+
+#else
+
+#define virtinfo_gencall(n, data) 0
+
+#endif
+
+#endif /* __LINUX_VIRTINFO_H */
diff --git a/include/linux/virtinfoscp.h b/include/linux/virtinfoscp.h
new file mode 100644
index 0000000..5661c0d
--- /dev/null
+++ b/include/linux/virtinfoscp.h
@@ -0,0 +1,23 @@
+#ifndef __VIRTINFO_SCP_H__
+#define __VIRTINFO_SCP_H__
+
+/*
+ * Dump and restore operations are non-symmetric.
+ * With respect to finish/fail hooks, 2 dump hooks are called from
+ * different proc operations, but restore hooks are called from a single one.
+ */
+#define VIRTINFO_SCP_COLLECT 0x10
+#define VIRTINFO_SCP_DUMP 0x11
+#define VIRTINFO_SCP_DMPFIN 0x12
+#define VIRTINFO_SCP_RSTCHECK 0x13
+#define VIRTINFO_SCP_RESTORE 0x14
+#define VIRTINFO_SCP_RSTFAIL 0x15
+
+#define VIRTINFO_SCP_RSTTSK 0x20
+#define VIRTINFO_SCP_RSTMM 0x21
+
+#define VIRTINFO_SCP_TEST 0x30
+
+#define VIRTNOTIFY_CHANGE 0x100
+
+#endif /* __VIRTINFO_SCP_H__ */
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3c123c3..6cf99e7 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -23,6 +23,10 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */
#define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */
#endif
+/* align size to 2^n page boundary */
+#define POWER2_PAGE_ALIGN(size) \
+ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size))))
+
struct vm_struct {
struct vm_struct *next;
void *addr;
@@ -51,12 +55,16 @@ static inline void vmalloc_init(void)
#endif
extern void *vmalloc(unsigned long size);
+extern void *ub_vmalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
+extern void *ub_vmalloc_node(unsigned long size, int node);
extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
+extern void *vmalloc_best(unsigned long size);
+extern void *ub_vmalloc_best(unsigned long size);
extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot);
extern void vfree(const void *addr);
@@ -68,6 +76,7 @@ extern void vunmap(const void *addr);
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff);
void vmalloc_sync_all(void);
+extern void vprintstat(void);
/*
* Lowlevel-APIs (not for driver use!)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 2d0f222..977a906 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -105,6 +105,7 @@ static inline void vm_events_fold_cpu(int cpu)
}
#endif
+extern unsigned long vm_events(enum vm_event_item i);
#else
/* Disable counters */
@@ -127,6 +128,7 @@ static inline void vm_events_fold_cpu(int cpu)
{
}
+static inline unsigned long vm_events(enum vm_event_item i) { return 0; }
#endif /* CONFIG_VM_EVENT_COUNTERS */
#define __count_zone_vm_events(item, zone, delta) \
diff --git a/include/linux/vzcalluser.h b/include/linux/vzcalluser.h
new file mode 100644
index 0000000..d093112
--- /dev/null
+++ b/include/linux/vzcalluser.h
@@ -0,0 +1,201 @@
+/*
+ * include/linux/vzcalluser.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VZCALLUSER_H
+#define _LINUX_VZCALLUSER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/vziptable_defs.h>
+
+#define KERN_VZ_PRIV_RANGE 51
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * VE management ioctls
+ */
+
+struct vzctl_old_env_create {
+ envid_t veid;
+ unsigned flags;
+#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */
+#define VE_EXCLUSIVE 2 /* Fail if exists */
+#define VE_ENTER 4 /* Enter existing VE */
+#define VE_TEST 8 /* Test if VE exists */
+#define VE_LOCK 16 /* Do not allow entering created VE */
+#define VE_SKIPLOCK 32 /* Allow entering embrion VE */
+ __u32 addr;
+};
+
+struct vzctl_mark_env_to_down {
+ envid_t veid;
+};
+
+struct vzctl_setdevperms {
+ envid_t veid;
+ unsigned type;
+#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */
+#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */
+#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
+ unsigned dev;
+ unsigned mask;
+};
+
+struct vzctl_ve_netdev {
+ envid_t veid;
+ int op;
+#define VE_NETDEV_ADD 1
+#define VE_NETDEV_DEL 2
+ char __user *dev_name;
+};
+
+struct vzctl_ve_meminfo {
+ envid_t veid;
+ unsigned long val;
+};
+
+struct vzctl_env_create_cid {
+ envid_t veid;
+ unsigned flags;
+ __u32 class_id;
+};
+
+struct vzctl_env_create {
+ envid_t veid;
+ unsigned flags;
+ __u32 class_id;
+};
+
+struct env_create_param {
+ __u64 iptables_mask;
+};
+
+#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param)
+
+struct env_create_param2 {
+ __u64 iptables_mask;
+ __u64 feature_mask;
+ __u32 total_vcpus; /* 0 - don't care, same as in host */
+};
+
+struct env_create_param3 {
+ __u64 iptables_mask;
+ __u64 feature_mask;
+ __u32 total_vcpus;
+ __u32 pad;
+ __u64 known_features;
+};
+
+#define VE_FEATURE_SYSFS (1ULL << 0)
+#define VE_FEATURE_NFS (1ULL << 1)
+#define VE_FEATURE_DEF_PERMS (1ULL << 2)
+#define VE_FEATURE_SIT (1ULL << 3)
+#define VE_FEATURE_IPIP (1ULL << 4)
+#define VE_FEATURE_PPP (1ULL << 5)
+#define VE_FEATURE_IPGRE (1ULL << 6)
+#define VE_FEATURE_BRIDGE (1ULL << 7)
+
+#define VE_FEATURES_OLD (VE_FEATURE_SYSFS)
+#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \
+ VE_FEATURE_DEF_PERMS)
+
+typedef struct env_create_param3 env_create_param_t;
+#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t)
+
+struct vzctl_env_create_data {
+ envid_t veid;
+ unsigned flags;
+ __u32 class_id;
+ env_create_param_t __user *data;
+ int datalen;
+};
+
+struct vz_load_avg {
+ int val_int;
+ int val_frac;
+};
+
+struct vz_cpu_stat {
+ unsigned long user_jif;
+ unsigned long nice_jif;
+ unsigned long system_jif;
+ unsigned long uptime_jif;
+ __u64 idle_clk;
+ __u64 strv_clk;
+ __u64 uptime_clk;
+ struct vz_load_avg avenrun[3]; /* loadavg data */
+};
+
+struct vzctl_cpustatctl {
+ envid_t veid;
+ struct vz_cpu_stat __user *cpustat;
+};
+
+#define VZCTLTYPE '.'
+#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \
+ struct vzctl_old_env_create)
+#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \
+ struct vzctl_mark_env_to_down)
+#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \
+ struct vzctl_setdevperms)
+#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \
+ struct vzctl_env_create_cid)
+#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \
+ struct vzctl_env_create)
+#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \
+ struct vzctl_cpustatctl)
+#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \
+ struct vzctl_env_create_data)
+#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \
+ struct vzctl_ve_netdev)
+#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \
+ struct vzctl_ve_meminfo)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_netdev {
+ envid_t veid;
+ int op;
+ compat_uptr_t dev_name;
+};
+
+struct compat_vzctl_ve_meminfo {
+ envid_t veid;
+ compat_ulong_t val;
+};
+
+struct compat_vzctl_env_create_data {
+ envid_t veid;
+ unsigned flags;
+ __u32 class_id;
+ compat_uptr_t data;
+ int datalen;
+};
+
+#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \
+ struct compat_vzctl_env_create_data)
+#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \
+ struct compat_vzctl_ve_netdev)
+#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \
+ struct compat_vzctl_ve_meminfo)
+#endif
+#endif
+
+#endif
diff --git a/include/linux/vzctl.h b/include/linux/vzctl.h
new file mode 100644
index 0000000..ad967ed
--- /dev/null
+++ b/include/linux/vzctl.h
@@ -0,0 +1,30 @@
+/*
+ * include/linux/vzctl.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _LINUX_VZCTL_H
+#define _LINUX_VZCTL_H
+
+#include <linux/list.h>
+
+struct module;
+struct inode;
+struct file;
+struct vzioctlinfo {
+ unsigned type;
+ int (*ioctl)(struct file *, unsigned int, unsigned long);
+ int (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+ struct module *owner;
+ struct list_head list;
+};
+
+extern void vzioctl_register(struct vzioctlinfo *inf);
+extern void vzioctl_unregister(struct vzioctlinfo *inf);
+
+#endif
diff --git a/include/linux/vzctl_quota.h b/include/linux/vzctl_quota.h
new file mode 100644
index 0000000..6d36cdd
--- /dev/null
+++ b/include/linux/vzctl_quota.h
@@ -0,0 +1,74 @@
+/*
+ * include/linux/vzctl_quota.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __LINUX_VZCTL_QUOTA_H__
+#define __LINUX_VZCTL_QUOTA_H__
+
+#include <linux/compat.h>
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * Quota management ioctl
+ */
+
+struct vz_quota_stat;
+struct vzctl_quotactl {
+ int cmd;
+ unsigned int quota_id;
+ struct vz_quota_stat __user *qstat;
+ char __user *ve_root;
+};
+
+struct vzctl_quotaugidctl {
+ int cmd; /* subcommand */
+ unsigned int quota_id; /* quota id where it applies to */
+ unsigned int ugid_index;/* for reading statistic. index of first
+ uid/gid record to read */
+ unsigned int ugid_size; /* size of ugid_buf array */
+ void *addr; /* user-level buffer */
+};
+
+#define VZDQCTLTYPE '+'
+#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \
+ struct vzctl_quotactl)
+#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \
+ struct vzctl_quotactl)
+#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \
+ struct vzctl_quotaugidctl)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+struct compat_vzctl_quotactl {
+ int cmd;
+ unsigned int quota_id;
+ compat_uptr_t qstat;
+ compat_uptr_t ve_root;
+};
+
+struct compat_vzctl_quotaugidctl {
+ int cmd; /* subcommand */
+ unsigned int quota_id; /* quota id where it applies to */
+ unsigned int ugid_index;/* for reading statistic. index of first
+ uid/gid record to read */
+ unsigned int ugid_size; /* size of ugid_buf array */
+ compat_uptr_t addr; /* user-level buffer */
+};
+
+#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \
+ struct compat_vzctl_quotactl)
+#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \
+ struct compat_vzctl_quotaugidctl)
+#endif
+#endif
+
+#endif /* __LINUX_VZCTL_QUOTA_H__ */
diff --git a/include/linux/vzctl_venet.h b/include/linux/vzctl_venet.h
new file mode 100644
index 0000000..8c02cd4
--- /dev/null
+++ b/include/linux/vzctl_venet.h
@@ -0,0 +1,53 @@
+/*
+ * include/linux/vzctl_venet.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZCTL_VENET_H
+#define _VZCTL_VENET_H
+
+#include <linux/types.h>
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+struct vzctl_ve_ip_map {
+ envid_t veid;
+ int op;
+#define VE_IP_ADD 1
+#define VE_IP_DEL 2
+#define VE_IP_EXT_ADD 3
+#define VE_IP_EXT_DEL 4
+ struct sockaddr *addr;
+ int addrlen;
+};
+
+#define VENETCTLTYPE '('
+
+#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \
+ struct vzctl_ve_ip_map)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+struct compat_vzctl_ve_ip_map {
+ envid_t veid;
+ int op;
+ compat_uptr_t addr;
+ int addrlen;
+};
+
+#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \
+ struct compat_vzctl_ve_ip_map)
+#endif
+#endif
+
+#endif
diff --git a/include/linux/vzctl_veth.h b/include/linux/vzctl_veth.h
new file mode 100644
index 0000000..1480c5b
--- /dev/null
+++ b/include/linux/vzctl_veth.h
@@ -0,0 +1,42 @@
+/*
+ * include/linux/vzctl_veth.h
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZCTL_VETH_H
+#define _VZCTL_VETH_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
+struct vzctl_ve_hwaddr {
+ envid_t veid;
+ int op;
+#define VE_ETH_ADD 1
+#define VE_ETH_DEL 2
+#define VE_ETH_ALLOW_MAC_CHANGE 3
+#define VE_ETH_DENY_MAC_CHANGE 4
+ unsigned char dev_addr[6];
+ int addrlen;
+ char dev_name[16];
+ unsigned char dev_addr_ve[6];
+ int addrlen_ve;
+ char dev_name_ve[16];
+};
+
+#define VETHCTLTYPE '['
+
+#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \
+ struct vzctl_ve_hwaddr)
+
+#endif
diff --git a/include/linux/vzdq_tree.h b/include/linux/vzdq_tree.h
new file mode 100644
index 0000000..c019e09
--- /dev/null
+++ b/include/linux/vzdq_tree.h
@@ -0,0 +1,99 @@
+/*
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo disk quota tree definition
+ */
+
+#ifndef _VZDQ_TREE_H
+#define _VZDQ_TREE_H
+
+#include <linux/list.h>
+#include <asm/string.h>
+
+typedef unsigned int quotaid_t;
+#define QUOTAID_BITS 32
+#define QUOTAID_BBITS 4
+#define QUOTAID_EBITS 8
+
+#if QUOTAID_EBITS % QUOTAID_BBITS
+#error Quota bit assumption failure
+#endif
+
+#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS)
+#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1)
+#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \
+ / QUOTAID_BBITS)
+#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \
+ / QUOTAID_EBITS)
+#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS)
+
+/*
+ * Depth of keeping unused node (not inclusive).
+ * 0 means release all nodes including root,
+ * QUOTATREE_DEPTH means never release nodes.
+ * Current value: release all nodes strictly after QUOTATREE_EDEPTH
+ * (measured in external shift units).
+ */
+#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \
+ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \
+ + 1)
+
+/*
+ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes.
+ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS),
+ * and each node contains 2^QUOTAID_BBITS pointers.
+ * Level 0 is a (single) tree root node.
+ *
+ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data.
+ * Nodes of lower levels contain pointers to nodes.
+ *
+ * Double pointer in array of i-level node, pointing to a (i+1)-level node
+ * (such as inside quotatree_find_state) are marked by level (i+1), not i.
+ * Level 0 double pointer is a pointer to root inside tree struct.
+ *
+ * The tree is permanent, i.e. all index blocks allocated are keeped alive to
+ * preserve the blocks numbers in the quota file tree to keep its changes
+ * locally.
+ */
+struct quotatree_node {
+ struct list_head list;
+ quotaid_t num;
+ void *blocks[QUOTATREE_BSIZE];
+};
+
+struct quotatree_level {
+ struct list_head usedlh, freelh;
+ quotaid_t freenum;
+};
+
+struct quotatree_tree {
+ struct quotatree_level levels[QUOTATREE_DEPTH];
+ struct quotatree_node *root;
+ unsigned int leaf_num;
+};
+
+struct quotatree_find_state {
+ void **block;
+ int level;
+};
+
+/* number of leafs (objects) and leaf level of the tree */
+#define QTREE_LEAFNUM(tree) ((tree)->leaf_num)
+#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1])
+
+struct quotatree_tree *quotatree_alloc(void);
+void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
+ struct quotatree_find_state *st);
+int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
+ struct quotatree_find_state *st, void *data);
+void quotatree_remove(struct quotatree_tree *tree, quotaid_t id);
+void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *));
+void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id);
+void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index);
+
+#endif /* _VZDQ_TREE_H */
+
diff --git a/include/linux/vzevent.h b/include/linux/vzevent.h
new file mode 100644
index 0000000..1a67297
--- /dev/null
+++ b/include/linux/vzevent.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_VZ_EVENT_H__
+#define __LINUX_VZ_EVENT_H__
+
+#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE)
+extern int vzevent_send(int msg, const char *attrs_fmt, ...);
+#else
+static inline int vzevent_send(int msg, const char *attrs_fmt, ...)
+{
+ return 0;
+}
+#endif
+
+#endif /* __LINUX_VZ_EVENT_H__ */
diff --git a/include/linux/vziptable_defs.h b/include/linux/vziptable_defs.h
new file mode 100644
index 0000000..204e9d8
--- /dev/null
+++ b/include/linux/vziptable_defs.h
@@ -0,0 +1,81 @@
+#ifndef _LINUX_VZIPTABLE_DEFS_H
+#define _LINUX_VZIPTABLE_DEFS_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+
+/*
+ * This masks represent modules
+ *
+ * Strictly speaking we use only a small subset
+ * of this bits novadays but we MUST RESERVE all
+ * the bits were ever used in a sake of ABI compatibility
+ * (ie compatibility with vzctl user-space utility)
+ *
+ * DON'T EVER DELETE/MODIFY THESE BITS
+ */
+#define VE_IPT_GENERATE(name, shift) name = (1U << shift)
+
+enum ve_ipt_mods {
+ VE_IPT_GENERATE(VE_IP_IPTABLES_MOD, 0),
+ VE_IPT_GENERATE(VE_IP_FILTER_MOD, 1),
+ VE_IPT_GENERATE(VE_IP_MANGLE_MOD, 2),
+ VE_IPT_GENERATE(VE_IP_MATCH_LIMIT_MOD, 3),
+ VE_IPT_GENERATE(VE_IP_MATCH_MULTIPORT_MOD, 4),
+ VE_IPT_GENERATE(VE_IP_MATCH_TOS_MOD, 5),
+ VE_IPT_GENERATE(VE_IP_TARGET_TOS_MOD, 6),
+ VE_IPT_GENERATE(VE_IP_TARGET_REJECT_MOD, 7),
+ VE_IPT_GENERATE(VE_IP_TARGET_TCPMSS_MOD, 8),
+ VE_IPT_GENERATE(VE_IP_MATCH_TCPMSS_MOD, 9),
+ VE_IPT_GENERATE(VE_IP_MATCH_TTL_MOD, 10),
+ VE_IPT_GENERATE(VE_IP_TARGET_LOG_MOD, 11),
+ VE_IPT_GENERATE(VE_IP_MATCH_LENGTH_MOD, 12),
+ VE_IPT_GENERATE(VE_IP_CONNTRACK_MOD, 14),
+ VE_IPT_GENERATE(VE_IP_CONNTRACK_FTP_MOD, 15),
+ VE_IPT_GENERATE(VE_IP_CONNTRACK_IRC_MOD, 16),
+ VE_IPT_GENERATE(VE_IP_MATCH_CONNTRACK_MOD, 17),
+ VE_IPT_GENERATE(VE_IP_MATCH_STATE_MOD, 18),
+ VE_IPT_GENERATE(VE_IP_MATCH_HELPER_MOD, 19),
+ VE_IPT_GENERATE(VE_IP_NAT_MOD, 20),
+ VE_IPT_GENERATE(VE_IP_NAT_FTP_MOD, 21),
+ VE_IPT_GENERATE(VE_IP_NAT_IRC_MOD, 22),
+ VE_IPT_GENERATE(VE_IP_TARGET_REDIRECT_MOD, 23),
+ VE_IPT_GENERATE(VE_IP_MATCH_OWNER_MOD, 24),
+ VE_IPT_GENERATE(VE_IP_MATCH_MAC_MOD, 25),
+ VE_IPT_GENERATE(VE_IP_IPTABLES6_MOD, 26),
+ VE_IPT_GENERATE(VE_IP_FILTER6_MOD, 27),
+ VE_IPT_GENERATE(VE_IP_MANGLE6_MOD, 28),
+ VE_IPT_GENERATE(VE_IP_IPTABLE_NAT_MOD, 29),
+ VE_IPT_GENERATE(VE_NF_CONNTRACK_MOD, 30),
+};
+
+/* these masks represent modules with their dependences */
+#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD)
+#define VE_IP_FILTER (VE_IP_FILTER_MOD | VE_IP_IPTABLES)
+#define VE_IP_MANGLE (VE_IP_MANGLE_MOD | VE_IP_IPTABLES)
+#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD)
+#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6)
+#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6)
+#define VE_NF_CONNTRACK (VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD | VE_IP_CONNTRACK)
+#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT (VE_IP_NAT_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD | VE_IP_NAT | VE_IP_CONNTRACK_FTP)
+#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD | VE_IP_NAT | VE_IP_CONNTRACK_IRC)
+#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK)
+
+/* safe iptables mask to be used by default */
+#define VE_IP_DEFAULT (VE_IP_IPTABLES | VE_IP_FILTER | VE_IP_MANGLE)
+
+/* allowed all */
+#define VE_IP_ALL (~0ULL)
+
+#define VE_IPT_CMP(x, y) (((x) & (y)) == (y))
+
+static inline bool mask_ipt_allow(__u64 permitted, __u64 mask)
+{
+ return VE_IPT_CMP(permitted, mask);
+}
+
+#endif /* _LINUX_VZIPTABLE_DEFS_H */
diff --git a/include/linux/vzquota.h b/include/linux/vzquota.h
new file mode 100644
index 0000000..1dba5fa
--- /dev/null
+++ b/include/linux/vzquota.h
@@ -0,0 +1,380 @@
+/*
+ *
+ * Copyright (C) 2001-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * This file contains Virtuozzo disk quota implementation
+ */
+
+#ifndef _VZDQUOTA_H
+#define _VZDQUOTA_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/* vzquotactl syscall commands */
+#define VZ_DQ_CREATE 5 /* create quota master block */
+#define VZ_DQ_DESTROY 6 /* destroy qmblk */
+#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */
+#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */
+#define VZ_DQ_SETLIMIT 9 /* set new limits */
+#define VZ_DQ_GETSTAT 10 /* get usage statistic */
+#define VZ_DQ_OFF_FORCED 11 /* forced off */
+/* set of syscalls to maintain UGID quotas */
+#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */
+#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */
+#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */
+#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */
+#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */
+#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */
+#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */
+#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */
+
+/* common structure for vz and ugid quota */
+struct dq_stat {
+ /* blocks limits */
+ __u64 bhardlimit; /* absolute limit in bytes */
+ __u64 bsoftlimit; /* preferred limit in bytes */
+ time_t btime; /* time limit for excessive disk use */
+ __u64 bcurrent; /* current bytes count */
+ /* inodes limits */
+ __u32 ihardlimit; /* absolute limit on allocated inodes */
+ __u32 isoftlimit; /* preferred inode limit */
+ time_t itime; /* time limit for excessive inode use */
+ __u32 icurrent; /* current # allocated inodes */
+};
+
+/* One second resolution for grace times */
+#define CURRENT_TIME_SECONDS (get_seconds())
+
+/* Values for dq_info->flags */
+#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */
+#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */
+
+struct dq_info {
+ time_t bexpire; /* expire timeout for excessive disk use */
+ time_t iexpire; /* expire timeout for excessive inode use */
+ unsigned flags; /* see previos defines */
+};
+
+struct vz_quota_stat {
+ struct dq_stat dq_stat;
+ struct dq_info dq_info;
+};
+
+/* UID/GID interface record - for user-kernel level exchange */
+struct vz_quota_iface {
+ unsigned int qi_id; /* UID/GID this applies to */
+ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */
+ struct dq_stat qi_stat; /* limits, options, usage stats */
+};
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+struct compat_dq_stat {
+ /* blocks limits */
+ __u64 bhardlimit; /* absolute limit in bytes */
+ __u64 bsoftlimit; /* preferred limit in bytes */
+ compat_time_t btime; /* time limit for excessive disk use */
+ __u64 bcurrent; /* current bytes count */
+ /* inodes limits */
+ __u32 ihardlimit; /* absolute limit on allocated inodes */
+ __u32 isoftlimit; /* preferred inode limit */
+ compat_time_t itime; /* time limit for excessive inode use */
+ __u32 icurrent; /* current # allocated inodes */
+};
+
+struct compat_dq_info {
+ compat_time_t bexpire; /* expire timeout for excessive disk use */
+ compat_time_t iexpire; /* expire timeout for excessive inode use */
+ unsigned flags; /* see previos defines */
+};
+
+struct compat_vz_quota_stat {
+ struct compat_dq_stat dq_stat;
+ struct compat_dq_info dq_info;
+};
+
+struct compat_vz_quota_iface {
+ unsigned int qi_id; /* UID/GID this applies to */
+ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */
+ struct compat_dq_stat qi_stat; /* limits, options, usage stats */
+};
+
+static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs,
+ struct dq_stat *dqs)
+{
+ dqs->bhardlimit = odqs->bhardlimit;
+ dqs->bsoftlimit = odqs->bsoftlimit;
+ dqs->bcurrent = odqs->bcurrent;
+ dqs->btime = odqs->btime;
+
+ dqs->ihardlimit = odqs->ihardlimit;
+ dqs->isoftlimit = odqs->isoftlimit;
+ dqs->icurrent = odqs->icurrent;
+ dqs->itime = odqs->itime;
+}
+
+static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi,
+ struct dq_info *dqi)
+{
+ dqi->bexpire = odqi->bexpire;
+ dqi->iexpire = odqi->iexpire;
+ dqi->flags = odqi->flags;
+}
+
+static inline void dqstat2compat_dqstat(struct dq_stat *dqs,
+ struct compat_dq_stat *odqs)
+{
+ odqs->bhardlimit = dqs->bhardlimit;
+ odqs->bsoftlimit = dqs->bsoftlimit;
+ odqs->bcurrent = dqs->bcurrent;
+ odqs->btime = (compat_time_t)dqs->btime;
+
+ odqs->ihardlimit = dqs->ihardlimit;
+ odqs->isoftlimit = dqs->isoftlimit;
+ odqs->icurrent = dqs->icurrent;
+ odqs->itime = (compat_time_t)dqs->itime;
+}
+
+static inline void dqinfo2compat_dqinfo(struct dq_info *dqi,
+ struct compat_dq_info *odqi)
+{
+ odqi->bexpire = (compat_time_t)dqi->bexpire;
+ odqi->iexpire = (compat_time_t)dqi->iexpire;
+ odqi->flags = dqi->flags;
+}
+#endif
+
+/* values for flags and dq_flags */
+/* this flag is set if the userspace has been unable to provide usage
+ * information about all ugids
+ * if the flag is set, we don't allocate new UG quota blocks (their
+ * current usage is unknown) or free existing UG quota blocks (not to
+ * lose information that this block is ok) */
+#define VZDQUG_FIXED_SET 0x01
+/* permit to use ugid quota */
+#define VZDQUG_ON 0x02
+#define VZDQ_USRQUOTA 0x10
+#define VZDQ_GRPQUOTA 0x20
+#define VZDQ_NOACT 0x1000 /* not actual */
+#define VZDQ_NOQUOT 0x2000 /* not under quota tree */
+
+struct vz_quota_ugid_stat {
+ unsigned int limit; /* max amount of ugid records */
+ unsigned int count; /* amount of ugid records */
+ unsigned int flags;
+};
+
+struct vz_quota_ugid_setlimit {
+ unsigned int type; /* quota type (USR/GRP) */
+ unsigned int id; /* ugid */
+ struct if_dqblk dqb; /* limits info */
+};
+
+struct vz_quota_ugid_setinfo {
+ unsigned int type; /* quota type (USR/GRP) */
+ struct if_dqinfo dqi; /* grace info */
+};
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <linux/time.h>
+#include <linux/vzquota_qlnk.h>
+#include <linux/vzdq_tree.h>
+#include <linux/semaphore.h>
+
+/* Values for dq_info flags */
+#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */
+#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */
+
+/* values for dq_state */
+#define VZDQ_STARTING 0 /* created, not turned on yet */
+#define VZDQ_WORKING 1 /* quota created, turned on */
+#define VZDQ_STOPING 2 /* created, turned on and off */
+
+/* master quota record - one per veid */
+struct vz_quota_master {
+ struct list_head dq_hash; /* next quota in hash list */
+ atomic_t dq_count; /* inode reference count */
+ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */
+ unsigned int dq_state; /* see values above */
+ unsigned int dq_id; /* VEID this applies to */
+ struct dq_stat dq_stat; /* limits, grace, usage stats */
+ struct dq_info dq_info; /* grace times and flags */
+ spinlock_t dq_data_lock; /* for dq_stat */
+
+ struct mutex dq_mutex; /* mutex to protect
+ ugid tree */
+
+ struct list_head dq_ilink_list; /* list of vz_quota_ilink */
+ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */
+ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */
+ unsigned int dq_ugid_count; /* amount of ugid records */
+ unsigned int dq_ugid_max; /* max amount of ugid records */
+ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */
+
+ struct path dq_root_path; /* path of fs tree */
+ struct super_block *dq_sb; /* superblock of our quota root */
+};
+
+/* UID/GID quota record - one per pair (quota_master, uid or gid) */
+struct vz_quota_ugid {
+ unsigned int qugid_id; /* UID/GID this applies to */
+ struct dq_stat qugid_stat; /* limits, options, usage stats */
+ int qugid_type; /* USRQUOTA|GRPQUOTA */
+ atomic_t qugid_count; /* reference count */
+};
+
+#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11)
+
+struct vz_quota_datast {
+ struct vz_quota_ilink qlnk;
+};
+
+#define VIRTINFO_QUOTA_GETSTAT 0
+#define VIRTINFO_QUOTA_ON 1
+#define VIRTINFO_QUOTA_OFF 2
+#define VIRTINFO_QUOTA_DISABLE 3
+
+struct virt_info_quota {
+ struct super_block *super;
+ struct dq_stat *qstat;
+};
+
+/*
+ * Interface to VZ quota core
+ */
+#define INODE_QLNK(inode) (&(inode)->i_qlnk)
+#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk)
+
+#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef)
+
+#define VZ_QUOTAO_SETE 1
+#define VZ_QUOTAO_INIT 2
+#define VZ_QUOTAO_DESTR 3
+#define VZ_QUOTAO_SWAP 4
+#define VZ_QUOTAO_INICAL 5
+#define VZ_QUOTAO_DRCAL 6
+#define VZ_QUOTAO_QSET 7
+#define VZ_QUOTAO_TRANS 8
+#define VZ_QUOTAO_ACT 9
+#define VZ_QUOTAO_DTREE 10
+#define VZ_QUOTAO_DET 11
+#define VZ_QUOTAO_ON 12
+#define VZ_QUOTAO_RE_LOCK 13
+
+#define DQUOT_CMD_ALLOC 0
+#define DQUOT_CMD_PREALLOC 1
+#define DQUOT_CMD_CHECK 12
+#define DQUOT_CMD_FORCE 13
+
+extern struct mutex vz_quota_mutex;
+
+void inode_qmblk_lock(struct super_block *sb);
+void inode_qmblk_unlock(struct super_block *sb);
+void qmblk_data_read_lock(struct vz_quota_master *qmblk);
+void qmblk_data_read_unlock(struct vz_quota_master *qmblk);
+void qmblk_data_write_lock(struct vz_quota_master *qmblk);
+void qmblk_data_write_unlock(struct vz_quota_master *qmblk);
+
+/* for quota operations */
+void vzquota_inode_init_call(struct inode *inode);
+void vzquota_inode_swap_call(struct inode *, struct inode *);
+void vzquota_inode_drop_call(struct inode *inode);
+int vzquota_inode_transfer_call(struct inode *, struct iattr *);
+struct vz_quota_master *vzquota_inode_data(struct inode *inode,
+ struct vz_quota_datast *);
+void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *);
+int vzquota_rename_check(struct inode *inode,
+ struct inode *old_dir, struct inode *new_dir);
+struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode);
+/* for second-level quota */
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
+/* for management operations */
+struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
+ struct vz_quota_stat *qstat);
+void vzquota_free_master(struct vz_quota_master *);
+struct vz_quota_master *vzquota_find_master(unsigned int quota_id);
+int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
+ struct vz_quota_master *qmblk, char __user *buf);
+int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk,
+ char __user *buf, int force);
+int vzquota_get_super(struct super_block *sb);
+void vzquota_put_super(struct super_block *sb);
+
+static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk)
+{
+ if (!atomic_read(&qmblk->dq_count))
+ BUG();
+ atomic_inc(&qmblk->dq_count);
+ return qmblk;
+}
+
+static inline void __qmblk_put(struct vz_quota_master *qmblk)
+{
+ atomic_dec(&qmblk->dq_count);
+}
+
+static inline void qmblk_put(struct vz_quota_master *qmblk)
+{
+ if (!atomic_dec_and_test(&qmblk->dq_count))
+ return;
+ vzquota_free_master(qmblk);
+}
+
+extern struct list_head vzquota_hash_table[];
+extern int vzquota_hash_size;
+
+/*
+ * Interface to VZ UGID quota
+ */
+extern struct quotactl_ops vz_quotactl_operations;
+extern struct dquot_operations vz_quota_operations2;
+extern struct quota_format_type vz_quota_empty_v2_format;
+
+#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \
+ qmblk->dq_uid_tree : \
+ qmblk->dq_gid_tree)
+
+#define VZDQUG_FIND_DONT_ALLOC 1
+#define VZDQUG_FIND_FAKE 2
+struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
+ unsigned int quota_id, int type, int flags);
+struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
+ unsigned int quota_id, int type, int flags);
+struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid);
+void vzquota_put_ugid(struct vz_quota_master *qmblk,
+ struct vz_quota_ugid *qugid);
+void vzquota_kill_ugid(struct vz_quota_master *qmblk);
+int vzquota_ugid_init(void);
+void vzquota_ugid_release(void);
+int vzquota_transfer_usage(struct inode *inode, int mask,
+ struct vz_quota_ilink *qlnk);
+void vzquota_inode_off(struct inode *inode);
+
+long do_vzquotaugidctl(int cmd, unsigned int quota_id,
+ unsigned int ugid_index, unsigned int ugid_size,
+ void *addr, int compat);
+
+/*
+ * Other VZ quota parts
+ */
+extern struct dquot_operations vz_quota_operations;
+
+long do_vzquotactl(int cmd, unsigned int quota_id,
+ struct vz_quota_stat __user *qstat, const char __user *ve_root,
+ int compat);
+int vzquota_proc_init(void);
+void vzquota_proc_release(void);
+struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
+
+void vzaquota_init(void);
+void vzaquota_fini(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _VZDQUOTA_H */
diff --git a/include/linux/vzquota_qlnk.h b/include/linux/vzquota_qlnk.h
new file mode 100644
index 0000000..2788c41
--- /dev/null
+++ b/include/linux/vzquota_qlnk.h
@@ -0,0 +1,25 @@
+/*
+ * include/linux/vzquota_qlnk.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef _VZDQUOTA_QLNK_H
+#define _VZDQUOTA_QLNK_H
+
+struct vz_quota_master;
+struct vz_quota_ugid;
+
+/* inode link, used to track inodes using quota via dq_ilink_list */
+struct vz_quota_ilink {
+ struct vz_quota_master *qmblk;
+ struct vz_quota_ugid *qugid[MAXQUOTAS];
+ struct list_head list;
+ unsigned char origin[2];
+};
+
+#endif /* _VZDQUOTA_QLNK_H */
diff --git a/include/linux/vzratelimit.h b/include/linux/vzratelimit.h
new file mode 100644
index 0000000..f26baad
--- /dev/null
+++ b/include/linux/vzratelimit.h
@@ -0,0 +1,28 @@
+/*
+ * include/linux/vzratelimit.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VZ_RATELIMIT_H__
+#define __VZ_RATELIMIT_H__
+
+/*
+ * Generic ratelimiting stuff.
+ */
+
+struct vz_rate_info {
+ int burst;
+ int interval; /* jiffy_t per event */
+ int bucket; /* kind of leaky bucket */
+ unsigned long last; /* last event */
+};
+
+/* Return true if rate limit permits. */
+int vz_ratelimit(struct vz_rate_info *p);
+
+#endif /* __VZ_RATELIMIT_H__ */
diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h
new file mode 100644
index 0000000..c7dfd1f
--- /dev/null
+++ b/include/linux/vzstat.h
@@ -0,0 +1,182 @@
+/*
+ * include/linux/vzstat.h
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#ifndef __VZSTAT_H__
+#define __VZSTAT_H__
+
+struct swap_cache_info_struct {
+ unsigned long add_total;
+ unsigned long del_total;
+ unsigned long find_success;
+ unsigned long find_total;
+ unsigned long noent_race;
+ unsigned long exist_race;
+ unsigned long remove_race;
+};
+
+struct kstat_lat_snap_struct {
+ cycles_t maxlat, totlat;
+ unsigned long count;
+};
+struct kstat_lat_pcpu_snap_struct {
+ cycles_t maxlat, totlat;
+ unsigned long count;
+ seqcount_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct kstat_lat_struct {
+ struct kstat_lat_snap_struct cur, last;
+ cycles_t avg[3];
+};
+struct kstat_lat_pcpu_struct {
+ struct kstat_lat_pcpu_snap_struct *cur;
+ cycles_t max_snap;
+ struct kstat_lat_snap_struct last;
+ cycles_t avg[3];
+};
+
+struct kstat_perf_snap_struct {
+ cycles_t wall_tottime, cpu_tottime;
+ cycles_t wall_maxdur, cpu_maxdur;
+ unsigned long count;
+};
+struct kstat_perf_struct {
+ struct kstat_perf_snap_struct cur, last;
+};
+
+struct kstat_zone_avg {
+ unsigned long free_pages_avg[3],
+ nr_active_avg[3],
+ nr_inactive_avg[3];
+};
+
+#define KSTAT_ALLOCSTAT_NR 5
+
+struct kernel_stat_glob {
+ unsigned long nr_unint_avg[3];
+
+ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
+ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+ struct kstat_lat_pcpu_struct sched_lat;
+ struct kstat_lat_struct swap_in;
+
+ struct kstat_perf_struct ttfp, cache_reap,
+ refill_inact, shrink_icache, shrink_dcache;
+
+ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */
+} ____cacheline_aligned;
+
+extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
+extern spinlock_t kstat_glb_lock;
+
+#ifdef CONFIG_VE
+#define KSTAT_PERF_ENTER(name) \
+ unsigned long flags; \
+ cycles_t start, sleep_time; \
+ \
+ start = get_cycles(); \
+ sleep_time = VE_TASK_INFO(current)->sleep_time; \
+
+#define KSTAT_PERF_LEAVE(name) \
+ spin_lock_irqsave(&kstat_glb_lock, flags); \
+ kstat_glob.name.cur.count++; \
+ start = get_cycles() - start; \
+ if (kstat_glob.name.cur.wall_maxdur < start) \
+ kstat_glob.name.cur.wall_maxdur = start;\
+ kstat_glob.name.cur.wall_tottime += start; \
+ start -= VE_TASK_INFO(current)->sleep_time - \
+ sleep_time; \
+ if (kstat_glob.name.cur.cpu_maxdur < start) \
+ kstat_glob.name.cur.cpu_maxdur = start; \
+ kstat_glob.name.cur.cpu_tottime += start; \
+ spin_unlock_irqrestore(&kstat_glb_lock, flags); \
+
+#else
+#define KSTAT_PERF_ENTER(name)
+#define KSTAT_PERF_LEAVE(name)
+#endif
+
+/*
+ * Add another statistics reading.
+ * Serialization is the caller's due.
+ */
+static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
+ cycles_t dur)
+{
+ p->cur.count++;
+ if (p->cur.maxlat < dur)
+ p->cur.maxlat = dur;
+ p->cur.totlat += dur;
+}
+
+static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
+ cycles_t dur)
+{
+ struct kstat_lat_pcpu_snap_struct *cur;
+
+ cur = per_cpu_ptr(p->cur, cpu);
+ write_seqcount_begin(&cur->lock);
+ cur->count++;
+ if (cur->maxlat < dur)
+ cur->maxlat = dur;
+ cur->totlat += dur;
+ write_seqcount_end(&cur->lock);
+}
+
+/*
+ * Move current statistics to last, clear last.
+ * Serialization is the caller's due.
+ */
+static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
+{
+ cycles_t m;
+ memcpy(&p->last, &p->cur, sizeof(p->last));
+ p->cur.maxlat = 0;
+ m = p->last.maxlat;
+ CALC_LOAD(p->avg[0], EXP_1, m)
+ CALC_LOAD(p->avg[1], EXP_5, m)
+ CALC_LOAD(p->avg[2], EXP_15, m)
+}
+
+static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
+{
+ unsigned i, cpu;
+ struct kstat_lat_pcpu_snap_struct snap, *cur;
+ cycles_t m;
+
+ memset(&p->last, 0, sizeof(p->last));
+ for_each_online_cpu(cpu) {
+ cur = per_cpu_ptr(p->cur, cpu);
+ do {
+ i = read_seqcount_begin(&cur->lock);
+ memcpy(&snap, cur, sizeof(snap));
+ } while (read_seqcount_retry(&cur->lock, i));
+ /*
+ * read above and this update of maxlat is not atomic,
+ * but this is OK, since it happens rarely and losing
+ * a couple of peaks is not essential. xemul
+ */
+ cur->maxlat = 0;
+
+ p->last.count += snap.count;
+ p->last.totlat += snap.totlat;
+ if (p->last.maxlat < snap.maxlat)
+ p->last.maxlat = snap.maxlat;
+ }
+
+ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
+ CALC_LOAD(p->avg[0], EXP_1, m);
+ CALC_LOAD(p->avg[1], EXP_5, m);
+ CALC_LOAD(p->avg[2], EXP_15, m);
+ /* reset max_snap to calculate it correctly next time */
+ p->max_snap = 0;
+}
+
+#endif /* __VZSTAT_H__ */
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 5c84af8..12bd3c3 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -10,6 +10,13 @@
#ifndef _LINUX_XATTR_H
#define _LINUX_XATTR_H
+#ifdef CONFIG_VE
+extern int ve_xattr_policy;
+#define VE_XATTR_POLICY_ACCEPT 0
+#define VE_XATTR_POLICY_IGNORE 1
+#define VE_XATTR_POLICY_REJECT 2
+#endif
+
#define XATTR_CREATE 0x1 /* set value, fail if attr already exists */
#define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 0f7c378..e2a9043 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -262,5 +262,9 @@ extern int if6_proc_init(void);
extern void if6_proc_exit(void);
#endif
+int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+ unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
+ __u32 valid_lft);
+
#endif
#endif
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1614d78..660a221 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -10,6 +10,7 @@ extern void unix_inflight(struct file *fp);
extern void unix_notinflight(struct file *fp);
extern void unix_gc(void);
extern void wait_for_unix_gc(void);
+extern void unix_destruct_fds(struct sk_buff *skb);
#define UNIX_HASH_SIZE 256
diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..d60647a 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -10,6 +10,7 @@
#include <linux/in6.h>
#include <asm/atomic.h>
+struct ve_struct;
struct flowi {
int oif;
int iif;
@@ -77,6 +78,9 @@ struct flowi {
#define fl_icmp_code uli_u.icmpt.code
#define fl_ipsec_spi uli_u.spi
#define fl_mh_type uli_u.mht.type
+#ifdef CONFIG_VE
+ struct ve_struct *owner_env;
+#endif
__u32 secid; /* used by xfrm; see secid.txt */
} __attribute__((__aligned__(BITS_PER_LONG/8)));
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 39f2dc9..da58ede 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -15,6 +15,9 @@ struct netns_frags {
struct inet_frag_queue {
struct hlist_node list;
struct netns_frags *net;
+#ifdef CONFIG_VE
+ struct ve_struct *owner_ve;
+#endif
struct list_head lru_list; /* lru list member */
spinlock_t lock;
atomic_t refcnt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f93ad90..02d7c27 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -82,6 +82,7 @@ struct inet_timewait_death_row {
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
+ int ub_managed;
};
extern void inet_twdr_hangman(unsigned long data);
@@ -138,6 +139,7 @@ struct inet_timewait_sock {
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
+ envid_t tw_owner_env;
};
static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 15b492a..ebaab2d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -162,6 +162,7 @@ struct fib6_table {
u32 tb6_id;
rwlock_t tb6_lock;
struct fib6_node tb6_root;
+ struct ve_struct *owner_env;
};
#define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index a120284..d0222f3 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -53,6 +53,13 @@ struct net {
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
+ int ifindex;
+
+#ifdef CONFIG_VE
+ struct completion *sysfs_completion;
+ struct ve_struct *owner_ve;
+#endif
+
/* core fib_rules */
struct list_head rules_ops;
spinlock_t rules_mod_lock;
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index 1ee717e..beb916c 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -9,7 +9,7 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
extern int nf_ct_frag6_init(void);
extern void nf_ct_frag6_cleanup(void);
-extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user);
+extern struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
extern void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
struct net_device *in,
struct net_device *out,
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 5cf7270..c8a94dc 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -270,6 +270,7 @@ extern struct nf_conn *
nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
+ struct user_beancounter *,
gfp_t gfp);
/* It's confirmed if it is, or has been in the hash table. */
@@ -291,6 +292,7 @@ static inline int nf_ct_is_untracked(const struct sk_buff *skb)
extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
extern unsigned int nf_conntrack_htable_size;
extern unsigned int nf_conntrack_max;
+extern int ip_conntrack_disable_ve0 /* XXX: unused */;
#define NF_CT_STAT_INC(net, count) \
(per_cpu_ptr((net)->ct.stat, raw_smp_processor_id())->count++)
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index a965280..bf3b73c 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -81,6 +81,8 @@ void nf_conntrack_expect_fini(struct net *net);
struct nf_conntrack_expect *
__nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple);
+void nf_ct_expect_insert(struct nf_conntrack_expect *exp);
+
struct nf_conntrack_expect *
nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple);
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index 8df0b7f..0c818b4 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -78,6 +78,8 @@ struct nf_conn_nat
#endif
};
+void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct);
+
/* Set up the info structure to map into this range. */
extern unsigned int nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
diff --git a/include/net/netlink_sock.h b/include/net/netlink_sock.h
new file mode 100644
index 0000000..ce4701a
--- /dev/null
+++ b/include/net/netlink_sock.h
@@ -0,0 +1,23 @@
+#ifndef __NET_NETLINK_SOCK_H
+#define __NET_NETLINK_SOCK_H
+
+struct netlink_sock {
+ /* struct sock has to be the first member of netlink_sock */
+ struct sock sk;
+ u32 pid;
+ u32 dst_pid;
+ u32 dst_group;
+ u32 flags;
+ u32 subscriptions;
+ u32 ngroups;
+ unsigned long *groups;
+ unsigned long state;
+ wait_queue_head_t wait;
+ struct netlink_callback *cb;
+ struct mutex *cb_mutex;
+ struct mutex cb_def_mutex;
+ void (*netlink_rcv)(struct sk_buff *skb);
+ struct module *module;
+};
+
+#endif /* __NET_NETLINK_SOCK_H */
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index dfeb2d7..2cd0e7a 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -14,6 +14,7 @@ struct netns_sysctl_ipv6 {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *table;
struct ctl_table_header *frags_hdr;
+ struct ctl_table_header *nf_frags_hdr;
#endif
int bindv6only;
int flush_delay;
@@ -32,6 +33,7 @@ struct netns_ipv6 {
struct ipv6_devconf *devconf_all;
struct ipv6_devconf *devconf_dflt;
struct netns_frags frags;
+ struct netns_frags ct_frags;
#ifdef CONFIG_NETFILTER
struct xt_table *ip6table_filter;
struct xt_table *ip6table_mangle;
diff --git a/include/net/route.h b/include/net/route.h
index 40f6346..1520d3c 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -138,6 +138,7 @@ static inline void ip_rt_put(struct rtable * rt)
#define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3)
extern const __u8 ip_tos2prio[16];
+extern int ip_rt_src_check;
static inline char rt_tos2priority(u8 tos)
{
diff --git a/include/net/sock.h b/include/net/sock.h
index 9f96394..7515fc8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -60,6 +60,8 @@
#include <net/dst.h>
#include <net/checksum.h>
+#include <bc/net.h>
+
/*
* This structure really needs to be cleaned up.
* Most of it is for TCP, and not used by any of
@@ -301,6 +303,8 @@ struct sock {
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk);
+ struct sock_beancounter sk_bc;
+ struct ve_struct *owner_env;
};
/*
@@ -591,6 +595,8 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
})
extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
+ unsigned long amount);
extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
extern int sk_stream_error(struct sock *sk, int flags, int err);
@@ -828,7 +834,8 @@ static inline int sk_has_account(struct sock *sk)
return !!sk->sk_prot->memory_allocated;
}
-static inline int sk_wmem_schedule(struct sock *sk, int size)
+static inline int sk_wmem_schedule(struct sock *sk, int size,
+ struct sk_buff *skb)
{
if (!sk_has_account(sk))
return 1;
@@ -836,12 +843,15 @@ static inline int sk_wmem_schedule(struct sock *sk, int size)
__sk_mem_schedule(sk, size, SK_MEM_SEND);
}
-static inline int sk_rmem_schedule(struct sock *sk, int size)
+static inline int sk_rmem_schedule(struct sock *sk, struct sk_buff *skb)
{
if (!sk_has_account(sk))
return 1;
- return size <= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_RECV);
+ if (!(skb->truesize <= sk->sk_forward_alloc ||
+ __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV)))
+ return 0;
+
+ return !ub_sockrcvbuf_charge(sk, skb);
}
static inline void sk_mem_reclaim(struct sock *sk)
@@ -965,6 +975,11 @@ extern struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
unsigned long data_len,
int noblock,
int *errcode);
+extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk,
+ unsigned long size,
+ unsigned long size2,
+ int noblock,
+ int *errcode);
extern void *sock_kmalloc(struct sock *sk, int size,
gfp_t priority);
extern void sock_kfree_s(struct sock *sk, void *mem, int size);
@@ -1327,6 +1342,7 @@ static inline void sock_poll_wait(struct file *filp,
static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
+ WARN_ON(skb->destructor);
skb_orphan(skb);
skb->sk = sk;
skb->destructor = sock_wfree;
@@ -1340,6 +1356,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
+ WARN_ON(skb->destructor);
skb_orphan(skb);
skb->sk = sk;
skb->destructor = sock_rfree;
@@ -1562,6 +1579,13 @@ static inline void sk_change_net(struct sock *sk, struct net *net)
sock_net_set(sk, hold_net(net));
}
+static inline void sk_change_net_get(struct sock *sk, struct net *net)
+{
+ struct net *old_net = sock_net(sk);
+ sock_net_set(sk, get_net(net));
+ put_net(old_net);
+}
+
static inline struct sock *skb_steal_sock(struct sk_buff *skb)
{
if (unlikely(skb->sk)) {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 842ac4d..4e8841c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -44,6 +44,13 @@
#include <net/dst.h>
#include <linux/seq_file.h>
+#include <bc/net.h>
+
+#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
+#define TCP_OFF(sk) (sk->sk_sndmsg_off)
+
+#define TW_WSCALE_MASK 0x0f
+#define TW_WSCALE_SPEC 0x10
extern struct inet_hashinfo tcp_hashinfo;
@@ -222,7 +229,9 @@ extern int sysctl_tcp_mem[3];
extern int sysctl_tcp_wmem[3];
extern int sysctl_tcp_rmem[3];
extern int sysctl_tcp_app_win;
+#ifndef sysctl_tcp_adv_win_scale
extern int sysctl_tcp_adv_win_scale;
+#endif
extern int sysctl_tcp_tw_reuse;
extern int sysctl_tcp_frto;
extern int sysctl_tcp_frto_response;
@@ -237,6 +246,10 @@ extern int sysctl_tcp_base_mss;
extern int sysctl_tcp_workaround_signed_windows;
extern int sysctl_tcp_slow_start_after_idle;
extern int sysctl_tcp_max_ssthresh;
+extern int sysctl_tcp_use_sg;
+extern int sysctl_tcp_max_tw_kmem_fraction;
+extern int sysctl_tcp_max_tw_buckets_ub;
+
extern atomic_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -592,7 +605,11 @@ extern u32 __tcp_select_window(struct sock *sk);
* to use only the low 32-bits of jiffies and hide the ugly
* casts with the following macro.
*/
+#ifdef CONFIG_VE
+#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup))
+#else
#define tcp_time_stamp ((__u32)(jiffies))
+#endif
/* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission
diff --git a/init/Kconfig b/init/Kconfig
index eb4b337..b480300 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -279,7 +279,7 @@ config TASK_XACCT
config TASK_IO_ACCOUNTING
bool "Enable per-task storage I/O accounting (EXPERIMENTAL)"
- depends on TASK_XACCT
+ depends on TASK_XACCT && BEANCOUNTERS
help
Collect information on the number of bytes of storage I/O which this
task has caused.
@@ -457,17 +457,18 @@ config RT_GROUP_SCHED
choice
depends on GROUP_SCHED
prompt "Basis for grouping tasks"
- default USER_SCHED
+ default VZ_FAIRSCHED
config USER_SCHED
bool "user id"
+ depends on !VE
help
This option will choose userid as the basis for grouping
tasks, thus providing equal CPU bandwidth to each user.
config CGROUP_SCHED
bool "Control groups"
- depends on CGROUPS
+ depends on CGROUPS && !VE
help
This option allows you to create arbitrary task groups
using the "cgroup" pseudo filesystem and control
@@ -475,6 +476,12 @@ config CGROUP_SCHED
Refer to Documentation/cgroups/cgroups.txt for more
information on "cgroup" pseudo filesystem.
+config VZ_FAIRSCHED
+ bool "OpenVZ groups"
+ help
+ This option add customizable task groups with OpenVZ compatible
+ syscall and procfs interface.
+
endchoice
menuconfig CGROUPS
@@ -505,7 +512,7 @@ config CGROUP_DEBUG
config CGROUP_NS
bool "Namespace cgroup subsystem"
- depends on CGROUPS
+ depends on CGROUPS && !VE
help
Provides a simple namespace cgroup subsystem to
provide hierarchical naming of sets of namespaces,
@@ -514,7 +521,7 @@ config CGROUP_NS
config CGROUP_FREEZER
bool "Freezer cgroup subsystem"
- depends on CGROUPS
+ depends on CGROUPS && !VE
help
Provides a way to freeze and unfreeze all tasks in a
cgroup.
@@ -528,7 +535,7 @@ config CGROUP_DEVICE
config CPUSETS
bool "Cpuset support"
- depends on CGROUPS
+ depends on CGROUPS && !VE
help
This option will let you create and manage CPUSETs which
allow dynamically partitioning a system into sets of CPUs and
@@ -544,13 +551,14 @@ config PROC_PID_CPUSET
config CGROUP_CPUACCT
bool "Simple CPU accounting cgroup subsystem"
- depends on CGROUPS
+ depends on CGROUPS && !VE
help
Provides a simple Resource Controller for monitoring the
total CPU consumed by the tasks in a cgroup.
config RESOURCE_COUNTERS
bool "Resource counters"
+ depends on !BEANCOUNTERS
help
This option enables controller independent resource accounting
infrastructure that works with cgroups.
@@ -638,6 +646,16 @@ config SYSFS_DEPRECATED_V2
if the original kernel, that came with your distribution, has
this option set to N.
+config SYSFS_DEPRECATED_DYN
+ bool "make deprecated sysfs layout dynamically"
+ depends on SYSFS
+ default y
+ select SYSFS_DEPRECATED_V2
+ help
+ This option works like the DEPRECATED_V2 but allows selecting the
+ sysfs layout dynamically, i.e. on boot. To select the old
+ (deprecated) layout, supply old_sysfs kernel boot parameter.
+
config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
diff --git a/init/calibrate.c b/init/calibrate.c
index 6eb48e5..b07e8d6 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -9,6 +9,7 @@
#include <linux/init.h>
#include <linux/timex.h>
#include <linux/smp.h>
+#include <linux/module.h>
unsigned long lpj_fine;
unsigned long preset_lpj;
@@ -108,6 +109,60 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
#endif
+unsigned long cycles_per_jiffy, cycles_per_clock;
+
+static __devinit void calibrate_cycles(void)
+{
+ unsigned long ticks;
+ cycles_t time;
+
+ ticks = jiffies;
+ while (ticks == jiffies)
+ /* nothing */;
+ time = get_cycles();
+ ticks = jiffies;
+ while (ticks == jiffies)
+ /* nothing */;
+
+ time = get_cycles() - time;
+ cycles_per_jiffy = time;
+ if ((time >> 32) != 0) {
+ printk("CPU too fast! timings are incorrect\n");
+ cycles_per_jiffy = -1;
+ }
+}
+
+EXPORT_SYMBOL(cycles_per_jiffy);
+EXPORT_SYMBOL(cycles_per_clock);
+
+static __devinit void calc_cycles_per_jiffy(void)
+{
+#if 0
+ extern unsigned long fast_gettimeoffset_quotient;
+ unsigned long low, high;
+
+ if (fast_gettimeoffset_quotient != 0) {
+ __asm__("divl %2"
+ :"=a" (low), "=d" (high)
+ :"r" (fast_gettimeoffset_quotient),
+ "0" (0), "1" (1000000/HZ));
+
+ cycles_per_jiffy = low;
+ }
+#endif
+ if (cycles_per_jiffy == 0)
+ calibrate_cycles();
+
+ if (cycles_per_jiffy == 0) {
+ printk(KERN_WARNING "Cycles are stuck! "
+ "Some statistics will not be available.");
+ /* to prevent division by zero in cycles_to_(clocks|jiffies) */
+ cycles_per_jiffy = 1;
+ cycles_per_clock = 1;
+ } else
+ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC);
+}
+
/*
* This is the number of bits of precision for the loops_per_jiffy. Each
* bit takes on average 1.5/HZ seconds. This (like the original) is a little
@@ -178,5 +233,6 @@ void __cpuinit calibrate_delay(void)
loops_per_jiffy/(500000/HZ),
(loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy);
+ calc_cycles_per_jiffy();
printed = true;
}
diff --git a/init/main.c b/init/main.c
index bc109c7..d06cdc8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,6 +70,9 @@
#include <linux/sfi.h>
#include <linux/shmem_fs.h>
#include <trace/boot.h>
+#include <linux/fairsched.h>
+
+#include <bc/beancounter.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -101,6 +104,16 @@ extern void tc_init(void);
enum system_states system_state __read_mostly;
EXPORT_SYMBOL(system_state);
+#ifdef CONFIG_VE
+extern void init_ve_system(void);
+extern void init_ve0(void);
+extern void prepare_ve0_process(struct task_struct *tsk);
+#else
+#define init_ve_system() do { } while (0)
+#define init_ve0() do { } while (0)
+#define prepare_ve0_process(tsk) do { } while (0)
+#endif
+
/*
* Boot command-line arguments
*/
@@ -516,6 +529,8 @@ asmlinkage void __init start_kernel(void)
smp_setup_processor_id();
+ prepare_ve0_process(&init_task);
+
/*
* Need to run as early as possible, to initialize the
* lockdep hash:
@@ -548,6 +563,8 @@ asmlinkage void __init start_kernel(void)
setup_command_line(command_line);
setup_nr_cpu_ids();
setup_per_cpu_areas();
+ init_ve0();
+ ub_init_early();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
build_all_zonelists();
@@ -655,6 +672,7 @@ asmlinkage void __init start_kernel(void)
cred_init();
fork_init(totalram_pages);
proc_caches_init();
+ ub_init_late();
buffer_init();
key_init();
security_init();
@@ -678,6 +696,10 @@ asmlinkage void __init start_kernel(void)
ftrace_init();
+#ifdef CONFIG_BC_RSS_ACCOUNTING
+ ub_init_pbc();
+#endif
+
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
@@ -768,6 +790,7 @@ static void __init do_initcalls(void)
*/
static void __init do_basic_setup(void)
{
+ init_ve_system();
init_workqueues();
cpuset_init_smp();
usermodehelper_init();
@@ -869,6 +892,7 @@ static int __init kernel_init(void * unused)
start_boot_trace();
smp_init();
+ fairsched_init_late();
sched_init_smp();
do_basic_setup();
diff --git a/init/version.c b/init/version.c
index 52a8b98..ccc6262 100644
--- a/init/version.c
+++ b/init/version.c
@@ -36,6 +36,12 @@ struct uts_namespace init_uts_ns = {
};
EXPORT_SYMBOL_GPL(init_uts_ns);
+struct new_utsname virt_utsname = {
+ /* we need only this field */
+ .release = UTS_RELEASE,
+};
+EXPORT_SYMBOL(virt_utsname);
+
const char linux_banner[] =
"Linux version " UTS_RELEASE
#ifdef LINUX_COMPILE_DISTRIBUTION_OFFICIAL_BUILD
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 7d37047..9202672 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -270,19 +270,14 @@ static struct ctl_table ipc_kern_table[] = {
{}
};
-static struct ctl_table ipc_root_table[] = {
- {
- .ctl_name = CTL_KERN,
- .procname = "kernel",
- .mode = 0555,
- .child = ipc_kern_table,
- },
+static struct ctl_path ipc_path[] = {
+ { .ctl_name = CTL_KERN, .procname = "kernel", },
{}
};
static int __init ipc_sysctl_init(void)
{
- register_sysctl_table(ipc_root_table);
+ register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1);
return 0;
}
diff --git a/ipc/msg.c b/ipc/msg.c
index 779f762..2d6b826 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -184,6 +184,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
int id, retval;
key_t key = params->key;
int msgflg = params->flg;
+ int msqid = params->id;
msq = ipc_rcu_alloc(sizeof(*msq));
if (!msq)
@@ -202,7 +203,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)
/*
* ipc_addid() locks msq
*/
- id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);
+ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid);
if (id < 0) {
security_msg_queue_free(msq);
ipc_rcu_putref(msq);
@@ -324,6 +325,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)
msg_params.key = key;
msg_params.flg = msgflg;
+ msg_params.id = -1;
return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
}
@@ -943,3 +945,55 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
msq->q_ctime);
}
#endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+int sysvipc_setup_msg(key_t key, int msqid, int msgflg)
+{
+ struct ipc_namespace *ns;
+ struct ipc_ops msg_ops;
+ struct ipc_params msg_params;
+
+ ns = current->nsproxy->ipc_ns;
+
+ msg_ops.getnew = newque;
+ msg_ops.associate = msg_security;
+ msg_ops.more_checks = NULL;
+
+ msg_params.key = key;
+ msg_params.flg = msgflg | IPC_CREAT;
+ msg_params.id = msqid;
+
+ return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params);
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_msg);
+
+int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg)
+{
+ int err = 0;
+ struct msg_queue * msq;
+ struct ipc_namespace *ns;
+ int next_id;
+ int total, in_use;
+
+ ns = current->nsproxy->ipc_ns;
+
+ down_write(&msg_ids(ns).rw_mutex);
+ in_use = msg_ids(ns).in_use;
+ for (total = 0, next_id = 0; total < in_use; next_id++) {
+ msq = idr_find(&msg_ids(ns).ipcs_idr, next_id);
+ if (msq == NULL)
+ continue;
+ ipc_lock_by_ptr(&msq->q_perm);
+ err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg);
+ msg_unlock(msq);
+ if (err)
+ break;
+ total++;
+ }
+ up_write(&msg_ids(ns).rw_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_msg);
+#endif
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index f095ee2..e9fc268 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -8,6 +8,7 @@
* See the file COPYING for more details.
*/
+#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/security.h>
@@ -18,6 +19,8 @@
#include "util.h"
+#include <bc/kmem.h>
+
DEFINE_SPINLOCK(mq_lock);
/*
@@ -44,52 +47,53 @@ struct msg_msgseg {
#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg))
#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg))
-struct msg_msg *load_msg(const void __user *src, int len)
+struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset,
+ void * data), int len, void * data)
{
struct msg_msg *msg;
struct msg_msgseg **pseg;
int err;
int alen;
+ int offset = 0;
alen = len;
if (alen > DATALEN_MSG)
alen = DATALEN_MSG;
- msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+ msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC);
if (msg == NULL)
return ERR_PTR(-ENOMEM);
msg->next = NULL;
msg->security = NULL;
- if (copy_from_user(msg + 1, src, alen)) {
+ if (load(msg + 1, alen, offset, data)) {
err = -EFAULT;
goto out_err;
}
len -= alen;
- src = ((char __user *)src) + alen;
+ offset += alen;
pseg = &msg->next;
while (len > 0) {
struct msg_msgseg *seg;
alen = len;
if (alen > DATALEN_SEG)
alen = DATALEN_SEG;
- seg = kmalloc(sizeof(*seg) + alen,
- GFP_KERNEL);
+ seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC);
if (seg == NULL) {
err = -ENOMEM;
goto out_err;
}
*pseg = seg;
seg->next = NULL;
- if (copy_from_user(seg + 1, src, alen)) {
+ if (load(seg + 1, alen, offset, data)) {
err = -EFAULT;
goto out_err;
}
pseg = &seg->next;
len -= alen;
- src = ((char __user *)src) + alen;
+ offset += alen;
}
err = security_msg_msg_alloc(msg);
@@ -102,33 +106,58 @@ out_err:
free_msg(msg);
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(sysv_msg_load);
-int store_msg(void __user *dest, struct msg_msg *msg, int len)
+static int do_load_msg(void * dst, int len, int offset, void * data)
+{
+ return copy_from_user(dst, data + offset, len);
+}
+
+struct msg_msg *load_msg(const void __user *src, int len)
+{
+ return sysv_msg_load(do_load_msg, len, (void*)src);
+}
+
+int sysv_msg_store(struct msg_msg *msg,
+ int (*store)(void * src, int len, int offset, void * data),
+ int len, void * data)
{
int alen;
+ int offset = 0;
struct msg_msgseg *seg;
-
+
alen = len;
if (alen > DATALEN_MSG)
alen = DATALEN_MSG;
- if (copy_to_user(dest, msg + 1, alen))
+ if (store(msg + 1, alen, offset, data))
return -1;
len -= alen;
- dest = ((char __user *)dest) + alen;
+ offset += alen;
seg = msg->next;
while (len > 0) {
alen = len;
if (alen > DATALEN_SEG)
alen = DATALEN_SEG;
- if (copy_to_user(dest, seg + 1, alen))
+ if (store(seg + 1, alen, offset, data))
return -1;
len -= alen;
- dest = ((char __user *)dest) + alen;
+ offset += alen;
seg = seg->next;
}
return 0;
}
+EXPORT_SYMBOL_GPL(sysv_msg_store);
+
+static int do_store_msg(void * src, int len, int offset, void * data)
+{
+ return copy_to_user(data + offset, src, len);
+}
+
+int store_msg(void __user *dest, struct msg_msg *msg, int len)
+{
+ return sysv_msg_store(msg, do_store_msg, len, dest);
+}
void free_msg(struct msg_msg *msg)
{
diff --git a/ipc/sem.c b/ipc/sem.c
index 2f2a479..5cd8dc7 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -87,6 +87,8 @@
#include <asm/uaccess.h>
#include "util.h"
+#include <bc/kmem.h>
+
#define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS])
#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm)
@@ -241,6 +243,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
key_t key = params->key;
int nsems = params->u.nsems;
int semflg = params->flg;
+ int semid = params->id;
if (!nsems)
return -EINVAL;
@@ -264,7 +267,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
return retval;
}
- id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);
+ id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid);
if (id < 0) {
security_sem_free(sma);
ipc_rcu_putref(sma);
@@ -327,6 +330,7 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
sem_params.key = key;
sem_params.flg = semflg;
sem_params.u.nsems = nsems;
+ sem_params.id = -1;
return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
}
@@ -949,7 +953,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
undo_list = current->sysvsem.undo_list;
if (!undo_list) {
- undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+ undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC);
if (undo_list == NULL)
return -ENOMEM;
spin_lock_init(&undo_list->lock);
@@ -1014,7 +1018,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
sem_getref_and_unlock(sma);
/* step 2: allocate new undo structure */
- new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
+ GFP_KERNEL_UBC);
if (!new) {
sem_putref(sma);
return ERR_PTR(-ENOMEM);
@@ -1076,7 +1081,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
if (nsops > ns->sc_semopm)
return -E2BIG;
if(nsops > SEMOPM_FAST) {
- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
+ sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC);
if(sops==NULL)
return -ENOMEM;
}
@@ -1379,3 +1384,57 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
sma->sem_ctime);
}
#endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg)
+{
+ struct ipc_namespace *ns;
+ struct ipc_ops sem_ops;
+ struct ipc_params sem_params;
+
+ ns = current->nsproxy->ipc_ns;
+
+ sem_ops.getnew = newary;
+ sem_ops.associate = sem_security;
+ sem_ops.more_checks = sem_more_checks;
+
+ sem_params.key = key;
+ sem_params.flg = semflg | IPC_CREAT;
+ sem_params.u.nsems = size;
+ sem_params.id = semid;
+
+ return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_sem);
+
+int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg)
+{
+ int err = 0;
+ struct sem_array *sma;
+ struct ipc_namespace *ns;
+ int next_id;
+ int total, in_use;
+
+ ns = current->nsproxy->ipc_ns;
+
+ down_write(&sem_ids(ns).rw_mutex);
+ in_use = sem_ids(ns).in_use;
+ for (total = 0, next_id = 0; total < in_use; next_id++) {
+ sma = idr_find(&sem_ids(ns).ipcs_idr, next_id);
+ if (sma == NULL)
+ continue;
+ ipc_lock_by_ptr(&sma->sem_perm);
+ err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg);
+ sem_unlock(sma);
+ if (err)
+ break;
+ total++;
+ }
+ up_write(&sem_ids(ns).rw_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_sem);
+EXPORT_SYMBOL_GPL(exit_sem);
+#endif
diff --git a/ipc/shm.c b/ipc/shm.c
index e9b039f..8a7214e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -40,27 +40,17 @@
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
#include <linux/ima.h>
+#include <linux/shmem_fs.h>
#include <asm/uaccess.h>
-#include "util.h"
-
-struct shm_file_data {
- int id;
- struct ipc_namespace *ns;
- struct file *file;
- const struct vm_operations_struct *vm_ops;
-};
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
-#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data))
+#include "util.h"
-static const struct file_operations shm_file_operations;
static const struct vm_operations_struct shm_vm_ops;
-#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS])
-
-#define shm_unlock(shp) \
- ipc_unlock(&(shp)->shm_perm)
static int newseg(struct ipc_namespace *, struct ipc_params *);
static void shm_open(struct vm_area_struct *vma);
@@ -113,20 +103,6 @@ void __init shm_init (void)
IPC_SHM_IDS, sysvipc_shm_proc_show);
}
-/*
- * shm_lock_(check_) routines are called in the paths where the rw_mutex
- * is not necessarily held.
- */
-static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
-{
- struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
-
- if (IS_ERR(ipcp))
- return (struct shmid_kernel *)ipcp;
-
- return container_of(ipcp, struct shmid_kernel, shm_perm);
-}
-
static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
int id)
{
@@ -295,7 +271,7 @@ static unsigned long shm_get_unmapped_area(struct file *file,
pgoff, flags);
}
-static const struct file_operations shm_file_operations = {
+const struct file_operations shm_file_operations = {
.mmap = shm_mmap,
.fsync = shm_fsync,
.release = shm_release,
@@ -307,6 +283,7 @@ static const struct file_operations shm_file_operations_huge = {
.release = shm_release,
.get_unmapped_area = shm_get_unmapped_area,
};
+EXPORT_SYMBOL_GPL(shm_file_operations);
int is_file_shm_hugepages(struct file *file)
{
@@ -336,11 +313,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
key_t key = params->key;
int shmflg = params->flg;
size_t size = params->u.size;
+ int shmid = params->id;
int error;
struct shmid_kernel *shp;
int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
struct file * file;
- char name[13];
+ char name[64];
int id;
int acctflag = 0;
@@ -365,7 +343,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
return error;
}
- sprintf (name, "SYSV%08x", key);
+ snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key);
if (shmflg & SHM_HUGETLB) {
/* hugetlb_file_setup applies strict accounting */
if (shmflg & SHM_NORESERVE)
@@ -386,7 +364,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (IS_ERR(file))
goto no_file;
- id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
+ id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid);
if (id < 0) {
error = id;
goto no_id;
@@ -461,6 +439,7 @@ SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
shm_params.key = key;
shm_params.flg = shmflg;
shm_params.u.size = size;
+ shm_params.id = -1;
return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}
@@ -1099,3 +1078,67 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
shp->shm_ctim);
}
#endif
+
+#ifdef CONFIG_VE
+#include <linux/module.h>
+
+struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg)
+{
+ struct ipc_namespace *ns;
+ struct ipc_ops shm_ops;
+ struct ipc_params shm_params;
+ struct shmid_kernel *shp;
+ struct file *file;
+ int rv;
+
+ ns = current->nsproxy->ipc_ns;
+
+ shm_ops.getnew = newseg;
+ shm_ops.associate = shm_security;
+ shm_ops.more_checks = shm_more_checks;
+
+ shm_params.key = key;
+ shm_params.flg = shmflg | IPC_CREAT;
+ shm_params.u.size = size;
+ shm_params.id = shmid;
+
+ rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
+ if (rv < 0)
+ return ERR_PTR(rv);
+ shp = shm_lock(ns, rv);
+ BUG_ON(IS_ERR(shp));
+ file = shp->shm_file;
+ get_file(file);
+ shm_unlock(shp);
+ return file;
+}
+EXPORT_SYMBOL_GPL(sysvipc_setup_shm);
+
+int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg)
+{
+ int err = 0;
+ struct shmid_kernel* shp;
+ struct ipc_namespace *ns;
+ int next_id;
+ int total, in_use;
+
+ ns = current->nsproxy->ipc_ns;
+
+ down_write(&shm_ids(ns).rw_mutex);
+ in_use = shm_ids(ns).in_use;
+ for (total = 0, next_id = 0; total < in_use; next_id++) {
+ shp = idr_find(&shm_ids(ns).ipcs_idr, next_id);
+ if (shp == NULL)
+ continue;
+ ipc_lock_by_ptr(&shp->shm_perm);
+ err = func(shp, arg);
+ shm_unlock(shp);
+ if (err)
+ break;
+ total++;
+ }
+ up_write(&shm_ids(ns).rw_mutex);
+ return err;
+}
+EXPORT_SYMBOL_GPL(sysvipc_walk_shm);
+#endif
diff --git a/ipc/util.c b/ipc/util.c
index 79ce84e..4979374 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -38,6 +38,8 @@
#include <asm/unistd.h>
+#include <bc/kmem.h>
+
#include "util.h"
struct ipc_proc_iface {
@@ -238,6 +240,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
* @ids: IPC identifier set
* @new: new IPC permission set
* @size: limit for the number of used ids
+ * @reqid: if >= 0, get this id exactly. If -1 -- don't care.
*
* Add an entry 'new' to the IPC ids idr. The permissions object is
* initialised and the first free entry is set up and the id assigned
@@ -247,7 +250,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
* Called with ipc_ids.rw_mutex held as a writer.
*/
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid)
{
uid_t euid;
gid_t egid;
@@ -264,7 +267,16 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
rcu_read_lock();
spin_lock(&new->lock);
- err = idr_get_new(&ids->ipcs_idr, new, &id);
+ if (reqid >= 0) {
+ id = reqid % SEQ_MULTIPLIER;
+ err = idr_get_new_above(&ids->ipcs_idr, new, id, &id);
+ if (!err && id != (reqid % SEQ_MULTIPLIER)) {
+ idr_remove(&ids->ipcs_idr, id);
+ err = -EEXIST;
+ }
+ } else
+ err = idr_get_new(&ids->ipcs_idr, new, &id);
+
if (err) {
spin_unlock(&new->lock);
rcu_read_unlock();
@@ -277,9 +289,13 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
new->cuid = new->uid = euid;
new->gid = new->cgid = egid;
- new->seq = ids->seq++;
- if(ids->seq > ids->seq_max)
- ids->seq = 0;
+ if (reqid >= 0) {
+ new->seq = reqid/SEQ_MULTIPLIER;
+ } else {
+ new->seq = ids->seq++;
+ if(ids->seq > ids->seq_max)
+ ids->seq = 0;
+ }
new->id = ipc_buildid(id, new->seq);
return id;
@@ -443,9 +459,9 @@ void* ipc_alloc(int size)
{
void* out;
if(size > PAGE_SIZE)
- out = vmalloc(size);
+ out = ub_vmalloc(size);
else
- out = kmalloc(size, GFP_KERNEL);
+ out = kmalloc(size, GFP_KERNEL_UBC);
return out;
}
@@ -528,14 +544,14 @@ void* ipc_rcu_alloc(int size)
* workqueue if necessary (for vmalloc).
*/
if (rcu_use_vmalloc(size)) {
- out = vmalloc(HDRLEN_VMALLOC + size);
+ out = ub_vmalloc(HDRLEN_VMALLOC + size);
if (out) {
out += HDRLEN_VMALLOC;
container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1;
container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
}
} else {
- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
+ out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC);
if (out) {
out += HDRLEN_KMALLOC;
container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0;
@@ -714,6 +730,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
return out;
}
+EXPORT_SYMBOL_GPL(ipc_lock);
struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id)
{
@@ -800,7 +817,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
euid = current_euid();
if (euid == ipcp->cuid ||
- euid == ipcp->uid || capable(CAP_SYS_ADMIN))
+ euid == ipcp->uid || capable(CAP_VE_SYS_ADMIN))
return ipcp;
err = -EPERM;
diff --git a/ipc/util.h b/ipc/util.h
index 764b51a..15e02c8 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -58,6 +58,7 @@ struct ipc_params {
size_t size; /* for shared memories */
int nsems; /* for semaphores */
} u; /* holds the getnew() specific param */
+ int id;
};
/*
@@ -87,14 +88,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
#define ipc_init_proc_interface(path, header, ids, show) do {} while (0)
#endif
-#define IPC_SEM_IDS 0
-#define IPC_MSG_IDS 1
-#define IPC_SHM_IDS 2
-
#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
/* must be called with ids->rw_mutex acquired for writing */
-int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
+int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int);
/* must be called with ids->rw_mutex acquired for reading */
int ipc_get_maxid(struct ipc_ids *);
@@ -121,7 +118,6 @@ void* ipc_rcu_alloc(int size);
void ipc_rcu_getref(void *ptr);
void ipc_rcu_putref(void *ptr);
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
@@ -163,12 +159,6 @@ static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
spin_lock(&perm->lock);
}
-static inline void ipc_unlock(struct kern_ipc_perm *perm)
-{
- spin_unlock(&perm->lock);
- rcu_read_unlock();
-}
-
struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
struct ipc_ops *ops, struct ipc_params *params);
diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz
new file mode 100644
index 0000000..2216a4c
--- /dev/null
+++ b/kernel/Kconfig.openvz
@@ -0,0 +1,92 @@
+# Copyright (C) 2005 SWsoft
+# All rights reserved.
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+menu "OpenVZ"
+
+config VE
+ bool "Virtual Environment support"
+ default y
+ select NAMESPACES
+ select PID_NS
+ select IPC_NS
+ select UTS_NS
+ select NET_NS
+ select USER_NS
+ select CGROUPS
+ select CGROUP_DEVICE
+ select GROUP_SCHED
+ select FAIR_GROUP_SCHED
+ help
+ This option adds support of virtual Linux running on the original box
+ with fully supported virtual network driver, tty subsystem and
+ configurable access for hardware and other resources.
+
+config VE_CALLS
+ tristate "VE calls interface"
+ depends on VE
+ select VZ_DEV
+ default m
+ help
+ This option controls how to build vzmon code containing VE calls.
+ By default it's build in module vzmon.o
+
+config VZ_GENCALLS
+ bool
+ default y
+
+config VE_NETDEV
+ tristate "VE network device"
+ depends on VE_CALLS && NET
+ select VZ_DEV
+ default m
+ help
+ This option controls whether to build venet device. This is a
+ common interface for networking in VE.
+
+config VE_ETHDEV
+ tristate "Virtual ethernet device"
+ depends on VE_CALLS && NET
+ select VZ_DEV
+ default m
+ help
+ This option controls whether to build virtual ethernet device.
+
+config VZ_DEV
+ tristate "VE device"
+ default m
+ help
+ This option adds support of vzdev device, which is used by
+ user-space applications to control Virtual Environments.
+
+config VE_IPTABLES
+ bool "VE netfiltering"
+ depends on VE && VE_NETDEV && INET && NETFILTER
+ default y
+ help
+ This option controls whether to build VE netfiltering code.
+
+config VZ_WDOG
+ tristate "VE watchdog module"
+ depends on VE_CALLS
+ default m
+ help
+ This option controls building of vzwdog module, which dumps
+ a lot of useful system info on console periodically.
+
+config VZ_CHECKPOINT
+ tristate "Checkpointing & restoring Virtual Environments"
+ depends on X86 || IA64
+ depends on VE_CALLS
+ select PM
+ select PM_SLEEP
+ select TUN
+ select VE_ETHDEV
+ select VE_NETDEV
+ default m
+ help
+ This option adds two modules, "cpt" and "rst", which allow
+ to save a running Virtual Environment and restore it
+ on another host (live migration) or on the same host (checkpointing).
+
+endmenu
diff --git a/kernel/Makefile b/kernel/Makefile
index d7c13d2..59704fe 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,10 @@ obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
+obj-$(CONFIG_BEANCOUNTERS) += bc/
+obj-y += ve/
+obj-$(CONFIG_VZ_CHECKPOINT) += cpt/
+
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
@@ -57,7 +61,11 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
+ifeq ($(CONFIG_VE),n)
obj-$(CONFIG_CGROUPS) += cgroup.o
+else
+obj-$(CONFIG_CGROUPS) += cgroup_lite.o
+endif
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -88,6 +96,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed23..fc40f1b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -662,6 +662,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
char *ctx = NULL;
u32 len;
+ if (!ve_is_super(skb->owner_env))
+ return -ECONNREFUSED;
+
err = audit_netlink_ok(skb, msg_type);
if (err)
return err;
diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig
new file mode 100644
index 0000000..962574c
--- /dev/null
+++ b/kernel/bc/Kconfig
@@ -0,0 +1,103 @@
+#
+# User resources part (UBC)
+#
+# Copyright (C) 2005 SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+menu "User resources"
+
+config BEANCOUNTERS
+ bool "Enable user resource accounting"
+ default y
+ help
+ This patch provides accounting and allows to configure
+ limits for user's consumption of exhaustible system resources.
+ The most important resource controlled by this patch is unswappable
+ memory (either mlock'ed or used by internal kernel structures and
+ buffers). The main goal of this patch is to protect processes
+ from running short of important resources because of an accidental
+ misbehavior of processes or malicious activity aiming to ``kill''
+ the system. It's worth to mention that resource limits configured
+ by setrlimit(2) do not give an acceptable level of protection
+ because they cover only small fraction of resources and work on a
+ per-process basis. Per-process accounting doesn't prevent malicious
+ users from spawning a lot of resource-consuming processes.
+
+config BC_RSS_ACCOUNTING
+ bool "Account physical memory usage"
+ default y
+ depends on BEANCOUNTERS
+ help
+ This allows to estimate per beancounter physical memory usage.
+ Implemented alghorithm accounts shared pages of memory as well,
+ dividing them by number of beancounter which use the page.
+
+config BC_IO_ACCOUNTING
+ bool "Account disk IO"
+ default y
+ depends on BC_RSS_ACCOUNTING
+ help
+ When on this option allows seeing disk IO activity caused by
+ tasks from each UB
+
+config BC_SWAP_ACCOUNTING
+ bool "Account swap usage"
+ default y
+ depends on BEANCOUNTERS
+ help
+ This allows accounting of swap usage.
+
+config BC_PROC
+ bool "Report resource usage in /proc"
+ default y
+ depends on BEANCOUNTERS
+ help
+ Allows a system administrator to inspect resource accounts and limits.
+
+config BC_DEBUG
+ bool "User resources debug features"
+ default n
+ depends on BEANCOUNTERS
+ help
+ Enables to setup debug features for user resource accounting
+
+config BC_DEBUG_IO
+ bool "Debug IO accounting"
+ default y
+ depends on BC_DEBUG && BC_IO_ACCOUNTING
+ help
+ Debugging for IO accointing.
+
+config BC_DEBUG_KMEM
+ bool "Debug kmemsize with cache counters"
+ default n
+ depends on BC_DEBUG
+ help
+ Adds /proc/user_beancounters_debug entry to get statistics
+ about cache usage of each beancounter
+
+config BC_KEEP_UNUSED
+ bool "Keep unused beancounter alive"
+ default y
+ depends on BC_DEBUG
+ help
+ If on, unused beancounters are kept on the hash and maxheld value
+ can be looked through.
+
+config BC_DEBUG_ITEMS
+ bool "Account resources in items rather than in bytes"
+ default y
+ depends on BC_DEBUG
+ help
+ When true some of the resources (e.g. kmemsize) are accounted
+ in items instead of bytes.
+
+config BC_UNLIMITED
+ bool "Use unlimited ubc settings"
+ default y
+ depends on BC_DEBUG
+ help
+ When ON all limits and barriers are set to max values.
+endmenu
diff --git a/kernel/bc/Makefile b/kernel/bc/Makefile
new file mode 100644
index 0000000..95ee497
--- /dev/null
+++ b/kernel/bc/Makefile
@@ -0,0 +1,15 @@
+#
+# User resources part (UBC)
+#
+# Copyright (C) 2005 SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \
+ vm_pages.o statd.o oom_kill.o
+
+obj-$(CONFIG_NET) += net.o
+obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o
+obj-$(CONFIG_BC_PROC) += proc.o
+obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
new file mode 100644
index 0000000..fdf3bb8
--- /dev/null
+++ b/kernel/bc/beancounter.c
@@ -0,0 +1,715 @@
+/*
+ * linux/kernel/bc/beancounter.c
+ *
+ * Copyright (C) 1998 Alan Cox
+ * 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg>
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * TODO:
+ * - more intelligent limit check in mremap(): currently the new size is
+ * charged and _then_ old size is uncharged
+ * (almost done: !move_vma case is completely done,
+ * move_vma in its current implementation requires too many conditions to
+ * do things right, because it may be not only expansion, but shrinking
+ * also, plus do_munmap will require an additional parameter...)
+ * - problem: bad pmd page handling
+ * - consider /proc redesign
+ * - TCP/UDP ports
+ * + consider whether __charge_beancounter_locked should be inline
+ *
+ * Changes:
+ * 1999/08/17 Marcelo Tosatti <marcelo@conectiva.com.br>
+ * - Set "barrier" and "limit" parts of limits atomically.
+ * 1999/10/06 Marcelo Tosatti <marcelo@conectiva.com.br>
+ * - setublimit system call.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+
+#include <bc/beancounter.h>
+#include <bc/hash.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+static struct kmem_cache *ub_cachep;
+static struct user_beancounter default_beancounter;
+struct user_beancounter ub0;
+EXPORT_SYMBOL_GPL(ub0);
+
+const char *ub_rnames[] = {
+ "kmemsize", /* 0 */
+ "lockedpages",
+ "privvmpages",
+ "shmpages",
+ "dummy",
+ "numproc", /* 5 */
+ "physpages",
+ "vmguarpages",
+ "oomguarpages",
+ "numtcpsock",
+ "numflock", /* 10 */
+ "numpty",
+ "numsiginfo",
+ "tcpsndbuf",
+ "tcprcvbuf",
+ "othersockbuf", /* 15 */
+ "dgramrcvbuf",
+ "numothersock",
+ "dcachesize",
+ "numfile",
+ "dummy", /* 20 */
+ "dummy",
+ "dummy",
+ "numiptent",
+ "swappages",
+ "unused_privvmpages", /* UB_RESOURCES */
+ "tmpfs_respages",
+ "held_pages",
+};
+
+static void init_beancounter_struct(struct user_beancounter *ub);
+static void init_beancounter_store(struct user_beancounter *ub);
+static void init_beancounter_nolimits(struct user_beancounter *ub);
+
+int print_ub_uid(struct user_beancounter *ub, char *buf, int size)
+{
+ if (ub->parent != NULL)
+ return snprintf(buf, size, "%u.%u",
+ ub->parent->ub_uid, ub->ub_uid);
+ else
+ return snprintf(buf, size, "%u", ub->ub_uid);
+}
+EXPORT_SYMBOL(print_ub_uid);
+
+#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
+#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
+struct hlist_head ub_hash[UB_HASH_SIZE];
+DEFINE_SPINLOCK(ub_hash_lock);
+LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */
+EXPORT_SYMBOL(ub_hash);
+EXPORT_SYMBOL(ub_hash_lock);
+EXPORT_SYMBOL(ub_list_head);
+
+/*
+ * Per user resource beancounting. Resources are tied to their luid.
+ * The resource structure itself is tagged both to the process and
+ * the charging resources (a socket doesn't want to have to search for
+ * things at irq time for example). Reference counters keep things in
+ * hand.
+ *
+ * The case where a user creates resource, kills all his processes and
+ * then starts new ones is correctly handled this way. The refcounters
+ * will mean the old entry is still around with resource tied to it.
+ */
+
+static struct user_beancounter *alloc_ub(uid_t uid, struct user_beancounter *p)
+{
+ struct user_beancounter *new_ub;
+
+ ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub);
+
+ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep,
+ GFP_KERNEL);
+ if (new_ub == NULL)
+ return NULL;
+
+ if (p == NULL) {
+ memcpy(new_ub, &default_beancounter, sizeof(*new_ub));
+ init_beancounter_struct(new_ub);
+ } else {
+ memset(new_ub, 0, sizeof(*new_ub));
+ init_beancounter_struct(new_ub);
+ init_beancounter_nolimits(new_ub);
+ init_beancounter_store(new_ub);
+ }
+
+ if (percpu_counter_init(&new_ub->ub_orphan_count, 0))
+ goto fail_pcpu;
+
+ new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
+ if (new_ub->ub_percpu == NULL)
+ goto fail_free;
+
+ new_ub->ub_uid = uid;
+ new_ub->parent = get_beancounter(p);
+ return new_ub;
+
+fail_free:
+ percpu_counter_destroy(&new_ub->ub_orphan_count);
+fail_pcpu:
+ kmem_cache_free(ub_cachep, new_ub);
+ return NULL;
+}
+
+static inline void __free_ub(struct user_beancounter *ub)
+{
+ free_percpu(ub->ub_percpu);
+ kmem_cache_free(ub_cachep, ub);
+}
+
+static inline void free_ub(struct user_beancounter *ub)
+{
+ percpu_counter_destroy(&ub->ub_orphan_count);
+ __free_ub(ub);
+}
+
+static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash,
+ uid_t uid, struct user_beancounter *parent)
+{
+ struct user_beancounter *ub;
+ struct hlist_node *ptr;
+
+ hlist_for_each_entry (ub, ptr, hash, ub_hash)
+ if (ub->ub_uid == uid && ub->parent == parent)
+ return get_beancounter(ub);
+
+ return NULL;
+}
+
+int ub_count;
+
+/* next two must be called under ub_hash_lock */
+static inline void ub_count_inc(struct user_beancounter *ub)
+{
+ if (ub->parent)
+ ub->parent->ub_childs++;
+ else
+ ub_count++;
+}
+
+static inline void ub_count_dec(struct user_beancounter *ub)
+{
+ if (ub->parent)
+ ub->parent->ub_childs--;
+ else
+ ub_count--;
+}
+
+struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
+{
+ struct user_beancounter *new_ub, *ub;
+ unsigned long flags;
+ struct hlist_head *hash;
+
+ hash = &ub_hash[ub_hash_fun(uid)];
+ new_ub = NULL;
+retry:
+ spin_lock_irqsave(&ub_hash_lock, flags);
+ ub = bc_lookup_hash(hash, uid, NULL);
+ if (ub != NULL) {
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ if (new_ub != NULL)
+ free_ub(new_ub);
+ return ub;
+ }
+
+ if (!create) {
+ /* no ub found */
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+ return NULL;
+ }
+
+ if (new_ub != NULL) {
+ list_add_rcu(&new_ub->ub_list, &ub_list_head);
+ hlist_add_head(&new_ub->ub_hash, hash);
+ ub_count_inc(new_ub);
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+ return new_ub;
+ }
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ new_ub = alloc_ub(uid, NULL);
+ if (new_ub == NULL)
+ return NULL;
+
+ goto retry;
+
+}
+EXPORT_SYMBOL(get_beancounter_byuid);
+
+struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p,
+ int id, int create)
+{
+ struct user_beancounter *new_ub, *ub;
+ unsigned long flags;
+ struct hlist_head *hash;
+
+ hash = &ub_hash[ub_subhash_fun(p, id)];
+ new_ub = NULL;
+retry:
+ spin_lock_irqsave(&ub_hash_lock, flags);
+ ub = bc_lookup_hash(hash, id, p);
+ if (ub != NULL) {
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ if (new_ub != NULL) {
+ put_beancounter(new_ub->parent);
+ free_ub(new_ub);
+ }
+ return ub;
+ }
+
+ if (!create) {
+ /* no ub found */
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+ return NULL;
+ }
+
+ if (new_ub != NULL) {
+ list_add_rcu(&new_ub->ub_list, &ub_list_head);
+ hlist_add_head(&new_ub->ub_hash, hash);
+ ub_count_inc(new_ub);
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+ return new_ub;
+ }
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ new_ub = alloc_ub(id, p);
+ if (new_ub == NULL)
+ return NULL;
+
+ goto retry;
+}
+EXPORT_SYMBOL(get_subbeancounter_byid);
+
+static void put_warn(struct user_beancounter *ub)
+{
+ char id[64];
+
+ print_ub_uid(ub, id, sizeof(id));
+ printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n",
+ atomic_read(&ub->ub_refcount), id, ub);
+}
+
+#ifdef CONFIG_BC_KEEP_UNUSED
+#define release_beancounter(ub) do { } while (0)
+#else
+static int verify_res(struct user_beancounter *ub, int resource,
+ unsigned long held)
+{
+ char id[64];
+
+ if (likely(held == 0))
+ return 1;
+
+ print_ub_uid(ub, id, sizeof(id));
+ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n",
+ id, held, ub_rnames[resource]);
+ return 0;
+}
+
+static inline void bc_verify_held(struct user_beancounter *ub)
+{
+ int i, clean;
+
+ clean = 1;
+ for (i = 0; i < UB_RESOURCES; i++)
+ clean &= verify_res(ub, i, ub->ub_parms[i].held);
+
+ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages);
+ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages);
+ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages);
+
+ ub_debug_trace(!clean, 5, 60*HZ);
+}
+
+static void bc_free_rcu(struct rcu_head *rcu)
+{
+ struct user_beancounter *ub;
+
+ ub = container_of(rcu, struct user_beancounter, rcu);
+ __free_ub(ub);
+}
+
+static void delayed_release_beancounter(struct work_struct *w)
+{
+ struct user_beancounter *ub, *parent;
+ unsigned long flags;
+
+ ub = container_of(w, struct user_beancounter, cleanup.work);
+again:
+ local_irq_save(flags);
+ if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) {
+ /* raced with get_beancounter_byuid */
+ local_irq_restore(flags);
+ return;
+ }
+
+ hlist_del(&ub->ub_hash);
+ ub_count_dec(ub);
+ list_del_rcu(&ub->ub_list);
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ bc_verify_held(ub);
+ ub_free_counters(ub);
+ percpu_counter_destroy(&ub->ub_orphan_count);
+
+ parent = ub->parent;
+
+ call_rcu(&ub->rcu, bc_free_rcu);
+ if (parent) {
+ ub = parent;
+ goto again;
+ }
+}
+
+static inline void release_beancounter(struct user_beancounter *ub)
+{
+ struct execute_work *ew;
+
+ ew = &ub->cleanup;
+ INIT_WORK(&ew->work, delayed_release_beancounter);
+ schedule_work(&ew->work);
+}
+#endif
+
+void __put_beancounter(struct user_beancounter *ub)
+{
+ unsigned long flags;
+
+ /* equevalent to atomic_dec_and_lock_irqsave() */
+ local_irq_save(flags);
+ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
+ if (unlikely(atomic_read(&ub->ub_refcount) < 0))
+ put_warn(ub);
+ local_irq_restore(flags);
+ return;
+ }
+
+ if (unlikely(ub == get_ub0())) {
+ printk(KERN_ERR "Trying to put ub0\n");
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+ return;
+ }
+
+ /* prevent get_beancounter_byuid + put_beancounter() reentrance */
+ atomic_inc(&ub->ub_refcount);
+ spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+ release_beancounter(ub);
+}
+EXPORT_SYMBOL(__put_beancounter);
+
+void put_beancounter_safe(struct user_beancounter *ub)
+{
+ synchronize_rcu();
+ __put_beancounter(ub);
+}
+EXPORT_SYMBOL(put_beancounter_safe);
+
+/*
+ * Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+ int resource, unsigned long val, enum ub_severity strict)
+{
+ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n",
+ val, resource, ub, ub->ub_parms[resource].held);
+ /*
+ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
+ * at the moment is possible so an overflow is impossible.
+ */
+ ub->ub_parms[resource].held += val;
+
+ switch (strict) {
+ case UB_HARD:
+ if (ub->ub_parms[resource].held >
+ ub->ub_parms[resource].barrier)
+ break;
+ case UB_SOFT:
+ if (ub->ub_parms[resource].held >
+ ub->ub_parms[resource].limit)
+ break;
+ case UB_FORCE:
+ ub_adjust_maxheld(ub, resource);
+ return 0;
+ default:
+ BUG();
+ }
+
+ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl))
+ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n",
+ ub_rnames[resource], ub->ub_uid);
+ ub->ub_parms[resource].failcnt++;
+ ub->ub_parms[resource].held -= val;
+ return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+ int resource, unsigned long val, enum ub_severity strict)
+{
+ int retval;
+ struct user_beancounter *p, *q;
+ unsigned long flags;
+
+ retval = -EINVAL;
+ if (val > UB_MAXVALUE)
+ goto out;
+
+ local_irq_save(flags);
+ for (p = ub; p != NULL; p = p->parent) {
+ spin_lock(&p->ub_lock);
+ retval = __charge_beancounter_locked(p, resource, val, strict);
+ spin_unlock(&p->ub_lock);
+ if (retval)
+ goto unroll;
+ }
+out_restore:
+ local_irq_restore(flags);
+out:
+ return retval;
+
+unroll:
+ for (q = ub; q != p; q = q->parent) {
+ spin_lock(&q->ub_lock);
+ __uncharge_beancounter_locked(q, resource, val);
+ spin_unlock(&q->ub_lock);
+ }
+ goto out_restore;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void __charge_beancounter_notop(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ struct user_beancounter *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for (p = ub; p->parent != NULL; p = p->parent) {
+ spin_lock(&p->ub_lock);
+ __charge_beancounter_locked(p, resource, val, UB_FORCE);
+ spin_unlock(&p->ub_lock);
+ }
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__charge_beancounter_notop);
+
+void uncharge_warn(struct user_beancounter *ub, int resource,
+ unsigned long val, unsigned long held)
+{
+ char id[64];
+
+ print_ub_uid(ub, id, sizeof(id));
+ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n",
+ val, held, ub_rnames[resource], id);
+ ub_debug_trace(1, 10, 10*HZ);
+}
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n",
+ val, resource, ub, ub->ub_parms[resource].held);
+ if (ub->ub_parms[resource].held < val) {
+ uncharge_warn(ub, resource,
+ val, ub->ub_parms[resource].held);
+ val = ub->ub_parms[resource].held;
+ }
+ ub->ub_parms[resource].held -= val;
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ unsigned long flags;
+ struct user_beancounter *p;
+
+ for (p = ub; p != NULL; p = p->parent) {
+ spin_lock_irqsave(&p->ub_lock, flags);
+ __uncharge_beancounter_locked(p, resource, val);
+ spin_unlock_irqrestore(&p->ub_lock, flags);
+ }
+}
+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+void __uncharge_beancounter_notop(struct user_beancounter *ub,
+ int resource, unsigned long val)
+{
+ struct user_beancounter *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ for (p = ub; p->parent != NULL; p = p->parent) {
+ spin_lock(&p->ub_lock);
+ __uncharge_beancounter_locked(p, resource, val);
+ spin_unlock(&p->ub_lock);
+ }
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(__uncharge_beancounter_notop);
+
+
+/*
+ * Rate limiting stuff.
+ */
+int ub_ratelimit(struct ub_rate_info *p)
+{
+ unsigned long cjif, djif;
+ unsigned long flags;
+ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
+ long new_bucket;
+
+ spin_lock_irqsave(&ratelimit_lock, flags);
+ cjif = jiffies;
+ djif = cjif - p->last;
+ if (djif < p->interval) {
+ if (p->bucket >= p->burst) {
+ spin_unlock_irqrestore(&ratelimit_lock, flags);
+ return 0;
+ }
+ p->bucket++;
+ } else {
+ new_bucket = p->bucket - (djif / (unsigned)p->interval);
+ if (new_bucket < 0)
+ new_bucket = 0;
+ p->bucket = new_bucket + 1;
+ }
+ p->last = cjif;
+ spin_unlock_irqrestore(&ratelimit_lock, flags);
+ return 1;
+}
+EXPORT_SYMBOL(ub_ratelimit);
+
+
+/*
+ * Initialization
+ *
+ * struct user_beancounter contains
+ * - limits and other configuration settings,
+ * with a copy stored for accounting purposes,
+ * - structural fields: lists, spinlocks and so on.
+ *
+ * Before these parts are initialized, the structure should be memset
+ * to 0 or copied from a known clean structure. That takes care of a lot
+ * of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub)
+{
+ ub->ub_magic = UB_MAGIC;
+ ub->ub_cookie = get_random_int();
+ atomic_set(&ub->ub_refcount, 1);
+ spin_lock_init(&ub->ub_lock);
+ INIT_LIST_HEAD(&ub->ub_tcp_sk_list);
+ INIT_LIST_HEAD(&ub->ub_other_sk_list);
+#ifdef CONFIG_BC_DEBUG_KMEM
+ INIT_LIST_HEAD(&ub->ub_cclist);
+#endif
+}
+
+static void init_beancounter_store(struct user_beancounter *ub)
+{
+ int k;
+
+ for (k = 0; k < UB_RESOURCES; k++) {
+ memcpy(&ub->ub_store[k], &ub->ub_parms[k],
+ sizeof(struct ubparm));
+ }
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+ int k;
+
+ for (k = 0; k < UB_RESOURCES; k++) {
+ ub->ub_parms[k].limit = UB_MAXVALUE;
+ /* FIXME: whether this is right for physpages and guarantees? */
+ ub->ub_parms[k].barrier = UB_MAXVALUE;
+ }
+
+ /* FIXME: set unlimited rate? */
+ ub->ub_limit_rl.burst = 4;
+ ub->ub_limit_rl.interval = 300*HZ;
+}
+
+static void init_beancounter_syslimits(struct user_beancounter *ub)
+{
+ unsigned long mp;
+ extern int max_threads;
+ int k;
+
+ mp = num_physpages;
+ ub->ub_parms[UB_KMEMSIZE].limit =
+ mp > (192*1024*1024 >> PAGE_SHIFT) ?
+ 32*1024*1024 : (mp << PAGE_SHIFT) / 6;
+ ub->ub_parms[UB_LOCKEDPAGES].limit = 8;
+ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE;
+ ub->ub_parms[UB_SHMPAGES].limit = 64;
+ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2;
+ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024;
+ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */
+ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */
+ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256;
+ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */
+ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */
+ ub->ub_parms[UB_NUMFLOCK].limit = 1024;
+ ub->ub_parms[UB_NUMPTY].limit = 16;
+ ub->ub_parms[UB_NUMSIGINFO].limit = 1024;
+ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024;
+ ub->ub_parms[UB_NUMFILE].limit = 1024;
+ ub->ub_parms[UB_SWAPPAGES].limit = UB_MAXVALUE;
+
+ for (k = 0; k < UB_RESOURCES; k++)
+ ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
+
+ ub->ub_limit_rl.burst = 4;
+ ub->ub_limit_rl.interval = 300*HZ;
+}
+
+static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu);
+
+void __init ub_init_early(void)
+{
+ struct user_beancounter *ub;
+
+ init_cache_counters();
+ ub = get_ub0();
+ memset(ub, 0, sizeof(*ub));
+ ub->ub_uid = 0;
+ init_beancounter_nolimits(ub);
+ init_beancounter_store(ub);
+ init_beancounter_struct(ub);
+ ub->ub_percpu = &per_cpu__ub0_percpu;
+
+ memset(&current->task_bc, 0, sizeof(struct task_beancounter));
+ (void)set_exec_ub(ub);
+ current->task_bc.task_ub = get_beancounter(ub);
+ __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE);
+ current->task_bc.fork_sub = get_beancounter(ub);
+ ub_init_task_bc(&current->task_bc);
+ init_mm.mm_ub = get_beancounter(ub);
+
+ hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]);
+ list_add(&ub->ub_list, &ub_list_head);
+ ub_count_inc(ub);
+}
+
+void __init ub_init_late(void)
+{
+ ub_cachep = kmem_cache_create("user_beancounters",
+ sizeof(struct user_beancounter),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+ memset(&default_beancounter, 0, sizeof(default_beancounter));
+#ifdef CONFIG_BC_UNLIMITED
+ init_beancounter_nolimits(&default_beancounter);
+#else
+ init_beancounter_syslimits(&default_beancounter);
+#endif
+ init_beancounter_store(&default_beancounter);
+ init_beancounter_struct(&default_beancounter);
+}
diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c
new file mode 100644
index 0000000..58ace1c
--- /dev/null
+++ b/kernel/bc/dcache.c
@@ -0,0 +1,399 @@
+/*
+ * kernel/bc/dcache.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/dcache.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/stop_machine.h>
+#include <linux/cpumask.h>
+#include <linux/nmi.h>
+#include <linux/rwsem.h>
+#include <linux/rcupdate.h>
+#include <linux/highmem.h>
+#include <asm/bitops.h>
+
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/dcache.h>
+#include <bc/dcache_op.h>
+
+/*
+ * Locking
+ * traverse dcache_lock d_lock
+ * ub_dentry_charge + - +
+ * ub_dentry_uncharge + + -
+ * ub_dentry_charge_nofail + + -
+ *
+ * d_inuse changes are atomic, with special handling of "not in use" <->
+ * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity
+ * here: (1) in many operations we need to change d_inuse of both dentry and
+ * its parent, and (2) on state transitions we need to adjust the account.
+ *
+ * Regarding (1): we do not have (and do not want) a single lock covering all
+ * operations, so in general it's impossible to get a consistent view of
+ * a tree with respect to d_inuse counters (except by swsuspend). It also
+ * means if a dentry with d_inuse of 0 gets one new in-use child and loses
+ * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0,
+ * and we can't say which way.
+ * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since
+ * uncharge can be done only after return from charge (with d_genocide being
+ * the only apparent exception).
+ * Regarding (2): there is a similar uncertainty with the dcache account.
+ * If the account is equal to the limit, one more dentry is started to be
+ * used and one is put, the account will either hit the limit (and an error
+ * will be returned), or decrement will happen before increment.
+ *
+ * These races do not really matter.
+ * The only things we want are:
+ * - if a system is suspenede with no in-use dentries, all d_inuse counters
+ * should be correct (-1);
+ * - d_inuse counters should always be >= -1.
+ * This holds if ->parent references are accessed and maintained properly.
+ * In subtle moments (like d_move) dentries exchanging their parents should
+ * both be in-use. At d_genocide time, lookups and charges are assumed to be
+ * impossible.
+ */
+
+/*
+ * Hierarchical accounting
+ * UB argument must NOT be NULL
+ */
+
+static int do_charge_dcache(struct user_beancounter *ub, unsigned long size,
+ enum ub_severity sv)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv))
+ goto out_mem;
+ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv))
+ goto out_dcache;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return 0;
+
+out_dcache:
+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
+out_mem:
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return -ENOMEM;
+}
+
+static void do_uncharge_dcache(struct user_beancounter *ub,
+ unsigned long size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
+ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static int charge_dcache(struct user_beancounter *ub, unsigned long size,
+ enum ub_severity sv)
+{
+ struct user_beancounter *p, *q;
+
+ for (p = ub; p != NULL; p = p->parent) {
+ if (do_charge_dcache(p, size, sv))
+ goto unroll;
+ }
+ return 0;
+
+unroll:
+ for (q = ub; q != p; q = q->parent)
+ do_uncharge_dcache(q, size);
+ return -ENOMEM;
+}
+
+void uncharge_dcache(struct user_beancounter *ub, unsigned long size)
+{
+ for (; ub != NULL; ub = ub->parent)
+ do_uncharge_dcache(ub, size);
+}
+
+/*
+ * Simple helpers to do maintain account and d_ub field.
+ */
+
+static inline int d_charge(struct dentry_beancounter *d_bc)
+{
+ struct user_beancounter *ub;
+
+ ub = get_beancounter(get_exec_ub());
+ if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) {
+ put_beancounter(ub);
+ return -1;
+ }
+ d_bc->d_ub = ub;
+ return 0;
+}
+
+static inline void d_forced_charge(struct dentry_beancounter *d_bc)
+{
+ struct user_beancounter *ub;
+
+ ub = get_beancounter(get_exec_ub());
+ charge_dcache(ub, d_bc->d_ubsize, UB_FORCE);
+ d_bc->d_ub = ub;
+}
+
+/*
+ * Minor helpers
+ */
+
+extern struct kmem_cache *dentry_cache;
+extern struct kmem_cache *inode_cachep;
+static struct rw_semaphore ub_dentry_alloc_sem;
+
+static inline unsigned long d_charge_size(struct dentry *dentry)
+{
+ /* dentry's d_name is already set to appropriate value (see d_alloc) */
+ return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) +
+ (dname_external(dentry) ?
+ kmem_dname_objuse((void *)dentry->d_name.name) : 0);
+}
+
+/*
+ * Entry points from dcache.c
+ */
+
+/*
+ * Set initial d_inuse on d_alloc.
+ * Called with no locks, preemption disabled.
+ */
+int __ub_dentry_alloc(struct dentry *dentry)
+{
+ struct dentry_beancounter *d_bc;
+
+ d_bc = &dentry->dentry_bc;
+ d_bc->d_ub = get_beancounter(get_exec_ub());
+ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */
+ d_bc->d_ubsize = d_charge_size(dentry);
+
+ if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD))
+ goto failure;
+ return 0;
+
+failure:
+ put_beancounter(d_bc->d_ub);
+ d_bc->d_ub = NULL;
+ return -ENOMEM;
+}
+void __ub_dentry_alloc_start(void)
+{
+ down_read(&ub_dentry_alloc_sem);
+ current->task_bc.dentry_alloc = 1;
+}
+
+void __ub_dentry_alloc_end(void)
+{
+ current->task_bc.dentry_alloc = 0;
+ up_read(&ub_dentry_alloc_sem);
+}
+
+/*
+ * It is assumed that parent is already in use, so traverse upwards is
+ * limited to one ancestor only.
+ * Called under d_lock and rcu_read_lock.
+ */
+int __ub_dentry_charge(struct dentry *dentry)
+{
+ struct dentry_beancounter *d_bc;
+ struct dentry *parent;
+ int ret;
+
+ if (ub_dget_testone(dentry)) {
+ d_bc = &dentry->dentry_bc;
+ /* state transition -1 => 0 */
+ if (d_charge(d_bc))
+ goto failure;
+
+ if (dentry != dentry->d_parent) {
+ parent = dentry->d_parent;
+ if (ub_dget_testone(parent))
+ BUG();
+ }
+ }
+ return 0;
+
+failure:
+ /*
+ * Here we would like to fail the lookup.
+ * It is not easy: if d_lookup fails, callers expect that a dentry
+ * with the given name doesn't exist, and create a new one.
+ * So, first we forcedly charge for this dentry.
+ * Then try to remove it from cache safely. If it turns out to be
+ * possible, we can return error.
+ */
+ d_forced_charge(d_bc);
+
+ if (dentry != dentry->d_parent) {
+ parent = dentry->d_parent;
+ if (ub_dget_testone(parent))
+ BUG();
+ }
+
+ ret = 0;
+ if (spin_trylock(&dcache_lock)) {
+ if (!list_empty(&dentry->d_subdirs)) {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ rcu_read_unlock();
+ shrink_dcache_parent(dentry);
+ rcu_read_lock();
+ spin_lock(&dcache_lock);
+ spin_lock(&dentry->d_lock);
+ }
+ if (atomic_read(&dentry->d_count) == 1) {
+ __d_drop(dentry);
+ ret = -1;
+ }
+ spin_unlock(&dcache_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Go up in the tree decreasing d_inuse.
+ * Called under dcache_lock.
+ */
+void __ub_dentry_uncharge(struct dentry *dentry)
+{
+ struct dentry *parent;
+ struct user_beancounter *ub;
+ unsigned long size;
+
+ /* go up until state doesn't change or and root is reached */
+ size = dentry->dentry_bc.d_ubsize;
+ ub = dentry->dentry_bc.d_ub;
+ while (ub_dput_testzero(dentry)) {
+ /* state transition 0 => -1 */
+ uncharge_dcache(ub, size);
+ put_beancounter(ub);
+
+ parent = dentry->d_parent;
+ if (dentry == parent)
+ break;
+
+ dentry = parent;
+ size = dentry->dentry_bc.d_ubsize;
+ ub = dentry->dentry_bc.d_ub;
+ }
+}
+
+/*
+ * Forced charge for __dget_locked, where API doesn't allow to return error.
+ * Called under dcache_lock.
+ */
+void __ub_dentry_charge_nofail(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ while (ub_dget_testone(dentry)) {
+ /* state transition -1 => 0 */
+ d_forced_charge(&dentry->dentry_bc);
+
+ parent = dentry->d_parent;
+ if (dentry == parent)
+ break;
+ dentry = parent;
+ }
+}
+
+/*
+ * Adaptive accounting
+ */
+
+int ub_dentry_on = 1;
+int ub_dentry_alloc_barrier;
+EXPORT_SYMBOL(ub_dentry_on);
+
+static unsigned long checklowat = 0;
+static unsigned long checkhiwat = ULONG_MAX;
+
+static int sysctl_ub_dentry_chk = 10;
+#define sysctl_ub_lowat sysctl_ub_watermark[0]
+#define sysctl_ub_hiwat sysctl_ub_watermark[1]
+static DECLARE_RWSEM(ub_dentry_alloc_sem);
+/* 1024th of lowmem size */
+static unsigned int sysctl_ub_watermark[2] = {0, 100};
+
+static void ub_dentry_set_limits(unsigned long pages, unsigned long cap)
+{
+ down_write(&ub_dentry_alloc_sem);
+ preempt_disable();
+ checklowat = (pages >> 10) * sysctl_ub_lowat;
+ checkhiwat = (pages >> 10) * sysctl_ub_hiwat;
+ if (checkhiwat > cap) {
+ checkhiwat = cap;
+ checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat;
+ }
+ preempt_enable();
+ up_write(&ub_dentry_alloc_sem);
+}
+
+static int ub_dentry_proc_handler(ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int r;
+
+ r = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ if (!r && write)
+ ub_dentry_set_limits(totalram_pages - totalhigh_pages,
+ ULONG_MAX);
+ return r;
+}
+
+static ctl_table ub_dentry_sysctl_table[] = {
+ {
+ .procname = "dentry_check",
+ .data = &sysctl_ub_dentry_chk,
+ .maxlen = sizeof(sysctl_ub_dentry_chk),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "dentry_watermark",
+ .data = &sysctl_ub_lowat,
+ .maxlen = sizeof(sysctl_ub_lowat) * 2,
+ .mode = 0644,
+ .proc_handler = ub_dentry_proc_handler,
+ },
+ { .ctl_name = 0 }
+};
+static ctl_table ub_dentry_sysctl_root[] = {
+ {
+ .procname = "ubc",
+ .mode = 0555,
+ .child = ub_dentry_sysctl_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static int __init ub_dentry_init(void)
+{
+ /*
+ * Initial watermarks are limited, to limit walk time.
+ * 384MB translates into 0.8 sec on PIII 866MHz.
+ */
+ ub_dentry_set_limits(totalram_pages - totalhigh_pages,
+ 384 * 1024 * 1024 / PAGE_SIZE);
+ if (register_sysctl_table(ub_dentry_sysctl_root) == NULL)
+ return -ENOMEM;
+ return 0;
+}
+__initcall(ub_dentry_init);
diff --git a/kernel/bc/io_acct.c b/kernel/bc/io_acct.c
new file mode 100644
index 0000000..428220f
--- /dev/null
+++ b/kernel/bc/io_acct.c
@@ -0,0 +1,501 @@
+/*
+ * kernel/bc/io_acct.c
+ *
+ * Copyright (C) 2006 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/mempool.h>
+#include <linux/proc_fs.h>
+#include <linux/virtinfo.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/rss_pages.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+static struct mempool_s *pb_pool;
+
+#define PB_MIN_IO (1024)
+
+static inline struct page_beancounter *io_pb_alloc(void)
+{
+ return mempool_alloc(pb_pool, GFP_ATOMIC);
+}
+
+static inline void io_pb_free(struct page_beancounter *pb)
+{
+ mempool_free(pb, pb_pool);
+}
+
+struct page_beancounter **page_pblist(struct page *page)
+{
+ struct page_beancounter **pb, *iopb;
+
+ pb = &page_pbc(page);
+ iopb = iopb_to_pb(*pb);
+
+ return iopb == NULL ? pb : &iopb->page_pb_list;
+}
+
+/*
+ * We save the context page was set dirty to use it later
+ * when the real write starts. If the page is mapped then
+ * IO pb is stores like this:
+ *
+ * Before saving:
+ *
+ * +- page -------+
+ * | ... |
+ * | page_pb +---+
+ * +--------------+ | +-----+ +-----+ +-----+
+ * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+
+ * +-----+ +-----+ +-----+ |
+ * ^ |
+ * +---------------------------------+
+ *
+ * After saving:
+ *
+ * +- page -------+ +- io pb ------+
+ * | ... | | ... |
+ * | page_pb +----> | page_pb_list +-+
+ * +--------------+ +--------------+ |
+ * |
+ * +-------------------+
+ * |
+ * | +-----+ +-----+ +-----+
+ * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+
+ * +-----+ +-----+ +-----+ |
+ * ^ |
+ * +---------------------------------+
+ *
+ * And the page_pblist(...) function returns pointer to the place that
+ * points to this pbX ring.
+ */
+
+#ifdef CONFIG_BC_DEBUG_IO
+static LIST_HEAD(pb_io_list);
+static unsigned long anon_pages, not_released;
+
+static inline void io_debug_save(struct page_beancounter *pb,
+ struct page_beancounter *mpb)
+{
+ pb->io_debug = (mpb == NULL);
+ list_add(&pb->io_list, &pb_io_list);
+}
+
+static inline void io_debug_release(struct page_beancounter *pb)
+{
+ list_del(&pb->io_list);
+}
+
+void ub_io_release_debug(struct page *page)
+{
+ struct page_beancounter *pb;
+ static int once = 0;
+
+ pb = page_pbc(page);
+ if (likely(iopb_to_pb(pb) == NULL))
+ return;
+
+ if (!once) {
+ printk("BUG: Page has an IO bc but is not expectd to\n");
+ dump_stack();
+ once = 1;
+ }
+
+ spin_lock(&pb_lock);
+ not_released++;
+ pb = iopb_to_pb(pb);
+ page_pbc(page) = NULL;
+ io_debug_release(pb);
+ pb->ub->io_pb_held--;
+ spin_unlock(&pb_lock);
+
+ put_beancounter(pb->ub);
+ io_pb_free(pb);
+}
+
+static inline int io_debug_precheck_save(struct page *page)
+{
+ if (unlikely(PageAnon(page))) {
+ anon_pages++;
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline int io_debug_precheck_release(struct page *page)
+{
+ return 0;
+}
+#else
+#define io_debug_save(pb, mpb) do { } while (0)
+#define io_debug_release(pb) do { } while (0)
+#define io_debug_precheck_save(page) (0)
+#define io_debug_precheck_release(p) (0)
+#endif
+
+static inline void set_page_io(struct page *page, struct page_beancounter *pb,
+ struct page_beancounter *mapped_pb)
+{
+ unsigned long val;
+
+ val = (unsigned long)pb | PAGE_IO_MARK;
+ pb->page = page;
+
+ page_pbc(page) = (struct page_beancounter *)val;
+ io_debug_save(pb, mapped_pb);
+ pb->ub->io_pb_held++;
+}
+
+static inline void put_page_io(struct page *page, struct page_beancounter *pb)
+{
+ pb->ub->io_pb_held--;
+ io_debug_release(pb);
+ page_pbc(page) = pb->page_pb_list;
+}
+
+void ub_io_save_context(struct page *page, size_t bytes_dirtied)
+{
+ struct user_beancounter *ub;
+ struct page_beancounter *pb, *mapped_pb, *io_pb;
+
+ if (unlikely(in_interrupt())) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ /*
+ * FIXME - this can happen from atomic context and
+ * it's probably not that good to loose some requests
+ */
+
+ pb = io_pb_alloc();
+ io_pb = NULL;
+
+ spin_lock(&pb_lock);
+ if (io_debug_precheck_save(page))
+ goto out_unlock;
+
+ mapped_pb = page_pbc(page);
+ io_pb = iopb_to_pb(mapped_pb);
+ if (io_pb != NULL) {
+ /*
+ * this page has an IO - release it and force a new one
+ * We could also race with page cleaning - see below
+ */
+ mapped_pb = io_pb->page_pb_list;
+ put_page_io(page, io_pb);
+ }
+
+ /*
+ * If the page is mapped we must save the context
+ * it maps to. If the page isn't mapped we use current
+ * context as this is a regular write.
+ */
+
+ if (mapped_pb != NULL)
+ ub = top_beancounter(mapped_pb->ub);
+ else
+ ub = get_io_ub();
+
+ if (!PageDirty(page)) {
+ /*
+ * race with clear_page_dirty(_for_io) - account
+ * writes for ub_io_release_context()
+ */
+ if (io_pb != NULL)
+ io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE;
+ if (pb != NULL)
+ io_pb_free(pb);
+ goto out_unlock;
+ }
+
+ if (pb == NULL) {
+ ub->bytes_dirty_missed += bytes_dirtied;
+ goto out_unlock;
+ }
+
+ /*
+ * the page may become clean here, but the context will be seen
+ * in ub_io_release_context()
+ */
+
+ pb->ub = get_beancounter(ub);
+ pb->page_pb_list = mapped_pb;
+ ub->bytes_dirtied += bytes_dirtied;
+
+ set_page_io(page, pb, mapped_pb);
+
+out_unlock:
+ spin_unlock(&pb_lock);
+
+ if (io_pb != NULL) {
+ put_beancounter(io_pb->ub);
+ io_pb_free(io_pb);
+ }
+}
+
+void ub_io_release_context(struct page *page, size_t wrote)
+{
+ struct page_beancounter *pb;
+
+ if (io_debug_precheck_release(page))
+ return;
+
+ if (unlikely(in_interrupt())) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ spin_lock(&pb_lock);
+ pb = iopb_to_pb(page_pbc(page));
+ if (unlikely(pb == NULL))
+ /*
+ * this may happen if we failed to allocate
+ * context in ub_io_save_context or raced with it
+ */
+ goto out_unlock;
+
+ if (wrote)
+ pb->ub->bytes_wrote += wrote;
+
+ put_page_io(page, pb);
+out_unlock:
+ spin_unlock(&pb_lock);
+
+ if (pb != NULL) {
+ put_beancounter(pb->ub);
+ io_pb_free(pb);
+ }
+}
+
+void __init ub_init_io(struct kmem_cache *pb_cachep)
+{
+ pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep);
+ if (pb_pool == NULL)
+ panic("Can't create pb_pool");
+}
+
+#ifdef CONFIG_PROC_FS
+#define in_flight(var) (var > var##_done ? var - var##_done : 0)
+
+static int bc_ioacct_show(struct seq_file *f, void *v)
+{
+ int i;
+ unsigned long long read, write, cancel;
+ unsigned long sync, sync_done;
+ unsigned long fsync, fsync_done;
+ unsigned long fdsync, fdsync_done;
+ unsigned long frsync, frsync_done;
+ unsigned long reads, writes;
+ unsigned long long rchar, wchar;
+ struct user_beancounter *ub;
+
+ ub = seq_beancounter(f);
+
+ read = write = cancel = 0;
+ sync = sync_done = fsync = fsync_done =
+ fdsync = fdsync_done = frsync = frsync_done = 0;
+ reads = writes = 0;
+ rchar = wchar = 0;
+ for_each_online_cpu(i) {
+ struct ub_percpu_struct *ub_percpu;
+ ub_percpu = per_cpu_ptr(ub->ub_percpu, i);
+
+ read += ub_percpu->bytes_read;
+ write += ub_percpu->bytes_wrote;
+ cancel += ub_percpu->bytes_cancelled;
+
+ sync += ub_percpu->sync;
+ fsync += ub_percpu->fsync;
+ fdsync += ub_percpu->fdsync;
+ frsync += ub_percpu->frsync;
+ sync_done += ub_percpu->sync_done;
+ fsync_done += ub_percpu->fsync_done;
+ fdsync_done += ub_percpu->fdsync_done;
+ frsync_done += ub_percpu->frsync_done;
+
+ reads += ub_percpu->read;
+ writes += ub_percpu->write;
+ rchar += ub_percpu->rchar;
+ wchar += ub_percpu->wchar;
+ }
+
+ seq_printf(f, bc_proc_llu_fmt, "read", read);
+ seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write);
+ seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied);
+ seq_printf(f, bc_proc_llu_fmt, "cancel", cancel);
+ seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed);
+
+ seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync);
+ seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync);
+ seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync);
+ seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync);
+
+ seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync));
+ seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync));
+ seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync));
+ seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync));
+
+ seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads);
+ seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar);
+ seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes);
+ seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar);
+
+ seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held);
+ return 0;
+}
+
+static struct bc_proc_entry bc_ioacct_entry = {
+ .name = "ioacct",
+ .u.show = bc_ioacct_show,
+};
+
+#ifdef CONFIG_BC_DEBUG_IO
+#define PTR_SIZE (int)(sizeof(void *) * 2)
+#define INT_SIZE (int)(sizeof(int) * 2)
+
+static int bc_io_show(struct seq_file *f, void *v)
+{
+ struct list_head *lh;
+ struct page_beancounter *pb;
+ struct page *pg;
+
+ lh = (struct list_head *)v;
+ if (lh == &pb_io_list) {
+ seq_printf(f, "Races: anon %lu missed %lu\n",
+ anon_pages, not_released);
+
+ seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s "
+ "%-*s %-*s %-1s %-*s %-*s\n",
+ PTR_SIZE, "pb", "",
+ PTR_SIZE, "page", "flg",
+ INT_SIZE, "cnt", INT_SIZE, "mcnt",
+ PTR_SIZE, "pb_list",
+ PTR_SIZE, "page_pb", "",
+ PTR_SIZE, "mapping",
+ INT_SIZE, "ub");
+ return 0;
+ }
+
+ pb = list_entry(lh, struct page_beancounter, io_list);
+ pg = pb->page;
+ seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n",
+ pb, pb->io_debug ? 'e' : 'm', pg,
+ PageDirty(pg) ? 'D' : 'd',
+ PageAnon(pg) ? 'A' : 'a',
+ PageWriteback(pg) ? 'W' : 'w',
+ PageLocked(pg) ? 'L' : 'l',
+ INT_SIZE, page_count(pg),
+ INT_SIZE, page_mapcount(pg),
+ pb->page_pb_list, page_pbc(pg),
+ iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!',
+ pg->mapping, pb->ub->ub_uid);
+ return 0;
+}
+
+static void *bc_io_start(struct seq_file *f, loff_t *ppos)
+{
+ spin_lock(&pb_lock);
+ return seq_list_start_head(&pb_io_list, *ppos);
+}
+
+static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+ return seq_list_next(v, &pb_io_list, ppos);
+}
+
+static void bc_io_stop(struct seq_file *f, void *v)
+{
+ spin_unlock(&pb_lock);
+}
+
+static struct seq_operations bc_io_seq_ops = {
+ .start = bc_io_start,
+ .next = bc_io_next,
+ .stop = bc_io_stop,
+ .show = bc_io_show,
+};
+
+static int bc_io_open(struct inode *inode, struct file *filp)
+{
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return -EACCES;
+
+ return seq_open(filp, &bc_io_seq_ops);
+}
+static struct file_operations bc_io_debug_ops = {
+ .open = bc_io_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct bc_proc_entry bc_ioacct_debug_entry = {
+ .name = "ioacct_debug",
+ .u.fops = &bc_io_debug_ops,
+};
+#endif
+
+static int bc_ioacct_notify(struct vnotifier_block *self,
+ unsigned long event, void *arg, int old_ret)
+{
+ struct user_beancounter *ub;
+ unsigned long *vm_events;
+ unsigned long long bin, bout;
+ int i;
+
+ if (event != VIRTINFO_VMSTAT)
+ return old_ret;
+
+ ub = top_beancounter(get_exec_ub());
+ if (ub == get_ub0())
+ return old_ret;
+
+ /* Think over: do we need to account here bytes_dirty_missed? */
+ bout = ub->bytes_wrote;
+ bin = 0;
+ for_each_online_cpu(i) {
+ bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote;
+ bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read;
+ }
+
+ /* convert to Kbytes */
+ bout >>= 10;
+ bin >>= 10;
+
+ vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS;
+ vm_events[PGPGOUT] = (unsigned long)bout;
+ vm_events[PGPGIN] = (unsigned long)bin;
+ return NOTIFY_OK;
+}
+
+static struct vnotifier_block bc_ioacct_nb = {
+ .notifier_call = bc_ioacct_notify,
+};
+
+static int __init bc_ioacct_init(void)
+{
+#ifdef CONFIG_BC_DEBUG_IO
+ bc_register_proc_root_entry(&bc_ioacct_debug_entry);
+#endif
+ bc_register_proc_entry(&bc_ioacct_entry);
+
+ virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb);
+ return 0;
+}
+
+late_initcall(bc_ioacct_init);
+#endif
diff --git a/kernel/bc/kmem.c b/kernel/bc/kmem.c
new file mode 100644
index 0000000..7068e57
--- /dev/null
+++ b/kernel/bc/kmem.c
@@ -0,0 +1,405 @@
+/*
+ * kernel/bc/kmem.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/rss_pages.h>
+#include <bc/hash.h>
+#include <bc/proc.h>
+
+/*
+ * Initialization
+ */
+
+/*
+ * Slab accounting
+ */
+
+#ifdef CONFIG_BC_DEBUG_KMEM
+
+#define CC_HASH_SIZE 1024
+static struct ub_cache_counter *cc_hash[CC_HASH_SIZE];
+spinlock_t cc_lock;
+
+static void __free_cache_counters(struct user_beancounter *ub,
+ struct kmem_cache *cachep)
+{
+ struct ub_cache_counter *cc, **pprev, *del;
+ int i;
+ unsigned long flags;
+
+ del = NULL;
+ spin_lock_irqsave(&cc_lock, flags);
+ for (i = 0; i < CC_HASH_SIZE; i++) {
+ pprev = &cc_hash[i];
+ cc = cc_hash[i];
+ while (cc != NULL) {
+ if (cc->ub != ub && cc->cachep != cachep) {
+ pprev = &cc->next;
+ cc = cc->next;
+ continue;
+ }
+
+ list_del(&cc->ulist);
+ *pprev = cc->next;
+ cc->next = del;
+ del = cc;
+ cc = *pprev;
+ }
+ }
+ spin_unlock_irqrestore(&cc_lock, flags);
+
+ while (del != NULL) {
+ cc = del->next;
+ kfree(del);
+ del = cc;
+ }
+}
+
+void ub_free_counters(struct user_beancounter *ub)
+{
+ __free_cache_counters(ub, NULL);
+}
+
+void ub_kmemcache_free(struct kmem_cache *cachep)
+{
+ __free_cache_counters(NULL, cachep);
+}
+
+void __init init_cache_counters(void)
+{
+ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0]));
+ spin_lock_init(&cc_lock);
+}
+
+#define cc_hash_fun(ub, cachep) ( \
+ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \
+ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \
+ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \
+ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \
+ ) & (CC_HASH_SIZE - 1))
+
+static int change_slab_charged(struct user_beancounter *ub,
+ struct kmem_cache *cachep, long val)
+{
+ struct ub_cache_counter *cc, *new_cnt, **pprev;
+ unsigned long flags;
+
+ new_cnt = NULL;
+again:
+ spin_lock_irqsave(&cc_lock, flags);
+ cc = cc_hash[cc_hash_fun(ub, cachep)];
+ while (cc) {
+ if (cc->ub == ub && cc->cachep == cachep)
+ goto found;
+ cc = cc->next;
+ }
+
+ if (new_cnt != NULL)
+ goto insert;
+
+ spin_unlock_irqrestore(&cc_lock, flags);
+
+ new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC);
+ if (new_cnt == NULL)
+ return -ENOMEM;
+
+ new_cnt->counter = 0;
+ new_cnt->ub = ub;
+ new_cnt->cachep = cachep;
+ goto again;
+
+insert:
+ pprev = &cc_hash[cc_hash_fun(ub, cachep)];
+ new_cnt->next = *pprev;
+ *pprev = new_cnt;
+ list_add(&new_cnt->ulist, &ub->ub_cclist);
+ cc = new_cnt;
+ new_cnt = NULL;
+
+found:
+ cc->counter += val;
+ spin_unlock_irqrestore(&cc_lock, flags);
+ if (new_cnt)
+ kfree(new_cnt);
+ return 0;
+}
+
+static inline int inc_slab_charged(struct user_beancounter *ub,
+ struct kmem_cache *cachep)
+{
+ return change_slab_charged(ub, cachep, 1);
+}
+
+static inline void dec_slab_charged(struct user_beancounter *ub,
+ struct kmem_cache *cachep)
+{
+ if (change_slab_charged(ub, cachep, -1) < 0)
+ BUG();
+}
+
+#include <linux/vmalloc.h>
+
+#define inc_pages_charged(ub, order) ub_percpu_add(ub, \
+ pages_charged, 1 << order)
+#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \
+ pages_charged, 1 << order)
+
+#ifdef CONFIG_PROC_FS
+static int bc_kmem_debug_show(struct seq_file *f, void *v)
+{
+ struct user_beancounter *ub;
+ struct ub_cache_counter *cc;
+ long pages, vmpages;
+ int i;
+
+ ub = seq_beancounter(f);
+
+ pages = vmpages = 0;
+ for_each_online_cpu(i) {
+ pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged;
+ vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged;
+ }
+ if (pages < 0)
+ pages = 0;
+ if (vmpages < 0)
+ vmpages = 0;
+
+ seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE);
+ seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE);
+ seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", ub->ub_pbcs,
+ sizeof(struct page_beancounter));
+
+ spin_lock_irq(&cc_lock);
+ list_for_each_entry (cc, &ub->ub_cclist, ulist) {
+ struct kmem_cache *cachep;
+
+ cachep = cc->cachep;
+ seq_printf(f, bc_proc_lu_lu_fmt,
+ kmem_cache_name(cachep),
+ cc->counter,
+ kmem_cache_objuse(cachep));
+ }
+ spin_unlock_irq(&cc_lock);
+ return 0;
+}
+
+static struct bc_proc_entry bc_kmem_debug_entry = {
+ .name = "kmem_debug",
+ .u.show = bc_kmem_debug_show,
+};
+
+static int __init bc_kmem_debug_init(void)
+{
+ bc_register_proc_entry(&bc_kmem_debug_entry);
+ return 0;
+}
+
+late_initcall(bc_kmem_debug_init);
+#endif
+
+#else
+#define inc_slab_charged(ub, cache) (0)
+#define dec_slab_charged(ub, cache) do { } while (0)
+#define inc_pages_charged(ub, cache) do { } while (0)
+#define dec_pages_charged(ub, cache) do { } while (0)
+#endif
+
+#define UB_KMEM_QUANT (PAGE_SIZE * 4)
+
+/* called with IRQ disabled */
+int ub_kmemsize_charge(struct user_beancounter *ub,
+ unsigned long size,
+ enum ub_severity strict)
+{
+ struct task_beancounter *tbc;
+
+ tbc = &current->task_bc;
+ if (ub != tbc->task_ub || size > UB_KMEM_QUANT)
+ goto just_charge;
+ if (tbc->kmem_precharged >= size) {
+ tbc->kmem_precharged -= size;
+ return 0;
+ }
+
+ if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) {
+ tbc->kmem_precharged += UB_KMEM_QUANT - size;
+ return 0;
+ }
+
+just_charge:
+ return charge_beancounter(ub, UB_KMEMSIZE, size, strict);
+}
+
+/* called with IRQ disabled */
+void ub_kmemsize_uncharge(struct user_beancounter *ub,
+ unsigned long size)
+{
+ struct task_beancounter *tbc;
+
+ if (size > UB_MAXVALUE) {
+ printk("ub_kmemsize_uncharge: size %lu\n", size);
+ dump_stack();
+ }
+
+ tbc = &current->task_bc;
+ if (ub != tbc->task_ub)
+ goto just_uncharge;
+
+ tbc->kmem_precharged += size;
+ if (tbc->kmem_precharged < UB_KMEM_QUANT * 2)
+ return;
+ size = tbc->kmem_precharged - UB_KMEM_QUANT;
+ tbc->kmem_precharged -= size;
+
+just_uncharge:
+ uncharge_beancounter(ub, UB_KMEMSIZE, size);
+}
+
+/* called with IRQ disabled */
+int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags)
+{
+ unsigned int size;
+ struct user_beancounter *ub;
+
+ ub = get_beancounter(get_exec_ub());
+ if (ub == NULL)
+ return 0;
+
+ size = CHARGE_SIZE(kmem_cache_objuse(cachep));
+ if (ub_kmemsize_charge(ub, size,
+ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
+ goto out_err;
+
+ if (inc_slab_charged(ub, cachep) < 0) {
+ ub_kmemsize_uncharge(ub, size);
+ goto out_err;
+ }
+ *ub_slab_ptr(cachep, objp) = ub;
+ return 0;
+
+out_err:
+ put_beancounter(ub);
+ return -ENOMEM;
+}
+
+/* called with IRQ disabled */
+void ub_slab_uncharge(struct kmem_cache *cachep, void *objp)
+{
+ unsigned int size;
+ struct user_beancounter **ub_ref;
+
+ ub_ref = ub_slab_ptr(cachep, objp);
+ if (*ub_ref == NULL)
+ return;
+
+ dec_slab_charged(*ub_ref, cachep);
+ size = CHARGE_SIZE(kmem_cache_objuse(cachep));
+ ub_kmemsize_uncharge(*ub_ref, size);
+ put_beancounter(*ub_ref);
+ *ub_ref = NULL;
+}
+
+/*
+ * Pages accounting
+ */
+
+int ub_page_charge(struct page *page, int order, gfp_t mask)
+{
+ struct user_beancounter *ub;
+ unsigned long flags;
+
+ ub = NULL;
+ if (!(mask & __GFP_UBC))
+ goto out;
+
+ ub = get_beancounter(get_exec_ub());
+ if (ub == NULL)
+ goto out;
+
+ local_irq_save(flags);
+ if (ub_kmemsize_charge(ub, CHARGE_ORDER(order),
+ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
+ goto err;
+
+ inc_pages_charged(ub, order);
+ local_irq_restore(flags);
+out:
+ BUG_ON(page_ub(page) != NULL);
+ page_ub(page) = ub;
+ return 0;
+
+err:
+ local_irq_restore(flags);
+ BUG_ON(page_ub(page) != NULL);
+ put_beancounter(ub);
+ return -ENOMEM;
+}
+
+void ub_page_uncharge(struct page *page, int order)
+{
+ struct user_beancounter *ub;
+ unsigned long flags;
+
+ ub = page_ub(page);
+ if (ub == NULL)
+ return;
+
+ BUG_ON(ub->ub_magic != UB_MAGIC);
+ dec_pages_charged(ub, order);
+ local_irq_save(flags);
+ ub_kmemsize_uncharge(ub, CHARGE_ORDER(order));
+ local_irq_restore(flags);
+ put_beancounter(ub);
+ page_ub(page) = NULL;
+}
+
+/*
+ * takes init_mm.page_table_lock
+ * some outer lock to protect pages from vmalloced area must be held
+ */
+struct user_beancounter *vmalloc_ub(void *obj)
+{
+ struct page *pg;
+
+ pg = vmalloc_to_page(obj);
+ if (pg == NULL)
+ return NULL;
+
+ return page_ub(pg);
+}
+
+EXPORT_SYMBOL(vmalloc_ub);
+
+struct user_beancounter *mem_ub(void *obj)
+{
+ struct user_beancounter *ub;
+
+ if ((unsigned long)obj >= VMALLOC_START &&
+ (unsigned long)obj < VMALLOC_END)
+ ub = vmalloc_ub(obj);
+ else
+ ub = slab_ub(obj);
+
+ return ub;
+}
+
+EXPORT_SYMBOL(mem_ub);
diff --git a/kernel/bc/misc.c b/kernel/bc/misc.c
new file mode 100644
index 0000000..15e7aa4
--- /dev/null
+++ b/kernel/bc/misc.c
@@ -0,0 +1,460 @@
+/*
+ * kernel/bc/misc.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+#include <bc/proc.h>
+
+#define UB_FILE_MINQUANT 3
+#define UB_FILE_MAXQUANT 10
+#define UB_FILE_INIQUANT 4
+
+static unsigned long ub_file_precharge(struct task_beancounter *task_bc,
+ struct user_beancounter *ub, unsigned long *kmemsize);
+
+static inline unsigned long ub_file_kmemsize(unsigned long nr)
+{
+ return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr;
+}
+
+/*
+ * Task staff
+ */
+
+static void init_task_sub(struct task_struct *parent,
+ struct task_struct *tsk,
+ struct task_beancounter *old_bc)
+{
+ struct task_beancounter *new_bc;
+ struct user_beancounter *sub;
+
+ new_bc = &tsk->task_bc;
+ sub = old_bc->fork_sub;
+ new_bc->fork_sub = get_beancounter(sub);
+ new_bc->task_fnode = NULL;
+ new_bc->task_freserv = old_bc->task_freserv;
+ old_bc->task_freserv = NULL;
+ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data));
+ new_bc->pgfault_handle = 0;
+ new_bc->pgfault_allot = 0;
+}
+
+void ub_init_task_bc(struct task_beancounter *tbc)
+{
+ tbc->file_precharged = 0;
+ tbc->file_quant = UB_FILE_INIQUANT;
+ tbc->file_count = 0;
+
+ tbc->kmem_precharged = 0;
+ tbc->dentry_alloc = 0;
+}
+
+int ub_task_charge(struct task_struct *parent, struct task_struct *task)
+{
+ struct task_beancounter *old_bc;
+ struct task_beancounter *new_bc;
+ struct user_beancounter *ub, *pub;
+ unsigned long file_nr, kmemsize;
+ unsigned long flags;
+
+ old_bc = &parent->task_bc;
+ ub = old_bc->fork_sub;
+ new_bc = &task->task_bc;
+ new_bc->task_ub = get_beancounter(ub);
+ new_bc->exec_ub = get_beancounter(ub);
+
+ pub = top_beancounter(ub);
+ spin_lock_irqsave(&pub->ub_lock, flags);
+ if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC,
+ 1, UB_HARD) < 0))
+ goto out_numproc;
+
+ ub_init_task_bc(new_bc);
+ file_nr = ub_file_precharge(new_bc, pub, &kmemsize);
+ spin_unlock_irqrestore(&pub->ub_lock, flags);
+
+ charge_beancounter_notop(ub, UB_NUMPROC, 1);
+ if (likely(file_nr)) {
+ charge_beancounter_notop(ub, UB_NUMFILE, file_nr);
+ charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize);
+ }
+
+ init_task_sub(parent, task, old_bc);
+ return 0;
+
+out_numproc:
+ spin_unlock_irqrestore(&pub->ub_lock, flags);
+ __put_beancounter_batch(ub, 2);
+ return -ENOMEM;
+}
+
+extern atomic_t dbgpre;
+
+void ub_task_uncharge(struct task_struct *task)
+{
+ struct task_beancounter *task_bc;
+ struct user_beancounter *pub;
+ unsigned long file_nr, file_kmemsize;
+ unsigned long flags;
+
+ task_bc = &task->task_bc;
+ pub = top_beancounter(task_bc->task_ub);
+ spin_lock_irqsave(&pub->ub_lock, flags);
+ __uncharge_beancounter_locked(pub, UB_NUMPROC, 1);
+ file_nr = task_bc->file_precharged;
+ if (likely(file_nr))
+ __uncharge_beancounter_locked(pub,
+ UB_NUMFILE, file_nr);
+
+ /* see comment in ub_file_charge */
+ task_bc->file_precharged = 0;
+ file_kmemsize = ub_file_kmemsize(file_nr);
+ if (likely(file_kmemsize))
+ __uncharge_beancounter_locked(pub,
+ UB_KMEMSIZE, file_kmemsize);
+ spin_unlock_irqrestore(&pub->ub_lock, flags);
+
+ uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1);
+ if (likely(file_nr)) {
+ uncharge_beancounter_notop(task_bc->task_ub,
+ UB_NUMFILE, file_nr);
+ __put_beancounter_batch(task_bc->task_ub, file_nr);
+ }
+ if (likely(file_kmemsize))
+ uncharge_beancounter_notop(task_bc->task_ub,
+ UB_KMEMSIZE, file_kmemsize);
+}
+
+void ub_task_put(struct task_struct *task)
+{
+ struct task_beancounter *task_bc;
+ struct user_beancounter *pub;
+ unsigned long kmemsize, flags;
+
+ task_bc = &task->task_bc;
+
+ pub = top_beancounter(task_bc->task_ub);
+ spin_lock_irqsave(&pub->ub_lock, flags);
+ kmemsize = task_bc->kmem_precharged;
+ task_bc->kmem_precharged = 0;
+ if (likely(kmemsize))
+ __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize);
+ spin_unlock_irqrestore(&pub->ub_lock, flags);
+ if (likely(kmemsize))
+ uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize);
+
+ put_beancounter(task_bc->exec_ub);
+ put_beancounter(task_bc->task_ub);
+ put_beancounter(task_bc->fork_sub);
+ /* can't be freed elsewhere, failures possible in the middle of fork */
+ if (task_bc->task_freserv != NULL)
+ kfree(task_bc->task_freserv);
+
+ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
+ task_bc->task_ub = (struct user_beancounter *)0xdead100c;
+ BUG_ON(task_bc->kmem_precharged != 0);
+}
+
+/*
+ * Files and file locks.
+ */
+/*
+ * For NUMFILE, we do not take a lock and call charge function
+ * for every file. We try to charge in batches, keeping local reserve on
+ * task. For experimental purposes, batch size is adaptive and depends
+ * on numfile barrier, number of processes, and the history of successes and
+ * failures of batch charges.
+ *
+ * Per-task fields have the following meaning
+ * file_precharged number of files charged to beancounter in advance,
+ * file_quant logarithm of batch size
+ * file_count counter of charge successes, to reduce batch size
+ * fluctuations.
+ */
+static unsigned long ub_file_precharge(struct task_beancounter *task_bc,
+ struct user_beancounter *ub, unsigned long *kmemsize)
+{
+ unsigned long n, kmem;
+
+ n = 1UL << task_bc->file_quant;
+ if (ub->ub_parms[UB_NUMPROC].held >
+ (ub->ub_parms[UB_NUMFILE].barrier >>
+ task_bc->file_quant))
+ goto nopre;
+ if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD)))
+ goto nopre;
+ kmem = ub_file_kmemsize(n);
+ if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE,
+ kmem, UB_HARD)))
+ goto nopre_kmem;
+
+ task_bc->file_precharged += n;
+ get_beancounter_batch(task_bc->task_ub, n);
+ task_bc->file_count++;
+ if (task_bc->file_quant < UB_FILE_MAXQUANT &&
+ task_bc->file_count >= task_bc->file_quant) {
+ task_bc->file_quant++;
+ task_bc->file_count = 0;
+ }
+ *kmemsize = kmem;
+ return n;
+
+nopre_kmem:
+ __uncharge_beancounter_locked(ub, UB_NUMFILE, n);
+nopre:
+ if (task_bc->file_quant > UB_FILE_MINQUANT)
+ task_bc->file_quant--;
+ task_bc->file_count = 0;
+ return 0;
+}
+
+int ub_file_charge(struct file *f)
+{
+ struct user_beancounter *ub, *pub;
+ struct task_beancounter *task_bc;
+ unsigned long file_nr, kmem;
+ unsigned long flags;
+ int err;
+
+ task_bc = &current->task_bc;
+ ub = get_exec_ub();
+ if (unlikely(ub != task_bc->task_ub))
+ goto just_charge;
+
+ if (likely(task_bc->file_precharged > 0)) {
+ /*
+ * files are put via RCU in 2.6.16 so during
+ * this decrement an IRQ can happen and called
+ * ub_files_uncharge() will mess file_precharged
+ *
+ * ub_task_uncharge() is called via RCU also so no
+ * protection is needed there
+ *
+ * Xemul
+ */
+
+ local_irq_save(flags);
+ task_bc->file_precharged--;
+ local_irq_restore(flags);
+
+ f->f_ub = ub;
+ return 0;
+ }
+
+ pub = top_beancounter(ub);
+ spin_lock_irqsave(&pub->ub_lock, flags);
+ file_nr = ub_file_precharge(task_bc, pub, &kmem);
+ if (unlikely(!file_nr))
+ goto last_try;
+ spin_unlock(&pub->ub_lock);
+ task_bc->file_precharged--;
+ local_irq_restore(flags);
+
+ charge_beancounter_notop(ub, UB_NUMFILE, file_nr);
+ charge_beancounter_notop(ub, UB_KMEMSIZE, kmem);
+ f->f_ub = ub;
+ return 0;
+
+just_charge:
+ pub = top_beancounter(ub);
+ spin_lock_irqsave(&pub->ub_lock, flags);
+last_try:
+ kmem = ub_file_kmemsize(1);
+ err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD);
+ if (likely(!err)) {
+ err = __charge_beancounter_locked(pub, UB_KMEMSIZE,
+ kmem, UB_HARD);
+ if (unlikely(err))
+ __uncharge_beancounter_locked(pub, UB_NUMFILE, 1);
+ }
+ spin_unlock_irqrestore(&pub->ub_lock, flags);
+ if (likely(!err)) {
+ charge_beancounter_notop(ub, UB_NUMFILE, 1);
+ charge_beancounter_notop(ub, UB_KMEMSIZE, kmem);
+ f->f_ub = get_beancounter(ub);
+ }
+ return err;
+}
+
+static inline int task_precharge_farnr(struct task_beancounter *task_bc)
+{
+ return (task_bc->file_precharged < (1UL << task_bc->file_quant));
+}
+
+void ub_file_uncharge(struct file *f)
+{
+ struct user_beancounter *ub, *pub;
+ struct task_beancounter *task_bc;
+ int nr;
+
+ ub = f->f_ub;
+ task_bc = &current->task_bc;
+ if (likely(ub == task_bc->task_ub)) {
+ task_bc->file_precharged++;
+ pub = top_beancounter(ub);
+ if (task_precharge_farnr(task_bc) &&
+ ub_barrier_farsz(pub, UB_KMEMSIZE))
+ return;
+ nr = task_bc->file_precharged
+ - (1UL << (task_bc->file_quant - 1));
+ if (nr > 0) {
+ task_bc->file_precharged -= nr;
+ __put_beancounter_batch(ub, nr);
+ uncharge_beancounter(ub, UB_NUMFILE, nr);
+ uncharge_beancounter(ub, UB_KMEMSIZE,
+ ub_file_kmemsize(nr));
+ }
+ } else {
+ uncharge_beancounter(ub, UB_NUMFILE, 1);
+ uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1));
+ put_beancounter(ub);
+ }
+}
+
+int ub_flock_charge(struct file_lock *fl, int hard)
+{
+ struct user_beancounter *ub;
+ int err;
+
+ /* No need to get_beancounter here since it's already got in slab */
+ ub = slab_ub(fl);
+ if (ub == NULL)
+ return 0;
+
+ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
+ if (!err)
+ fl->fl_charged = 1;
+ return err;
+}
+
+void ub_flock_uncharge(struct file_lock *fl)
+{
+ struct user_beancounter *ub;
+
+ /* Ub will be put in slab */
+ ub = slab_ub(fl);
+ if (ub == NULL || !fl->fl_charged)
+ return;
+
+ uncharge_beancounter(ub, UB_NUMFLOCK, 1);
+ fl->fl_charged = 0;
+}
+
+/*
+ * Signal handling
+ */
+
+static int do_ub_siginfo_charge(struct user_beancounter *ub,
+ unsigned long size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD))
+ goto out_kmem;
+
+ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD))
+ goto out_num;
+
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return 0;
+
+out_num:
+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
+out_kmem:
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return -ENOMEM;
+}
+
+static void do_ub_siginfo_uncharge(struct user_beancounter *ub,
+ unsigned long size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
+ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub)
+{
+ unsigned long size;
+ struct user_beancounter *p, *q;
+
+ size = CHARGE_SIZE(kmem_obj_objuse(sq));
+ for (p = ub; p != NULL; p = p->parent) {
+ if (do_ub_siginfo_charge(p, size))
+ goto unroll;
+ }
+
+ sq->sig_ub = get_beancounter(ub);
+ return 0;
+
+unroll:
+ for (q = ub; q != p; q = q->parent)
+ do_ub_siginfo_uncharge(q, size);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(ub_siginfo_charge);
+
+void ub_siginfo_uncharge(struct sigqueue *sq)
+{
+ unsigned long size;
+ struct user_beancounter *ub, *p;
+
+ p = ub = sq->sig_ub;
+ sq->sig_ub = NULL;
+ size = CHARGE_SIZE(kmem_obj_objuse(sq));
+ for (; ub != NULL; ub = ub->parent)
+ do_ub_siginfo_uncharge(ub, size);
+ put_beancounter(p);
+}
+
+/*
+ * PTYs
+ */
+
+int ub_pty_charge(struct tty_struct *tty)
+{
+ struct user_beancounter *ub;
+ int retval;
+
+ ub = slab_ub(tty);
+ retval = 0;
+ if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+ !test_bit(TTY_CHARGED, &tty->flags)) {
+ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
+ if (!retval)
+ set_bit(TTY_CHARGED, &tty->flags);
+ }
+ return retval;
+}
+
+void ub_pty_uncharge(struct tty_struct *tty)
+{
+ struct user_beancounter *ub;
+
+ ub = slab_ub(tty);
+ if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+ test_bit(TTY_CHARGED, &tty->flags)) {
+ uncharge_beancounter(ub, UB_NUMPTY, 1);
+ clear_bit(TTY_CHARGED, &tty->flags);
+ }
+}
diff --git a/kernel/bc/net.c b/kernel/bc/net.c
new file mode 100644
index 0000000..2e450f7
--- /dev/null
+++ b/kernel/bc/net.c
@@ -0,0 +1,1153 @@
+/*
+ * linux/kernel/bc/net.c
+ *
+ * Copyright (C) 1998-2004 Andrey V. Savochkin <saw@saw.sw.com.sg>
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ * TODO:
+ * - sizeof(struct inode) charge
+ * = tcp_mem_schedule() feedback based on ub limits
+ * + measures so that one socket won't exhaust all send buffers,
+ * see bug in bugzilla
+ * = sk->socket check for NULL in snd_wakeups
+ * (tcp_write_space checks for NULL itself)
+ * + in tcp_close(), orphaned socket abortion should be based on ubc
+ * resources (same in tcp_out_of_resources)
+ * Beancounter should also have separate orphaned socket counter...
+ * + for rcv, in-order segment should be accepted
+ * if only barrier is exceeded
+ * = tcp_rmem_schedule() feedback based on ub limits
+ * - repair forward_alloc mechanism for receive buffers
+ * It's idea is that some buffer space is pre-charged so that receive fast
+ * path doesn't need to take spinlocks and do other heavy stuff
+ * + tcp_prune_queue actions based on ub limits
+ * + window adjustments depending on available buffers for receive
+ * - window adjustments depending on available buffers for send
+ * + race around usewreserv
+ * + avoid allocating new page for each tiny-gram, see letter from ANK
+ * + rename ub_sock_lock
+ * + sk->sleep wait queue probably can be used for all wakeups, and
+ * sk->ub_wait is unnecessary
+ * + for UNIX sockets, the current algorithm will lead to
+ * UB_UNIX_MINBUF-sized messages only for non-blocking case
+ * - charge for af_packet sockets
+ * + all datagram sockets should be charged to NUMUNIXSOCK
+ * - we do not charge for skb copies and clones staying in device queues
+ * + live-lock if number of sockets is big and buffer limits are small
+ * [diff-ubc-dbllim3]
+ * - check that multiple readers/writers on the same socket won't cause fatal
+ * consequences
+ * - check allocation/charge orders
+ * + There is potential problem with callback_lock. In *snd_wakeup we take
+ * beancounter first, in sock_def_error_report - callback_lock first.
+ * then beancounter. This is not a problem if callback_lock taken
+ * readonly, but anyway...
+ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator
+ * General kernel problems:
+ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC
+ * notification won't get signals
+ * - datagram_poll looks racy
+ *
+ */
+
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/err.h>
+#include <linux/socket.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <bc/beancounter.h>
+#include <bc/net.h>
+#include <bc/debug.h>
+
+/* by some reason it is not used currently */
+#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0
+
+
+/* Skb truesize definition. Bad place. Den */
+
+static inline int skb_chargesize_head(struct sk_buff *skb)
+{
+ return skb_charge_size(skb_end_pointer(skb) - skb->head +
+ sizeof(struct skb_shared_info));
+}
+
+int skb_charge_fullsize(struct sk_buff *skb)
+{
+ int chargesize;
+ struct sk_buff *skbfrag;
+
+ chargesize = skb_chargesize_head(skb) +
+ PAGE_SIZE * skb_shinfo(skb)->nr_frags;
+ if (likely(skb_shinfo(skb)->frag_list == NULL))
+ return chargesize;
+ for (skbfrag = skb_shinfo(skb)->frag_list;
+ skbfrag != NULL;
+ skbfrag = skbfrag->next) {
+ chargesize += skb_charge_fullsize(skbfrag);
+ }
+ return chargesize;
+}
+EXPORT_SYMBOL(skb_charge_fullsize);
+
+static int ub_sock_makewreserv_locked(struct sock *sk,
+ int bufid, unsigned long size);
+
+int __ub_too_many_orphans(struct sock *sk, int count)
+{
+ struct user_beancounter *ub;
+
+ if (sock_has_ubc(sk)) {
+ ub = top_beancounter(sock_bc(sk)->ub);
+ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Queueing
+ */
+
+static void ub_sock_snd_wakeup(struct user_beancounter *ub)
+{
+ struct list_head *p;
+ struct sock *sk;
+ struct sock_beancounter *skbc;
+ struct socket *sock;
+ unsigned long added;
+
+ while (!list_empty(&ub->ub_other_sk_list)) {
+ p = ub->ub_other_sk_list.next;
+ skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
+ sk = skbc_sock(skbc);
+
+ added = 0;
+ sock = sk->sk_socket;
+ if (sock == NULL) {
+ /* sk being destroyed */
+ list_del_init(&skbc->ub_sock_list);
+ continue;
+ }
+
+ ub_debug(UBD_NET_SLEEP,
+ "Checking queue, waiting %lu, reserv %lu\n",
+ skbc->ub_waitspc, skbc->poll_reserv);
+ added = -skbc->poll_reserv;
+ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF,
+ skbc->ub_waitspc))
+ break;
+ added += skbc->poll_reserv;
+
+ list_del_init(&skbc->ub_sock_list);
+
+ /*
+ * See comments in ub_tcp_snd_wakeup.
+ * Locking note: both unix_write_space and
+ * sock_def_write_space take callback_lock themselves.
+ * We take it here just to be on the safe side and to
+ * act the same way as ub_tcp_snd_wakeup does.
+ */
+ sock_hold(sk);
+ read_lock(&sk->sk_callback_lock);
+ spin_unlock(&ub->ub_lock);
+
+ sk->sk_write_space(sk);
+ read_unlock(&sk->sk_callback_lock);
+
+ if (skbc->ub != ub && added)
+ charge_beancounter_notop(skbc->ub,
+ UB_OTHERSOCKBUF, added);
+ sock_put(sk);
+
+ spin_lock(&ub->ub_lock);
+ }
+}
+
+static void ub_tcp_snd_wakeup(struct user_beancounter *ub)
+{
+ struct list_head *p;
+ struct sock *sk;
+ struct sock_beancounter *skbc;
+ struct socket *sock;
+ unsigned long added;
+
+ while (!list_empty(&ub->ub_tcp_sk_list)) {
+ p = ub->ub_tcp_sk_list.next;
+ skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
+ sk = skbc_sock(skbc);
+
+ added = 0;
+ sock = sk->sk_socket;
+ if (sock == NULL) {
+ /* sk being destroyed */
+ list_del_init(&skbc->ub_sock_list);
+ continue;
+ }
+
+ ub_debug(UBD_NET_SLEEP,
+ "Checking queue, waiting %lu, reserv %lu\n",
+ skbc->ub_waitspc, skbc->poll_reserv);
+ added = -skbc->poll_reserv;
+ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF,
+ skbc->ub_waitspc))
+ break;
+ added += skbc->poll_reserv;
+
+ list_del_init(&skbc->ub_sock_list);
+
+ /*
+ * Send async notifications and wake up.
+ * Locking note: we get callback_lock here because
+ * tcp_write_space is over-optimistic about calling context
+ * (socket lock is presumed). So we get the lock here although
+ * it belongs to the callback.
+ */
+ sock_hold(sk);
+ read_lock(&sk->sk_callback_lock);
+ spin_unlock(&ub->ub_lock);
+
+ sk->sk_write_space(sk);
+ read_unlock(&sk->sk_callback_lock);
+
+ if (skbc->ub != ub && added)
+ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added);
+ sock_put(sk);
+
+ spin_lock(&ub->ub_lock);
+ }
+}
+
+int ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size)
+{
+ unsigned long flags;
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long added_reserv;
+
+ if (!sock_has_ubc(sk))
+ return 0;
+
+ skbc = sock_bc(sk);
+ ub = top_beancounter(skbc->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size);
+ added_reserv = -skbc->poll_reserv;
+ if (!ub_sock_makewreserv_locked(sk, res, size)) {
+ /*
+ * It looks a bit hackish, but it is compatible with both
+ * wait_for_xx_ubspace and poll.
+ * This __set_current_state is equivalent to a wakeup event
+ * right after spin_unlock_irqrestore.
+ */
+ __set_current_state(TASK_RUNNING);
+ added_reserv += skbc->poll_reserv;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ if (added_reserv)
+ charge_beancounter_notop(skbc->ub, res, added_reserv);
+ return 0;
+ }
+
+ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n");
+ skbc->ub_waitspc = size;
+ if (!list_empty(&skbc->ub_sock_list)) {
+ ub_debug(UBD_NET_SOCKET,
+ "re-adding socket to beancounter %p.\n", ub);
+ goto out;
+ }
+
+ switch (res) {
+ case UB_TCPSNDBUF:
+ list_add_tail(&skbc->ub_sock_list,
+ &ub->ub_tcp_sk_list);
+ break;
+ case UB_OTHERSOCKBUF:
+ list_add_tail(&skbc->ub_sock_list,
+ &ub->ub_other_sk_list);
+ break;
+ default:
+ BUG();
+ }
+out:
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return -ENOMEM;
+}
+
+EXPORT_SYMBOL(ub_sock_snd_queue_add);
+
+long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sk->sk_sleep, &wait);
+ for (;;) {
+ if (signal_pending(current))
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size))
+ break;
+
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ break;
+ if (sk->sk_err)
+ break;
+ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size);
+ timeo = schedule_timeout(timeo);
+ }
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk->sk_sleep, &wait);
+ return timeo;
+}
+
+void ub_sock_sndqueuedel(struct sock *sk)
+{
+ struct user_beancounter *ub;
+ struct sock_beancounter *skbc;
+ unsigned long flags;
+
+ if (!sock_has_ubc(sk))
+ return;
+ skbc = sock_bc(sk);
+
+ /* race with write_space callback of other socket */
+ ub = top_beancounter(skbc->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ list_del_init(&skbc->ub_sock_list);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+/*
+ * Helpers
+ */
+
+static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
+ unsigned long size, int resource)
+{
+ WARN_ON_ONCE(skb_bc(skb)->ub != NULL);
+
+ skb_bc(skb)->ub = sock_bc(sk)->ub;
+ skb_bc(skb)->charged = size;
+ skb_bc(skb)->resource = resource;
+}
+
+void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
+ unsigned long size, int resource)
+{
+ if (!sock_has_ubc(sk))
+ return;
+
+ if (sock_bc(sk)->ub == NULL)
+ BUG();
+
+ __ub_skb_set_charge(skb, sk, size, resource);
+
+ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */
+ if (skb->sk == NULL)
+ skb->sk = sk;
+}
+
+EXPORT_SYMBOL(ub_skb_set_charge);
+
+static inline void ub_skb_set_uncharge(struct sk_buff *skb)
+{
+ skb_bc(skb)->ub = NULL;
+ skb_bc(skb)->charged = 0;
+ skb_bc(skb)->resource = 0;
+}
+
+static void ub_update_rmem_thres(struct sock_beancounter *skub)
+{
+ struct user_beancounter *ub;
+
+ if (skub && skub->ub) {
+ ub = top_beancounter(skub->ub);
+ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier /
+ (ub->ub_parms[UB_NUMTCPSOCK].held + 1);
+ }
+}
+
+static inline void ub_sock_wcharge_dec(struct sock *sk,
+ unsigned long chargesize)
+{
+ /* The check sk->sk_family != PF_NETLINK is made as the skb is
+ * queued to the kernel end of socket while changed to the user one.
+ * Den */
+ if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) {
+ if (sock_bc(sk)->ub_wcharged > chargesize)
+ sock_bc(sk)->ub_wcharged -= chargesize;
+ else
+ sock_bc(sk)->ub_wcharged = 0;
+ }
+}
+
+/*
+ * Charge socket number
+ */
+
+static inline void sk_alloc_beancounter(struct sock *sk)
+{
+ struct sock_beancounter *skbc;
+
+ skbc = sock_bc(sk);
+ memset(skbc, 0, sizeof(struct sock_beancounter));
+}
+
+static inline void sk_free_beancounter(struct sock *sk)
+{
+}
+
+static int __sock_charge(struct sock *sk, int res)
+{
+ struct sock_beancounter *skbc;
+ struct user_beancounter *cub, *ub;
+ unsigned long added_reserv, added_forw;
+ unsigned long flags;
+
+ cub = get_exec_ub();
+ if (unlikely(cub == NULL))
+ return 0;
+
+ sk_alloc_beancounter(sk);
+ skbc = sock_bc(sk);
+ INIT_LIST_HEAD(&skbc->ub_sock_list);
+
+ ub = top_beancounter(cub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0))
+ goto out_limit;
+
+ added_reserv = 0;
+ added_forw = 0;
+ if (res == UB_NUMTCPSOCK) {
+ added_reserv = skb_charge_size(MAX_TCP_HEADER +
+ 1500 - sizeof(struct iphdr) -
+ sizeof(struct tcphdr));
+ added_reserv *= 4;
+ ub->ub_parms[UB_TCPSNDBUF].held += added_reserv;
+ if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) {
+ ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv;
+ added_reserv = 0;
+ }
+ skbc->poll_reserv = added_reserv;
+
+ added_forw = SK_MEM_QUANTUM * 4;
+ ub->ub_parms[UB_TCPRCVBUF].held += added_forw;
+ if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
+ ub->ub_parms[UB_TCPRCVBUF].held -= added_forw;
+ added_forw = 0;
+ }
+ skbc->forw_space = added_forw;
+ }
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ charge_beancounter_notop(cub, res, 1);
+ if (added_reserv)
+ charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv);
+ if (added_forw)
+ charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw);
+
+ skbc->ub = get_beancounter(cub);
+ return 0;
+
+out_limit:
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ sk_free_beancounter(sk);
+ return -ENOMEM;
+}
+
+int ub_tcp_sock_charge(struct sock *sk)
+{
+ int ret;
+
+ ret = __sock_charge(sk, UB_NUMTCPSOCK);
+ ub_update_rmem_thres(sock_bc(sk));
+
+ return ret;
+}
+
+int ub_other_sock_charge(struct sock *sk)
+{
+ return __sock_charge(sk, UB_NUMOTHERSOCK);
+}
+
+EXPORT_SYMBOL(ub_other_sock_charge);
+
+int ub_sock_charge(struct sock *sk, int family, int type)
+{
+ return (IS_TCP_SOCK(family, type) ?
+ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk));
+}
+
+EXPORT_SYMBOL(ub_sock_charge);
+
+/*
+ * Uncharge socket number
+ */
+
+void ub_sock_uncharge(struct sock *sk)
+{
+ int is_tcp_sock;
+ unsigned long flags;
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long reserv, forw;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return;
+
+ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type);
+ skbc = sock_bc(sk);
+ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk);
+
+ ub = top_beancounter(skbc->ub);
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (!list_empty(&skbc->ub_sock_list)) {
+ ub_debug(UBD_NET_SOCKET,
+ "ub_sock_uncharge: removing from ub(%p) queue.\n",
+ skbc);
+ list_del_init(&skbc->ub_sock_list);
+ }
+
+ reserv = skbc->poll_reserv;
+ forw = skbc->forw_space;
+ __uncharge_beancounter_locked(ub,
+ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
+ reserv);
+ if (forw)
+ __uncharge_beancounter_locked(ub,
+ (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF),
+ forw);
+ __uncharge_beancounter_locked(ub,
+ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
+
+ ub_sock_wcharge_dec(sk, reserv);
+ if (unlikely(skbc->ub_wcharged))
+ printk(KERN_WARNING
+ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n",
+ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid);
+ skbc->poll_reserv = 0;
+ skbc->forw_space = 0;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ uncharge_beancounter_notop(skbc->ub,
+ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
+ reserv);
+ if (forw)
+ uncharge_beancounter_notop(skbc->ub,
+ (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF),
+ forw);
+ uncharge_beancounter_notop(skbc->ub,
+ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
+
+ put_beancounter(skbc->ub);
+ sk_free_beancounter(sk);
+}
+
+/*
+ * Special case for netlink_dump - (un)charges precalculated size
+ */
+
+int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)
+{
+ int ret;
+ unsigned long chargesize;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ chargesize = skb_charge_fullsize(skb);
+ ret = charge_beancounter(sock_bc(sk)->ub,
+ UB_OTHERSOCKBUF, chargesize, UB_HARD);
+ if (ret < 0)
+ return ret;
+ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
+ return ret;
+}
+
+/*
+ * Poll reserve accounting
+ *
+ * This is the core of socket buffer management (along with queueing/wakeup
+ * functions. The rest of buffer accounting either call these functions, or
+ * repeat parts of their logic for some simpler cases.
+ */
+
+static int ub_sock_makewreserv_locked(struct sock *sk,
+ int bufid, unsigned long size)
+{
+ unsigned long wcharge_added;
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+
+ skbc = sock_bc(sk);
+ if (skbc->poll_reserv >= size) /* no work to be done */
+ goto out;
+
+ ub = top_beancounter(skbc->ub);
+ ub->ub_parms[bufid].held += size - skbc->poll_reserv;
+
+ wcharge_added = 0;
+ /*
+ * Logic:
+ * 1) when used memory hits barrier, we set wmem_pressure;
+ * wmem_pressure is reset under barrier/2;
+ * between barrier/2 and barrier we limit per-socket buffer growth;
+ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets
+ * calculated on the base of memory eaten after the barrier is hit
+ */
+ skbc = sock_bc(sk);
+#if UB_SOCK_MAINTAIN_WMEMPRESSURE
+ if (!ub_hfbarrier_hit(ub, bufid)) {
+ if (ub->ub_wmem_pressure)
+ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 "
+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+ sk, size, skbc->poll_reserv,
+ ub->ub_parms[bufid].held,
+ skbc->ub_wcharged, sk->sk_sndbuf);
+ ub->ub_wmem_pressure = 0;
+ }
+#endif
+ if (ub_barrier_hit(ub, bufid)) {
+#if UB_SOCK_MAINTAIN_WMEMPRESSURE
+ if (!ub->ub_wmem_pressure)
+ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 "
+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+ sk, size, skbc->poll_reserv,
+ ub->ub_parms[bufid].held,
+ skbc->ub_wcharged, sk->sk_sndbuf);
+ ub->ub_wmem_pressure = 1;
+#endif
+ if (sk->sk_family == PF_NETLINK)
+ goto unroll;
+ wcharge_added = size - skbc->poll_reserv;
+ skbc->ub_wcharged += wcharge_added;
+ if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit +
+ ub->ub_parms[bufid].barrier >
+ ub->ub_parms[bufid].limit)
+ goto unroll_wch;
+ }
+ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit)
+ goto unroll;
+
+ ub_adjust_maxheld(ub, bufid);
+ skbc->poll_reserv = size;
+out:
+ return 0;
+
+unroll_wch:
+ skbc->ub_wcharged -= wcharge_added;
+unroll:
+ ub_debug(UBD_NET_SEND,
+ "makewres: deny "
+ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
+ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held,
+ skbc->ub_wcharged, sk->sk_sndbuf);
+ ub->ub_parms[bufid].failcnt++;
+ ub->ub_parms[bufid].held -= size - skbc->poll_reserv;
+
+ if (sk->sk_socket != NULL) {
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ return -ENOMEM;
+}
+
+int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size)
+{
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long flags;
+ unsigned long added_reserv;
+ int err;
+
+ skbc = sock_bc(sk);
+
+ /*
+ * This function provides that there is sufficient reserve upon return
+ * only if sk has only one user. We can check poll_reserv without
+ * serialization and avoid locking if the reserve already exists.
+ */
+ if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size))
+ return 0;
+
+ ub = top_beancounter(skbc->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ added_reserv = -skbc->poll_reserv;
+ err = ub_sock_makewreserv_locked(sk, bufid, size);
+ added_reserv += skbc->poll_reserv;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ if (added_reserv)
+ charge_beancounter_notop(skbc->ub, bufid, added_reserv);
+
+ return err;
+}
+
+EXPORT_SYMBOL(ub_sock_make_wreserv);
+
+int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size)
+{
+ struct sock_beancounter *skbc;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ /* optimize for the case if socket has sufficient reserve */
+ ub_sock_make_wreserv(sk, bufid, size);
+ skbc = sock_bc(sk);
+ if (likely(skbc->poll_reserv >= size)) {
+ skbc->poll_reserv -= size;
+ return 0;
+ }
+ return -ENOMEM;
+}
+
+EXPORT_SYMBOL(ub_sock_get_wreserv);
+
+static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid,
+ unsigned long size, unsigned long ressize)
+{
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long extra;
+ unsigned long flags;
+
+ skbc = sock_bc(sk);
+ ub = top_beancounter(skbc->ub);
+
+ extra = 0;
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ skbc->poll_reserv += size;
+ if (skbc->poll_reserv > ressize) {
+ extra = skbc->poll_reserv - ressize;
+ ub_sock_wcharge_dec(sk, extra);
+ skbc->poll_reserv = ressize;
+
+ __uncharge_beancounter_locked(ub, bufid, extra);
+ if (bufid == UB_TCPSNDBUF)
+ ub_tcp_snd_wakeup(ub);
+ else
+ ub_sock_snd_wakeup(ub);
+ }
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ if (extra)
+ uncharge_beancounter_notop(skbc->ub, bufid, extra);
+}
+
+void ub_sock_ret_wreserv(struct sock *sk, int bufid,
+ unsigned long size, unsigned long ressize)
+{
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return;
+
+ skbc = sock_bc(sk);
+ ub = top_beancounter(skbc->ub);
+ /* check if the reserve can be kept */
+ if (ub_barrier_farsz(ub, bufid)) {
+ skbc->poll_reserv += size;
+ return;
+ }
+ ub_sock_do_ret_wreserv(sk, bufid, size, ressize);
+}
+
+/*
+ * UB_DGRAMRCVBUF
+ */
+
+static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+ unsigned long chargesize;
+
+ chargesize = skb_charge_fullsize(skb);
+ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF,
+ chargesize, UB_HARD))
+ return -ENOMEM;
+
+ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF);
+ return 0;
+}
+
+int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ if (IS_TCP_SOCK(sk->sk_family, sk->sk_type))
+ return ub_tcprcvbuf_charge(sk, skb);
+ else
+ return ub_dgramrcvbuf_charge(sk, skb);
+}
+
+EXPORT_SYMBOL(ub_sockrcvbuf_charge);
+
+static void ub_sockrcvbuf_uncharge(struct sk_buff *skb)
+{
+ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF,
+ skb_bc(skb)->charged);
+ ub_skb_set_uncharge(skb);
+}
+
+/*
+ * UB_TCPRCVBUF
+ */
+
+int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb,
+ enum ub_severity strict)
+{
+ int retval;
+ unsigned long flags;
+ struct user_beancounter *ub;
+ struct sock_beancounter *skbc;
+ unsigned long chargesize;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+ skbc = sock_bc(sk);
+
+ chargesize = skb_charge_fullsize(skb);
+ if (likely(skbc->forw_space >= chargesize)) {
+ skbc->forw_space -= chargesize;
+ __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
+ return 0;
+ }
+
+ /*
+ * Memory pressure reactions:
+ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND)
+ * 2) set UB_RMEM_SHRINK and tcp_clamp_window()
+ * tcp_collapse_queues() if rmem_alloc > rcvbuf
+ * 3) drop OFO, tcp_purge_ofo()
+ * 4) drop all.
+ * Currently, we do #2 and #3 at once (which means that current
+ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time,
+ * for example...)
+ * On memory pressure we jump from #0 to #3, and when the pressure
+ * subsides, to #1.
+ */
+ retval = 0;
+ ub = top_beancounter(sock_bc(sk)->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_parms[UB_TCPRCVBUF].held += chargesize;
+ if (ub->ub_parms[UB_TCPRCVBUF].held >
+ ub->ub_parms[UB_TCPRCVBUF].barrier &&
+ strict != UB_FORCE)
+ goto excess;
+ ub_adjust_maxheld(ub, UB_TCPRCVBUF);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+out:
+ if (retval == 0) {
+ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF,
+ chargesize);
+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
+ }
+ return retval;
+
+excess:
+ ub->ub_rmem_pressure = UB_RMEM_SHRINK;
+ if (strict == UB_HARD)
+ retval = -ENOMEM;
+ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit)
+ retval = -ENOMEM;
+ /*
+ * We try to leave numsock*maxadvmss as a reserve for sockets not
+ * queueing any data yet (if the difference between the barrier and the
+ * limit is enough for this reserve).
+ */
+ if (ub->ub_parms[UB_TCPRCVBUF].held +
+ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss
+ > ub->ub_parms[UB_TCPRCVBUF].limit &&
+ atomic_read(&sk->sk_rmem_alloc))
+ retval = -ENOMEM;
+ if (retval) {
+ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize;
+ ub->ub_parms[UB_TCPRCVBUF].failcnt++;
+ }
+ ub_adjust_maxheld(ub, UB_TCPRCVBUF);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ goto out;
+}
+EXPORT_SYMBOL(ub_sock_tcp_chargerecv);
+
+static void ub_tcprcvbuf_uncharge(struct sk_buff *skb)
+{
+ unsigned long flags;
+ unsigned long held, bar;
+ int prev_pres;
+ struct user_beancounter *ub;
+
+ ub = top_beancounter(skb_bc(skb)->ub);
+ if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) {
+ sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged;
+ ub_skb_set_uncharge(skb);
+ return;
+ }
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) {
+ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n",
+ skb_bc(skb)->charged,
+ ub, ub->ub_parms[UB_TCPRCVBUF].held);
+ /* ass-saving bung */
+ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held;
+ }
+ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged;
+ held = ub->ub_parms[UB_TCPRCVBUF].held;
+ bar = ub->ub_parms[UB_TCPRCVBUF].barrier;
+ prev_pres = ub->ub_rmem_pressure;
+ if (held <= bar - (bar >> 2))
+ ub->ub_rmem_pressure = UB_RMEM_EXPAND;
+ else if (held <= bar)
+ ub->ub_rmem_pressure = UB_RMEM_KEEP;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF,
+ skb_bc(skb)->charged);
+ ub_skb_set_uncharge(skb);
+}
+
+
+/*
+ * UB_OTHERSOCKBUF and UB_TCPSNDBUF
+ */
+
+static void ub_socksndbuf_uncharge(struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct user_beancounter *ub, *cub;
+ unsigned long chargesize;
+
+ cub = skb_bc(skb)->ub;
+ ub = top_beancounter(cub);
+ chargesize = skb_bc(skb)->charged;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize);
+ if (skb->sk != NULL && sock_has_ubc(skb->sk))
+ ub_sock_wcharge_dec(skb->sk, chargesize);
+ ub_sock_snd_wakeup(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize);
+ ub_skb_set_uncharge(skb);
+}
+
+/* expected to be called under socket lock */
+static void ub_tcpsndbuf_uncharge(struct sk_buff *skb)
+{
+ /*
+ * ub_sock_ret_wreserv call is abused here, we just want to uncharge
+ * skb size. However, to reduce duplication of the code doing
+ * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call
+ * a function that already does all of this. 2006/04/27 SAW
+ */
+ ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged,
+ sock_bc(skb->sk)->poll_reserv);
+ ub_skb_set_uncharge(skb);
+}
+
+void ub_skb_uncharge(struct sk_buff *skb)
+{
+ switch (skb_bc(skb)->resource) {
+ case UB_TCPSNDBUF:
+ ub_tcpsndbuf_uncharge(skb);
+ break;
+ case UB_TCPRCVBUF:
+ ub_tcprcvbuf_uncharge(skb);
+ break;
+ case UB_DGRAMRCVBUF:
+ ub_sockrcvbuf_uncharge(skb);
+ break;
+ case UB_OTHERSOCKBUF:
+ ub_socksndbuf_uncharge(skb);
+ break;
+ }
+}
+
+EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */
+
+/*
+ * Other sock reserve managment
+ */
+
+int ub_sock_getwres_other(struct sock *sk, unsigned long size)
+{
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long flags;
+ unsigned long added_reserv;
+ int err;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ /*
+ * Nothing except beancounter lock protects skbc->poll_reserv.
+ * So, take the lock and do the job.
+ * Dances with added_reserv repeat ub_sock_make_wreserv.
+ */
+ skbc = sock_bc(sk);
+ ub = top_beancounter(skbc->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ added_reserv = -skbc->poll_reserv;
+ err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size);
+ added_reserv += skbc->poll_reserv;
+ if (!err)
+ skbc->poll_reserv -= size;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ if (added_reserv)
+ charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv);
+
+ return err;
+}
+EXPORT_SYMBOL(ub_sock_getwres_other);
+
+void ub_sock_retwres_other(struct sock *sk,
+ unsigned long size, unsigned long ressize)
+{
+ if (unlikely(!sock_has_ubc(sk)))
+ return;
+
+ ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize);
+}
+
+/*
+ * TCP send buffers accouting. Paged part
+ */
+
+int ub_sock_tcp_chargepage(struct sock *sk)
+{
+ struct sock_beancounter *skbc;
+ unsigned long extra;
+ int err;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ skbc = sock_bc(sk);
+ ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE);
+ if (likely(skbc->poll_reserv >= PAGE_SIZE)) {
+ skbc->poll_reserv -= PAGE_SIZE;
+ return 0;
+ }
+
+ /*
+ * Ok, full page is not available.
+ * However, this function must succeed if poll previously indicated
+ * that write is possible. We better make a forced charge here
+ * than reserve a whole page in poll.
+ */
+ err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE);
+ if (unlikely(err < 0))
+ goto out;
+ if (skbc->poll_reserv < PAGE_SIZE) {
+ extra = PAGE_SIZE - skbc->poll_reserv;
+ err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra,
+ UB_FORCE);
+ if (err < 0)
+ goto out;
+ skbc->poll_reserv += extra;
+ }
+ skbc->poll_reserv -= PAGE_SIZE;
+ return 0;
+
+out:
+ return err;
+}
+
+void ub_sock_tcp_detachpage(struct sock *sk)
+{
+ struct sk_buff *skb;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return;
+
+ /* The page is just detached from socket. The last skb in queue
+ with paged part holds referrence to it */
+ skb = skb_peek_tail(&sk->sk_write_queue);
+ if (skb == NULL) {
+ /* If the queue is empty - all data is sent and page is about
+ to be freed */
+ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE,
+ sock_bc(sk)->poll_reserv);
+ } else {
+ /* Last skb is a good aproximation for a last skb with
+ paged part */
+ skb_bc(skb)->charged += PAGE_SIZE;
+ }
+}
+
+/*
+ * TCPSNDBUF charge functions below are called in the following cases:
+ * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by
+ * some technical reasons in TCP code;
+ * - fragmentation of TCP packets.
+ * These functions are allowed but not required to use poll_reserv.
+ * Originally, these functions didn't do that, since it didn't make
+ * any sense. Now, since poll_reserv now has a function of general reserve,
+ * they use it.
+ */
+int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb,
+ enum ub_severity strict)
+{
+ int ret;
+ unsigned long chargesize;
+ struct sock_beancounter *skbc;
+ struct user_beancounter *ub;
+ unsigned long flags;
+
+ if (unlikely(!sock_has_ubc(sk)))
+ return 0;
+
+ skbc = sock_bc(sk);
+ chargesize = skb_charge_fullsize(skb);
+ if (likely(skbc->poll_reserv >= chargesize)) {
+ skbc->poll_reserv -= chargesize;
+ __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+ /* XXX hack, see ub_skb_set_charge */
+ skb->sk = sk;
+ return 0;
+ }
+
+ ub = top_beancounter(skbc->ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF,
+ chargesize, strict);
+ /*
+ * Note: this check is not equivalent of the corresponding check
+ * in makewreserv. It's similar in spirit, but an equivalent check
+ * would be too long and complicated here.
+ */
+ if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF))
+ skbc->ub_wcharged += chargesize;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ if (likely(!ret)) {
+ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize);
+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ub_sock_tcp_chargesend);
+
+/*
+ * Initialization
+ */
+
+int __init skbc_cache_init(void)
+{
+ return 0;
+}
diff --git a/kernel/bc/oom_kill.c b/kernel/bc/oom_kill.c
new file mode 100644
index 0000000..c79e826
--- /dev/null
+++ b/kernel/bc/oom_kill.c
@@ -0,0 +1,200 @@
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/cpuset.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/oom_kill.h>
+#include <bc/hash.h>
+
+#define UB_OOM_TIMEOUT (5 * HZ)
+
+int oom_generation;
+int oom_kill_counter;
+static DEFINE_SPINLOCK(oom_lock);
+static DECLARE_WAIT_QUEUE_HEAD(oom_wq);
+
+static inline int ub_oom_completed(struct task_struct *tsk)
+{
+ if (test_tsk_thread_flag(tsk, TIF_MEMDIE))
+ /* we were oom killed - just die */
+ return 1;
+ if (tsk->task_bc.oom_generation != oom_generation)
+ /* some task was succesfully killed */
+ return 1;
+ return 0;
+}
+
+static void ub_clear_oom(void)
+{
+ struct user_beancounter *ub;
+
+ rcu_read_lock();
+ for_each_beancounter(ub)
+ ub->ub_oom_noproc = 0;
+ rcu_read_unlock();
+}
+
+/* Called with cpuset_lock held */
+int ub_oom_lock(void)
+{
+ int timeout;
+ DEFINE_WAIT(oom_w);
+ struct task_struct *tsk;
+
+ tsk = current;
+
+ spin_lock(&oom_lock);
+ if (!oom_kill_counter)
+ goto out_do_oom;
+
+ timeout = UB_OOM_TIMEOUT;
+ while (1) {
+ if (ub_oom_completed(tsk)) {
+ spin_unlock(&oom_lock);
+ return -EINVAL;
+ }
+
+ if (timeout == 0)
+ break;
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&oom_wq, &oom_w);
+ spin_unlock(&oom_lock);
+ cpuset_unlock();
+
+ timeout = schedule_timeout(timeout);
+
+ cpuset_lock();
+ spin_lock(&oom_lock);
+ remove_wait_queue(&oom_wq, &oom_w);
+ }
+
+out_do_oom:
+ ub_clear_oom();
+ return 0;
+}
+
+static inline long ub_current_overdraft(struct user_beancounter *ub)
+{
+ return ub->ub_parms[UB_OOMGUARPAGES].held +
+ ((ub->ub_parms[UB_KMEMSIZE].held
+ + ub->ub_parms[UB_TCPSNDBUF].held
+ + ub->ub_parms[UB_TCPRCVBUF].held
+ + ub->ub_parms[UB_OTHERSOCKBUF].held
+ + ub->ub_parms[UB_DGRAMRCVBUF].held)
+ >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier;
+}
+
+int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk)
+{
+ struct user_beancounter *mm_ub;
+
+ if (ub == NULL)
+ return 0;
+
+ task_lock(tsk);
+ if (tsk->mm == NULL)
+ mm_ub = NULL;
+ else
+ mm_ub = tsk->mm->mm_ub;
+
+ while (mm_ub != NULL && mm_ub != ub)
+ mm_ub = mm_ub->parent;
+ task_unlock(tsk);
+
+ return mm_ub != ub;
+}
+
+struct user_beancounter *ub_oom_select_worst(void)
+{
+ struct user_beancounter *ub, *walkp;
+ long ub_maxover;
+
+ ub_maxover = 0;
+ ub = NULL;
+
+ rcu_read_lock();
+ for_each_beancounter (walkp) {
+ long ub_overdraft;
+
+ if (walkp->parent != NULL)
+ continue;
+ if (walkp->ub_oom_noproc)
+ continue;
+
+ ub_overdraft = ub_current_overdraft(walkp);
+ if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) {
+ put_beancounter(ub);
+ ub = walkp;
+ ub_maxover = ub_overdraft;
+ }
+ }
+
+ if (ub)
+ ub->ub_oom_noproc = 1;
+ rcu_read_unlock();
+
+ return ub;
+}
+
+void ub_oom_mm_killed(struct user_beancounter *ub)
+{
+ static struct ub_rate_info ri = { 5, 60*HZ };
+
+ /* increment is serialized with oom_lock */
+ ub->ub_parms[UB_OOMGUARPAGES].failcnt++;
+
+ if (ub_ratelimit(&ri))
+ show_mem();
+}
+
+void ub_oom_unlock(void)
+{
+ spin_unlock(&oom_lock);
+}
+
+void ub_oom_task_dead(struct task_struct *tsk)
+{
+ spin_lock(&oom_lock);
+ oom_kill_counter = 0;
+ oom_generation++;
+
+ printk("OOM killed process %s (pid=%d, ve=%d) exited, "
+ "free=%lu gen=%d.\n",
+ tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env),
+ nr_free_pages(), oom_generation);
+ /* if there is time to sleep in ub_oom_lock -> sleep will continue */
+ wake_up_all(&oom_wq);
+ spin_unlock(&oom_lock);
+}
+
+void ub_out_of_memory(struct user_beancounter *scope)
+{
+ struct user_beancounter *ub;
+ struct task_struct *p;
+
+ cpuset_lock();
+ spin_lock(&oom_lock);
+ ub_clear_oom();
+ ub = get_beancounter(scope);
+
+ read_lock(&tasklist_lock);
+retry:
+ p = select_bad_process(ub, NULL);
+ if (p == NULL || PTR_ERR(p) == -1UL)
+ goto unlock;
+
+ if (oom_kill_process(p, (gfp_t)-1, -1, NULL, "UB Out of memory"))
+ goto retry;
+
+ put_beancounter(ub);
+
+unlock:
+ read_unlock(&tasklist_lock);
+ spin_unlock(&oom_lock);
+ cpuset_unlock();
+}
+EXPORT_SYMBOL(ub_out_of_memory);
diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c
new file mode 100644
index 0000000..dd96e38
--- /dev/null
+++ b/kernel/bc/proc.c
@@ -0,0 +1,703 @@
+/*
+ * kernel/bc/proc.c
+ *
+ * Copyright (C) 2006 OpenVZ. SWsoft Inc.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/hash.h>
+#include <bc/rss_pages.h>
+#include <bc/proc.h>
+
+/* Generic output formats */
+#if BITS_PER_LONG == 32
+const char *bc_proc_lu_fmt = "\t%-20s %10lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n";
+#else
+const char *bc_proc_lu_fmt = "\t%-20s %21lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n";
+#endif
+
+#if BITS_PER_LONG == 32
+static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n";
+static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n";
+#else
+static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n";
+static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n";
+#endif
+
+static void ub_show_res(struct seq_file *f, struct user_beancounter *ub,
+ int r, int show_uid)
+{
+ int len;
+ char ub_uid[64];
+
+ if (show_uid && r == 0) {
+ len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2);
+ ub_uid[len] = ':';
+ ub_uid[len + 1] = '\0';
+ } else
+ strcpy(ub_uid, "");
+
+ seq_printf(f, res_fmt, ub_uid, ub_rnames[r],
+ ub->ub_parms[r].held,
+ ub->ub_parms[r].maxheld,
+ ub->ub_parms[r].barrier,
+ ub->ub_parms[r].limit,
+ ub->ub_parms[r].failcnt);
+}
+
+static void __show_resources(struct seq_file *f, struct user_beancounter *ub,
+ int show_uid)
+{
+ int i;
+
+ for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+ if (strcmp(ub_rnames[i], "dummy") != 0)
+ ub_show_res(f, ub, i, show_uid);
+
+ for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++)
+ ub_show_res(f, ub, i, show_uid);
+}
+
+static int bc_resources_show(struct seq_file *f, void *v)
+{
+ __show_resources(f, seq_beancounter(f), 0);
+ return 0;
+}
+
+static struct bc_proc_entry bc_resources_entry = {
+ .name = "resources",
+ .u.show = bc_resources_show,
+};
+
+#ifdef CONFIG_UBC_DEBUG
+static int bc_debug_show(struct seq_file *f, void *v)
+{
+ struct user_beancounter *ub;
+ char buf[64];
+
+ ub = seq_beancounter(f);
+ print_ub_uid(ub, buf, sizeof(buf));
+ seq_printf(f, "uid: %s\n", buf);
+ seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount));
+
+ seq_printf(f, "bc: %p\n", ub);
+ seq_printf(f, "par: %p\n", ub->parent);
+ seq_printf(f, "priv: %p\n", ub->private_data);
+ return 0;
+}
+
+static struct bc_proc_entry bc_debug_entry = {
+ .name = "debug",
+ .u.show = bc_debug_show,
+};
+#endif
+
+static int ub_show(struct seq_file *f, void *v)
+{
+ int i;
+
+ for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+ ub_show_res(f, (struct user_beancounter *)v, i, 1);
+ return 0;
+}
+
+static int res_show(struct seq_file *f, void *v)
+{
+ __show_resources(f, (struct user_beancounter *)v, 1);
+ return 0;
+}
+
+static int ub_accessible(struct user_beancounter *exec,
+ struct user_beancounter *target)
+{
+ struct user_beancounter *p, *q;
+
+ p = top_beancounter(exec);
+ q = top_beancounter(target);
+
+ return (p == get_ub0() || p == q);
+}
+
+static void ub_show_header(struct seq_file *f)
+{
+ seq_printf(f, "Version: 2.5\n");
+ seq_printf(f, head_fmt, "uid", "resource",
+ "held", "maxheld", "barrier", "limit", "failcnt");
+}
+
+static void *ub_start(struct seq_file *f, loff_t *ppos)
+{
+ struct user_beancounter *ub;
+ struct user_beancounter *exec_ub;
+ unsigned long pos;
+
+ pos = *ppos;
+ if (pos == 0)
+ ub_show_header(f);
+
+ exec_ub = get_exec_ub();
+
+ rcu_read_lock();
+ for_each_beancounter(ub) {
+ if (ub->parent != NULL)
+ continue;
+ if (!ub_accessible(exec_ub, ub))
+ continue;
+ if (pos-- == 0)
+ return ub;
+ }
+ return NULL;
+}
+
+static void *ub_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+ struct user_beancounter *ub;
+ struct list_head *entry;
+ struct user_beancounter *exec_ub;
+
+ exec_ub = get_exec_ub();
+ ub = (struct user_beancounter *)v;
+
+ entry = &ub->ub_list;
+
+ list_for_each_continue_rcu(entry, &ub_list_head) {
+ ub = list_entry(entry, struct user_beancounter, ub_list);
+ if (ub->parent != NULL)
+ continue;
+ if (!ub_accessible(exec_ub, ub))
+ continue;
+
+ (*ppos)++;
+ return ub;
+ }
+ return NULL;
+}
+
+static void ub_stop(struct seq_file *f, void *v)
+{
+ rcu_read_unlock();
+}
+
+static struct seq_operations ub_seq_ops = {
+ .start = ub_start,
+ .next = ub_next,
+ .stop = ub_stop,
+ .show = ub_show,
+};
+
+static int ub_open(struct inode *inode, struct file *filp)
+{
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return -EACCES;
+
+ return seq_open(filp, &ub_seq_ops);
+}
+
+static struct file_operations ub_file_operations = {
+ .open = ub_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct seq_operations res_seq_ops = {
+ .start = ub_start,
+ .next = ub_next,
+ .stop = ub_stop,
+ .show = res_show,
+};
+
+static int res_open(struct inode *inode, struct file *filp)
+{
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return -EACCES;
+
+ return seq_open(filp, &res_seq_ops);
+}
+
+static struct file_operations resources_operations = {
+ .open = res_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct bc_proc_entry bc_all_resources_entry = {
+ .name = "resources",
+ .u.fops = &resources_operations,
+};
+
+/*
+ * Generic showing stuff
+ */
+
+static int cookies, num_entries;
+static struct bc_proc_entry *bc_entries __read_mostly;
+static struct bc_proc_entry *bc_root_entries __read_mostly;
+static DEFINE_SPINLOCK(bc_entries_lock);
+static struct proc_dir_entry *bc_proc_root;
+
+void bc_register_proc_entry(struct bc_proc_entry *e)
+{
+ spin_lock(&bc_entries_lock);
+ e->cookie = ++cookies;
+ e->next = bc_entries;
+ bc_entries = e;
+ num_entries++;
+ spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_entry);
+
+void bc_register_proc_root_entry(struct bc_proc_entry *e)
+{
+ spin_lock(&bc_entries_lock);
+ e->cookie = ++cookies;
+ e->next = bc_root_entries;
+ bc_root_entries = e;
+ bc_proc_root->nlink++;
+ spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_root_entry);
+
+/*
+ * small helpers
+ */
+
+static inline unsigned long bc_make_ino(struct user_beancounter *ub)
+{
+ unsigned long ret;
+
+ ret = 0xbc000000;
+ if (ub->parent)
+ ret |= ((ub->parent->ub_uid + 1) << 4);
+ ret |= (ub->ub_uid + 1);
+ return ret;
+}
+
+static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de)
+{
+ return 0xbe000000 + de->cookie;
+}
+
+static int bc_d_delete(struct dentry *d)
+{
+ return 1;
+}
+
+static void bc_d_release(struct dentry *d)
+{
+ put_beancounter((struct user_beancounter *)d->d_fsdata);
+}
+
+static struct inode_operations bc_entry_iops;
+static struct file_operations bc_entry_fops;
+static struct dentry_operations bc_dentry_ops = {
+ .d_delete = bc_d_delete,
+ .d_release = bc_d_release,
+};
+
+/*
+ * common directory operations' helpers
+ */
+
+static int bc_readdir(struct file *file, filldir_t filler, void *data,
+ struct user_beancounter *parent)
+{
+ int err = 0;
+ loff_t pos, filled;
+ struct user_beancounter *ub, *prev;
+ struct bc_proc_entry *pde;
+
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return -EPERM;
+
+ pos = file->f_pos;
+ if (pos == 0) {
+ err = (*filler)(data, ".", 1, pos,
+ file->f_dentry->d_inode->i_ino, DT_DIR);
+ if (err < 0) {
+ err = 0;
+ goto out;
+ }
+ pos++;
+ }
+
+ if (pos == 1) {
+ err = (*filler)(data, "..", 2, pos,
+ parent_ino(file->f_dentry), DT_DIR);
+ if (err < 0) {
+ err = 0;
+ goto out;
+ }
+ pos++;
+ }
+
+ filled = 2;
+ for (pde = (parent == NULL ? bc_root_entries : bc_entries);
+ pde != NULL; pde = pde->next) {
+ if (filled++ < pos)
+ continue;
+
+ err = (*filler)(data, pde->name, strlen(pde->name), pos,
+ bc_make_file_ino(pde), DT_REG);
+ if (err < 0) {
+ err = 0;
+ goto out;
+ }
+ pos++;
+ }
+
+ rcu_read_lock();
+ prev = NULL;
+ ub = list_entry(&ub_list_head, struct user_beancounter, ub_list);
+ while (1) {
+ int len;
+ unsigned long ino;
+ char buf[64];
+
+ ub = list_entry(rcu_dereference(ub->ub_list.next),
+ struct user_beancounter, ub_list);
+ if (&ub->ub_list == &ub_list_head)
+ break;
+
+ if (ub->parent != parent)
+ continue;
+
+ if (filled++ < pos)
+ continue;
+
+ if (!get_beancounter_rcu(ub))
+ continue;
+
+ rcu_read_unlock();
+ put_beancounter(prev);
+
+ len = print_ub_uid(ub, buf, sizeof(buf));
+ ino = bc_make_ino(ub);
+
+ err = (*filler)(data, buf, len, pos, ino, DT_DIR);
+ if (err < 0) {
+ err = 0;
+ put_beancounter(ub);
+ goto out;
+ }
+
+ rcu_read_lock();
+ prev = ub;
+ pos++;
+ }
+ rcu_read_unlock();
+ put_beancounter(prev);
+out:
+ file->f_pos = pos;
+ return err;
+}
+
+static int bc_looktest(struct inode *ino, void *data)
+{
+ return ino->i_op == &bc_entry_iops && ino->i_private == data;
+}
+
+static int bc_lookset(struct inode *ino, void *data)
+{
+ struct user_beancounter *ub;
+
+ ub = (struct user_beancounter *)data;
+ ino->i_private = data;
+ ino->i_ino = bc_make_ino(ub);
+ ino->i_fop = &bc_entry_fops;
+ ino->i_op = &bc_entry_iops;
+ ino->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+ /* subbeancounters are not included, but who cares? */
+ ino->i_nlink = num_entries + 2;
+ ino->i_gid = 0;
+ ino->i_uid = 0;
+ return 0;
+}
+
+static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *ino;
+
+ ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub);
+ if (ino == NULL)
+ goto out_put;
+
+ unlock_new_inode(ino);
+ dentry->d_op = &bc_dentry_ops;
+ dentry->d_fsdata = ub;
+ d_add(dentry, ino);
+ return NULL;
+
+out_put:
+ put_beancounter(ub);
+ return ERR_PTR(-ENOENT);
+}
+
+/*
+ * files (bc_proc_entry) manipulations
+ */
+
+static struct dentry *bc_lookup_file(struct inode *dir,
+ struct dentry *dentry, struct bc_proc_entry *root,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *))
+{
+ struct bc_proc_entry *pde;
+ struct inode *ino;
+
+ for (pde = root; pde != NULL; pde = pde->next)
+ if (strcmp(pde->name, dentry->d_name.name) == 0)
+ break;
+
+ if (pde == NULL)
+ return ERR_PTR(-ESRCH);
+
+ ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde);
+ if (ino == NULL)
+ return ERR_PTR(-ENOENT);
+
+ unlock_new_inode(ino);
+ dentry->d_op = &bc_dentry_ops;
+ d_add(dentry, ino);
+ return NULL;
+}
+
+static int bc_file_open(struct inode *ino, struct file *filp)
+{
+ struct bc_proc_entry *de;
+ struct user_beancounter *ub;
+
+ de = (struct bc_proc_entry *)ino->i_private;
+ ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata;
+ BUG_ON(ub->ub_magic != UB_MAGIC);
+
+ /*
+ * ub can't disappear: we hold d_parent, he holds the beancounter
+ */
+ return single_open(filp, de->u.show, ub);
+}
+
+static struct file_operations bc_file_ops = {
+ .open = bc_file_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int bc_looktest_entry(struct inode *ino, void *data)
+{
+ return ino->i_fop == &bc_file_ops && ino->i_private == data;
+}
+
+static int bc_lookset_entry(struct inode *ino, void *data)
+{
+ struct bc_proc_entry *de;
+
+ de = (struct bc_proc_entry *)data;
+ ino->i_private = data;
+ ino->i_ino = bc_make_file_ino(de);
+ ino->i_fop = &bc_file_ops,
+ ino->i_mode = S_IFREG | S_IRUSR;
+ ino->i_nlink = 1;
+ ino->i_gid = 0;
+ ino->i_uid = 0;
+ return 0;
+}
+
+static inline struct dentry *bc_lookup_files(struct inode *dir,
+ struct dentry *de)
+{
+ return bc_lookup_file(dir, de, bc_entries,
+ bc_looktest_entry, bc_lookset_entry);
+}
+
+static int bc_looktest_root_entry(struct inode *ino, void *data)
+{
+ struct bc_proc_entry *de;
+
+ de = (struct bc_proc_entry *)data;
+ return ino->i_fop == de->u.fops && ino->i_private == data;
+}
+
+static int bc_lookset_root_entry(struct inode *ino, void *data)
+{
+ struct bc_proc_entry *de;
+
+ de = (struct bc_proc_entry *)data;
+ ino->i_private = data;
+ ino->i_ino = bc_make_file_ino(de);
+ ino->i_fop = de->u.fops;
+ ino->i_mode = S_IFREG | S_IRUSR;
+ ino->i_nlink = 1;
+ ino->i_gid = 0;
+ ino->i_uid = 0;
+ return 0;
+}
+
+static inline struct dentry *bc_lookup_root_files(struct inode *dir,
+ struct dentry *de)
+{
+ return bc_lookup_file(dir, de, bc_root_entries,
+ bc_looktest_root_entry, bc_lookset_root_entry);
+}
+
+/*
+ * /proc/bc/.../<id> directory operations
+ */
+
+static int bc_entry_readdir(struct file *file, void *data, filldir_t filler)
+{
+ return bc_readdir(file, filler, data,
+ (struct user_beancounter *)file->f_dentry->d_fsdata);
+}
+
+static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ int id;
+ char *end;
+ struct user_beancounter *par, *ub;
+ struct dentry *de;
+
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return ERR_PTR(-EPERM);
+
+ de = bc_lookup_files(dir, dentry);
+ if (de != ERR_PTR(-ESRCH))
+ return de;
+
+ id = simple_strtol(dentry->d_name.name, &end, 10);
+ if (*end != '.')
+ return ERR_PTR(-ENOENT);
+
+ par = (struct user_beancounter *)dir->i_private;
+ if (par->ub_uid != id)
+ return ERR_PTR(-ENOENT);
+
+ id = simple_strtol(end + 1, &end, 10);
+ if (*end != '\0')
+ return ERR_PTR(-ENOENT);
+
+ ub = get_subbeancounter_byid(par, id, 0);
+ if (ub == NULL)
+ return ERR_PTR(-ENOENT);
+
+ return bc_lookup(ub, dir, dentry);
+}
+
+static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct user_beancounter *ub;
+
+ generic_fillattr(dentry->d_inode, stat);
+ ub = (struct user_beancounter *)dentry->d_fsdata;
+ stat->nlink = ub->ub_childs + 2;
+ return 0;
+}
+
+static struct file_operations bc_entry_fops = {
+ .read = generic_read_dir,
+ .readdir = bc_entry_readdir,
+};
+
+static struct inode_operations bc_entry_iops = {
+ .lookup = bc_entry_lookup,
+ .getattr = bc_entry_getattr,
+};
+
+/*
+ * /proc/bc directory operations
+ */
+
+static int bc_root_readdir(struct file *file, void *data, filldir_t filler)
+{
+ return bc_readdir(file, filler, data, NULL);
+}
+
+static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ int id;
+ char *end;
+ struct user_beancounter *ub;
+ struct dentry *de;
+
+ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+ return ERR_PTR(-EPERM);
+
+ de = bc_lookup_root_files(dir, dentry);
+ if (de != ERR_PTR(-ESRCH))
+ return de;
+
+ id = simple_strtol(dentry->d_name.name, &end, 10);
+ if (*end != '\0')
+ return ERR_PTR(-ENOENT);
+
+ ub = get_beancounter_byuid(id, 0);
+ if (ub == NULL)
+ return ERR_PTR(-ENOENT);
+
+ return bc_lookup(ub, dir, dentry);
+}
+
+static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ generic_fillattr(dentry->d_inode, stat);
+ stat->nlink = ub_count + 2;
+ return 0;
+}
+
+static struct file_operations bc_root_fops = {
+ .read = generic_read_dir,
+ .readdir = bc_root_readdir,
+};
+
+static struct inode_operations bc_root_iops = {
+ .lookup = bc_root_lookup,
+ .getattr = bc_root_getattr,
+};
+
+static int __init ub_init_proc(void)
+{
+ struct proc_dir_entry *entry;
+
+ bc_proc_root = create_proc_entry("bc",
+ S_IFDIR | S_IRUSR | S_IXUSR, NULL);
+ if (bc_proc_root == NULL)
+ panic("Can't create /proc/bc entry");
+
+ bc_proc_root->proc_fops = &bc_root_fops;
+ bc_proc_root->proc_iops = &bc_root_iops;
+
+ bc_register_proc_entry(&bc_resources_entry);
+#ifdef CONFIG_UBC_DEBUG
+ bc_register_proc_entry(&bc_debug_entry);
+#endif
+ bc_register_proc_root_entry(&bc_all_resources_entry);
+
+ entry = proc_create("user_beancounters",
+ S_IRUSR, &glob_proc_root, &ub_file_operations);
+ return 0;
+}
+
+core_initcall(ub_init_proc);
diff --git a/kernel/bc/rss_pages.c b/kernel/bc/rss_pages.c
new file mode 100644
index 0000000..2f64be5
--- /dev/null
+++ b/kernel/bc/rss_pages.c
@@ -0,0 +1,454 @@
+/*
+ * kernel/bc/rss_pages.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+
+#include <bc/beancounter.h>
+#include <bc/hash.h>
+#include <bc/vmpages.h>
+#include <bc/rss_pages.h>
+#include <bc/io_acct.h>
+
+static struct kmem_cache *pb_cachep;
+spinlock_t pb_lock = SPIN_LOCK_UNLOCKED;
+static struct page_beancounter **pb_hash_table;
+static unsigned int pb_hash_mask;
+
+/*
+ * Auxiliary staff
+ */
+
+static inline struct page_beancounter *next_page_pb(struct page_beancounter *p)
+{
+ return list_entry(p->page_list.next, struct page_beancounter,
+ page_list);
+}
+
+static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p)
+{
+ return list_entry(p->page_list.prev, struct page_beancounter,
+ page_list);
+}
+
+/*
+ * Held pages manipulation
+ */
+static inline void set_held_pages(struct user_beancounter *bc)
+{
+ /* all three depend on ub_held_pages */
+ __ub_update_physpages(bc);
+ __ub_update_oomguarpages(bc);
+ __ub_update_privvm(bc);
+}
+
+static inline void do_dec_held_pages(struct user_beancounter *ub, int value)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_held_pages -= value;
+ set_held_pages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static void dec_held_pages(struct user_beancounter *ub, int value)
+{
+ for (; ub != NULL; ub = ub->parent)
+ do_dec_held_pages(ub, value);
+}
+
+static inline void do_inc_held_pages(struct user_beancounter *ub, int value)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_held_pages += value;
+ set_held_pages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static void inc_held_pages(struct user_beancounter *ub, int value)
+{
+ for (; ub != NULL; ub = ub->parent)
+ do_inc_held_pages(ub, value);
+}
+
+/*
+ * ++ and -- beyond are protected with pb_lock
+ */
+
+static inline void inc_pbc_count(struct user_beancounter *ub)
+{
+ for (; ub != NULL; ub = ub->parent)
+ ub->ub_pbcs++;
+}
+
+static inline void dec_pbc_count(struct user_beancounter *ub)
+{
+ for (; ub != NULL; ub = ub->parent)
+ ub->ub_pbcs--;
+}
+
+/*
+ * Alloc - free
+ */
+
+inline int pb_alloc(struct page_beancounter **pbc)
+{
+ *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL);
+ if (*pbc != NULL) {
+ (*pbc)->next_hash = NULL;
+ (*pbc)->pb_magic = PB_MAGIC;
+ }
+ return (*pbc == NULL);
+}
+
+inline void pb_free(struct page_beancounter **pb)
+{
+ if (*pb != NULL) {
+ kmem_cache_free(pb_cachep, *pb);
+ *pb = NULL;
+ }
+}
+
+void pb_free_list(struct page_beancounter **p_pb)
+{
+ struct page_beancounter *list, *pb;
+
+ list = *p_pb;
+ if (list == PBC_COPY_SAME)
+ return;
+
+ while (list) {
+ pb = list;
+ list = list->next_hash;
+ pb_free(&pb);
+ }
+ *p_pb = NULL;
+}
+
+/*
+ * head -> <new objs> -> <old objs> -> ...
+ */
+static int __alloc_list(struct page_beancounter **head, int num)
+{
+ struct page_beancounter *pb;
+
+ while (num > 0) {
+ if (pb_alloc(&pb))
+ return -1;
+ pb->next_hash = *head;
+ *head = pb;
+ num--;
+ }
+
+ return num;
+}
+
+/*
+ * Ensure that the list contains at least num elements.
+ * p_pb points to an initialized list, may be of the zero length.
+ *
+ * mm->page_table_lock should be held
+ */
+int pb_alloc_list(struct page_beancounter **p_pb, int num)
+{
+ struct page_beancounter *list;
+
+ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--);
+ if (!num)
+ return 0;
+
+ /*
+ * *p_pb(after) *p_pb (before)
+ * \ \
+ * <new objs> -...-> <old objs> -> ...
+ */
+ if (__alloc_list(p_pb, num) < 0)
+ goto nomem;
+ return 0;
+
+nomem:
+ pb_free_list(p_pb);
+ return -ENOMEM;
+}
+
+/*
+ * Allocates a page_beancounter for each
+ * user_beancounter in a hash
+ */
+int pb_alloc_all(struct page_beancounter **pbs)
+{
+ int need_alloc;
+ struct user_beancounter *ub;
+
+ need_alloc = 0;
+ rcu_read_lock();
+ for_each_beancounter(ub)
+ need_alloc++;
+ rcu_read_unlock();
+
+ if (!__alloc_list(pbs, need_alloc))
+ return 0;
+
+ pb_free_list(pbs);
+ return -ENOMEM;
+}
+
+/*
+ * Hash routines
+ */
+
+static inline int pb_hash(struct user_beancounter *ub, struct page *page)
+{
+ return (page_to_pfn(page) ^ ub->ub_cookie) & pb_hash_mask;
+}
+
+/* pb_lock should be held */
+static inline void insert_pb(struct page_beancounter *p, struct page *page,
+ struct user_beancounter *ub, int hash)
+{
+ p->page = page;
+ p->ub = get_beancounter(ub);
+ p->next_hash = pb_hash_table[hash];
+ pb_hash_table[hash] = p;
+ inc_pbc_count(ub);
+}
+
+/*
+ * Heart
+ */
+
+static int __pb_dup_ref(struct page *page, struct user_beancounter *bc,
+ int hash)
+{
+ struct page_beancounter *p;
+
+ for (p = pb_hash_table[hash];
+ p != NULL && (p->page != page || p->ub != bc);
+ p = p->next_hash);
+ if (p == NULL)
+ return -1;
+
+ PB_COUNT_INC(p->refcount);
+ return 0;
+}
+
+static void __pb_add_ref(struct page *page, struct user_beancounter *bc,
+ struct page_beancounter **ppb, int hash)
+{
+ struct page_beancounter *head, *p, **hp;
+ int shift;
+
+ p = *ppb;
+ *ppb = p->next_hash;
+
+ insert_pb(p, page, bc, hash);
+ hp = page_pblist(page);
+ head = *hp;
+
+ if (head != NULL) {
+ /*
+ * Move the first element to the end of the list.
+ * List head (pb_head) is set to the next entry.
+ * Note that this code works even if head is the only element
+ * on the list (because it's cyclic).
+ */
+ BUG_ON(head->pb_magic != PB_MAGIC);
+ *hp = next_page_pb(head);
+ PB_SHIFT_INC(head->refcount);
+ shift = PB_SHIFT_GET(head->refcount);
+ /*
+ * Update user beancounter, the share of head has been changed.
+ * Note that the shift counter is taken after increment.
+ */
+ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift);
+ /* add the new page beancounter to the end of the list */
+ head = *hp;
+ list_add_tail(&p->page_list, &head->page_list);
+ } else {
+ *hp = p;
+ shift = 0;
+ INIT_LIST_HEAD(&p->page_list);
+ }
+
+ p->refcount = PB_REFCOUNT_MAKE(shift, 1);
+ /* update user beancounter for the new page beancounter */
+ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift);
+}
+
+void pb_add_ref(struct page *page, struct mm_struct *mm,
+ struct page_beancounter **p_pb)
+{
+ int hash;
+ struct user_beancounter *bc;
+
+ bc = mm->mm_ub;
+ if (bc == NULL)
+ return;
+
+ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
+ return;
+
+ hash = pb_hash(bc, page);
+
+ spin_lock(&pb_lock);
+ if (__pb_dup_ref(page, bc, hash))
+ __pb_add_ref(page, bc, p_pb, hash);
+ spin_unlock(&pb_lock);
+}
+
+void pb_dup_ref(struct page *page, struct mm_struct *mm,
+ struct page_beancounter **p_pb)
+{
+ int hash;
+ struct user_beancounter *bc;
+
+ bc = mm->mm_ub;
+ if (bc == NULL)
+ return;
+
+ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
+ return;
+
+ hash = pb_hash(bc, page);
+
+ spin_lock(&pb_lock);
+ if (*page_pblist(page) == NULL)
+ /*
+ * pages like ZERO_PAGE must not be accounted in pbc
+ * so on fork we just skip them
+ */
+ goto out_unlock;
+
+ if (unlikely(*p_pb != PBC_COPY_SAME))
+ __pb_add_ref(page, bc, p_pb, hash);
+ else if (unlikely(__pb_dup_ref(page, bc, hash)))
+ WARN_ON(1);
+out_unlock:
+ spin_unlock(&pb_lock);
+}
+
+void pb_remove_ref(struct page *page, struct mm_struct *mm)
+{
+ int hash;
+ struct user_beancounter *bc;
+ struct page_beancounter *p, **q, *f;
+ int shift, shiftt;
+
+ bc = mm->mm_ub;
+ if (bc == NULL)
+ return;
+
+ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
+ return;
+
+ hash = pb_hash(bc, page);
+
+ spin_lock(&pb_lock);
+ for (q = pb_hash_table + hash, p = *q;
+ p != NULL && (p->page != page || p->ub != bc);
+ q = &p->next_hash, p = *q);
+ if (p == NULL)
+ goto out_unlock;
+
+ PB_COUNT_DEC(p->refcount);
+ if (PB_COUNT_GET(p->refcount))
+ /*
+ * More references from the same user beancounter exist.
+ * Nothing needs to be done.
+ */
+ goto out_unlock;
+
+ /* remove from the hash list */
+ f = p;
+ *q = p->next_hash;
+
+ shift = PB_SHIFT_GET(p->refcount);
+
+ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift);
+
+ q = page_pblist(page);
+ if (*q == p) {
+ if (list_empty(&p->page_list)) {
+ *q = NULL;
+ goto out_free;
+ }
+
+ *q = next_page_pb(p);
+ }
+ list_del(&p->page_list);
+
+ /* Now balance the list. Move the tail and adjust its shift counter. */
+ p = prev_page_pb(*q);
+ shiftt = PB_SHIFT_GET(p->refcount);
+ *q = p;
+ PB_SHIFT_DEC(p->refcount);
+
+ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
+
+ /*
+ * If the shift counter of the moved beancounter is different from the
+ * removed one's, repeat the procedure for one more tail beancounter
+ */
+ if (shiftt > shift) {
+ p = prev_page_pb(*q);
+ *q = p;
+ PB_SHIFT_DEC(p->refcount);
+ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
+ }
+out_free:
+ dec_pbc_count(f->ub);
+ spin_unlock(&pb_lock);
+
+ put_beancounter(f->ub);
+ pb_free(&f);
+ return;
+
+out_unlock:
+ spin_unlock(&pb_lock);
+}
+
+struct user_beancounter *pb_grab_page_ub(struct page *page)
+{
+ struct page_beancounter *pb;
+ struct user_beancounter *ub;
+
+ spin_lock(&pb_lock);
+ pb = *page_pblist(page);
+ ub = (pb == NULL ? ERR_PTR(-EINVAL) :
+ get_beancounter(pb->ub));
+ spin_unlock(&pb_lock);
+ return ub;
+}
+
+void __init ub_init_pbc(void)
+{
+ unsigned long hash_size;
+
+ pb_cachep = kmem_cache_create("page_beancounter",
+ sizeof(struct page_beancounter), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ hash_size = num_physpages >> 2;
+ for (pb_hash_mask = 1;
+ (hash_size & pb_hash_mask) != hash_size;
+ pb_hash_mask = (pb_hash_mask << 1) + 1);
+ hash_size = pb_hash_mask + 1;
+ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size);
+ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *));
+ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *));
+
+ ub_init_io(pb_cachep);
+}
diff --git a/kernel/bc/statd.c b/kernel/bc/statd.c
new file mode 100644
index 0000000..bf6354b
--- /dev/null
+++ b/kernel/bc/statd.c
@@ -0,0 +1,453 @@
+/*
+ * kernel/bc/statd.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+
+#include <bc/beancounter.h>
+#include <bc/hash.h>
+#include <bc/statd.h>
+
+static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(ubs_notify_list);
+static long ubs_min_interval;
+static ubstattime_t ubs_start_time, ubs_end_time;
+static struct timer_list ubs_timer;
+
+static int ubstat_get_list(void __user *buf, long size)
+{
+ int retval;
+ struct user_beancounter *ub, *ubp;
+ long *page, *ptr, *end;
+ int len;
+
+ page = (long *)__get_free_page(GFP_KERNEL);
+ if (page == NULL)
+ return -ENOMEM;
+
+ retval = 0;
+ ubp = NULL;
+ ptr = page;
+ end = page + PAGE_SIZE / sizeof(*ptr);
+
+ spin_lock_irq(&ub_hash_lock);
+ for_each_beancounter(ub) {
+ if (ub->parent != NULL)
+ continue;
+ *ptr++ = ub->ub_uid;
+ if (ptr != end)
+ continue;
+
+ get_beancounter(ub);
+ spin_unlock_irq(&ub_hash_lock);
+
+ put_beancounter(ubp);
+ ubp = ub;
+
+ len = min_t(long, (ptr - page) * sizeof(*ptr), size);
+ if (copy_to_user(buf, page, len)) {
+ retval = -EFAULT;
+ goto out_put;
+ }
+ retval += len;
+ if (len < PAGE_SIZE)
+ goto out_put;
+ buf += len;
+ size -= len;
+
+ ptr = page;
+ end = page + PAGE_SIZE / sizeof(*ptr);
+
+ spin_lock_irq(&ub_hash_lock);
+ }
+ spin_unlock_irq(&ub_hash_lock);
+
+ put_beancounter(ubp);
+ size = min_t(long, (ptr - page) * sizeof(*ptr), size);
+ if (size > 0 && copy_to_user(buf, page, size)) {
+ retval = -EFAULT;
+ goto out_put;
+ }
+ retval += size;
+
+out_put:
+ put_beancounter(ubp);
+ free_page((unsigned long)page);
+ return retval;
+}
+
+static int ubstat_gettime(void __user *buf, long size)
+{
+ ubgettime_t data;
+ int retval;
+
+ spin_lock(&ubs_notify_lock);
+ data.start_time = ubs_start_time;
+ data.end_time = ubs_end_time;
+ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
+ spin_unlock(&ubs_notify_lock);
+
+ retval = min_t(long, sizeof(data), size);
+ if (copy_to_user(buf, &data, retval))
+ retval = -EFAULT;
+ return retval;
+}
+
+static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
+{
+ struct {
+ ubstattime_t start_time;
+ ubstattime_t end_time;
+ ubstatparm_t param[1];
+ } *data;
+
+ data = kbuf;
+ data->start_time = ubs_start_time;
+ data->end_time = ubs_end_time;
+
+ data->param[0].maxheld = ub->ub_store[res].maxheld;
+ data->param[0].failcnt = ub->ub_store[res].failcnt;
+
+ return sizeof(*data);
+}
+
+static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
+{
+ int wrote;
+ struct {
+ ubstattime_t start_time;
+ ubstattime_t end_time;
+ ubstatparm_t param[UB_RESOURCES];
+ } *data;
+ int resource;
+
+ data = kbuf;
+ data->start_time = ubs_start_time;
+ data->end_time = ubs_end_time;
+ wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+ for (resource = 0; resource < UB_RESOURCES; resource++) {
+ if (size < wrote + sizeof(data->param[resource]))
+ break;
+ data->param[resource].maxheld = ub->ub_store[resource].maxheld;
+ data->param[resource].failcnt = ub->ub_store[resource].failcnt;
+ wrote += sizeof(data->param[resource]);
+ }
+
+ return wrote;
+}
+
+static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
+ int size)
+{
+ int wrote;
+ struct {
+ ubstattime_t start_time;
+ ubstattime_t end_time;
+ ubstatparmf_t param[UB_RESOURCES];
+ } *data;
+ int resource;
+
+ data = kbuf;
+ data->start_time = ubs_start_time;
+ data->end_time = ubs_end_time;
+ wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+ for (resource = 0; resource < UB_RESOURCES; resource++) {
+ if (size < wrote + sizeof(data->param[resource]))
+ break;
+ /* The beginning of ubstatparmf_t matches struct ubparm. */
+ memcpy(&data->param[resource], &ub->ub_store[resource],
+ sizeof(ub->ub_store[resource]));
+ data->param[resource].__unused1 = 0;
+ data->param[resource].__unused2 = 0;
+ wrote += sizeof(data->param[resource]);
+ }
+ return wrote;
+}
+
+static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
+ void __user *buf, long size)
+{
+ void *kbuf;
+ int retval;
+
+ kbuf = (void *)__get_free_page(GFP_KERNEL);
+ if (kbuf == NULL)
+ return -ENOMEM;
+
+ spin_lock(&ubs_notify_lock);
+ switch (UBSTAT_CMD(cmd)) {
+ case UBSTAT_READ_ONE:
+ retval = -EINVAL;
+ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
+ break;
+ retval = ubstat_do_read_one(ub,
+ UBSTAT_PARMID(cmd), kbuf);
+ break;
+ case UBSTAT_READ_ALL:
+ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
+ break;
+ case UBSTAT_READ_FULL:
+ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
+ break;
+ default:
+ retval = -EINVAL;
+ }
+ spin_unlock(&ubs_notify_lock);
+
+ if (retval > 0) {
+ retval = min_t(long, retval, size);
+ if (copy_to_user(buf, kbuf, retval))
+ retval = -EFAULT;
+ }
+
+ free_page((unsigned long)kbuf);
+ return retval;
+}
+
+static int ubstat_handle_notifrq(ubnotifrq_t *req)
+{
+ int retval;
+ struct ub_stat_notify *new_notify;
+ struct list_head *entry;
+ struct task_struct *tsk_to_free;
+
+ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL);
+ if (new_notify == NULL)
+ return -ENOMEM;
+
+ tsk_to_free = NULL;
+ INIT_LIST_HEAD(&new_notify->list);
+
+ spin_lock(&ubs_notify_lock);
+ list_for_each(entry, &ubs_notify_list) {
+ struct ub_stat_notify *notify;
+
+ notify = list_entry(entry, struct ub_stat_notify, list);
+ if (notify->task == current) {
+ kfree(new_notify);
+ new_notify = notify;
+ break;
+ }
+ }
+
+ retval = -EINVAL;
+ if (req->maxinterval < 1)
+ goto out_unlock;
+ if (req->maxinterval > TIME_MAX_SEC)
+ req->maxinterval = TIME_MAX_SEC;
+ if (req->maxinterval < ubs_min_interval) {
+ unsigned long dif;
+
+ ubs_min_interval = req->maxinterval;
+ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
+ if (dif > req->maxinterval)
+ mod_timer(&ubs_timer,
+ ubs_timer.expires -
+ (dif - req->maxinterval) * HZ);
+ }
+
+ if (entry != &ubs_notify_list) {
+ list_del(&new_notify->list);
+ tsk_to_free = new_notify->task;
+ }
+ if (req->signum) {
+ new_notify->task = current;
+ get_task_struct(new_notify->task);
+ new_notify->signum = req->signum;
+ list_add(&new_notify->list, &ubs_notify_list);
+ } else
+ kfree(new_notify);
+ retval = 0;
+out_unlock:
+ spin_unlock(&ubs_notify_lock);
+ if (tsk_to_free != NULL)
+ put_task_struct(tsk_to_free);
+ return retval;
+}
+
+/*
+ * former sys_ubstat
+ */
+long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
+ void __user *buf, long size)
+{
+ int retval;
+ struct user_beancounter *ub;
+
+ if (func == UBSTAT_UBPARMNUM)
+ return UB_RESOURCES;
+ if (func == UBSTAT_UBLIST)
+ return ubstat_get_list(buf, size);
+ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)))
+ return -EPERM;
+
+ if (func == UBSTAT_GETTIME) {
+ retval = ubstat_gettime(buf, size);
+ goto notify;
+ }
+
+ ub = get_exec_ub();
+ if (ub != NULL && ub->ub_uid == arg1)
+ get_beancounter(ub);
+ else /* FIXME must be if (ve_is_super) */
+ ub = get_beancounter_byuid(arg1, 0);
+
+ if (ub == NULL)
+ return -ESRCH;
+
+ retval = ubstat_get_stat(ub, func, buf, size);
+ put_beancounter(ub);
+notify:
+ /* Handle request for notification */
+ if (retval >= 0) {
+ ubnotifrq_t notifrq;
+ int err;
+
+ err = -EFAULT;
+ if (!copy_from_user(&notifrq, (void __user *)arg2,
+ sizeof(notifrq)))
+ err = ubstat_handle_notifrq(&notifrq);
+ if (err)
+ retval = err;
+ }
+
+ return retval;
+}
+
+static void ubstat_save_onestat(struct user_beancounter *ub)
+{
+ int resource;
+
+ /* called with local irq disabled */
+ spin_lock(&ub->ub_lock);
+ for (resource = 0; resource < UB_RESOURCES; resource++) {
+ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
+ sizeof(struct ubparm));
+ ub->ub_parms[resource].minheld =
+ ub->ub_parms[resource].maxheld =
+ ub->ub_parms[resource].held;
+ }
+ spin_unlock(&ub->ub_lock);
+}
+
+static void ubstat_save_statistics(void)
+{
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ local_irq_save(flags);
+ for_each_beancounter (ub)
+ ubstat_save_onestat(ub);
+ local_irq_restore(flags);
+}
+
+static void ubstatd_timeout(unsigned long __data)
+{
+ struct task_struct *p;
+
+ p = (struct task_struct *) __data;
+ wake_up_process(p);
+}
+
+/*
+ * Safe wrapper for send_sig. It prevents a race with release_task
+ * for sighand.
+ * Should be called under tasklist_lock.
+ */
+static void task_send_sig(struct ub_stat_notify *notify)
+{
+ if (likely(notify->task->sighand != NULL))
+ send_sig(notify->signum, notify->task, 1);
+}
+
+static inline void do_notifies(void)
+{
+ LIST_HEAD(notif_free_list);
+ struct ub_stat_notify *notify;
+ struct ub_stat_notify *tmp;
+
+ spin_lock(&ubs_notify_lock);
+ ubs_start_time = ubs_end_time;
+ /*
+ * the expression below relies on time being unsigned long and
+ * arithmetic promotion rules
+ */
+ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
+ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
+ ubs_min_interval = TIME_MAX_SEC;
+ /* save statistics accumulated for the interval */
+ ubstat_save_statistics();
+ /* send signals */
+ read_lock(&tasklist_lock);
+ while (!list_empty(&ubs_notify_list)) {
+ notify = list_entry(ubs_notify_list.next,
+ struct ub_stat_notify, list);
+ task_send_sig(notify);
+ list_del(&notify->list);
+ list_add(&notify->list, &notif_free_list);
+ }
+ read_unlock(&tasklist_lock);
+ spin_unlock(&ubs_notify_lock);
+
+ list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
+ put_task_struct(notify->task);
+ kfree(notify);
+ }
+}
+
+/*
+ * Kernel thread
+ */
+static int ubstatd(void *unused)
+{
+ /* daemonize call will take care of signals */
+ daemonize("ubstatd");
+
+ ubs_timer.data = (unsigned long)current;
+ ubs_timer.function = ubstatd_timeout;
+ add_timer(&ubs_timer);
+
+ while (1) {
+ set_task_state(current, TASK_INTERRUPTIBLE);
+ if (time_after(ubs_timer.expires, jiffies)) {
+ schedule();
+ try_to_freeze();
+ continue;
+ }
+
+ __set_task_state(current, TASK_RUNNING);
+ do_notifies();
+ }
+ return 0;
+}
+
+static int __init ubstatd_init(void)
+{
+ init_timer(&ubs_timer);
+ ubs_timer.expires = TIME_MAX_JIF;
+ ubs_min_interval = TIME_MAX_SEC;
+ ubs_start_time = ubs_end_time = 0;
+
+ kernel_thread(ubstatd, NULL, 0);
+ return 0;
+}
+
+module_init(ubstatd_init);
diff --git a/kernel/bc/sys.c b/kernel/bc/sys.c
new file mode 100644
index 0000000..8fb942e
--- /dev/null
+++ b/kernel/bc/sys.c
@@ -0,0 +1,184 @@
+/*
+ * kernel/bc/sys.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/virtinfo.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+#include <bc/beancounter.h>
+
+/*
+ * The (rather boring) getluid syscall
+ */
+SYSCALL_DEFINE0(getluid)
+{
+ struct user_beancounter *ub;
+
+ ub = get_exec_ub();
+ if (ub == NULL)
+ return -EINVAL;
+
+ return ub->ub_uid;
+}
+
+/*
+ * The setluid syscall
+ */
+SYSCALL_DEFINE1(setluid, uid_t, uid)
+{
+ struct user_beancounter *ub;
+ struct task_beancounter *task_bc;
+ int error;
+
+ task_bc = &current->task_bc;
+
+ /* You may not disown a setluid */
+ error = -EINVAL;
+ if (uid == (uid_t)-1)
+ goto out;
+
+ /* You may only set an ub as root */
+ error = -EPERM;
+ if (!capable(CAP_SETUID))
+ goto out;
+ /*
+ * The ub once set is irrevocable to all
+ * unless it's set from ve0.
+ */
+ if (!ve_is_super(get_exec_env()))
+ goto out;
+
+ /* Ok - set up a beancounter entry for this user */
+ error = -ENOBUFS;
+ ub = get_beancounter_byuid(uid, 1);
+ if (ub == NULL)
+ goto out;
+
+ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) "
+ "for %.20s pid %d\n",
+ ub, atomic_read(&ub->ub_refcount),
+ current->comm, current->pid);
+ /* install bc */
+ error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub);
+ if (!(error & NOTIFY_FAIL)) {
+ put_beancounter(task_bc->exec_ub);
+ task_bc->exec_ub = ub;
+ if (!(error & NOTIFY_OK)) {
+ put_beancounter(task_bc->fork_sub);
+ task_bc->fork_sub = get_beancounter(ub);
+ }
+ error = 0;
+ } else {
+ put_beancounter(ub);
+ error = -ENOBUFS;
+ }
+out:
+ return error;
+}
+
+long do_setublimit(uid_t uid, unsigned long resource,
+ unsigned long *new_limits)
+{
+ int error;
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ error = -EPERM;
+ if(!capable(CAP_SYS_RESOURCE))
+ goto out;
+
+ if (!ve_is_super(get_exec_env()))
+ goto out;
+
+ error = -EINVAL;
+ if (resource >= UB_RESOURCES)
+ goto out;
+
+ error = -EINVAL;
+ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
+ goto out;
+
+ error = -ENOENT;
+ ub = get_beancounter_byuid(uid, 0);
+ if (ub == NULL) {
+ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid);
+ goto out;
+ }
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_parms[resource].barrier = new_limits[0];
+ ub->ub_parms[resource].limit = new_limits[1];
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+ put_beancounter(ub);
+
+ error = 0;
+out:
+ return error;
+}
+
+/*
+ * The setbeanlimit syscall
+ */
+SYSCALL_DEFINE3(setublimit, uid_t, uid, unsigned long, resource,
+ unsigned long __user, *limits)
+{
+ unsigned long new_limits[2];
+
+ if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
+ return -EFAULT;
+
+ return do_setublimit(uid, resource, new_limits);
+}
+
+extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
+ void __user *buf, long size);
+
+SYSCALL_DEFINE5(ubstat, int, func, unsigned long, arg1, unsigned long, arg2,
+ void __user, *buf, long, size)
+{
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
+ return do_ubstat(func, arg1, arg2, buf, size);
+}
+
+#ifdef CONFIG_COMPAT
+#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1)
+
+asmlinkage long compat_sys_setublimit(uid_t uid,
+ compat_long_t resource,
+ compat_long_t __user *limits)
+{
+ compat_long_t u_new_limits[2];
+ unsigned long new_limits[2];
+
+ if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits)))
+ return -EFAULT;
+
+ new_limits[0] = u_new_limits[0];
+ new_limits[1] = u_new_limits[1];
+
+ if (u_new_limits[0] == UB_MAXVALUE_COMPAT)
+ new_limits[0] = UB_MAXVALUE;
+ if (u_new_limits[1] == UB_MAXVALUE_COMPAT)
+ new_limits[1] = UB_MAXVALUE;
+
+ return do_setublimit(uid, resource, new_limits);
+}
+
+asmlinkage long compat_sys_ubstat(int func, unsigned int arg1,
+ unsigned int arg2, compat_uptr_t *buf, long size)
+{
+ return sys_ubstat(func, arg1, arg2, buf, size);
+}
+#endif
diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c
new file mode 100644
index 0000000..9b4ef0e
--- /dev/null
+++ b/kernel/bc/vm_pages.c
@@ -0,0 +1,546 @@
+/*
+ * kernel/bc/vm_pages.c
+ *
+ * Copyright (C) 2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/virtinfo.h>
+#include <linux/module.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, unsigned long end,
+ unsigned long *ret)
+{
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ do {
+ if (!pte_none(*pte) && pte_present(*pte))
+ (*ret)++;
+ } while (pte++, addr += PAGE_SIZE, (addr != end));
+ pte_unmap_unlock(pte - 1, ptl);
+
+ return addr;
+}
+
+static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end,
+ unsigned long *ret)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ next = pages_in_pte_range(vma, pmd, addr, next, ret);
+ } while (pmd++, addr = next, (addr != end));
+
+ return addr;
+}
+
+static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ unsigned long *ret)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ next = pages_in_pmd_range(vma, pud, addr, next, ret);
+ } while (pud++, addr = next, (addr != end));
+
+ return addr;
+}
+
+unsigned long pages_in_vma_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long ret;
+
+ ret = 0;
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ next = pages_in_pud_range(vma, pgd, addr, next, &ret);
+ } while (pgd++, addr = next, (addr != end));
+ return ret;
+}
+
+void __ub_update_physpages(struct user_beancounter *ub)
+{
+ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages
+ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT);
+ ub_adjust_maxheld(ub, UB_PHYSPAGES);
+}
+
+void __ub_update_oomguarpages(struct user_beancounter *ub)
+{
+ ub->ub_parms[UB_OOMGUARPAGES].held =
+ ub->ub_parms[UB_PHYSPAGES].held +
+ ub->ub_parms[UB_SWAPPAGES].held;
+ ub_adjust_maxheld(ub, UB_OOMGUARPAGES);
+}
+
+void __ub_update_privvm(struct user_beancounter *ub)
+{
+ ub->ub_parms[UB_PRIVVMPAGES].held =
+ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT)
+ + ub->ub_unused_privvmpages
+ + ub->ub_parms[UB_SHMPAGES].held;
+ ub_adjust_maxheld(ub, UB_PRIVVMPAGES);
+}
+
+static inline int __charge_privvm_locked(struct user_beancounter *ub,
+ unsigned long s, enum ub_severity strict)
+{
+ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0)
+ return -ENOMEM;
+
+ ub->ub_unused_privvmpages += s;
+ return 0;
+}
+
+static void __unused_privvm_dec_locked(struct user_beancounter *ub,
+ long size)
+{
+ /* catch possible overflow */
+ if (ub->ub_unused_privvmpages < size) {
+ uncharge_warn(ub, UB_UNUSEDPRIVVM,
+ size, ub->ub_unused_privvmpages);
+ size = ub->ub_unused_privvmpages;
+ }
+ ub->ub_unused_privvmpages -= size;
+ __ub_update_privvm(ub);
+}
+
+void __ub_unused_privvm_dec(struct mm_struct *mm, long size)
+{
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return;
+
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __unused_privvm_dec_locked(ub, size);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_unused_privvm_sub(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long count)
+{
+ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
+ __ub_unused_privvm_dec(mm, count);
+}
+
+void ub_unused_privvm_add(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long size)
+{
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ ub = mm->mm_ub;
+ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
+ return;
+
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_unused_privvmpages += size;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+int ub_protected_charge(struct mm_struct *mm, unsigned long size,
+ unsigned long newflags, struct vm_area_struct *vma)
+{
+ unsigned long flags;
+ struct file *file;
+ struct user_beancounter *ub;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return PRIVVM_NO_CHARGE;
+
+ flags = vma->vm_flags;
+ if (!((newflags ^ flags) & VM_WRITE))
+ return PRIVVM_NO_CHARGE;
+
+ file = vma->vm_file;
+ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file))
+ return PRIVVM_NO_CHARGE;
+
+ if (flags & VM_WRITE)
+ return PRIVVM_TO_SHARED;
+
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0)
+ goto err;
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return PRIVVM_TO_PRIVATE;
+
+err:
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return PRIVVM_ERROR;
+}
+
+int ub_memory_charge(struct mm_struct *mm, unsigned long size,
+ unsigned vm_flags, struct file *vm_file, int sv)
+{
+ struct user_beancounter *ub, *ubl;
+ unsigned long flags;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return 0;
+
+ size >>= PAGE_SHIFT;
+ if (size > UB_MAXVALUE)
+ return -EINVAL;
+
+ BUG_ON(sv != UB_SOFT && sv != UB_HARD);
+
+ if (vm_flags & VM_LOCKED) {
+ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
+ goto out_err;
+ }
+ if (VM_UB_PRIVATE(vm_flags, vm_file)) {
+ ubl = top_beancounter(ub);
+ spin_lock_irqsave(&ubl->ub_lock, flags);
+ if (__charge_privvm_locked(ubl, size, sv))
+ goto out_private;
+ spin_unlock_irqrestore(&ubl->ub_lock, flags);
+ }
+ return 0;
+
+out_private:
+ spin_unlock_irqrestore(&ubl->ub_lock, flags);
+ if (vm_flags & VM_LOCKED)
+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+out_err:
+ return -ENOMEM;
+}
+
+void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
+ unsigned vm_flags, struct file *vm_file)
+{
+ struct user_beancounter *ub;
+ unsigned long flags;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return;
+
+ size >>= PAGE_SHIFT;
+
+ if (vm_flags & VM_LOCKED)
+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+ if (VM_UB_PRIVATE(vm_flags, vm_file)) {
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __unused_privvm_dec_locked(ub, size);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ }
+}
+
+int ub_locked_charge(struct mm_struct *mm, unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return 0;
+
+ return charge_beancounter(ub, UB_LOCKEDPAGES,
+ size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ ub = mm->mm_ub;
+ if (ub == NULL)
+ return;
+
+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ ub = shi->shmi_ub;
+ if (ub == NULL)
+ return 0;
+
+ return charge_beancounter(ub, UB_LOCKEDPAGES,
+ size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ ub = shi->shmi_ub;
+ if (ub == NULL)
+ return;
+
+ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+
+static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ub->ub_tmpfs_respages++;
+ __ub_update_physpages(ub);
+ __ub_update_oomguarpages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_tmpfs_respages_inc(struct shmem_inode_info *shi)
+{
+ struct user_beancounter *ub;
+
+ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
+ do_ub_tmpfs_respages_inc(ub);
+}
+
+static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub,
+ unsigned long size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ /* catch possible overflow */
+ if (ub->ub_tmpfs_respages < size) {
+ uncharge_warn(ub, UB_TMPFSPAGES,
+ size, ub->ub_tmpfs_respages);
+ size = ub->ub_tmpfs_respages;
+ }
+ ub->ub_tmpfs_respages -= size;
+ /* update values what is the most interesting */
+ __ub_update_physpages(ub);
+ __ub_update_oomguarpages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
+ unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
+ do_ub_tmpfs_respages_sub(ub, size);
+}
+
+int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size)
+{
+ int ret;
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ ub = shi->shmi_ub;
+ if (ub == NULL)
+ return 0;
+
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD);
+ if (ret == 0)
+ __ub_update_privvm(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+ return ret;
+}
+
+void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size)
+{
+ unsigned long flags;
+ struct user_beancounter *ub;
+
+ ub = shi->shmi_ub;
+ if (ub == NULL)
+ return;
+
+ ub = top_beancounter(ub);
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size);
+ __ub_update_privvm(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+#ifdef CONFIG_BC_SWAP_ACCOUNTING
+static inline void do_ub_swapentry_inc(struct user_beancounter *ub)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __charge_beancounter_locked(ub, UB_SWAPPAGES, 1, UB_FORCE);
+ __ub_update_oomguarpages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num,
+ struct user_beancounter *ub)
+{
+ si->swap_ubs[num] = get_beancounter(ub);
+ for (; ub != NULL; ub = ub->parent)
+ do_ub_swapentry_inc(ub);
+}
+EXPORT_SYMBOL(ub_swapentry_inc);
+
+static inline void do_ub_swapentry_dec(struct user_beancounter *ub)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ub->ub_lock, flags);
+ __uncharge_beancounter_locked(ub, UB_SWAPPAGES, 1);
+ __ub_update_oomguarpages(ub);
+ spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num)
+{
+ struct user_beancounter *ub, *ubp;
+
+ ub = si->swap_ubs[num];
+ si->swap_ubs[num] = NULL;
+ for (ubp = ub; ubp != NULL; ubp = ubp->parent)
+ do_ub_swapentry_dec(ubp);
+ put_beancounter(ub);
+}
+EXPORT_SYMBOL(ub_swapentry_dec);
+
+int ub_swap_init(struct swap_info_struct *si, pgoff_t num)
+{
+ struct user_beancounter **ubs;
+
+ ubs = vmalloc(num * sizeof(struct user_beancounter *));
+ if (ubs == NULL)
+ return -ENOMEM;
+
+ memset(ubs, 0, num * sizeof(struct user_beancounter *));
+ si->swap_ubs = ubs;
+ return 0;
+}
+
+void ub_swap_fini(struct swap_info_struct *si)
+{
+ if (si->swap_ubs) {
+ vfree(si->swap_ubs);
+ si->swap_ubs = NULL;
+ }
+}
+#endif
+
+static int vmguar_enough_memory(struct vnotifier_block *self,
+ unsigned long event, void *arg, int old_ret)
+{
+ struct user_beancounter *ub;
+
+ if (event != VIRTINFO_ENOUGHMEM)
+ return old_ret;
+ /*
+ * If it's a kernel thread, don't care about it.
+ * Added in order aufsd to run smoothly over ramfs.
+ */
+ if (!current->mm)
+ return NOTIFY_DONE;
+
+ ub = top_beancounter(current->mm->mm_ub);
+ if (ub->ub_parms[UB_PRIVVMPAGES].held >
+ ub->ub_parms[UB_VMGUARPAGES].barrier)
+ return old_ret;
+
+ return NOTIFY_OK;
+}
+
+static struct vnotifier_block vmguar_notifier_block = {
+ .notifier_call = vmguar_enough_memory
+};
+
+static int __init init_vmguar_notifier(void)
+{
+ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block);
+ return 0;
+}
+
+static void __exit fini_vmguar_notifier(void)
+{
+ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block);
+}
+
+module_init(init_vmguar_notifier);
+module_exit(fini_vmguar_notifier);
+
+#ifdef CONFIG_PROC_FS
+static int bc_vmaux_show(struct seq_file *f, void *v)
+{
+ struct user_beancounter *ub;
+ unsigned long swap, unmap;
+ int i;
+
+ ub = seq_beancounter(f);
+
+ swap = unmap = 0;
+ for_each_online_cpu(i) {
+ swap += per_cpu_ptr(ub->ub_percpu, i)->swapin;
+ unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap;
+ }
+
+ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM],
+ ub->ub_unused_privvmpages);
+ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES],
+ ub->ub_tmpfs_respages);
+ seq_printf(f, bc_proc_lu_fmt, "rss", ub->ub_pbcs);
+
+ seq_printf(f, bc_proc_lu_fmt, "swapin", swap);
+ seq_printf(f, bc_proc_lu_fmt, "unmap", unmap);
+ return 0;
+}
+static struct bc_proc_entry bc_vmaux_entry = {
+ .name = "vmaux",
+ .u.show = bc_vmaux_show,
+};
+
+static int __init bc_vmaux_init(void)
+{
+ bc_register_proc_entry(&bc_vmaux_entry);
+ return 0;
+}
+
+late_initcall(bc_vmaux_init);
+#endif
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1fbcc74..7d06272 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2127,7 +2127,7 @@ static void cgroup_enable_task_cg_lists(void)
struct task_struct *p, *g;
write_lock(&css_set_lock);
use_task_css_set_links = 1;
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
task_lock(p);
/*
* We should check if the process is exiting, otherwise
@@ -2137,7 +2137,7 @@ static void cgroup_enable_task_cg_lists(void)
if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
list_add(&p->cg_list, &p->cgroups->tasks);
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
write_unlock(&css_set_lock);
}
diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c
new file mode 100644
index 0000000..d299cf6
--- /dev/null
+++ b/kernel/cgroup_lite.c
@@ -0,0 +1,342 @@
+/*
+ * lite cgroups engine
+ */
+
+#include <linux/cgroup.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/ve.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+
+#define SUBSYS(_x) &_x ## _subsys,
+
+static struct cgroup_subsys *subsys[] = {
+#include <linux/cgroup_subsys.h>
+};
+
+static struct css_set init_css_set;
+static struct cgroup init_cgroup;
+static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT];
+
+static struct idr cgroup_idr;
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+ return css->cgroup->cgroup_lite_id;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+ return (css->cgroup == &init_cgroup) ? 0 : 1;
+}
+
+int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+{
+ snprintf(buf, buflen, "/%d", cgrp->cgroup_lite_id);
+ return 0;
+}
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+ struct cgroup *g;
+
+ BUG_ON(!ss->use_id);
+ g = idr_find(&cgroup_idr, id);
+ if (!g)
+ return NULL;
+ return g->subsys[ss->subsys_id];
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+}
+
+static int init_cgroup_id(struct cgroup *g)
+{
+ int err, id;
+
+ if (unlikely(!idr_pre_get(&cgroup_idr, GFP_KERNEL)))
+ return -ENOMEM;
+
+ spin_lock(&cgroup_idr_lock);
+ err = idr_get_new_above(&cgroup_idr, g, 1, &id);
+ spin_unlock(&cgroup_idr_lock);
+
+ if (err)
+ return err;
+
+ if (id > USHORT_MAX) {
+ spin_lock(&cgroup_idr_lock);
+ idr_remove(&cgroup_idr, id);
+ spin_unlock(&cgroup_idr_lock);
+ return -ENOSPC;
+ }
+
+ g->cgroup_lite_id = id;
+
+ return 0;
+}
+
+static void fini_cgroup_id(struct cgroup *g)
+{
+ spin_lock(&cgroup_idr_lock);
+ idr_remove(&cgroup_idr, g->cgroup_lite_id);
+ spin_unlock(&cgroup_idr_lock);
+}
+
+void __css_put(struct cgroup_subsys_state *css)
+{
+ atomic_dec(&css->refcnt);
+}
+
+static int init_css_set_subsystems(struct cgroup *g, struct css_set *set)
+{
+ int i;
+ struct cgroup_subsys_state *ss;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *cs = subsys[i];
+
+ ss = cs->create(cs, g);
+ if (IS_ERR(ss))
+ goto destroy;
+
+ g->subsys[i] = ss;
+ set->subsys[i] = ss;
+ atomic_set(&ss->refcnt, 1);
+ ss->cgroup = g;
+ }
+ return 0;
+
+destroy:
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *cs = subsys[i];
+
+ if (g->subsys[i])
+ cs->destroy(cs, g);
+ }
+ return PTR_ERR(ss);
+}
+
+int init_ve_cgroups(struct ve_struct *ve)
+{
+ int err = -ENOMEM;
+ struct cgroup *g;
+ struct css_set *cs;
+
+ g = kzalloc(sizeof(struct cgroup), GFP_KERNEL);
+ if (g == NULL)
+ goto err_galloc;
+
+ cs = kzalloc(sizeof(struct css_set), GFP_KERNEL);
+ if (cs == NULL)
+ goto err_calloc;
+
+ err = init_cgroup_id(g);
+ if (err)
+ goto err_id;
+
+ g->parent = &init_cgroup;
+ err = init_css_set_subsystems(g, cs);
+ if (err)
+ goto err_subsys;
+
+ g->parent = &init_cgroup;
+ ve->ve_cgroup = g;
+ ve->ve_css_set = cs;
+ return 0;
+
+err_subsys:
+ fini_cgroup_id(g);
+err_id:
+ kfree(cs);
+err_calloc:
+ kfree(g);
+err_galloc:
+ return err;
+}
+EXPORT_SYMBOL(init_ve_cgroups);
+
+void fini_ve_cgroups(struct ve_struct *ve)
+{
+ int i;
+ struct cgroup *g = ve->ve_cgroup;
+ struct css_set *css = ve->ve_css_set;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *cs = subsys[i];
+ struct cgroup_subsys_state *ss = css->subsys[i];
+
+ BUG_ON(ss != g->subsys[i]);
+
+ if (cs->pre_destroy)
+ cs->pre_destroy(cs, g);
+
+ if (atomic_read(&ss->refcnt) != 1)
+ printk(KERN_ERR "CG: leaking %d/%s subsys\n",
+ ve->veid, subsys[i]->name);
+ else
+ cs->destroy(cs, g);
+ }
+
+ fini_cgroup_id(g);
+ kfree(g);
+ kfree(css);
+ ve->ve_cgroup = NULL;
+ ve->ve_css_set = NULL;
+}
+EXPORT_SYMBOL(fini_ve_cgroups);
+
+/*
+ * task lifecycle
+ */
+
+void cgroup_fork(struct task_struct *child)
+{
+ child->cgroups = current->cgroups;
+}
+
+void cgroup_fork_callbacks(struct task_struct *child)
+{
+}
+
+void cgroup_post_fork(struct task_struct *child)
+{
+}
+
+void cgroup_exit(struct task_struct *tsk, int dummy)
+{
+ tsk->cgroups = &init_css_set;
+}
+
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ return -ENODATA;
+}
+
+int cgroup_set_task_css(struct task_struct *tsk, struct css_set *css)
+{
+ int i, err;
+ struct cgroup_subsys *cs;
+ struct css_set *old_css;
+
+ old_css = tsk->cgroups;
+
+ if (old_css == css)
+ return 0;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ cs = subsys[i];
+ if (!cs->can_attach)
+ continue;
+ err = cs->can_attach(cs, css->subsys[i]->cgroup, tsk, false);
+ if (err)
+ return err;
+ }
+
+ tsk->cgroups = css;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ cs = subsys[i];
+ if (!cs->attach)
+ continue;
+ cs->attach(cs, css->subsys[i]->cgroup,
+ old_css->subsys[i]->cgroup, tsk, false);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(cgroup_set_task_css);
+
+/*
+ * proc struts
+ */
+
+static int proc_cgroup_show(struct seq_file *m, void *v)
+{
+ struct task_struct *tsk;
+
+ tsk = pid_task((struct pid *)m->private, PIDTYPE_PID);
+ seq_printf(m, "%p\n", tsk->cgroups);
+ return 0;
+}
+
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return single_open(file, proc_cgroup_show, PROC_I(inode)->pid);
+}
+
+const struct file_operations proc_cgroup_operations = {
+ .open = cgroup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * cgroups misc struts
+ */
+
+int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ const struct cftype cft[], int count)
+{
+ int idx = subsys->subsys_id;
+ static DEFINE_SPINLOCK(add_files_lock);
+
+ if (unlikely(subsys_cftypes[idx] == NULL)) {
+ spin_lock(&add_files_lock);
+ if (subsys_cftypes[idx] == NULL)
+ subsys_cftypes[idx] = (struct cftype *)cft;
+ spin_unlock(&add_files_lock);
+ }
+
+ BUG_ON(subsys_cftypes[idx] != cft);
+ return 0;
+}
+
+void cgroup_lock(void)
+{
+}
+
+void cgroup_unlock(void)
+{
+}
+
+bool cgroup_lock_live_group(struct cgroup *cg)
+{
+ return 1;
+}
+
+
+int cgroup_is_removed(const struct cgroup *cgrp)
+{
+ return 0;
+}
+
+int __init cgroup_init_early(void)
+{
+ int i;
+
+ init_task.cgroups = &init_css_set;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+ BUG_ON(subsys[i]->early_init);
+
+ return 0;
+}
+
+int __init cgroup_init(void)
+{
+ get_ve0()->ve_cgroup = &init_cgroup;
+ get_ve0()->ve_css_set = &init_css_set;
+ idr_init(&cgroup_idr);
+ if (init_cgroup_id(&init_cgroup))
+ panic("CG: Can't init initial cgroup id\n");
+ if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0)
+ panic("CG: Can't init initial set\n");
+ return 0;
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f..c13b053 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
#include <linux/security.h>
#include <linux/timex.h>
#include <linux/migrate.h>
+#include <linux/module.h>
#include <linux/posix-timers.h>
#include <linux/times.h>
#include <linux/ptrace.h>
@@ -100,7 +101,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
-static long compat_nanosleep_restart(struct restart_block *restart)
+long compat_nanosleep_restart(struct restart_block *restart)
{
struct compat_timespec __user *rmtp;
struct timespec rmt;
@@ -122,6 +123,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
return ret;
}
+EXPORT_SYMBOL_GPL(compat_nanosleep_restart);
asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
diff --git a/kernel/cpt/Makefile b/kernel/cpt/Makefile
new file mode 100644
index 0000000..d97cc31
--- /dev/null
+++ b/kernel/cpt/Makefile
@@ -0,0 +1,53 @@
+#
+#
+# kernel/cpt/Makefile
+#
+# Copyright (C) 2000-2005 SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o
+
+vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \
+ cpt_mm.o cpt_files.o cpt_kernel.o \
+ cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \
+ cpt_conntrack.o cpt_epoll.o
+
+vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \
+ rst_mm.o rst_files.o \
+ rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \
+ rst_conntrack.o rst_epoll.o
+
+ifeq ($(CONFIG_BEANCOUNTERS), y)
+vzcpt-objs += cpt_ubc.o
+vzrst-objs += rst_ubc.o
+endif
+
+ifeq ($(CONFIG_INOTIFY_USER), y)
+vzcpt-objs += cpt_inotify.o
+vzrst-objs += rst_inotify.o
+endif
+
+vzrst-objs += cpt_exports.o
+
+ifeq ($(CONFIG_VZ_CHECKPOINT), m)
+vzrst-objs += cpt_obj.o cpt_kernel.o
+endif
+
+ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y)
+vzcpt-objs += cpt_iterative.o
+vzrst-objs += rst_iterative.o
+endif
+
+ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y)
+vzcpt-objs += cpt_pagein.o
+vzrst-objs += rst_pagein.o
+endif
+
+ifeq ($(CONFIG_X86_64), y)
+vzcpt-objs += cpt_x8664.o
+ifeq ($(CONFIG_VZ_CHECKPOINT), m)
+vzrst-objs += cpt_x8664.o
+endif
+endif
diff --git a/kernel/cpt/cpt_conntrack.c b/kernel/cpt/cpt_conntrack.c
new file mode 100644
index 0000000..19dcf32
--- /dev/null
+++ b/kernel/cpt/cpt_conntrack.c
@@ -0,0 +1,365 @@
+/*
+ *
+ * kernel/cpt/cpt_conntrack.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/unistd.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+
+#if defined(CONFIG_VE_IPTABLES) && \
+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+
+/* How does it work?
+ *
+ * Network is disabled, so new conntrack entries will not appear.
+ * However, some of them can disappear because of timeouts.
+ *
+ * So, we take read_lock, collect all required information atomically,
+ * essentially, creating parallel "refcount" structures holding pointers.
+ * We delete conntrack timers as well, so the structures cannot disappear
+ * after releasing the lock. Now, after releasing lock we can dump everything
+ * safely. And on exit we restore timers to their original values.
+ *
+ * Note, this approach is not going to work in VE0.
+ */
+
+struct ct_holder
+{
+ struct ct_holder *next;
+ struct ip_conntrack_tuple_hash *cth;
+ int index;
+};
+
+static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple)
+{
+ v->cpt_dst = tuple->dst.ip;
+ v->cpt_dstport = tuple->dst.u.all;
+ v->cpt_protonum = tuple->dst.protonum;
+ v->cpt_dir = tuple->dst.dir;
+
+ v->cpt_src = tuple->src.ip;
+ v->cpt_srcport = tuple->src.u.all;
+}
+
+static int dump_one_expect(struct cpt_ip_connexpect_image *v,
+ struct ip_conntrack_expect *exp,
+ int sibling, cpt_context_t *ctx)
+{
+ int err = 0;
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_VOID;
+
+ encode_tuple(&v->cpt_tuple, &exp->tuple);
+ encode_tuple(&v->cpt_mask, &exp->mask);
+ v->cpt_sibling_conntrack = sibling;
+ v->cpt_flags = exp->flags;
+ v->cpt_seq = exp->id;
+ v->cpt_dir = 0;
+ v->cpt_manip_proto = 0;
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ v->cpt_manip_proto = exp->saved_proto.all;
+ v->cpt_dir = exp->dir;
+#endif
+ v->cpt_timeout = 0;
+ if (exp->master->helper->timeout)
+ v->cpt_timeout = exp->timeout.expires - jiffies;
+ return err;
+}
+
+/* NOTE. We use one page to dump list of expectations. This may be not enough
+ * in theory. In practice there is only one expectation per conntrack record.
+ * Moreover, taking into account that _ALL_ of expecations are saved in one
+ * global list, which is looked up each incoming/outpging packet, the system
+ * would be severely dead when even one conntrack would have so much of
+ * expectations. Shortly, I am not going to repair this.
+ */
+
+static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list,
+ cpt_context_t *ctx)
+{
+ int err = 0;
+ unsigned long pg;
+ struct cpt_ip_connexpect_image *v;
+ struct ip_conntrack_expect *exp;
+
+ if (ct->expecting == 0)
+ return err;
+ if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE)
+ return -ENOBUFS;
+
+ pg = __get_free_page(GFP_KERNEL);
+ if (!pg)
+ return -ENOMEM;
+ v = (struct cpt_ip_connexpect_image *)pg;
+
+ read_lock_bh(&ip_conntrack_lock);
+ list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) {
+ int sibling;
+
+ if (exp->master != ct)
+ continue;
+
+ if (ct->helper == NULL) {
+ eprintk_ctx("conntrack: no helper and non-trivial expectation\n");
+ err = -EINVAL;
+ break;
+ }
+
+ sibling = 0;
+#if 0
+ /* That's all? No need to calculate sibling? */
+ if (exp->sibling) {
+ struct ct_holder *c;
+ for (c = list; c; c = c->next) {
+ if (tuplehash_to_ctrack(c->cth) == exp->sibling) {
+ sibling = c->index;
+ break;
+ }
+ }
+ /* NOTE: exp->sibling could be not "confirmed" and, hence,
+ * out of hash table. We should just ignore such a sibling,
+ * the connection is going to be retried, the packet
+ * apparently was lost somewhere.
+ */
+ if (sibling == 0)
+ dprintk_ctx("sibling conntrack is not found\n");
+ }
+#endif
+
+ /* If the expectation still does not have exp->sibling
+ * and timer is not running, it is about to die on another
+ * cpu. Skip it. */
+ if (!sibling &&
+ ct->helper->timeout &&
+ !timer_pending(&exp->timeout)) {
+ dprintk_ctx("conntrack: expectation: no timer\n");
+ continue;
+ }
+
+ err = dump_one_expect(v, exp, sibling, ctx);
+ if (err)
+ break;
+
+ v++;
+ }
+ read_unlock_bh(&ip_conntrack_lock);
+
+ if (err == 0 && (unsigned long)v != pg)
+ ctx->write((void*)pg, (unsigned long)v - pg, ctx);
+
+ free_page(pg);
+ return err;
+}
+
+static int dump_one_ct(struct ct_holder *c, struct ct_holder *list,
+ cpt_context_t *ctx)
+{
+ struct ip_conntrack_tuple_hash *h = c->cth;
+ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+ struct cpt_ip_conntrack_image v;
+ int err = 0;
+
+ if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) {
+ eprintk_ctx("conntrack module ct->proto version mismatch\n");
+ return -EINVAL;
+ }
+
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_CONNTRACK;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_ARRAY;
+
+ read_lock_bh(&ip_conntrack_lock);
+ v.cpt_status = ct->status;
+ v.cpt_timeout = ct->timeout.expires - jiffies;
+ v.cpt_ct_helper = (ct->helper != NULL);
+ v.cpt_index = c->index;
+ v.cpt_id = ct->id;
+ v.cpt_mark = 0;
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+ v.cpt_mark = ct->mark;
+#endif
+ encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple);
+ encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple);
+ memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data));
+ memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data));
+
+ v.cpt_masq_index = 0;
+ v.cpt_initialized = 0;
+ v.cpt_num_manips = 0;
+ v.cpt_nat_helper = 0;
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+ v.cpt_masq_index = ct->nat.masq_index;
+#endif
+ /* "help" data is used by pptp, difficult to support */
+ v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos;
+ v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before;
+ v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after;
+ v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos;
+ v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before;
+ v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after;
+#endif
+ read_unlock_bh(&ip_conntrack_lock);
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ err = dump_expect_list(ct, list, ctx);
+
+ cpt_close_object(ctx);
+ return err;
+}
+
+int cpt_dump_ip_conntrack(cpt_context_t * ctx)
+{
+ struct ct_holder *ct_list = NULL;
+ struct ct_holder *c, **cp;
+ int err = 0;
+ int index = 0;
+ int idx;
+
+ if (get_exec_env()->_ip_conntrack == NULL)
+ return 0;
+
+ for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) {
+ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
+ if (c == NULL) {
+ err = -ENOMEM;
+ goto done;
+ }
+ memset(c, 0, sizeof(struct ct_holder));
+ c->next = ct_list;
+ ct_list = c;
+ }
+
+ c = ct_list;
+
+ read_lock_bh(&ip_conntrack_lock);
+ for (idx = 0; idx < ip_conntrack_htable_size; idx++) {
+ struct ip_conntrack_tuple_hash *h;
+ list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) {
+ /* Skip reply tuples, they are covered by original
+ * direction. */
+ if (DIRECTION(h))
+ continue;
+
+ /* Oops, we have not enough of holders...
+ * It is impossible. */
+ if (unlikely(c == NULL)) {
+ read_unlock_bh(&ip_conntrack_lock);
+ eprintk_ctx("unexpected conntrack appeared\n");
+ err = -ENOMEM;
+ goto done;
+ }
+
+ /* If timer is not running, it means that it
+ * has just been scheduled on another cpu.
+ * We should skip this conntrack, it is about to be
+ * destroyed. */
+ if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) {
+ dprintk_ctx("conntrack: no timer\n");
+ continue;
+ }
+
+ /* Timer is deleted. refcnt is _not_ decreased.
+ * We are going to restore the timer on exit
+ * from this function. */
+ c->cth = h;
+ c->index = ++index;
+ c = c->next;
+ }
+ }
+ read_unlock_bh(&ip_conntrack_lock);
+
+ /* No conntracks? Good. */
+ if (index == 0)
+ goto done;
+
+ /* Comb the list a little. */
+ cp = &ct_list;
+ while ((c = *cp) != NULL) {
+ /* Discard unused entries; they can appear, if some
+ * entries were timed out since we preallocated the list.
+ */
+ if (c->cth == NULL) {
+ *cp = c->next;
+ kfree(c);
+ continue;
+ }
+
+ /* Move conntracks attached to expectations to the beginning
+ * of the list. */
+ if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) {
+ *cp = c->next;
+ c->next = ct_list;
+ ct_list = c;
+ dprintk_ctx("conntrack: %d moved in list\n", c->index);
+ continue;
+ }
+ cp = &c->next;
+ }
+
+ cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK);
+
+ for (c = ct_list; c; c = c->next) {
+ err = dump_one_ct(c, ct_list, ctx);
+ if (err)
+ goto done;
+ }
+
+ cpt_close_section(ctx);
+
+done:
+ while ((c = ct_list) != NULL) {
+ ct_list = c->next;
+ if (c->cth) {
+ /* Restore timer. refcnt is preserved. */
+ add_timer(&tuplehash_to_ctrack(c->cth)->timeout);
+ }
+ kfree(c);
+ }
+ return err;
+}
+
+#endif
diff --git a/kernel/cpt/cpt_context.c b/kernel/cpt/cpt_context.c
new file mode 100644
index 0000000..f095a73
--- /dev/null
+++ b/kernel/cpt/cpt_context.c
@@ -0,0 +1,285 @@
+/*
+ *
+ * kernel/cpt/cpt_context.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+
+static void file_write(const void *addr, size_t count, struct cpt_context *ctx)
+{
+ mm_segment_t oldfs;
+ ssize_t err = -EBADF;
+ struct file *file = ctx->file;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (file)
+ err = file->f_op->write(file, addr, count, &file->f_pos);
+ set_fs(oldfs);
+ if (err != count && !ctx->write_error)
+ ctx->write_error = err < 0 ? err : -EIO;
+}
+
+static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
+{
+ mm_segment_t oldfs;
+ ssize_t err = -EBADF;
+ struct file *file = ctx->file;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (file)
+ err = file->f_op->write(file, addr, count, &pos);
+ set_fs(oldfs);
+ if (err != count && !ctx->write_error)
+ ctx->write_error = err < 0 ? err : -EIO;
+}
+
+static void file_align(struct cpt_context *ctx)
+{
+ struct file *file = ctx->file;
+
+ if (file)
+ file->f_pos = CPT_ALIGN(file->f_pos);
+}
+
+static void cpt_push(loff_t *p, struct cpt_context *ctx)
+{
+ cpt_push_object(p, ctx);
+ cpt_open_object(NULL, ctx);
+}
+
+static void cpt_pop(loff_t *p, struct cpt_context *ctx)
+{
+ cpt_close_object(ctx);
+ cpt_pop_object(p, ctx);
+}
+
+static loff_t lookup_cpt_object_pos(int type, void *p, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = lookup_cpt_object(type, p, ctx);
+ return obj->o_pos;
+}
+
+struct cpt_ops cpt_ops = {
+ .write = file_write,
+ .push_object = cpt_push,
+ .pop_object = cpt_pop,
+ .lookup_object = lookup_cpt_object_pos,
+};
+
+void cpt_context_init(struct cpt_context *ctx)
+{
+ int i;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ init_MUTEX(&ctx->main_sem);
+ ctx->refcount = 1;
+
+ ctx->current_section = -1;
+ ctx->current_object = -1;
+ ctx->pagesize = PAGE_SIZE;
+ ctx->write = file_write;
+ ctx->pwrite = file_pwrite;
+ ctx->align = file_align;
+ for (i=0; i < CPT_SECT_MAX; i++)
+ ctx->sections[i] = CPT_NULL;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ init_completion(&ctx->pgin_notify);
+#endif
+ cpt_object_init(ctx);
+}
+
+int cpt_open_dumpfile(struct cpt_context *ctx)
+{
+ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
+ if (ctx->tmpbuf == NULL)
+ return -ENOMEM;
+ __cpt_release_buf(ctx);
+ return 0;
+}
+
+int cpt_close_dumpfile(struct cpt_context *ctx)
+{
+ if (ctx->file) {
+ fput(ctx->file);
+ ctx->file = NULL;
+ }
+ if (ctx->tmpbuf) {
+ free_page((unsigned long)ctx->tmpbuf);
+ ctx->tmpbuf = NULL;
+ }
+ if (ctx->write_error)
+ eprintk_ctx("error while writing dump file: %d\n", ctx->write_error);
+ return ctx->write_error;
+}
+
+int cpt_major_hdr_out(struct cpt_context *ctx)
+{
+ struct cpt_major_hdr hdr;
+
+ if (ctx->file == NULL)
+ return 0;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.cpt_signature[0] = CPT_SIGNATURE0;
+ hdr.cpt_signature[1] = CPT_SIGNATURE1;
+ hdr.cpt_signature[2] = CPT_SIGNATURE2;
+ hdr.cpt_signature[3] = CPT_SIGNATURE3;
+ hdr.cpt_hdrlen = sizeof(hdr);
+ hdr.cpt_image_version = CPT_CURRENT_VERSION;
+#ifdef CONFIG_X86_64
+ hdr.cpt_os_arch = CPT_OS_ARCH_EMT64;
+#elif defined(CONFIG_X86_32)
+ hdr.cpt_os_arch = CPT_OS_ARCH_I386;
+#elif defined(CONFIG_IA64)
+ hdr.cpt_os_arch = CPT_OS_ARCH_IA64;
+#else
+#error Arch is not supported
+#endif
+ hdr.cpt_ve_features = (__u32)ctx->features;
+ hdr.cpt_ve_features2 = (__u32)(ctx->features>>32);
+ hdr.cpt_pagesize = (__u16)PAGE_SIZE;
+ hdr.cpt_hz = HZ;
+ hdr.cpt_start_jiffies64 = ctx->virt_jiffies64;
+ hdr.cpt_start_sec = ctx->start_time.tv_sec;
+ hdr.cpt_start_nsec = ctx->start_time.tv_nsec;
+ hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags;
+ hdr.cpt_kernel_config[0] = ctx->kernel_config_flags;
+ hdr.cpt_iptables_mask = ctx->iptables_mask;
+
+ ctx->write(&hdr, sizeof(hdr), ctx);
+ return 0;
+}
+
+int cpt_close_section(struct cpt_context *ctx)
+{
+ if (ctx->file && ctx->current_section >= 0) {
+ __u64 next = ctx->file->f_pos - ctx->current_section;
+ ctx->pwrite(&next, 8, ctx, ctx->current_section);
+ ctx->current_section = -1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cpt_close_section);
+
+int cpt_open_section(struct cpt_context *ctx, __u32 type)
+{
+ struct cpt_section_hdr hdr;
+
+ if (ctx->file == NULL)
+ return 0;
+
+ cpt_close_section(ctx);
+
+ ctx->current_section = ctx->file->f_pos;
+ ctx->sections[type] = ctx->current_section;
+
+ hdr.cpt_next = 0;
+ hdr.cpt_section = type;
+ hdr.cpt_hdrlen = sizeof(hdr);
+ hdr.cpt_align = 0;
+ ctx->write(&hdr, sizeof(hdr), ctx);
+
+ return 0;
+}
+EXPORT_SYMBOL(cpt_open_section);
+
+
+int cpt_close_object(struct cpt_context *ctx)
+{
+ if (ctx->file && ctx->current_object >= 0) {
+ __u64 next = ctx->file->f_pos - ctx->current_object;
+ ctx->pwrite(&next, 8, ctx, ctx->current_object);
+ ctx->current_object = -1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cpt_close_object);
+
+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ if (ctx->file == NULL)
+ return 0;
+
+ cpt_close_object(ctx);
+
+ ctx->current_object = ctx->file->f_pos;
+ if (obj)
+ cpt_obj_setpos(obj, ctx->current_object, ctx);
+
+ return 0;
+}
+EXPORT_SYMBOL(cpt_open_object);
+
+int cpt_push_object(loff_t *saved, struct cpt_context *ctx)
+{
+ if (ctx->file) {
+ *saved = ctx->current_object;
+ ctx->current_object = ctx->file->f_pos;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(cpt_push_object);
+
+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx)
+{
+ ctx->current_object = *saved;
+ return 0;
+}
+EXPORT_SYMBOL(cpt_pop_object);
+
+int cpt_dump_tail(struct cpt_context *ctx)
+{
+ struct cpt_major_tail hdr;
+ int i;
+
+ if (ctx->file == NULL)
+ return 0;
+
+ cpt_open_section(ctx, CPT_SECT_TRAILER);
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.cpt_next = sizeof(hdr);
+ hdr.cpt_object = CPT_OBJ_TRAILER;
+ hdr.cpt_hdrlen = sizeof(hdr);
+ hdr.cpt_content = CPT_CONTENT_VOID;
+ hdr.cpt_lazypages = 0;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ hdr.cpt_lazypages = ctx->lazypages;
+#endif
+ hdr.cpt_64bit = ctx->tasks64;
+ hdr.cpt_signature[0] = CPT_SIGNATURE0;
+ hdr.cpt_signature[1] = CPT_SIGNATURE1;
+ hdr.cpt_signature[2] = CPT_SIGNATURE2;
+ hdr.cpt_signature[3] = CPT_SIGNATURE3;
+ hdr.cpt_nsect = CPT_SECT_MAX_INDEX;
+ for (i = 0; i < CPT_SECT_MAX_INDEX; i++)
+ hdr.cpt_sections[i] = ctx->sections[i];
+
+ ctx->write(&hdr, sizeof(hdr), ctx);
+ cpt_close_section(ctx);
+ return 0;
+}
diff --git a/kernel/cpt/cpt_context.h b/kernel/cpt/cpt_context.h
new file mode 100644
index 0000000..9eb851a
--- /dev/null
+++ b/kernel/cpt/cpt_context.h
@@ -0,0 +1,225 @@
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <bc/beancounter.h>
+
+#define CPT_CTX_ERROR -1
+#define CPT_CTX_IDLE 0
+#define CPT_CTX_SUSPENDING 1
+#define CPT_CTX_SUSPENDED 2
+#define CPT_CTX_DUMPING 3
+#define CPT_CTX_UNDUMPING 4
+#define CPT_CTX_UNDUMPED 5
+
+#define CPT_TID(tsk) task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm
+#define CPT_FID "%d,%d(%s)"
+
+
+typedef struct cpt_context
+{
+ struct list_head ctx_list;
+ int refcount;
+ int ctx_state;
+ int objcount;
+ int sticky;
+ struct semaphore main_sem;
+
+ struct file *errorfile;
+ struct file *statusfile;
+ struct file *lockfile;
+
+ int errno;
+ char *error_msg;
+ loff_t err_offset;
+
+ struct file *file;
+ char *tmpbuf;
+ int pagesize;
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+ int iter_done;
+ void *iter_dir;
+ struct user_beancounter *iter_ub;
+#endif
+ loff_t current_section;
+ loff_t current_object;
+
+ loff_t sections[CPT_SECT_MAX];
+
+ __u32 errormask;
+ __u32 write_error;
+
+ struct list_head object_array[CPT_OBJ_MAX];
+
+ void (*write)(const void *addr, size_t count, struct cpt_context *ctx);
+ void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
+ ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx);
+ ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
+ void (*align)(struct cpt_context *ctx);
+ int ve_id;
+ int contextid;
+ struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst
+ * corresponging to start_time */
+ __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when
+ * VE did not migrate. */
+ struct timespec start_time;
+ struct timespec delta_time;
+ __s64 delta_nsec;
+ int image_version;
+ __u16 image_arch;
+ __u64 iptables_mask;
+ __u64 features;
+
+#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9)
+#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS)
+ struct hlist_head *anonvmas;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ struct file *pagein_file_in;
+ struct file *pagein_file_out;
+ int lazy_vm;
+ int lazypages;
+ int lazytype;
+ struct task_struct *pgin_task;
+ unsigned long last_pagein;
+ struct pagein_desc **pgin_dir;
+ struct pgin_device *pagein_dev;
+ struct completion pgin_notify;
+ struct completion *pgind_completion;
+ struct swap_info_struct *pgin_swp;
+#endif
+ int tasks64;
+ __u32 src_cpu_flags;
+ __u32 dst_cpu_flags;
+ __u32 kernel_config_flags;
+
+ __u32 last_vpid;
+
+ struct filejob *filejob_queue;
+
+ int slm_count;
+
+ char *vdso;
+
+#ifdef CONFIG_BEANCOUNTERS
+ /* Store here ubc limits and barriers during undumping,
+ and restore them before resuming */
+ struct ubparm saved_ubc[UB_RESOURCES];
+#endif
+
+ int tcp_cb_convert;
+#define CPT_TCP_CB_CONV 1
+#define CPT_TCP_CB_NOT_CONV 2
+
+#define CPT_MAX_LINKDIRS 1
+ struct file *linkdirs[CPT_MAX_LINKDIRS];
+ int linkdirs_num;
+ unsigned int linkcnt; /* for create hardlinked files */
+ int hardlinked_on;
+} cpt_context_t;
+
+typedef struct {
+ int pid;
+ cpt_context_t *ctx;
+ struct completion done;
+} pagein_info_t;
+
+int pagein_info_printf(char *buf, cpt_context_t *ctx);
+
+int cpt_open_dumpfile(struct cpt_context *);
+int cpt_close_dumpfile(struct cpt_context *);
+int rst_open_dumpfile(struct cpt_context *);
+void rst_close_dumpfile(struct cpt_context *);
+void cpt_context_init(struct cpt_context *);
+void rst_context_init(struct cpt_context *);
+void cpt_context_destroy(struct cpt_context *);
+
+void rst_report_error(int err, cpt_context_t *ctx);
+
+
+int cpt_major_hdr_out(struct cpt_context *ctx);
+int cpt_dump_tail(struct cpt_context *ctx);
+int cpt_close_section(struct cpt_context *ctx);
+int cpt_open_section(struct cpt_context *ctx, __u32 type);
+int cpt_close_object(struct cpt_context *ctx);
+int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx);
+int cpt_push_object(loff_t *saved, struct cpt_context *ctx);
+int cpt_pop_object(loff_t *saved, struct cpt_context *ctx);
+
+int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *);
+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx);
+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx);
+void rst_put_name(__u8 *name, struct cpt_context *ctx);
+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx);
+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx);
+
+pid_t vpid_to_pid(pid_t);
+
+#define rst_get_object(type, pos, tmp, ctx) \
+ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx))
+
+extern int debug_level;
+
+#define cpt_printk(lvl, fmt, args...) do { \
+ if (lvl <= debug_level) \
+ printk(fmt, ##args); \
+ } while (0)
+
+#define dprintk(a...) cpt_printk(3, "CPT DBG: " a)
+#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
+
+#define wprintk(a...) cpt_printk(2, "CPT WRN: " a)
+#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
+
+#define eprintk(a...) cpt_printk(1, "CPT ERR: " a)
+#define eprintk_ctx(f, arg...) \
+do { \
+ eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \
+ if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \
+ ctx->err_offset += snprintf((char*)(ctx->error_msg + \
+ ctx->err_offset), \
+ PAGE_SIZE - ctx->err_offset, \
+ "Error: " f, ##arg); \
+} while(0)
+
+#define CPT_TMPBUF_FREE 0x789adf12
+#define CPT_TMPBUF_BUSY 0xabcd9876
+
+static inline void *cpt_get_buf(cpt_context_t *ctx)
+{
+ void *buf = ctx->tmpbuf;
+
+ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE);
+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY;
+ return buf;
+}
+
+static inline void __cpt_release_buf(cpt_context_t *ctx)
+{
+ void *buf = ctx->tmpbuf;
+
+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
+}
+
+static inline void cpt_release_buf(cpt_context_t *ctx)
+{
+ void *buf = ctx->tmpbuf;
+
+ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY);
+ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
+}
+
+static inline void cpt_flush_error(cpt_context_t *ctx)
+{
+ mm_segment_t oldfs;
+
+ if (ctx->errorfile && ctx->error_msg && ctx->err_offset) {
+ if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) {
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ctx->errorfile->f_op->write(ctx->errorfile,
+ ctx->error_msg, ctx->err_offset,
+ &ctx->errorfile->f_pos);
+ set_fs(oldfs);
+ }
+ ctx->error_msg[0] = 0;
+ ctx->err_offset = 0;
+ }
+}
diff --git a/kernel/cpt/cpt_dump.c b/kernel/cpt/cpt_dump.c
new file mode 100644
index 0000000..08ae5e6
--- /dev/null
+++ b/kernel/cpt/cpt_dump.c
@@ -0,0 +1,1271 @@
+/*
+ *
+ * kernel/cpt/cpt_dump.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/ptrace.h>
+#include <linux/smp_lock.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/virtinfo.h>
+#include <linux/virtinfoscp.h>
+#include <bc/task.h>
+#include <linux/cpt_image.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
+#include <linux/netdevice.h>
+#include <linux/dcache.h>
+#include <linux/if_tun.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/netdevice.h>
+#include <linux/mount.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_dump.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_net.h"
+#include "cpt_socket.h"
+#include "cpt_ubc.h"
+#include "cpt_kernel.h"
+
+
+static int vps_child_level(struct task_struct *root, struct task_struct *c)
+{
+ int level = 0;
+ int veid = VE_TASK_INFO(c)->owner_env->veid;
+
+ while (VE_TASK_INFO(c)->owner_env->veid == veid) {
+ if (c->pid != c->tgid)
+ c = c->group_leader;
+ if (c == root)
+ return level;
+
+ c = c->parent;
+ level++;
+ }
+ return -1;
+}
+
+static inline int freezable(struct task_struct * p)
+{
+ if (p->exit_state)
+ return 0;
+
+ switch (p->state) {
+ case EXIT_ZOMBIE:
+ case EXIT_DEAD:
+ case TASK_STOPPED:
+#if TASK_TRACED != TASK_STOPPED
+ case TASK_TRACED:
+#endif
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+static void wake_ve(cpt_context_t *ctx)
+{
+ struct task_struct *p, *g;
+
+ do_each_thread_ve(g, p) {
+ spin_lock_irq(&p->sighand->siglock);
+ if (p->flags & PF_FROZEN) {
+ p->flags &= ~PF_FROZEN;
+ wake_up_process(p);
+ }
+ spin_unlock_irq(&p->sighand->siglock);
+ } while_each_thread_ve(g, p);
+}
+
+/*
+ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE...
+ *
+ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context
+ * of another process. Apparently, it is unacceptable on SMP.
+ * Let's take freeze_processes() in kernel/power/process.c as an example.
+ * Unserialized modifications tsk->flags easily
+ * (believe or not, but it happens with probability of almost 100% :-))
+ * creates the situation when setting PF_FREEZE in freeze_processes(),
+ * which quickly spins raising PF_FREEZE of all the processes,
+ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks.
+ *
+ * So, to make things clean, we require that those flags may be modified
+ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE
+ * is just a kind of signal.
+ *
+ * It is not enough, because we are still not allowed to change tsk->flags
+ * in context of another process, we can corrupt another flags, when the process
+ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags,
+ * which can be changed atomically.
+ *
+ * PF_FROZEN also changes in context of another process, but this happens
+ * only when the process is already in refrigerator() which does not modify
+ * tsk->flags.
+ */
+
+static int check_process_external(struct task_struct *p)
+{
+ if (pid_alive(p)) {
+ if (p->pids[PIDTYPE_PID].pid->level == 0)
+ return PIDTYPE_PID;
+ if (p->pids[PIDTYPE_PGID].pid->level == 0)
+ return PIDTYPE_PGID;
+ if (p->pids[PIDTYPE_SID].pid->level == 0)
+ return PIDTYPE_SID;
+ }
+
+ return PIDTYPE_MAX;
+}
+
+enum
+{
+ OBSTACLE_NOGO = -1,
+ OBSTACLE_TIMEOUT = -2,
+ OBSTACLE_TRYAGAIN = -3,
+};
+
+#define SUSPEND_TIMEOUT (10UL*HZ)
+
+static int vps_stop_tasks(struct cpt_context *ctx)
+{
+ unsigned long start_time = jiffies;
+ unsigned long target, timeout;
+ struct task_struct *p, *g;
+ int todo;
+ int round = 0;
+
+ do_gettimespec(&ctx->start_time);
+ do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
+ ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup;
+
+ read_lock(&tasklist_lock);
+
+ atomic_inc(&get_exec_env()->suspend);
+ timeout = HZ/5;
+ target = jiffies + timeout;
+
+ for(;;) {
+ struct task_struct *root;
+ todo = 0;
+
+ root = find_task_by_vpid(1);
+ if (!root) {
+ read_unlock(&tasklist_lock);
+ eprintk_ctx("cannot find ve init\n");
+ atomic_dec(&get_exec_env()->suspend);
+ return -ESRCH;
+ }
+
+ do_each_thread_ve(g, p) {
+ if (vps_child_level(root, p) >= 0) {
+ switch (check_process_external(p)) {
+ case PIDTYPE_PID:
+ eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+ task_pid_vnr(p), p->pid, p->comm);
+ todo = OBSTACLE_NOGO;
+ goto out;
+ case PIDTYPE_PGID:
+ eprintk_ctx("external process group %d/%d(%s) inside CT "
+ "(e.g. vzctl enter or vzctl exec).\n",
+ task_pgrp_vnr(p), p->pid, p->comm);
+ todo = OBSTACLE_NOGO;
+ goto out;
+ case PIDTYPE_SID:
+ eprintk_ctx("external process session %d/%d(%s) inside CT "
+ "(e.g. vzctl enter or vzctl exec).\n",
+ task_session_vnr(p), p->pid, p->comm);
+ todo = OBSTACLE_NOGO;
+ goto out;
+ }
+ if (p->vfork_done) {
+ /* Task between vfork()...exec()
+ * cannot be frozen, because parent
+ * wait in uninterruptible state.
+ * So, we do nothing, waiting for
+ * exec(), unless:
+ */
+ if (p->state == TASK_STOPPED ||
+ p->state == TASK_TRACED) {
+ eprintk_ctx("task " CPT_FID " is stopped while vfork(). "
+ "Checkpointing is impossible.\n",
+ CPT_TID(p));
+ todo = OBSTACLE_NOGO;
+ /* It is fatal, _user_ stopped
+ * vfork()ing task, so that we
+ * cannot suspend now.
+ */
+ } else {
+ todo = OBSTACLE_TRYAGAIN;
+ }
+ goto out;
+ }
+ if (p->signal->group_exit_task &&
+ p->signal->notify_count) {
+ /* exec() waits for threads' death */
+ wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p));
+ todo = OBSTACLE_TRYAGAIN;
+ goto out;
+ }
+ if (p->state == TASK_TRACED
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+ && !p->stopped_state
+#endif
+ ) {
+ int ptrace_id = p->pn_state;
+ /* Debugger waits for signal. */
+ switch (ptrace_id) {
+ case PN_STOP_TF:
+ case PN_STOP_TF_RT:
+ case PN_STOP_ENTRY:
+ case PN_STOP_FORK:
+ case PN_STOP_VFORK:
+ case PN_STOP_SIGNAL:
+ case PN_STOP_EXIT:
+ case PN_STOP_LEAVE:
+ break;
+ default:
+ eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id);
+ todo = OBSTACLE_NOGO;
+ goto out;
+ }
+ }
+#ifdef CONFIG_UTRACE
+ if (check_utrace(p, root, ctx)) {
+ eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p));
+ todo = OBSTACLE_NOGO;
+ goto out;
+ }
+#endif
+ if (p->flags & PF_NOFREEZE) {
+ eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p));
+ todo = OBSTACLE_NOGO;
+ goto out;
+ }
+
+ if (!freezable(p))
+ continue;
+
+ spin_lock_irq(&p->sighand->siglock);
+ if (!(p->flags & PF_FROZEN)) {
+ set_tsk_thread_flag(p, TIF_FREEZE);
+ signal_wake_up(p, 0);
+ }
+ spin_unlock_irq(&p->sighand->siglock);
+
+ if (p->flags & PF_FROZEN) {
+ if (p->state != TASK_UNINTERRUPTIBLE)
+ printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p));
+ continue;
+ }
+
+ if (round == 10)
+ wprintk_ctx(CPT_FID " is running\n", CPT_TID(p));
+
+ todo++;
+ } else {
+ if (p != current) {
+ eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n",
+ task_pid_vnr(p), task_pid_nr(p), p->comm);
+ todo = OBSTACLE_NOGO;
+ goto out;
+ }
+ }
+ } while_each_thread_ve(g, p);
+
+ if (todo > 0) {
+ /* No visible obstacles, but VE did not freeze
+ * for timeout. Interrupt suspend, if it is major
+ * timeout or signal; if it is minor timeout
+ * we will wake VE and restart suspend.
+ */
+ if (time_after(jiffies, start_time + SUSPEND_TIMEOUT)
+ || signal_pending(current))
+ todo = OBSTACLE_TIMEOUT;
+ else if (time_after(jiffies, target))
+ todo = OBSTACLE_TRYAGAIN;
+ }
+
+out:
+ if (todo < 0) {
+ atomic_dec(&get_exec_env()->suspend);
+
+ wake_ve(ctx);
+
+#if 0
+ /* This is sign of failure of printk(), which is not
+ * ours. So, no prefixes. */
+ printk(">\n");
+#endif
+ }
+
+ read_unlock(&tasklist_lock);
+
+ if (!todo) {
+ atomic_dec(&get_exec_env()->suspend);
+ return 0;
+ }
+
+ switch (todo) {
+ case OBSTACLE_NOGO:
+ eprintk_ctx("suspend is impossible now.\n");
+ return -EAGAIN;
+
+ case OBSTACLE_TIMEOUT:
+ eprintk_ctx("interrupted or timed out.\n");
+ return -EINTR;
+
+ case OBSTACLE_TRYAGAIN:
+ if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) ||
+ signal_pending(current)) {
+ wprintk_ctx("suspend timed out\n");
+ return -EAGAIN;
+ }
+
+ wprintk_ctx("minor suspend timeout (%lu) expired, "
+ "trying again\n", timeout);
+
+ /* Try again. VE is awake, give it some time to run. */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ);
+
+ /* After a short wait restart suspend
+ * with longer timeout */
+ atomic_inc(&get_exec_env()->suspend);
+ timeout = min(timeout<<1, SUSPEND_TIMEOUT);
+ target = jiffies + timeout;
+ break;
+
+ default:
+ if (round > 0) {
+ /* VE is partially frozen, give processes
+ * a chance to enter to refrigerator(). */
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ/20);
+ } else {
+ yield();
+ }
+ }
+
+ read_lock(&tasklist_lock);
+ round++;
+ }
+}
+
+static int cpt_unlock_ve(struct cpt_context *ctx)
+{
+ struct ve_struct *env;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+ down_write(&env->op_sem);
+ env->is_locked = 0;
+ up_write(&env->op_sem);
+ put_ve(env);
+ return 0;
+}
+
+int cpt_resume(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx);
+
+ cpt_unlock_sockets(ctx);
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ if (ctx->pgin_task) {
+ wait_for_completion(&ctx->pgin_notify);
+ put_task_struct(ctx->pgin_task);
+ ctx->pgin_task = NULL;
+ }
+#endif
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->flags & PF_FROZEN) {
+ tsk->flags &= ~PF_FROZEN;
+ wake_up_process(tsk);
+ } else if (freezable(tsk)) {
+ eprintk_ctx("strange, %s not frozen\n", tsk->comm );
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ put_task_struct(tsk);
+ }
+
+ cpt_resume_network(ctx);
+
+ cpt_unlock_ve(ctx);
+
+ cpt_finish_ubc(ctx);
+ cpt_object_destroy(ctx);
+ return 0;
+}
+
+int cpt_kill(struct cpt_context *ctx)
+{
+ int err = 0;
+ struct ve_struct *env;
+ cpt_object_t *obj;
+ struct task_struct *root_task = NULL;
+ long delay;
+ struct cred *c;
+
+ if (!ctx->ve_id)
+ return -EINVAL;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+
+ c = prepare_creds();
+ if (c == NULL) {
+ put_ve(env);
+ return -ENOMEM;
+ }
+
+ /* from here cpt_kill succeeds */
+ virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx);
+
+ if (current->ve_task_info.owner_env == env) {
+ wprintk_ctx("attempt to kill ve from inside, escaping...\n");
+ ve_move_task(current, get_ve0(), c);
+ } else
+ abort_creds(c);
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ if (ctx->pgin_task) {
+ wait_for_completion(&ctx->pgin_notify);
+ put_task_struct(ctx->pgin_task);
+ ctx->pgin_task = NULL;
+ }
+#endif
+
+ cpt_kill_sockets(ctx);
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+
+ if (tsk->exit_state) {
+ put_task_struct(tsk);
+ continue;
+ }
+
+ if (task_pid_vnr(tsk) == 1) {
+ root_task = tsk;
+ continue;
+ }
+
+ tsk->robust_list = NULL;
+#ifdef CONFIG_COMPAT
+ tsk->compat_robust_list = NULL;
+#endif
+ tsk->clear_child_tid = NULL;
+
+ if (tsk->ptrace) {
+ write_lock_irq(&tasklist_lock);
+ tsk->ptrace = 0;
+ if (!list_empty(&tsk->ptrace_entry)) {
+ list_del_init(&tsk->ptrace_entry);
+ /*
+ * This code used to be here:
+ * remove_parent(tsk);
+ * tsk->parent = tsk->parent;
+ * add_parent(tsk);
+ */
+ }
+ write_unlock_irq(&tasklist_lock);
+ }
+
+ send_sig(SIGKILL, tsk, 1);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sigfillset(&tsk->blocked);
+ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
+ set_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ if (tsk->flags & PF_FROZEN)
+ tsk->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ }
+
+ yield();
+
+ if (root_task != NULL) {
+ send_sig(SIGKILL, root_task, 1);
+
+ spin_lock_irq(&root_task->sighand->siglock);
+ sigfillset(&root_task->blocked);
+ sigdelsetmask(&root_task->blocked, sigmask(SIGKILL));
+ set_tsk_thread_flag(root_task, TIF_SIGPENDING);
+ clear_tsk_thread_flag(root_task, TIF_FREEZE);
+ if (root_task->flags & PF_FROZEN)
+ root_task->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&root_task->sighand->siglock);
+
+ wake_up_process(root_task);
+ put_task_struct(root_task);
+ }
+
+ cpt_finish_ubc(ctx);
+ cpt_object_destroy(ctx);
+
+ delay = 1;
+ while (atomic_read(&env->counter) != 1) {
+ if (signal_pending(current))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ delay = (delay < HZ) ? (delay << 1) : HZ;
+ schedule_timeout(delay);
+ }
+ put_ve(env);
+
+ return err;
+}
+
+#ifdef CONFIG_BEANCOUNTERS
+static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx)
+{
+ struct task_beancounter *tbc;
+
+ tbc = &(t->task_bc);
+ cpt_add_ubc(tbc->exec_ub, ctx);
+ cpt_add_ubc(tbc->task_ub, ctx);
+ cpt_add_ubc(tbc->fork_sub, ctx);
+}
+#else
+static void inline collect_task_ubc(struct task_struct *t,
+ struct cpt_context *ctx)
+{ return; }
+#endif
+
+static cpt_object_t * remember_task(struct task_struct * child,
+ cpt_object_t * head, cpt_context_t * ctx)
+{
+ cpt_object_t *cobj;
+
+ if (freezable(child) && !(child->flags&PF_FROZEN)) {
+ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child));
+ put_task_struct(child);
+ return NULL;
+ }
+
+ if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG();
+ if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
+ put_task_struct(child);
+ return NULL;
+ }
+ cobj->o_count = 1;
+ cpt_obj_setobj(cobj, child, ctx);
+ insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx);
+ collect_task_ubc(child, ctx);
+ return cobj;
+}
+
+static int vps_collect_tasks(struct cpt_context *ctx)
+{
+ int err = -ESRCH;
+ cpt_object_t *obj;
+ struct task_struct *root;
+ read_lock(&tasklist_lock);
+ root = find_task_by_vpid(1);
+ if (root)
+ get_task_struct(root);
+ read_unlock(&tasklist_lock);
+
+ if (!root) {
+ err = -ESRCH;
+ eprintk_ctx("vps_collect_tasks: cannot find root\n");
+ goto out;
+ }
+
+ if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
+ put_task_struct(root);
+ return -ENOMEM;
+ }
+ obj->o_count = 1;
+ cpt_obj_setobj(obj, root, ctx);
+ intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
+ collect_task_ubc(root, ctx);
+
+ /* Collect process subtree recursively */
+ for_each_object(obj, CPT_OBJ_TASK) {
+ cpt_object_t *head = obj;
+ struct task_struct *tsk = obj->o_obj;
+ struct task_struct *child;
+
+ if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) {
+ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk));
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (tsk->state == TASK_RUNNING)
+ printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk));
+
+ wait_task_inactive(tsk, 0);
+
+ err = check_task_state(tsk, ctx);
+ if (err)
+ goto out;
+
+ if (tsk->pid == tsk->tgid) {
+ child = tsk;
+ for (;;) {
+ read_lock(&tasklist_lock);
+ child = next_thread(child);
+ if (child != tsk)
+ get_task_struct(child);
+ read_unlock(&tasklist_lock);
+
+ if (child == tsk)
+ break;
+
+ if (child->parent != tsk->parent) {
+ put_task_struct(child);
+ eprintk_ctx("illegal thread structure, kernel bug\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if ((head = remember_task(child, head, ctx)) == NULL) {
+ eprintk_ctx("task obj allocation failure\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ /* About locking. VE is frozen. But lists of children
+ * may change at least for init, when entered task reparents
+ * to init and when reparented task exits. If we take care
+ * of this case, we still can unlock while scanning
+ * tasklists.
+ */
+ read_lock(&tasklist_lock);
+ list_for_each_entry(child, &tsk->children, sibling) {
+ if (child->parent != tsk)
+ continue;
+ if (child->pid != child->tgid)
+ continue;
+ get_task_struct(child);
+ read_unlock(&tasklist_lock);
+
+ if ((head = remember_task(child, head, ctx)) == NULL) {
+ eprintk_ctx("task obj allocation failure\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ }
+
+ list_for_each_entry(child, &tsk->ptraced, ptrace_entry) {
+ if (child->parent != tsk)
+ continue;
+ if (child->pid != child->tgid)
+ continue;
+ get_task_struct(child);
+ read_unlock(&tasklist_lock);
+
+ if ((head = remember_task(child, head, ctx)) == NULL) {
+ eprintk_ctx("task obj allocation failure\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ read_lock(&tasklist_lock);
+ }
+ read_unlock(&tasklist_lock);
+ }
+
+ return 0;
+
+out:
+ while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) {
+ struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next;
+ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
+ struct task_struct *tsk;
+
+ list_del(head);
+ tsk = obj->o_obj;
+ put_task_struct(tsk);
+ free_cpt_object(obj, ctx);
+ }
+ return err;
+}
+
+static int cpt_collect(struct cpt_context *ctx)
+{
+ int err;
+
+ if ((err = cpt_collect_mm(ctx)) != 0)
+ return err;
+
+ if ((err = cpt_collect_sysv(ctx)) != 0)
+ return err;
+
+ if ((err = cpt_collect_files(ctx)) != 0)
+ return err;
+
+ if ((err = cpt_collect_fs(ctx)) != 0)
+ return err;
+
+ if ((err = cpt_collect_namespace(ctx)) != 0)
+ return err;
+
+ if ((err = cpt_collect_signals(ctx)) != 0)
+ return err;
+
+ if (virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL)
+ return -ECHRNG;
+
+ return 0;
+}
+
+static int cpt_dump_veinfo(cpt_context_t *ctx)
+{
+ struct cpt_veinfo_image *i = cpt_get_buf(ctx);
+ struct ve_struct *ve;
+ struct timespec delta;
+ struct ipc_namespace *ns;
+
+ cpt_open_section(ctx, CPT_SECT_VEINFO);
+ cpt_open_object(NULL, ctx);
+
+ memset(i, 0, sizeof(*i));
+
+ i->cpt_next = CPT_NULL;
+ i->cpt_object = CPT_OBJ_VEINFO;
+ i->cpt_hdrlen = sizeof(*i);
+ i->cpt_content = CPT_CONTENT_VOID;
+
+ ve = get_exec_env();
+ ns = ve->ve_ns->ipc_ns;
+
+ if (ns->shm_ctlall > 0xFFFFFFFFU)
+ i->shm_ctl_all = 0xFFFFFFFFU;
+ if (ns->shm_ctlmax > 0xFFFFFFFFU)
+ i->shm_ctl_max = 0xFFFFFFFFU;
+ i->shm_ctl_mni = ns->shm_ctlmni;
+
+ i->msg_ctl_max = ns->msg_ctlmax;
+ i->msg_ctl_mni = ns->msg_ctlmni;
+ i->msg_ctl_mnb = ns->msg_ctlmnb;
+
+ BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
+ i->sem_ctl_arr[0] = ns->sem_ctls[0];
+ i->sem_ctl_arr[1] = ns->sem_ctls[1];
+ i->sem_ctl_arr[2] = ns->sem_ctls[2];
+ i->sem_ctl_arr[3] = ns->sem_ctls[3];
+
+ do_posix_clock_monotonic_gettime(&delta);
+ _set_normalized_timespec(&delta,
+ delta.tv_sec - ve->start_timespec.tv_sec,
+ delta.tv_nsec - ve->start_timespec.tv_nsec);
+ i->start_timespec_delta = cpt_timespec_export(&delta);
+ i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies;
+
+ i->last_pid = ve->ve_ns->pid_ns->last_pid;
+ i->rnd_va_space = ve->_randomize_va_space + 1;
+
+ ctx->write(i, sizeof(*i), ctx);
+ cpt_release_buf(ctx);
+ cpt_close_object(ctx);
+ cpt_close_section(ctx);
+ return 0;
+}
+
+static int cpt_dump_utsname(cpt_context_t *ctx)
+{
+ int len;
+ struct cpt_object_hdr o;
+ struct ve_struct *ve;
+ struct uts_namespace *ns;
+
+ cpt_open_section(ctx, CPT_SECT_UTSNAME);
+
+ ve = get_exec_env();
+ ns = ve->ve_ns->uts_ns;
+
+ cpt_open_object(NULL, ctx);
+ len = strlen(ns->name.nodename);
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_NAME;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_NAME;
+
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(ns->name.nodename, len+1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+
+ cpt_open_object(NULL, ctx);
+ len = strlen(ns->name.domainname);
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_NAME;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_NAME;
+
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(ns->name.domainname, len+1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+
+ cpt_close_section(ctx);
+ return 0;
+}
+
+#ifndef CONFIG_IA64
+static int cpt_dump_vsyscall(cpt_context_t *ctx)
+{
+ struct cpt_page_block *pgb = cpt_get_buf(ctx);
+
+ cpt_open_section(ctx, CPT_SECT_VSYSCALL);
+ cpt_open_object(NULL, ctx);
+
+ pgb->cpt_next = CPT_NULL;
+ pgb->cpt_object = CPT_OBJ_VSYSCALL;
+ pgb->cpt_hdrlen = sizeof(*pgb);
+ pgb->cpt_content = CPT_CONTENT_DATA;
+ pgb->cpt_start = cpt_ptr_export(vsyscall_addr);
+ pgb->cpt_end = pgb->cpt_start + PAGE_SIZE;
+
+ ctx->write(pgb, sizeof(*pgb), ctx);
+ cpt_release_buf(ctx);
+
+ ctx->write(vsyscall_addr, PAGE_SIZE, ctx);
+
+ cpt_close_object(ctx);
+ cpt_close_section(ctx);
+ return 0;
+}
+#endif
+
+int cpt_dump(struct cpt_context *ctx)
+{
+ struct ve_struct *oldenv, *env;
+ struct nsproxy *old_ns;
+ int err, err2 = 0;
+
+ if (!ctx->ve_id)
+ return -EINVAL;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+
+ down_read(&env->op_sem);
+ err = -ESRCH;
+ if (!env->is_running)
+ goto out_noenv;
+ if (!env->is_locked)
+ goto out_noenv;
+ err = -EINVAL;
+ if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) {
+ printk(KERN_WARNING "CT: checkpointing not supported yet"
+ " for hidden pid namespaces.\n");
+ goto out_noenv;
+ }
+
+ oldenv = set_exec_env(env);
+ old_ns = current->nsproxy;
+ current->nsproxy = env->ve_ns;
+
+ /* Phase 2: real checkpointing */
+ err = cpt_open_dumpfile(ctx);
+ if (err)
+ goto out;
+
+ cpt_major_hdr_out(ctx);
+
+ if (!err)
+ err = cpt_dump_veinfo(ctx);
+ if (!err)
+ err = cpt_dump_ubc(ctx);
+ if (!err)
+ err = cpt_dump_files(ctx);
+ if (!err)
+ err = cpt_dump_files_struct(ctx);
+ if (!err)
+ err = cpt_dump_fs_struct(ctx);
+ /* netdevices should be dumped after dumping open files
+ as we need to restore netdevice binding to /dev/net/tun file */
+ if (!err)
+ err = cpt_dump_ifinfo(ctx);
+ if (!err)
+ err = cpt_dump_namespace(ctx);
+ if (!err)
+ err = cpt_dump_sighand(ctx);
+ if (!err)
+ err = cpt_dump_vm(ctx);
+ if (!err)
+ err = cpt_dump_sysvsem(ctx);
+ if (!err)
+ err = cpt_dump_sysvmsg(ctx);
+ if (!err)
+ err = cpt_dump_tasks(ctx);
+ if (!err)
+ err = cpt_dump_orphaned_sockets(ctx);
+#if defined(CONFIG_VE_IPTABLES) && \
+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
+ if (!err)
+ err = cpt_dump_ip_conntrack(ctx);
+#endif
+ if (!err) {
+ if (virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL)
+ err = -ECHRNG;
+ }
+ if (!err)
+ err = cpt_dump_utsname(ctx);
+
+#ifndef CONFIG_IA64
+ if (!err)
+ err = cpt_dump_vsyscall(ctx);
+#endif
+
+ if (!err)
+ err = cpt_dump_tail(ctx);
+
+ err2 = cpt_close_dumpfile(ctx);
+
+out:
+ current->nsproxy = old_ns;
+ set_exec_env(oldenv);
+out_noenv:
+ up_read(&env->op_sem);
+ put_ve(env);
+ return err ? : err2;
+}
+
+int cpt_vps_suspend(struct cpt_context *ctx)
+{
+ struct ve_struct *oldenv, *env;
+ struct nsproxy *old_ns;
+ int err = 0;
+
+ ctx->kernel_config_flags = test_kernel_config();
+ cpt_object_init(ctx);
+
+ if (!ctx->ve_id) {
+ env = get_exec_env();
+ if (env == get_ve0())
+ return -EINVAL;
+ wprintk("undefined ve_id\n");
+ ctx->ve_id = env->veid;
+ get_ve(env);
+ } else {
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+ }
+
+#ifdef CONFIG_VE_IPTABLES
+ ctx->iptables_mask = env->_iptables_modules;
+#endif
+ ctx->features = env->features;
+
+ down_write(&env->op_sem);
+ err = -ESRCH;
+ if (!env->is_running)
+ goto out_noenv;
+
+ err = -EBUSY;
+ if (env->is_locked)
+ goto out_noenv;
+ env->is_locked = 1;
+ downgrade_write(&env->op_sem);
+
+ oldenv = set_exec_env(env);
+ old_ns = current->nsproxy;
+ current->nsproxy = env->ve_ns;
+
+ /* Phase 0: find and stop all the tasks */
+ if ((err = vps_stop_tasks(ctx)) != 0)
+ goto out;
+
+ if ((err = cpt_suspend_network(ctx)) != 0)
+ goto out_wake;
+
+ /* At the moment all the state is frozen. We do not need to lock
+ * the state, which can be changed only if the tasks are running.
+ */
+
+ /* Phase 1: collect task tree */
+ if ((err = vps_collect_tasks(ctx)) != 0)
+ goto out_wake;
+
+ /* Phase 1': collect all the resources */
+ if ((err = cpt_collect(ctx)) != 0)
+ goto out;
+
+out:
+ current->nsproxy = old_ns;
+ set_exec_env(oldenv);
+ up_read(&env->op_sem);
+ put_ve(env);
+ return err;
+
+out_noenv:
+ up_write(&env->op_sem);
+ put_ve(env);
+ return err;
+
+out_wake:
+ read_lock(&tasklist_lock);
+ wake_ve(ctx);
+ read_unlock(&tasklist_lock);
+ goto out;
+}
+
+static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps)
+{
+ struct net *net = get_exec_env()->ve_netns;
+ struct net_device *dev;
+
+ read_lock(&dev_base_lock);
+ for_each_netdev(net, dev) {
+ if (dev->netdev_ops->ndo_cpt == NULL) {
+ eprintk_ctx("unsupported netdevice %s\n", dev->name);
+ *caps |= (1<<CPT_UNSUPPORTED_NETDEV);
+ }
+ }
+ read_unlock(&dev_base_lock);
+}
+
+static void check_one_process(struct cpt_context *ctx, __u32 *caps,
+ unsigned int flags, struct ve_struct *env,
+ struct task_struct *root, struct task_struct *p)
+{
+ struct mnt_namespace *ns;
+
+ if (tsk_used_math(p)) {
+ *caps |= flags & ((1<<CPT_CPU_X86_FXSR) |
+ (1<<CPT_CPU_X86_SSE) |
+ (1<<CPT_CPU_X86_SSE2) |
+ (1<<CPT_CPU_X86_MMX) |
+ (1<<CPT_CPU_X86_3DNOW) |
+ (1<<CPT_CPU_X86_3DNOW2));
+ }
+ /* This is not 100% true. VE could migrate with vdso using int80.
+ * In this case we do not need SEP/SYSCALL32 caps. It is not so easy
+ * to test, so that we do not. */
+#ifdef CONFIG_X86_64
+ if (!(task_thread_info(p)->flags & _TIF_IA32))
+ *caps |= flags & ((1<<CPT_CPU_X86_EMT64)|(1<<CPT_CPU_X86_SYSCALL));
+ else if (p->mm && p->mm->context.vdso) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ *caps |= flags & (1<<CPT_CPU_X86_SEP);
+ else
+ *caps |= flags & (1<<CPT_CPU_X86_SYSCALL32);
+ }
+#elif defined(CONFIG_X86_32)
+ if (p->mm && p->mm->context.vdso)
+ *caps |= flags & (1<<CPT_CPU_X86_SEP);
+#endif
+#ifdef CONFIG_IA64
+ if (!IS_IA32_PROCESS(task_pt_regs(p)))
+ *caps |= (1<<CPT_CPU_X86_IA64);
+#endif
+ if (vps_child_level(root, p) >= 0) {
+ switch (check_process_external(p)) {
+ case PIDTYPE_PID:
+ eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_EXTERNAL_PROCESS);
+ break;
+ case PIDTYPE_PGID:
+ eprintk_ctx("external process group %d/%d(%s) inside CT "
+ "(e.g. vzctl enter or vzctl exec).\n",
+ task_pgrp_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_EXTERNAL_PROCESS);
+ break;
+ case PIDTYPE_SID:
+ eprintk_ctx("external process session %d/%d(%s) inside CT "
+ "(e.g. vzctl enter or vzctl exec).\n",
+ task_session_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_EXTERNAL_PROCESS);
+ }
+ } else {
+ eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_EXTERNAL_PROCESS);
+ }
+ task_lock(p);
+ ns = NULL;
+ if (p->nsproxy) {
+ ns = p->nsproxy->mnt_ns;
+ if (ns)
+ get_mnt_ns(ns);
+ }
+ task_unlock(p);
+ if (ns) {
+ if (ns != current->nsproxy->mnt_ns) {
+ eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_NAMESPACES);
+ }
+ put_mnt_ns(ns);
+ }
+ if (p->policy != SCHED_NORMAL) {
+ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm);
+ *caps |= (1<<CPT_SCHEDULER_POLICY);
+ }
+#ifdef CONFIG_UTRACE
+ if (check_utrace(p, root, ctx)) {
+ eprintk_ctx("task %d/%d(%s) is ptraced from host system\n", p->pid, virt_pid(p), p->comm);
+ *caps |= (1<<CPT_PTRACED_FROM_VE0);
+ }
+#endif
+ if (cpt_check_unsupported(p, ctx)) {
+ *caps |= (1<<CPT_UNSUPPORTED_MISC);
+ }
+}
+
+static void check_unsupported_mounts(struct cpt_context *ctx, __u32 *caps,
+ struct ve_struct *env, struct mnt_namespace *n, char *path_buf)
+{
+ struct list_head *p;
+ char *path;
+
+ down_read(&namespace_sem);
+ list_for_each(p, &n->list) {
+ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
+ struct path p;
+
+ p.dentry = mnt->mnt_root;
+ p.mnt = mnt;
+ spin_lock(&dcache_lock);
+ path = __d_path(&p, &env->root_path,
+ path_buf, PAGE_SIZE);
+ spin_unlock(&dcache_lock);
+ if (IS_ERR(path))
+ continue;
+
+ if (check_one_vfsmount(mnt)) {
+ eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name);
+ *caps |= (1<<CPT_UNSUPPORTED_FSTYPE);
+ }
+ }
+ up_read(&namespace_sem);
+}
+
+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps)
+{
+ struct task_struct *p;
+ struct task_struct *root;
+ struct ve_struct *env;
+ struct ve_struct *old_env;
+ struct nsproxy *old_ns;
+ struct mnt_namespace *n;
+ int err;
+ unsigned int flags = test_cpu_caps_and_features();
+
+ if (!ctx->ve_id)
+ return -EINVAL;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (env == NULL)
+ return -ESRCH;
+
+ down_read(&env->op_sem);
+ err = -ESRCH;
+ if (!env->is_running) {
+ eprintk_ctx("CT is not running\n");
+ goto out_noenv;
+ }
+
+ err = -EBUSY;
+ if (env->is_locked) {
+ eprintk_ctx("CT is locked\n");
+ goto out_noenv;
+ }
+
+ *caps = flags & (1<<CPT_CPU_X86_CMOV);
+
+ if (flags & (1 << CPT_SLM_DMPRST)) {
+ eprintk_ctx("SLM is enabled, but slm_dmprst module is not loaded\n");
+ *caps |= (1 << CPT_SLM_DMPRST);
+ }
+
+ old_env = set_exec_env(env);
+ old_ns = current->nsproxy;
+ current->nsproxy = env->ve_ns;
+
+ check_unsupported_netdevices(ctx, caps);
+
+ read_lock(&tasklist_lock);
+ root = find_task_by_vpid(1);
+ if (!root) {
+ read_unlock(&tasklist_lock);
+ eprintk_ctx("cannot find ve init\n");
+ err = -ESRCH;
+ goto out;
+ }
+ get_task_struct(root);
+ for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p))
+ check_one_process(ctx, caps, flags, env, root, p);
+ read_unlock(&tasklist_lock);
+
+ task_lock(root);
+ n = NULL;
+ if (root->nsproxy) {
+ n = root->nsproxy->mnt_ns;
+ if (n)
+ get_mnt_ns(n);
+ }
+ task_unlock(root);
+ if (n) {
+ char *path_buf;
+
+ path_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!path_buf) {
+ put_mnt_ns(n);
+ err = -ENOMEM;
+ goto out_root;
+ }
+
+ check_unsupported_mounts(ctx, caps, env, n, path_buf);
+
+ free_page((unsigned long) path_buf);
+ put_mnt_ns(n);
+ }
+
+ err = 0;
+
+out_root:
+ put_task_struct(root);
+out:
+ current->nsproxy = old_ns;
+ set_exec_env(old_env);
+out_noenv:
+ up_read(&env->op_sem);
+ put_ve(env);
+
+ return err;
+}
diff --git a/kernel/cpt/cpt_dump.h b/kernel/cpt/cpt_dump.h
new file mode 100644
index 0000000..71f6d94
--- /dev/null
+++ b/kernel/cpt/cpt_dump.h
@@ -0,0 +1,16 @@
+int cpt_dump(struct cpt_context *cpt);
+int rst_undump(struct cpt_context *cpt);
+int cpt_suspend(struct cpt_context *cpt);
+int cpt_resume(struct cpt_context *cpt);
+int cpt_kill(struct cpt_context *cpt);
+int rst_clean(struct cpt_context *cpt);
+int rst_resume(struct cpt_context *cpt);
+int rst_kill(struct cpt_context *cpt);
+
+int cpt_freeze_one(pid_t pid, int freeze);
+int cpt_vps_suspend(struct cpt_context *ctx);
+int vps_rst_undump(struct cpt_context *ctx);
+
+int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps);
+
+int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx);
diff --git a/kernel/cpt/cpt_epoll.c b/kernel/cpt/cpt_epoll.c
new file mode 100644
index 0000000..81d2b98
--- /dev/null
+++ b/kernel/cpt/cpt_epoll.c
@@ -0,0 +1,113 @@
+/*
+ *
+ * kernel/cpt/cpt_epoll.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/eventpoll.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx)
+{
+ int err = 0;
+ struct file *file = obj->o_obj;
+ struct eventpoll *ep;
+ struct rb_node *rbp;
+ struct cpt_epoll_image ei;
+
+ if (file->f_op != &eventpoll_fops) {
+ eprintk_ctx("bad epoll file\n");
+ return -EINVAL;
+ }
+
+ ep = file->private_data;
+
+ /* eventpoll.c does not protect open /proc/N/fd, silly.
+ * Opener will get an invalid file with uninitialized private_data
+ */
+ if (unlikely(ep == NULL)) {
+ eprintk_ctx("bad epoll device\n");
+ return -EINVAL;
+ }
+
+ cpt_open_object(NULL, ctx);
+
+ ei.cpt_next = CPT_NULL;
+ ei.cpt_object = CPT_OBJ_EPOLL;
+ ei.cpt_hdrlen = sizeof(ei);
+ ei.cpt_content = CPT_CONTENT_ARRAY;
+ ei.cpt_file = obj->o_pos;
+
+ ctx->write(&ei, sizeof(ei), ctx);
+
+ mutex_lock(&epmutex);
+ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+ loff_t saved_obj;
+ cpt_object_t *tobj;
+ struct cpt_epoll_file_image efi;
+ struct epitem *epi;
+ epi = rb_entry(rbp, struct epitem, rbn);
+ tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx);
+ if (tobj == NULL) {
+ eprintk_ctx("epoll device refers to an external file\n");
+ err = -EBUSY;
+ break;
+ }
+ cpt_push_object(&saved_obj, ctx);
+ cpt_open_object(NULL, ctx);
+
+ efi.cpt_next = CPT_NULL;
+ efi.cpt_object = CPT_OBJ_EPOLL_FILE;
+ efi.cpt_hdrlen = sizeof(efi);
+ efi.cpt_content = CPT_CONTENT_VOID;
+ efi.cpt_file = tobj->o_pos;
+ efi.cpt_fd = epi->ffd.fd;
+ efi.cpt_events = epi->event.events;
+ efi.cpt_data = epi->event.data;
+ efi.cpt_revents = 0;
+ efi.cpt_ready = 0;
+ if (!list_empty(&epi->rdllink))
+ efi.cpt_ready = 1;
+
+ ctx->write(&efi, sizeof(efi), ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+ mutex_unlock(&epmutex);
+
+ cpt_close_object(ctx);
+
+ return err;
+}
+
diff --git a/kernel/cpt/cpt_exports.c b/kernel/cpt/cpt_exports.c
new file mode 100644
index 0000000..f492331
--- /dev/null
+++ b/kernel/cpt/cpt_exports.c
@@ -0,0 +1,13 @@
+#include <linux/module.h>
+#include <asm/signal.h>
+
+#include "cpt_obj.h"
+
+EXPORT_SYMBOL(alloc_cpt_object);
+EXPORT_SYMBOL(intern_cpt_object);
+EXPORT_SYMBOL(insert_cpt_object);
+EXPORT_SYMBOL(__cpt_object_add);
+EXPORT_SYMBOL(cpt_object_add);
+EXPORT_SYMBOL(cpt_object_get);
+EXPORT_SYMBOL(lookup_cpt_object);
+EXPORT_SYMBOL(lookup_cpt_obj_bypos);
diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c
new file mode 100644
index 0000000..927a4e3
--- /dev/null
+++ b/kernel/cpt/cpt_files.c
@@ -0,0 +1,1782 @@
+/*
+ *
+ * kernel/cpt/cpt_files.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <linux/pagemap.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/ve_proto.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+#include <linux/if_tun.h>
+#include <linux/fdtable.h>
+#include <linux/shm.h>
+#include <linux/signalfd.h>
+#include <linux/nsproxy.h>
+#include <linux/fs_struct.h>
+#include <linux/miscdevice.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+static inline int is_signalfd_file(struct file *file)
+{
+ /* no other users of it yet */
+ return file->f_op == &signalfd_fops;
+}
+
+void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt)
+{
+ char *path;
+ struct path p;
+ unsigned long pg = __get_free_page(GFP_KERNEL);
+
+ if (!pg)
+ return;
+
+ p.dentry = d;
+ p.mnt = mnt;
+ path = d_path(&p, (char *)pg, PAGE_SIZE);
+
+ if (!IS_ERR(path))
+ eprintk("<%s>", path);
+ free_page(pg);
+}
+
+int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
+ int verify, cpt_context_t *ctx)
+{
+ if (d->d_inode->i_sb->s_magic == FSMAGIC_PROC &&
+ proc_dentry_of_dead_task(d))
+ return 0;
+
+ if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) {
+ struct nameidata nd;
+ if (path_lookup(path, 0, &nd)) {
+ eprintk_ctx("d_path cannot be looked up %s\n", path);
+ return -EINVAL;
+ }
+ if (nd.path.dentry != d || (verify && nd.path.mnt != mnt)) {
+ if (!strcmp(path, "/dev/null")) {
+ /*
+ * epic kludge to workaround the case, when the
+ * init opens a /dev/null and then udevd
+ * overmounts the /dev with tmpfs
+ */
+ path_put(&nd.path);
+ return 0;
+ }
+
+ eprintk_ctx("d_path is invisible %s\n", path);
+ path_put(&nd.path);
+ return -EINVAL;
+ }
+ path_put(&nd.path);
+ }
+ return 0;
+}
+
+static int
+cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx)
+{
+ int result = 0;
+
+#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE)
+ char *path;
+ unsigned long pg;
+ struct dentry * renamed_dentry;
+ struct path p;
+
+ if (de->d_sb->s_magic != FSMAGIC_VEFS)
+ return 0;
+ if (de->d_inode->i_nlink != 0 ||
+ atomic_read(&de->d_inode->i_writecount) > 0)
+ return 0;
+
+ renamed_dentry = vefs_replaced_dentry(de);
+ if (renamed_dentry == NULL)
+ return 0;
+
+ pg = __get_free_page(GFP_KERNEL);
+ if (!pg)
+ return 0;
+
+ p.dentry = de;
+ p.mnt = mnt;
+ path = d_path(&p, (char *)pg, PAGE_SIZE);
+ if (!IS_ERR(path)) {
+ int len;
+ struct nameidata nd;
+
+ len = pg + PAGE_SIZE - 1 - (unsigned long)path;
+ if (len >= sizeof("(deleted) ") - 1 &&
+ !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
+ len -= sizeof("(deleted) ") - 1;
+ path += sizeof("(deleted) ") - 1;
+ }
+
+ if (path_lookup(path, 0, &nd) == 0) {
+ if (mnt == nd.path.mnt &&
+ vefs_is_renamed_dentry(nd.path.dentry, renamed_dentry))
+ result = 1;
+ path_put(&nd.path);
+ }
+ }
+ free_page(pg);
+#endif
+ return result;
+}
+
+static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt,
+ int replaced, int verify, cpt_context_t *ctx)
+{
+ int len;
+ char *path;
+ struct path p;
+ char *pg = cpt_get_buf(ctx);
+ loff_t saved;
+
+ p.dentry = d;
+ p.mnt = mnt;
+ path = d_path(&p, pg, PAGE_SIZE);
+ len = PTR_ERR(path);
+
+ if (IS_ERR(path)) {
+ struct cpt_object_hdr o;
+ char tmp[1];
+
+ /* VZ changes d_path() to return EINVAL, when path
+ * is not supposed to be visible inside VE.
+ * This changes behaviour of d_path() comparing
+ * to mainstream kernel, f.e. d_path() fails
+ * on any kind of shared memory. Maybe, there are
+ * another cases, but I am aware only about this one.
+ * So, we just ignore error on shmem mounts and proceed.
+ * Otherwise, checkpointing is prohibited because
+ * of reference to an invisible file.
+ */
+ if (len != -EINVAL ||
+ mnt != get_exec_env()->shmem_mnt)
+ eprintk_ctx("d_path err=%d\n", len);
+ else
+ len = 0;
+
+ cpt_push_object(&saved, ctx);
+ cpt_open_object(NULL, ctx);
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_NAME;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_NAME;
+ tmp[0] = 0;
+
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(tmp, 1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved, ctx);
+
+ __cpt_release_buf(ctx);
+ return len;
+ } else {
+ struct cpt_object_hdr o;
+
+ len = pg + PAGE_SIZE - 1 - path;
+ if (replaced &&
+ len >= sizeof("(deleted) ") - 1 &&
+ !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) {
+ len -= sizeof("(deleted) ") - 1;
+ path += sizeof("(deleted) ") - 1;
+ }
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_NAME;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_NAME;
+ path[len] = 0;
+
+ if (cpt_verify_overmount(path, d, mnt, verify, ctx)) {
+ __cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+
+ cpt_push_object(&saved, ctx);
+ cpt_open_object(NULL, ctx);
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(path, len+1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved, ctx);
+ __cpt_release_buf(ctx);
+ }
+ return 0;
+}
+
+int cpt_dump_string(const char *s, struct cpt_context *ctx)
+{
+ int len;
+ struct cpt_object_hdr o;
+
+ cpt_open_object(NULL, ctx);
+ len = strlen(s);
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_NAME;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_NAME;
+
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(s, len+1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ return 0;
+}
+
+static int
+cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx)
+{
+ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, 1, ctx);
+}
+
+int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_inode_image *v = cpt_get_buf(ctx);
+ struct kstat sbuf;
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_INODE;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+
+ v->cpt_dev = d->d_inode->i_sb->s_dev;
+ v->cpt_ino = d->d_inode->i_ino;
+ v->cpt_mode = sbuf.mode;
+ v->cpt_nlink = sbuf.nlink;
+ v->cpt_uid = sbuf.uid;
+ v->cpt_gid = sbuf.gid;
+ v->cpt_rdev = d->d_inode->i_rdev;
+ v->cpt_size = sbuf.size;
+ v->cpt_atime = cpt_timespec_export(&sbuf.atime);
+ v->cpt_mtime = cpt_timespec_export(&sbuf.mtime);
+ v->cpt_ctime = cpt_timespec_export(&sbuf.ctime);
+ v->cpt_blksize = sbuf.blksize;
+ v->cpt_blocks = sbuf.blocks;
+ v->cpt_sb = d->d_inode->i_sb->s_magic;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ return 0;
+}
+
+int cpt_collect_files(cpt_context_t * ctx)
+{
+ int err;
+ cpt_object_t *obj;
+ int index = 0;
+
+ /* Collect process fd sets */
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL)
+ return -ENOMEM;
+ }
+
+ /* Collect files from fd sets */
+ for_each_object(obj, CPT_OBJ_FILES) {
+ int fd;
+ struct files_struct *f = obj->o_obj;
+
+ cpt_obj_setindex(obj, index++, ctx);
+
+ if (obj->o_count != atomic_read(&f->count)) {
+ eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count));
+ return -EBUSY;
+ }
+
+ for (fd = 0; fd < f->fdt->max_fds; fd++) {
+ struct file *file = fcheck_files(f, fd);
+ if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL)
+ return -ENOMEM;
+ }
+ }
+
+ /* Collect files queued by AF_UNIX sockets. */
+ if ((err = cpt_collect_passedfds(ctx)) < 0)
+ return err;
+
+ /* OK. At this point we should count all the references. */
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ struct file *parent;
+ cpt_object_t *ino_obj;
+
+ if (obj->o_count != atomic_long_read(&file->f_count)) {
+ eprintk_ctx("file struct is referenced outside %d %ld\n", obj->o_count, atomic_long_read(&file->f_count));
+ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
+ return -EBUSY;
+ }
+
+ switch (file->f_dentry->d_inode->i_sb->s_magic) {
+ case FSMAGIC_FUTEX:
+ case FSMAGIC_MQUEUE:
+ case FSMAGIC_BDEV:
+#ifndef CONFIG_INOTIFY_USER
+ case FSMAGIC_INOTIFY:
+#endif
+ eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic);
+ return -EBUSY;
+ }
+
+ /* Collect inode. It is necessary mostly to resolve deleted
+ * hard links. */
+ ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+ if (ino_obj == NULL)
+ return -ENOMEM;
+
+ parent = ino_obj->o_parent;
+ if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry)))
+ ino_obj->o_parent = file;
+
+ if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
+ int maj = imajor(file->f_dentry->d_inode);
+ if (maj == PTY_MASTER_MAJOR ||
+ (maj >= UNIX98_PTY_MASTER_MAJOR &&
+ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
+ maj == PTY_SLAVE_MAJOR ||
+ maj == UNIX98_PTY_SLAVE_MAJOR ||
+ maj == TTYAUX_MAJOR) {
+ err = cpt_collect_tty(file, ctx);
+ if (err)
+ return err;
+ }
+ }
+
+ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
+ err = cpt_collect_socket(file, ctx);
+ if (err)
+ return err;
+ }
+ }
+
+ err = cpt_index_sockets(ctx);
+
+ return err;
+}
+
+/* /dev/ptmx is special, all the files share one inode, but real tty backend
+ * is attached via file->private_data.
+ */
+
+static inline int is_cloning_inode(struct inode *ino)
+{
+ return S_ISCHR(ino->i_mode) &&
+ ino->i_rdev == MKDEV(TTYAUX_MAJOR,2);
+}
+
+static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx)
+{
+ pid_t pid;
+ struct cpt_flock_image *v = cpt_get_buf(ctx);
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_FLOCK;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_VOID;
+
+ v->cpt_owner = owner;
+
+ pid = fl->fl_pid;
+ if (pid) {
+ pid = pid_to_vpid(fl->fl_pid);
+ if (pid == -1) {
+ if (!(fl->fl_flags&FL_FLOCK)) {
+ eprintk_ctx("posix lock from another container?\n");
+ cpt_release_buf(ctx);
+ return -EBUSY;
+ }
+ pid = 0;
+ }
+ }
+
+ v->cpt_pid = pid;
+ v->cpt_start = fl->fl_start;
+ v->cpt_end = fl->fl_end;
+ v->cpt_flags = fl->fl_flags;
+ v->cpt_type = fl->fl_type;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ return 0;
+}
+
+
+int cpt_dump_flock(struct file *file, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct file_lock *fl;
+
+ lock_kernel();
+ for (fl = file->f_dentry->d_inode->i_flock;
+ fl; fl = fl->fl_next) {
+ if (file != fl->fl_file)
+ continue;
+ if (fl->fl_flags & FL_LEASE) {
+ eprintk_ctx("lease lock is not supported\n");
+ err = -EINVAL;
+ break;
+ }
+ if (fl->fl_flags & FL_POSIX) {
+ cpt_object_t *obj;
+ obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx);
+ if (obj) {
+ dump_one_flock(fl, obj->o_index, ctx);
+ continue;
+ } else {
+ eprintk_ctx("unknown lock owner %p\n", fl->fl_owner);
+ err = -EINVAL;
+ }
+ }
+ if (fl->fl_flags & FL_FLOCK) {
+ dump_one_flock(fl, -1, ctx);
+ continue;
+ }
+ }
+ unlock_kernel();
+ return err;
+}
+
+static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx)
+{
+ int err = 0;
+ cpt_object_t *iobj;
+ struct cpt_file_image *v = cpt_get_buf(ctx);
+ struct kstat sbuf;
+ int replaced = 0;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_FILE;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_flags = file->f_flags;
+ v->cpt_mode = file->f_mode;
+ v->cpt_pos = file->f_pos;
+ v->cpt_uid = file->f_cred->uid;
+ v->cpt_gid = file->f_cred->gid;
+
+ vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf);
+
+ v->cpt_i_mode = sbuf.mode;
+ v->cpt_lflags = 0;
+
+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) {
+ v->cpt_lflags |= CPT_DENTRY_PROC;
+ if (proc_dentry_of_dead_task(file->f_dentry))
+ v->cpt_lflags |= CPT_DENTRY_PROCPID_DEAD;
+ }
+
+ if (IS_ROOT(file->f_dentry))
+ v->cpt_lflags |= CPT_DENTRY_ROOT;
+ else if (d_unhashed(file->f_dentry)) {
+ if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) {
+ v->cpt_lflags |= CPT_DENTRY_REPLACED;
+ replaced = 1;
+ } else if (!(v->cpt_lflags & CPT_DENTRY_PROCPID_DEAD))
+ v->cpt_lflags |= CPT_DENTRY_DELETED;
+ }
+ if (is_cloning_inode(file->f_dentry->d_inode))
+ v->cpt_lflags |= CPT_DENTRY_CLONING;
+
+ v->cpt_inode = CPT_NULL;
+ if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) {
+ iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+ if (iobj) {
+ v->cpt_inode = iobj->o_pos;
+ if (iobj->o_flags & CPT_INODE_HARDLINKED)
+ v->cpt_lflags |= CPT_DENTRY_HARDLINKED;
+ }
+ }
+ v->cpt_priv = CPT_NULL;
+ v->cpt_fown_fd = -1;
+ if (S_ISCHR(v->cpt_i_mode)) {
+ iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx);
+ if (iobj) {
+ v->cpt_priv = iobj->o_pos;
+ if (file->f_flags&FASYNC)
+ v->cpt_fown_fd = cpt_tty_fasync(file, ctx);
+ }
+ if (imajor(file->f_dentry->d_inode) == MISC_MAJOR &&
+ iminor(file->f_dentry->d_inode) == TUN_MINOR)
+ v->cpt_lflags |= CPT_DENTRY_TUNTAP;
+ }
+ if (S_ISSOCK(v->cpt_i_mode)) {
+ if (obj->o_index < 0) {
+ eprintk_ctx("BUG: no socket index\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ v->cpt_priv = obj->o_index;
+ if (file->f_flags&FASYNC)
+ v->cpt_fown_fd = cpt_socket_fasync(file, ctx);
+ }
+ if (file->f_op == &eventpoll_fops) {
+ v->cpt_priv = file->f_dentry->d_inode->i_ino;
+ v->cpt_lflags |= CPT_DENTRY_EPOLL;
+ }
+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
+ v->cpt_priv = file->f_dentry->d_inode->i_ino;
+ v->cpt_lflags |= CPT_DENTRY_INOTIFY;
+ }
+
+ v->cpt_fown_pid = (file->f_owner.pid == NULL ?
+ CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid));
+ v->cpt_fown_uid = file->f_owner.uid;
+ v->cpt_fown_euid = file->f_owner.euid;
+ v->cpt_fown_signo = file->f_owner.signum;
+
+ if (is_signalfd_file(file)) {
+ struct signalfd_ctx *ctx = file->private_data;
+ v->cpt_lflags |= CPT_DENTRY_SIGNALFD;
+ v->cpt_priv = cpt_sigset_export(&ctx->sigmask);
+ }
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ if (!S_ISSOCK(v->cpt_i_mode)) {
+ err = cpt_dump_filename(file, replaced, ctx);
+ if (err)
+ return err;
+ if ((file->f_mode & FMODE_WRITE) &&
+ file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS)
+ vefs_track_notify(file->f_dentry, 1);
+ }
+
+ if (file->f_dentry->d_inode->i_flock)
+ err = cpt_dump_flock(file, ctx);
+
+ cpt_close_object(ctx);
+
+ return err;
+}
+
+/* About this weird function... Crappy code dealing with SYSV shared memory
+ * defines TMPFS inode and file with f_op doing only mmap. So...
+ * Maybe, this is wrong and leaks something. It is clear access to
+ * SYSV shmem via mmap is quite unusual and impossible from user space.
+ */
+static int dump_content_shm(struct file *file, struct cpt_context *ctx)
+{
+ struct cpt_obj_bits *v;
+ loff_t saved_pos;
+ unsigned long addr;
+
+ addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size,
+ PROT_READ, MAP_SHARED, 0);
+ if (IS_ERR((void*)addr))
+ return PTR_ERR((void*)addr);
+
+ cpt_push_object(&saved_pos, ctx);
+ cpt_open_object(NULL, ctx);
+ v = cpt_get_buf(ctx);
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_BITS;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_DATA;
+ v->cpt_size = file->f_dentry->d_inode->i_size;
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx);
+ ctx->align(ctx);
+ do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size);
+
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_pos, ctx);
+ return 0;
+}
+
+static int data_is_zero(char *addr, int len)
+{
+ int i;
+ unsigned long zerolong = 0;
+
+ for (i=0; i<len/sizeof(unsigned long); i++) {
+ if (((unsigned long*)(addr))[i] != 0)
+ return 0;
+ }
+ i = len % sizeof(unsigned long);
+ if (!i)
+ return 1;
+ return memcmp(addr + len - i, &zerolong, i) == 0;
+}
+
+
+static int dump_content_regular(struct file *file, struct cpt_context *ctx)
+{
+ loff_t saved_pos;
+ loff_t pos = 0;
+ loff_t obj_opened = CPT_NULL;
+ struct cpt_page_block pgb;
+ ssize_t (*do_read)(struct file *, char __user *, size_t, loff_t *);
+
+ if (file->f_op == NULL)
+ return -EINVAL;
+
+ do_read = file->f_op->read;
+
+ if (file->f_op == &shm_file_operations ||
+ file->f_op == &shmem_file_operations) {
+
+ /* shmget uses shm ops */
+ if (file->f_op == &shm_file_operations) {
+ struct shm_file_data *sfd = file->private_data;
+ file = sfd->file;
+ }
+
+ cpt_dump_content_sysvshm(file, ctx);
+
+ do_read = file->f_dentry->d_inode->i_fop->read;
+ if (!do_read) {
+ wprintk_ctx("TMPFS is not configured?\n");
+ return dump_content_shm(file, ctx);
+ }
+ }
+
+ if (!(file->f_mode & FMODE_READ) ||
+ (file->f_flags & O_DIRECT)) {
+ struct file *filp;
+ filp = dentry_open(dget(file->f_dentry),
+ mntget(file->f_vfsmnt),
+ O_RDONLY | O_LARGEFILE,
+ NULL /* not checked */);
+ if (IS_ERR(filp)) {
+ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
+ eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(filp));
+ return PTR_ERR(filp);
+ }
+ file = filp;
+ } else {
+ atomic_long_inc(&file->f_count);
+ }
+
+ for (;;) {
+ mm_segment_t oldfs;
+ int err;
+
+ (void)cpt_get_buf(ctx);
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos);
+ set_fs(oldfs);
+ if (err < 0) {
+ eprintk_ctx("dump_content_regular: do_read: %d", err);
+ fput(file);
+ __cpt_release_buf(ctx);
+ return err;
+ }
+ if (err == 0) {
+ __cpt_release_buf(ctx);
+ break;
+ }
+ if (data_is_zero(ctx->tmpbuf, err)) {
+ if (obj_opened != CPT_NULL) {
+ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_pos, ctx);
+ obj_opened = CPT_NULL;
+ }
+ } else {
+ if (obj_opened == CPT_NULL) {
+ cpt_push_object(&saved_pos, ctx);
+ cpt_open_object(NULL, ctx);
+ obj_opened = ctx->file->f_pos;
+ pgb.cpt_next = CPT_NULL;
+ pgb.cpt_object = CPT_OBJ_PAGES;
+ pgb.cpt_hdrlen = sizeof(pgb);
+ pgb.cpt_content = CPT_CONTENT_DATA;
+ pgb.cpt_start = pos - err;
+ pgb.cpt_end = pgb.cpt_start;
+ ctx->write(&pgb, sizeof(pgb), ctx);
+ }
+ ctx->write(ctx->tmpbuf, err, ctx);
+ pgb.cpt_end += err;
+ }
+ __cpt_release_buf(ctx);
+ }
+
+ fput(file);
+
+ if (obj_opened != CPT_NULL) {
+ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_pos, ctx);
+ obj_opened = CPT_NULL;
+ }
+ return 0;
+}
+
+
+static int dump_content_chrdev(struct file *file, struct cpt_context *ctx)
+{
+ struct inode *ino = file->f_dentry->d_inode;
+ int maj;
+
+ maj = imajor(ino);
+ if (maj == MEM_MAJOR) {
+ /* Well, OK. */
+ return 0;
+ }
+ if (maj == PTY_MASTER_MAJOR ||
+ (maj >= UNIX98_PTY_MASTER_MAJOR &&
+ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
+ maj == PTY_SLAVE_MAJOR ||
+ maj == UNIX98_PTY_SLAVE_MAJOR ||
+ maj == TTYAUX_MAJOR) {
+ return cpt_dump_content_tty(file, ctx);
+ }
+ if (maj == MISC_MAJOR && iminor(ino) == TUN_MINOR)
+ return 0;
+
+ eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino));
+ return -EINVAL;
+}
+
+static int dump_content_blkdev(struct file *file, struct cpt_context *ctx)
+{
+ struct inode *ino = file->f_dentry->d_inode;
+
+ /* We are not going to transfer them. */
+ eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino));
+ return -EINVAL;
+}
+
+static int dump_content_fifo(struct file *file, struct cpt_context *ctx)
+{
+ struct inode *ino = file->f_dentry->d_inode;
+ cpt_object_t *obj;
+ loff_t saved_pos;
+ int readers;
+ int writers;
+ int anon = 0;
+
+ mutex_lock(&ino->i_mutex);
+ readers = ino->i_pipe->readers;
+ writers = ino->i_pipe->writers;
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file1 = obj->o_obj;
+ if (file1->f_dentry->d_inode == ino) {
+ if (file1->f_mode & FMODE_READ)
+ readers--;
+ if (file1->f_mode & FMODE_WRITE)
+ writers--;
+ }
+ }
+ mutex_unlock(&ino->i_mutex);
+ if (readers || writers) {
+ struct dentry *dr = file->f_dentry->d_sb->s_root;
+ if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0)
+ anon = 1;
+
+ if (anon) {
+ eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers);
+ return -EBUSY;
+ }
+ /* If fifo has external readers/writers, we are in troubles.
+ * If the buffer is not empty, we must move its content.
+ * But if the fifo is owned by a service, we cannot do
+ * this. See?
+ *
+ * For now we assume, that if fifo is opened by another
+ * process, we do not own it and, hence, migrate without
+ * data.
+ */
+ return 0;
+ }
+
+ /* OK, we must save fifo state. No semaphores required. */
+
+ if (ino->i_pipe->nrbufs) {
+ struct cpt_obj_bits *v = cpt_get_buf(ctx);
+ struct pipe_inode_info *info;
+ int count, buf, nrbufs;
+
+ mutex_lock(&ino->i_mutex);
+ info = ino->i_pipe;
+ count = 0;
+ buf = info->curbuf;
+ nrbufs = info->nrbufs;
+ while (--nrbufs >= 0) {
+ if (!info->bufs[buf].ops->can_merge) {
+ mutex_unlock(&ino->i_mutex);
+ eprintk_ctx("unknown format of pipe buffer\n");
+ return -EINVAL;
+ }
+ count += info->bufs[buf].len;
+ buf = (buf+1) & (PIPE_BUFFERS-1);
+ }
+
+ if (!count) {
+ mutex_unlock(&ino->i_mutex);
+ return 0;
+ }
+
+ cpt_push_object(&saved_pos, ctx);
+ cpt_open_object(NULL, ctx);
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_BITS;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_DATA;
+ v->cpt_size = count;
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ count = 0;
+ buf = info->curbuf;
+ nrbufs = info->nrbufs;
+ while (--nrbufs >= 0) {
+ struct pipe_buffer *b = info->bufs + buf;
+ /* need to ->pin first? */
+ void * addr = b->ops->map(info, b, 0);
+ ctx->write(addr + b->offset, b->len, ctx);
+ b->ops->unmap(info, b, addr);
+ buf = (buf+1) & (PIPE_BUFFERS-1);
+ }
+
+ mutex_unlock(&ino->i_mutex);
+
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_pos, ctx);
+ }
+
+ return 0;
+}
+
+static int dump_content_socket(struct file *file, struct cpt_context *ctx)
+{
+ return 0;
+}
+
+struct cpt_dirent {
+ unsigned long ino;
+ char *name;
+ int namelen;
+ int found;
+};
+
+static int cpt_filldir(void * __buf, const char * name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
+{
+ struct cpt_dirent * dirent = __buf;
+
+ if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) {
+ memcpy(dirent->name, name, namelen);
+ dirent->name[namelen] = '\0';
+ dirent->namelen = namelen;
+ dirent->found = 1;
+ return 1;
+ }
+ return 0;
+}
+
+static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt,
+ struct inode *ino, struct cpt_context *ctx)
+{
+ int err = -EBUSY;
+ struct file *f = NULL;
+ struct cpt_dirent entry;
+ struct dentry *de, *found = NULL;
+
+ dprintk_ctx("deleted reference to existing inode, try to find file\n");
+ /* 1. Try to find not deleted dentry in ino->i_dentry list */
+ spin_lock(&dcache_lock);
+ list_for_each_entry(de, &ino->i_dentry, d_alias) {
+ if (!IS_ROOT(de) && d_unhashed(de))
+ continue;
+ found = de;
+ dget_locked(found);
+ break;
+ }
+ spin_unlock(&dcache_lock);
+ if (found) {
+ err = cpt_dump_dentry(found, mnt, 0, 1, ctx);
+ dput(found);
+ if (!err) {
+ dprintk_ctx("dentry found in aliases\n");
+ return 0;
+ }
+ }
+
+ /* 2. Try to find file in current dir */
+ de = dget_parent(d);
+ if (!de)
+ return -EINVAL;
+
+ mntget(mnt);
+ f = dentry_open(de, mnt, O_RDONLY | O_LARGEFILE, NULL);
+ if (IS_ERR(f))
+ return PTR_ERR(f);
+
+ entry.ino = ino->i_ino;
+ entry.name = cpt_get_buf(ctx);
+ entry.found = 0;
+ err = vfs_readdir(f, cpt_filldir, &entry);
+ if (err || !entry.found) {
+ err = err ? err : -ENOENT;
+ goto err_readdir;
+ }
+
+ found = lookup_one_len(entry.name, de, entry.namelen);
+ if (IS_ERR(found)) {
+ err = PTR_ERR(found);
+ goto err_readdir;
+ }
+
+ err = -ENOENT;
+ if (found->d_inode != ino)
+ goto err_lookup;
+
+ dprintk_ctx("dentry found in dir\n");
+ __cpt_release_buf(ctx);
+ err = cpt_dump_dentry(found, mnt, 0, 1, ctx);
+
+err_lookup:
+ dput(found);
+err_readdir:
+ fput(f);
+ __cpt_release_buf(ctx);
+ return err;
+}
+
+static struct dentry *find_linkdir(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+ int i;
+
+ for (i = 0; i < ctx->linkdirs_num; i++)
+ if (ctx->linkdirs[i]->f_vfsmnt == mnt)
+ return ctx->linkdirs[i]->f_dentry;
+ return NULL;
+}
+
+struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt,
+ struct inode *ino, struct cpt_context *ctx)
+{
+ int err;
+ int order = 8;
+ const char *prefix = ".cpt_hardlink.";
+ int preflen = strlen(prefix) + order;
+ char name[preflen + 1];
+ struct dentry *dirde, *hardde;
+
+ dirde = find_linkdir(mnt, ctx);
+ if (!dirde) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ ctx->linkcnt++;
+ snprintf(name, sizeof(name), "%s%0*u", prefix, order, ctx->linkcnt);
+
+ mutex_lock(&dirde->d_inode->i_mutex);
+ hardde = lookup_one_len(name, dirde, strlen(name));
+ if (IS_ERR(hardde)) {
+ err = PTR_ERR(hardde);
+ goto out_unlock;
+ }
+
+ if (hardde->d_inode) {
+ /* Userspace should clean hardlinked files from previous
+ * dump/undump
+ */
+ eprintk_ctx("Hardlinked file already exists: %s\n", name);
+ err = -EEXIST;
+ goto out_put;
+ }
+
+ if (d == NULL)
+ err = vfs_create(dirde->d_inode, hardde, 0600, NULL);
+ else
+ err = vfs_link(d, dirde->d_inode, hardde);
+ if (err) {
+ eprintk_ctx("error hardlink %s, %d\n", name, err);
+ goto out_put;
+ }
+
+out_unlock:
+ mutex_unlock(&dirde->d_inode->i_mutex);
+out:
+ return err ? ERR_PTR(err) : hardde;
+
+out_put:
+ dput(hardde);
+ goto out_unlock;
+}
+
+static int create_dump_hardlink(struct dentry *d, struct vfsmount *mnt,
+ struct inode *ino, struct cpt_context *ctx)
+{
+ int err;
+ struct dentry *hardde;
+
+ hardde = cpt_fake_link(d, mnt, ino, ctx);
+ if (IS_ERR(hardde))
+ return PTR_ERR(hardde);
+
+ err = cpt_dump_dentry(hardde, mnt, 0, 1, ctx);
+ dput(hardde);
+
+ return err;
+}
+
+static int dump_one_inode(struct file *file, struct dentry *d,
+ struct vfsmount *mnt, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct inode *ino = d->d_inode;
+ cpt_object_t *iobj;
+ int dump_it = 0;
+
+ iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx);
+ if (!iobj)
+ return -EINVAL;
+
+ if (iobj->o_pos >= 0)
+ return 0;
+
+ if (ino->i_sb->s_magic == FSMAGIC_PROC &&
+ proc_dentry_of_dead_task(d))
+ return 0;
+
+ if ((!IS_ROOT(d) && d_unhashed(d)) &&
+ !cpt_replaced(d, mnt, ctx))
+ dump_it = 1;
+ if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) {
+ if (file->f_op == &eventpoll_fops ||
+ is_signalfd_file(file))
+ return 0;
+ dump_it = 1;
+ }
+
+ if (!dump_it)
+ return 0;
+
+ cpt_open_object(iobj, ctx);
+ cpt_dump_inode(d, mnt, ctx);
+
+ if (!IS_ROOT(d) && d_unhashed(d)) {
+ struct file *parent;
+ parent = iobj->o_parent;
+ if (!parent ||
+ (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) {
+ /* Inode is not deleted, but it does not
+ * have references from inside checkpointed
+ * process group. */
+ if (ino->i_nlink != 0) {
+ err = find_linked_dentry(d, mnt, ino, ctx);
+ if (err && S_ISREG(ino->i_mode)) {
+ err = create_dump_hardlink(d, mnt, ino, ctx);
+ iobj->o_flags |= CPT_INODE_HARDLINKED;
+ } else if (S_ISCHR(ino->i_mode) ||
+ S_ISBLK(ino->i_mode) ||
+ S_ISFIFO(ino->i_mode))
+ err = 0;
+
+ if (err) {
+ eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err);
+ return -EBUSY;
+ }
+ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
+ dump_it = 0;
+ }
+ } else {
+ /* Refer to _another_ file name. */
+ err = cpt_dump_filename(parent, 0, ctx);
+ if (err)
+ return err;
+ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
+ dump_it = 0;
+ }
+ }
+ if (dump_it) {
+ if (S_ISREG(ino->i_mode)) {
+ if ((err = dump_content_regular(file, ctx)) != 0) {
+ eprintk_ctx("dump_content_regular ");
+ cpt_printk_dentry(d, mnt);
+ }
+ } else if (S_ISDIR(ino->i_mode)) {
+ /* We cannot do anything. The directory should be
+ * empty, so it is not a big deal.
+ */
+ } else if (S_ISCHR(ino->i_mode)) {
+ err = dump_content_chrdev(file, ctx);
+ } else if (S_ISBLK(ino->i_mode)) {
+ err = dump_content_blkdev(file, ctx);
+ } else if (S_ISFIFO(ino->i_mode)) {
+ err = dump_content_fifo(file, ctx);
+ } else if (S_ISSOCK(ino->i_mode)) {
+ err = dump_content_socket(file, ctx);
+ } else {
+ eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic);
+ err = -EINVAL;
+ }
+ }
+ cpt_close_object(ctx);
+
+ return err;
+}
+
+int cpt_dump_files(struct cpt_context *ctx)
+{
+ int epoll_nr, inotify_nr;
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_TTY);
+ for_each_object(obj, CPT_OBJ_TTY) {
+ int err;
+
+ if ((err = cpt_dump_tty(obj, ctx)) != 0)
+ return err;
+ }
+ cpt_close_section(ctx);
+
+ cpt_open_section(ctx, CPT_SECT_INODE);
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ int err;
+
+ if ((err = dump_one_inode(file, file->f_dentry,
+ file->f_vfsmnt, ctx)) != 0)
+ return err;
+ }
+ for_each_object(obj, CPT_OBJ_FS) {
+ struct fs_struct *fs = obj->o_obj;
+ int err;
+
+ if (fs->root.dentry &&
+ (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0)
+ return err;
+ if (fs->pwd.dentry &&
+ (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0)
+ return err;
+ }
+ cpt_close_section(ctx);
+
+ epoll_nr = 0;
+ inotify_nr = 0;
+ cpt_open_section(ctx, CPT_SECT_FILES);
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ int err;
+
+ if ((err = dump_one_file(obj, file, ctx)) != 0)
+ return err;
+ if (file->f_op == &eventpoll_fops)
+ epoll_nr++;
+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY)
+ inotify_nr++;
+ }
+ cpt_close_section(ctx);
+
+ if (epoll_nr) {
+ cpt_open_section(ctx, CPT_SECT_EPOLL);
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ if (file->f_op == &eventpoll_fops) {
+ int err;
+ if ((err = cpt_dump_epolldev(obj, ctx)) != 0)
+ return err;
+ }
+ }
+ cpt_close_section(ctx);
+ }
+
+ if (inotify_nr) {
+ cpt_open_section(ctx, CPT_SECT_INOTIFY);
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) {
+ int err = -EINVAL;
+#ifdef CONFIG_INOTIFY_USER
+ if ((err = cpt_dump_inotify(obj, ctx)) != 0)
+#endif
+ return err;
+ }
+ }
+ cpt_close_section(ctx);
+ }
+
+ cpt_open_section(ctx, CPT_SECT_SOCKET);
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ int err;
+
+ if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0)
+ return err;
+ }
+ cpt_close_section(ctx);
+
+ return 0;
+}
+
+static int dump_filedesc(int fd, struct file *file,
+ struct files_struct *f, struct cpt_context *ctx)
+{
+ struct cpt_fd_image *v = cpt_get_buf(ctx);
+ cpt_object_t *obj;
+
+ cpt_open_object(NULL, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_FILEDESC;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_VOID;
+
+ v->cpt_fd = fd;
+ obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx);
+ if (!obj) BUG();
+ v->cpt_file = obj->o_pos;
+ v->cpt_flags = 0;
+ if (FD_ISSET(fd, f->fdt->close_on_exec))
+ v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ cpt_close_object(ctx);
+
+ return 0;
+}
+
+static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct files_struct *f = obj->o_obj;
+ struct cpt_files_struct_image *v = cpt_get_buf(ctx);
+ int fd;
+ loff_t saved_obj;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_FILES;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_index = obj->o_index;
+ v->cpt_max_fds = f->fdt->max_fds;
+ v->cpt_next_fd = f->next_fd;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ for (fd = 0; fd < f->fdt->max_fds; fd++) {
+ struct file *file = fcheck_files(f, fd);
+ if (file)
+ dump_filedesc(fd, file, f, ctx);
+ }
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+
+ return 0;
+}
+
+int cpt_dump_files_struct(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_FILES_STRUCT);
+
+ for_each_object(obj, CPT_OBJ_FILES) {
+ int err;
+
+ if ((err = dump_one_file_struct(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+ return 0;
+}
+
+int cpt_collect_fs(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->fs) {
+ if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL)
+ return -ENOMEM;
+ if (tsk->fs->pwd.dentry &&
+ cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL)
+ return -ENOMEM;
+ if (tsk->fs->root.dentry &&
+ cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL)
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
+{
+ struct file file;
+
+ memset(&file, 0, sizeof(file));
+
+ file.f_dentry = d;
+ file.f_vfsmnt = mnt;
+ file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK;
+ file.f_cred = current->cred;
+
+ return dump_one_file(NULL, &file, ctx);
+}
+
+static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct fs_struct *fs = obj->o_obj;
+ struct cpt_fs_struct_image *v = cpt_get_buf(ctx);
+ loff_t saved_obj;
+ int err;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_FS;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_umask = fs->umask;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx);
+ if (!err)
+ err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx);
+
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+
+ return err;
+}
+
+int cpt_dump_fs_struct(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_FS);
+
+ for_each_object(obj, CPT_OBJ_FS) {
+ int err;
+
+ if ((err = dump_one_fs(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+ return 0;
+}
+
+static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct mnt_namespace *n = obj->o_obj;
+ struct list_head *p;
+ char *path_buf, *path;
+
+ path_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!path_buf)
+ return -ENOMEM;
+
+ down_read(&namespace_sem);
+ list_for_each(p, &n->list) {
+ struct path pt;
+ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
+
+ pt.dentry = mnt->mnt_root;
+ pt.mnt = mnt;
+ path = d_path(&pt, path_buf, PAGE_SIZE);
+ if (IS_ERR(path))
+ continue;
+
+ if (check_one_vfsmount(mnt)) {
+ eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name);
+ err = -EINVAL;
+ break;
+ }
+ }
+ up_read(&namespace_sem);
+
+ free_page((unsigned long) path_buf);
+
+ return err;
+}
+
+int cpt_collect_namespace(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->nsproxy && tsk->nsproxy->mnt_ns &&
+ cpt_object_add(CPT_OBJ_NAMESPACE,
+ tsk->nsproxy->mnt_ns, ctx) == NULL)
+ return -ENOMEM;
+ }
+
+ for_each_object(obj, CPT_OBJ_NAMESPACE) {
+ int err;
+ if ((err = check_one_namespace(obj, ctx)) != 0)
+ return err;
+ }
+
+ return 0;
+}
+
+struct args_t
+{
+ int* pfd;
+ char* path;
+ envid_t veid;
+};
+
+static int dumptmpfs(void *arg)
+{
+ int i;
+ struct args_t *args = arg;
+ int *pfd = args->pfd;
+ int fd0, fd2;
+ char *path = args->path;
+ char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL };
+
+ i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
+ if (i < 0) {
+ eprintk("cannot enter ve to dump tmpfs\n");
+ module_put(THIS_MODULE);
+ return 255 << 8;
+ }
+
+ if (pfd[1] != 1)
+ sc_dup2(pfd[1], 1);
+ set_fs(KERNEL_DS);
+ fd0 = sc_open("/dev/null", O_RDONLY, 0);
+ fd2 = sc_open("/dev/null", O_WRONLY, 0);
+ if (fd0 < 0 || fd2 < 0) {
+ eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2);
+ module_put(THIS_MODULE);
+ return 255 << 8;
+ }
+ if (fd0 != 0)
+ sc_dup2(fd0, 0);
+ if (fd2 != 2)
+ sc_dup2(fd2, 2);
+
+ for (i = 3; i < current->files->fdt->max_fds; i++) {
+ sc_close(i);
+ }
+
+ module_put(THIS_MODULE);
+
+ i = sc_execve("/bin/tar", argv, NULL);
+ eprintk("failed to exec /bin/tar: %d\n", i);
+ return 255 << 8;
+}
+
+static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx)
+{
+ int err;
+ int pid;
+ int pfd[2];
+ struct file *f;
+ struct cpt_object_hdr v;
+ char buf[16];
+ int n;
+ loff_t saved_obj;
+ struct args_t args;
+ int status;
+ mm_segment_t oldfs;
+ sigset_t ignore, blocked;
+ struct ve_struct *oldenv;
+
+ err = sc_pipe(pfd);
+ if (err < 0)
+ return err;
+ args.pfd = pfd;
+ args.path = path;
+ args.veid = VEID(get_exec_env());
+ ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+ sigprocmask(SIG_BLOCK, &ignore, &blocked);
+ oldenv = set_exec_env(get_ve0());
+ err = pid = local_kernel_thread(dumptmpfs, (void*)&args,
+ SIGCHLD | CLONE_VFORK, 0);
+ set_exec_env(oldenv);
+ if (err < 0) {
+ eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
+ goto out;
+ }
+ f = fget(pfd[0]);
+ sc_close(pfd[1]);
+ sc_close(pfd[0]);
+
+ cpt_push_object(&saved_obj, ctx);
+ cpt_open_object(NULL, ctx);
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NAME;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_NAME;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ do {
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
+ set_fs(oldfs);
+ if (n > 0)
+ ctx->write(buf, n, ctx);
+ } while (n > 0);
+
+ fput(f);
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if ((err = sc_waitx(pid, 0, &status)) < 0)
+ eprintk_ctx("wait4: %d\n", err);
+ else if ((status & 0x7f) == 0) {
+ err = (status & 0xff00) >> 8;
+ if (err != 0) {
+ eprintk_ctx("tar exited with %d\n", err);
+ err = -EINVAL;
+ }
+ } else {
+ eprintk_ctx("tar terminated\n");
+ err = -EINVAL;
+ }
+ set_fs(oldfs);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+ buf[0] = 0;
+ ctx->write(buf, 1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ return n ? : err;
+
+out:
+ if (pfd[1] >= 0)
+ sc_close(pfd[1]);
+ if (pfd[0] >= 0)
+ sc_close(pfd[0]);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+ return err;
+}
+
+static int loopy_root(struct vfsmount *mnt)
+{
+ struct list_head *p;
+
+ list_for_each(p, &mnt->mnt_ns->list) {
+ struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list);
+ if (m == mnt)
+ return 0;
+ if (m->mnt_sb == mnt->mnt_sb)
+ return 1;
+ }
+ /* Cannot happen */
+ return 0;
+}
+
+static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx)
+{
+ struct list_head *p;
+ int err = -EINVAL;
+
+ /* One special case: mount --bind /a /a */
+ if (mnt->mnt_root == mnt->mnt_mountpoint)
+ return cpt_dump_dentry(mnt->mnt_root, mnt, 0, 0, ctx);
+
+ list_for_each_prev(p, &mnt->mnt_list) {
+ struct vfsmount * m;
+
+ if (p == &mnt->mnt_ns->list)
+ break;
+
+ m = list_entry(p, struct vfsmount, mnt_list);
+
+ if (m->mnt_sb != mnt->mnt_sb)
+ continue;
+
+ err = cpt_dump_dentry(mnt->mnt_root, m, 0, 1, ctx);
+ if (err == 0)
+ break;
+ }
+ return err;
+}
+
+static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct cpt_vfsmount_image v;
+ loff_t saved_obj;
+ char *path_buf, *path;
+ struct path p;
+
+ path_buf = (char *) __get_free_page(GFP_KERNEL);
+ if (!path_buf)
+ return -ENOMEM;
+
+ p.dentry = mnt->mnt_root;
+ p.mnt = mnt;
+ path = d_path(&p, path_buf, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ free_page((unsigned long) path_buf);
+ return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path);
+ }
+
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_VFSMOUNT;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_ARRAY;
+
+ v.cpt_mntflags = mnt->mnt_flags;
+ if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) {
+ v.cpt_mntflags |= CPT_MNT_EXT;
+ } else {
+ if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt))
+ v.cpt_mntflags |= CPT_MNT_BIND;
+ }
+ v.cpt_flags = mnt->mnt_sb->s_flags;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ cpt_dump_string(mnt->mnt_devname ? : "none", ctx);
+ cpt_dump_string(path, ctx);
+ cpt_dump_string(mnt->mnt_sb->s_type->name, ctx);
+
+ if (v.cpt_mntflags & CPT_MNT_BIND) {
+ err = cpt_dump_bind_mnt(mnt, ctx);
+
+ /* Temporary solution for Ubuntu 8.04 */
+ if (err == -EINVAL && !strcmp(path, "/dev/.static/dev")) {
+ cpt_dump_string("/dev", ctx);
+ err = 0;
+ }
+ }
+ else if (!(v.cpt_mntflags & CPT_MNT_EXT)) {
+
+ if (mnt->mnt_sb->s_type->fs_flags & FS_REQUIRES_DEV) {
+ eprintk_ctx("Checkpoint supports only nodev fs: %s\n",
+ mnt->mnt_sb->s_type->name);
+ err = -EXDEV;
+ } else if (!strcmp(mnt->mnt_sb->s_type->name, "tmpfs")) {
+ mntget(mnt);
+ up_read(&namespace_sem);
+ err = cpt_dump_tmpfs(path, ctx);
+ down_read(&namespace_sem);
+ if (!err && list_empty(&mnt->mnt_list))
+ err = -EBUSY;
+ mntput(mnt);
+ }
+ }
+
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+ if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS)
+ vefs_track_force_stop(mnt->mnt_sb);
+
+ free_page((unsigned long) path_buf);
+
+ return err;
+}
+
+static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct mnt_namespace *n = obj->o_obj;
+ struct cpt_object_hdr v;
+ struct vfsmount *rootmnt, *p;
+ loff_t saved_obj;
+ int err = 0;
+
+ cpt_open_object(obj, ctx);
+
+ v.cpt_next = -1;
+ v.cpt_object = CPT_OBJ_NAMESPACE;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_ARRAY;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+
+ down_read(&namespace_sem);
+ rootmnt = n->root;
+ for (p = rootmnt; p; p = next_mnt(p, rootmnt)) {
+ err = dump_vfsmount(p, ctx);
+ if (err)
+ break;
+ }
+ up_read(&namespace_sem);
+
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+
+ return err;
+}
+
+int cpt_dump_namespace(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_NAMESPACE);
+
+ for_each_object(obj, CPT_OBJ_NAMESPACE) {
+ int err;
+
+ if ((err = dump_one_namespace(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+ return 0;
+}
diff --git a/kernel/cpt/cpt_files.h b/kernel/cpt/cpt_files.h
new file mode 100644
index 0000000..bc66731
--- /dev/null
+++ b/kernel/cpt/cpt_files.h
@@ -0,0 +1,77 @@
+int cpt_collect_files(cpt_context_t *);
+int cpt_collect_fs(cpt_context_t *);
+int cpt_collect_namespace(cpt_context_t *);
+int cpt_collect_sysvsem_undo(cpt_context_t *);
+int cpt_collect_tty(struct file *, cpt_context_t *);
+int cpt_dump_files(struct cpt_context *ctx);
+int cpt_dump_files_struct(struct cpt_context *ctx);
+int cpt_dump_fs_struct(struct cpt_context *ctx);
+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx);
+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx);
+int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx);
+struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx);
+struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx);
+struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx);
+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx);
+
+int rst_posix_locks(struct cpt_context *ctx);
+
+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx);
+int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx);
+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_restore_fs(struct cpt_context *ctx);
+
+int cpt_collect_sysv(cpt_context_t *);
+int cpt_dump_sysvsem(struct cpt_context *ctx);
+int cpt_dump_sysvmsg(struct cpt_context *ctx);
+int rst_sysv_ipc(struct cpt_context *ctx);
+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+
+int cpt_dump_namespace(struct cpt_context *ctx);
+int rst_root_namespace(struct cpt_context *ctx);
+
+int rst_stray_files(struct cpt_context *ctx);
+int rst_tty_jobcontrol(struct cpt_context *ctx);
+
+void rst_flush_filejobs(struct cpt_context *);
+int rst_do_filejobs(struct cpt_context *);
+
+extern struct file_operations eventpoll_fops;
+extern struct file_operations signalfd_fops;
+
+int rst_eventpoll(struct cpt_context *);
+struct file *cpt_open_epolldev(struct cpt_file_image *fi,
+ unsigned flags,
+ struct cpt_context *ctx);
+int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *);
+
+int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx);
+int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp,
+ loff_t *pos, struct cpt_context *ctx);
+
+int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx);
+int rst_inotify(cpt_context_t *ctx);
+struct file *rst_open_inotify(struct cpt_file_image *fi,
+ unsigned flags,
+ struct cpt_context *ctx);
+
+struct dentry *cpt_fake_link(struct dentry *d, struct vfsmount *mnt,
+ struct inode *ino, struct cpt_context *ctx);
+
+int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
+ int verify, cpt_context_t *ctx);
+
+#define check_one_vfsmount(mnt) \
+ (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0 && \
+ strcmp(mnt->mnt_sb->s_type->name, "binfmt_misc") != 0)
diff --git a/kernel/cpt/cpt_fsmagic.h b/kernel/cpt/cpt_fsmagic.h
new file mode 100644
index 0000000..7e79789
--- /dev/null
+++ b/kernel/cpt/cpt_fsmagic.h
@@ -0,0 +1,17 @@
+/* Collected from kernel sources. */
+
+#define FSMAGIC_TMPFS 0x01021994
+#define FSMAGIC_PIPEFS 0x50495045
+#define FSMAGIC_SOCKFS 0x534F434B
+#define FSMAGIC_PFMFS 0xa0b4d889
+#define FSMAGIC_BDEV 0x62646576
+#define FSMAGIC_FUTEX 0x0BAD1DEA
+#define FSMAGIC_INOTIFY 0x2BAD1DEA
+#define FSMAGIC_MQUEUE 0x19800202
+#define FSMAGIC_PROC 0x9fa0
+#define FSMAGIC_DEVPTS 0x1CD1
+#define FSMAGIC_AUTOFS 0x0187
+#define FSMAGIC_EXT2 0xEF53
+#define FSMAGIC_REISER 0x52654973
+#define FSMAGIC_VEFS 0x565a4653
+#define FSMAGIC_ANON 0x09041934
diff --git a/kernel/cpt/cpt_inotify.c b/kernel/cpt/cpt_inotify.c
new file mode 100644
index 0000000..4f2abb0
--- /dev/null
+++ b/kernel/cpt/cpt_inotify.c
@@ -0,0 +1,174 @@
+/*
+ *
+ * kernel/cpt/cpt_inotify.c
+ *
+ * Copyright (C) 2000-2007 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/inotify.h>
+#include <linux/cpt_image.h>
+#include <linux/fsnotify_backend.h>
+
+#include "../../fs/notify/inotify/inotify.h"
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+static int dump_watch_inode(struct path *path, cpt_context_t *ctx)
+{
+ int err;
+ struct dentry *d;
+
+ d = path->dentry;
+ if (IS_ROOT(d) || !d_unhashed(d))
+ goto dump_dir;
+
+ d = cpt_fake_link(d->d_inode->i_nlink ? d : NULL,
+ path->mnt, d->d_inode, ctx);
+
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+dump_dir:
+ err = cpt_dump_dir(d, path->mnt, ctx);
+ if (d != path->dentry)
+ dput(d);
+
+ return err;
+}
+
+static int cpt_dump_watches(struct fsnotify_group *g, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct fsnotify_mark_entry *fse;
+ struct inotify_inode_mark_entry *ie;
+ struct cpt_inotify_wd_image wi;
+ loff_t saved_obj;
+
+ /* FIXME locking */
+ list_for_each_entry(fse, &g->mark_entries, g_list) {
+ struct path path;
+
+ ie = container_of(fse, struct inotify_inode_mark_entry,
+ fsn_entry);
+
+ cpt_open_object(NULL, ctx);
+
+ wi.cpt_next = CPT_NULL;
+ wi.cpt_object = CPT_OBJ_INOTIFY_WATCH;
+ wi.cpt_hdrlen = sizeof(wi);
+ wi.cpt_content = CPT_CONTENT_ARRAY;
+ wi.cpt_wd = ie->wd;
+ wi.cpt_mask = fse->mask;
+
+ ctx->write(&wi, sizeof(wi), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ spin_lock(&fse->lock);
+ if (ie->path.dentry == NULL) {
+ err = -EINVAL;
+ eprintk_ctx("inotify mark without path\n");
+ spin_unlock(&fse->lock);
+ break;
+ }
+
+ path = ie->path;
+ path_get(&path);
+ spin_unlock(&fse->lock);
+
+ err = dump_watch_inode(&path, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ path_put(&path);
+
+ if (err)
+ break;
+
+ cpt_close_object(ctx);
+ }
+
+ return err;
+}
+
+static int cpt_dump_events(struct fsnotify_group *g, struct cpt_context *ctx)
+{
+ /* FIXME - implement */
+ if (!list_empty(&g->notification_list))
+ wprintk_ctx("Inotify events are lost. Sorry...\n");
+
+ return 0;
+}
+
+int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx)
+{
+ int err;
+ struct file *file = obj->o_obj;
+ struct fsnotify_group *group;
+ struct cpt_inotify_image ii;
+ loff_t saved_obj;
+
+ if (file->f_op != &inotify_fops) {
+ eprintk_ctx("bad inotify file\n");
+ return -EINVAL;
+ }
+
+ group = file->private_data;
+ if (unlikely(group == NULL)) {
+ eprintk_ctx("bad inotify group\n");
+ return -EINVAL;
+ }
+
+ if (group->inotify_data.fa != NULL) {
+ eprintk_ctx("inotify with fasync\n");
+ return -ENOTSUPP;
+ }
+
+ cpt_open_object(NULL, ctx);
+
+ ii.cpt_next = CPT_NULL;
+ ii.cpt_object = CPT_OBJ_INOTIFY;
+ ii.cpt_hdrlen = sizeof(ii);
+ ii.cpt_content = CPT_CONTENT_ARRAY;
+ ii.cpt_file = obj->o_pos;
+ ii.cpt_user = group->inotify_data.user->uid;
+ ii.cpt_max_events = group->max_events;
+ ii.cpt_last_wd = group->max_events;
+
+ ctx->write(&ii, sizeof(ii), ctx);
+ cpt_push_object(&saved_obj, ctx);
+
+ err = cpt_dump_watches(group, ctx);
+ if (err == 0)
+ err = cpt_dump_events(group, ctx);
+
+ cpt_pop_object(&saved_obj, ctx);
+ cpt_close_object(ctx);
+
+ return err;
+}
diff --git a/kernel/cpt/cpt_kernel.c b/kernel/cpt/cpt_kernel.c
new file mode 100644
index 0000000..10fa5d6
--- /dev/null
+++ b/kernel/cpt/cpt_kernel.c
@@ -0,0 +1,185 @@
+/*
+ *
+ * kernel/cpt/cpt_kernel.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#define __KERNEL_SYSCALLS__ 1
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+#include <linux/cpt_image.h>
+#include <linux/virtinfo.h>
+#include <linux/virtinfoscp.h>
+
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+int debug_level = 1;
+
+#ifdef CONFIG_X86_32
+
+/*
+ * Create a kernel thread
+ */
+extern void kernel_thread_helper(void);
+int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
+{
+ struct pt_regs regs;
+
+ memset(&regs, 0, sizeof(regs));
+
+ regs.bx = (unsigned long) fn;
+ regs.dx = (unsigned long) arg;
+
+ regs.ds = __USER_DS;
+ regs.es = __USER_DS;
+ regs.fs = __KERNEL_PERCPU;
+ regs.gs = __KERNEL_STACK_CANARY;
+ regs.orig_ax = -1;
+ regs.ip = (unsigned long) kernel_thread_helper;
+ regs.cs = __KERNEL_CS | get_kernel_rpl();
+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+ /* Ok, create the new process.. */
+ return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL, pid);
+}
+#endif
+
+#ifdef CONFIG_IA64
+pid_t
+asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid)
+{
+ extern void start_kernel_thread (void);
+ unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread;
+ struct {
+ struct switch_stack sw;
+ struct pt_regs pt;
+ } regs;
+
+ memset(&regs, 0, sizeof(regs));
+ regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */
+ regs.pt.r1 = helper_fptr[1]; /* set GP */
+ regs.pt.r9 = (unsigned long) fn; /* 1st argument */
+ regs.pt.r11 = (unsigned long) arg; /* 2nd argument */
+ /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */
+ regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
+ regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */
+ regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+ regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
+ regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/);
+ return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs.pt, 0, NULL, NULL, pid);
+}
+#endif
+
+int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
+{
+ pid_t ret;
+
+ if (current->fs == NULL) {
+ /* do_fork_pid() hates processes without fs, oopses. */
+ printk("CPT BUG: local_kernel_thread: current->fs==NULL\n");
+ return -EINVAL;
+ }
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+ while ((ret = asm_kernel_thread(fn, arg, flags, pid)) ==
+ -ERESTARTNOINTR)
+ cond_resched();
+ if (ret < 0)
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+#ifdef __i386__
+int __execve(const char *file, char **argv, char **envp)
+{
+ long res;
+ __asm__ volatile ("int $0x80"
+ : "=a" (res)
+ : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)),
+ "d" ((long)(envp)) : "memory");
+ return (int)res;
+}
+#endif
+
+int sc_execve(char *cmd, char **argv, char **env)
+{
+ int ret;
+#ifndef __i386__
+ ret = kernel_execve(cmd, argv, env);
+#else
+ ret = __execve(cmd, argv, env);
+#endif
+ return ret;
+}
+
+unsigned int test_cpu_caps_and_features(void)
+{
+ unsigned int flags = 0;
+
+#ifdef CONFIG_X86
+ if (boot_cpu_has(X86_FEATURE_CMOV))
+ flags |= 1 << CPT_CPU_X86_CMOV;
+ if (cpu_has_fxsr)
+ flags |= 1 << CPT_CPU_X86_FXSR;
+ if (cpu_has_xmm)
+ flags |= 1 << CPT_CPU_X86_SSE;
+#ifndef CONFIG_X86_64
+ if (cpu_has_xmm2)
+#endif
+ flags |= 1 << CPT_CPU_X86_SSE2;
+ if (cpu_has_mmx)
+ flags |= 1 << CPT_CPU_X86_MMX;
+ if (boot_cpu_has(X86_FEATURE_3DNOW))
+ flags |= 1 << CPT_CPU_X86_3DNOW;
+ if (boot_cpu_has(X86_FEATURE_3DNOWEXT))
+ flags |= 1 << CPT_CPU_X86_3DNOW2;
+ if (boot_cpu_has(X86_FEATURE_SYSCALL))
+ flags |= 1 << CPT_CPU_X86_SYSCALL;
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_SYSCALL) &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ flags |= 1 << CPT_CPU_X86_SYSCALL32;
+#endif
+ if (boot_cpu_has(X86_FEATURE_SEP)
+#ifdef CONFIG_X86_64
+ && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
+#endif
+ )
+ flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32));
+#ifdef CONFIG_X86_64
+ flags |= 1 << CPT_CPU_X86_EMT64;
+#endif
+#endif
+#ifdef CONFIG_IA64
+ flags |= 1 << CPT_CPU_X86_IA64;
+ flags |= 1 << CPT_CPU_X86_FXSR;
+#endif
+ if (virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_TEST, NULL) & NOTIFY_FAIL)
+ flags |= 1 << CPT_SLM_DMPRST;
+ return flags;
+}
+
+unsigned int test_kernel_config(void)
+{
+ unsigned int flags = 0;
+#ifdef CONFIG_X86
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+ flags |= 1 << CPT_KERNEL_CONFIG_PAE;
+#endif
+#endif
+ return flags;
+}
diff --git a/kernel/cpt/cpt_kernel.h b/kernel/cpt/cpt_kernel.h
new file mode 100644
index 0000000..8bbd402
--- /dev/null
+++ b/kernel/cpt/cpt_kernel.h
@@ -0,0 +1,99 @@
+/* Interface to kernel vars which we had to _add_. */
+
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
+#define TASK_TRACED TASK_STOPPED
+#define unix_peer(sk) ((sk)->sk_pair)
+#define page_mapcount(pg) ((pg)->mapcount)
+#else
+#define unix_peer(sk) (unix_sk(sk)->peer)
+#endif
+
+#ifdef CONFIG_IA64
+#define cpu_has_fxsr 1
+#endif
+
+#define CPT_SIG_IGNORE_MASK (\
+ (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \
+ (1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1)))
+
+static inline void do_gettimespec(struct timespec *ts)
+{
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ ts->tv_sec = tv.tv_sec;
+ ts->tv_nsec = tv.tv_usec*1000;
+}
+
+int local_kernel_thread(int (*fn)(void *),
+ void * arg,
+ unsigned long flags,
+ pid_t pid);
+int asm_kernel_thread(int (*fn)(void *),
+ void * arg,
+ unsigned long flags,
+ pid_t pid);
+
+#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE)
+void vefs_track_force_stop(struct super_block *super);
+
+void vefs_track_notify(struct dentry *vdentry, int track_cow);
+
+struct dentry * vefs_replaced_dentry(struct dentry *de);
+int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde);
+#else
+static inline void vefs_track_force_stop(struct super_block *super) { };
+
+static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { };
+#endif
+
+unsigned int test_cpu_caps_and_features(void);
+unsigned int test_kernel_config(void);
+
+#define test_one_flag_old(src, dst, flag, message, ret) \
+if (src & (1 << flag)) \
+ if (!(dst & (1 << flag))) { \
+ wprintk("Destination cpu does not have " message "\n"); \
+ ret = 1; \
+ }
+#define test_one_flag(src, dst, flag, message, ret) \
+if (src & (1 << flag)) \
+ if (!(dst & (1 << flag))) { \
+ eprintk_ctx("Destination cpu does not have " message "\n"); \
+ ret = 1; \
+ }
+
+static inline void
+_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+
+static inline struct timespec
+_ns_to_timespec(const s64 nsec)
+{
+ struct timespec ts;
+ s32 rem;
+
+ if (!nsec)
+ return (struct timespec) {0, 0};
+
+ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
+ if (unlikely(rem < 0)) {
+ ts.tv_sec--;
+ rem += NSEC_PER_SEC;
+ }
+ ts.tv_nsec = rem;
+
+ return ts;
+}
diff --git a/kernel/cpt/cpt_mm.c b/kernel/cpt/cpt_mm.c
new file mode 100644
index 0000000..1164358
--- /dev/null
+++ b/kernel/cpt/cpt_mm.c
@@ -0,0 +1,923 @@
+/*
+ *
+ * kernel/cpt/cpt_mm.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#ifdef CONFIG_X86
+#include <asm/ldt.h>
+#endif
+#include <asm/mmu.h>
+#include <linux/cpt_image.h>
+#include <linux/shm.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+#include "cpt_pagein.h"
+#endif
+#include "cpt_ubc.h"
+
+static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
+ cpt_context_t *ctx)
+{
+ if (!list_empty(&aio_ctx->run_list)) {
+ /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */
+ eprintk_ctx("run list is not empty, cannot suspend AIO\n");
+ return -EBUSY;
+ }
+
+ /* Wait for pending IOCBs. Linux AIO is mostly _fake_.
+ * It is actually synchronous, except for direct IO and
+ * some funny raw USB things, which cannot happen inside VE.
+ * However, we do this for future.
+ *
+ * Later note: in 2.6.16 we may allow O_DIRECT, so that
+ * it is not meaningless code.
+ */
+ wait_for_all_aios(aio_ctx);
+
+ if (!list_empty(&aio_ctx->run_list) ||
+ !list_empty(&aio_ctx->active_reqs) ||
+ aio_ctx->reqs_active) {
+ eprintk_ctx("were not able to suspend AIO\n");
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx)
+{
+ struct vm_area_struct *vma;
+ struct hlist_node *n;
+ struct kioctx *aio_ctx;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma->vm_file) {
+ if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL)
+ return -ENOMEM;
+ }
+ }
+
+ if (mm->exe_file &&
+ cpt_object_add(CPT_OBJ_FILE, mm->exe_file, ctx) == NULL)
+ return -ENOMEM;
+
+#ifdef CONFIG_BEANCOUNTERS
+ if (cpt_add_ubc(mm->mm_ub, ctx) == NULL)
+ return -ENOMEM;
+#endif
+
+ hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) {
+ int err;
+
+ if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
+ return err;
+ }
+
+ return 0;
+}
+
+int cpt_collect_mm(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+ int err;
+ int index;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL)
+ return -ENOMEM;
+ }
+
+ index = 1;
+ for_each_object(obj, CPT_OBJ_MM) {
+ struct mm_struct *mm = obj->o_obj;
+ if (obj->o_count != atomic_read(&mm->mm_users)) {
+ eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users));
+ return -EAGAIN;
+ }
+ cpt_obj_setindex(obj, index++, ctx);
+
+ if ((err = collect_one_mm(mm, ctx)) != 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int zcnt, scnt, scnt0, ucnt;
+
+/* Function where_is_anon_page() returns address of a anonymous page in mm
+ * of already dumped process. This happens f.e. after fork(). We do not use
+ * this right now, just keep statistics, it is diffucult to restore such state,
+ * but the most direct use is to save space in dumped image. */
+
+
+static inline unsigned long
+vma_address0(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ unsigned long address;
+
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ address |= 1;
+ return address;
+}
+
+static int really_this_one(struct vm_area_struct *vma, unsigned long address,
+ struct page *page)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+ int result;
+
+ pgd = pgd_offset(mm, address);
+ if (unlikely(!pgd_present(*pgd)))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (unlikely(!pmd_present(*pmd)))
+ return 0;
+
+ result = 0;
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return 0;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte))
+ result = 1;
+ pte_unmap_unlock(pte, ptl);
+ return result;
+}
+
+static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr,
+ struct page *page, cpt_context_t * ctx)
+{
+ loff_t mmptr = CPT_NULL;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ int idx = mmobj->o_index;
+
+ if (!PageAnon(page))
+ return CPT_NULL;
+
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ return CPT_NULL;
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ unsigned long addr = vma_address0(page, vma);
+ cpt_object_t *obj;
+
+ /* We do not try to support mremapped regions (addr != mapaddr),
+ * only mmaps directly inherited via fork().
+ * With this limitation we may check self-consistency of
+ * vmas (vm_start, vm_pgoff, anon_vma) before
+ * doing __copy_page_range() in rst_mm.
+ */
+ if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) {
+ obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx);
+ if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) {
+ if (really_this_one(vma, addr, page)) {
+ mmptr = obj->o_pos;
+ idx = obj->o_index;
+ }
+ }
+ }
+ }
+ page_unlock_anon_vma(anon_vma);
+
+ return mmptr;
+}
+
+struct page_area
+{
+ int type;
+ unsigned long start;
+ unsigned long end;
+ pgoff_t pgoff;
+ loff_t mm;
+ __u64 list[16];
+};
+
+struct page_desc
+{
+ int type;
+ pgoff_t index;
+ loff_t mm;
+ int shared;
+};
+
+enum {
+ PD_ABSENT,
+ PD_COPY,
+ PD_ZERO,
+ PD_CLONE,
+ PD_FUNKEY,
+ PD_LAZY,
+ PD_ITER,
+ PD_ITERYOUNG,
+};
+
+/* 0: page can be obtained from backstore, or still not mapped anonymous page,
+ or something else, which does not requre copy.
+ 1: page requires copy
+ 2: page requres copy but its content is zero. Quite useless.
+ 3: wp page is shared after fork(). It is to be COWed when modified.
+ 4: page is something unsupported... We copy it right now.
+ */
+
+
+
+static void page_get_desc(cpt_object_t *mmobj,
+ struct vm_area_struct *vma, unsigned long addr,
+ struct page_desc *pdesc, cpt_context_t * ctx)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ struct page *pg = NULL;
+ pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff;
+
+ pdesc->index = linear_index;
+ pdesc->shared = 0;
+ pdesc->mm = CPT_NULL;
+
+ if (vma->vm_flags & VM_IO) {
+ pdesc->type = PD_ABSENT;
+ return;
+ }
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto out_absent;
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto out_absent;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto out_absent;
+#ifdef CONFIG_X86
+ if (pmd_huge(*pmd)) {
+ eprintk_ctx("page_huge\n");
+ goto out_unsupported;
+ }
+#endif
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+retry:
+#endif
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = *ptep;
+ pte_unmap(ptep);
+
+ if (pte_none(pte))
+ goto out_absent_unlock;
+
+ if (!pte_present(pte)) {
+ if (pte_file(pte)) {
+ pdesc->index = pte_to_pgoff(pte);
+ goto out_absent_unlock;
+ }
+ if (vma->vm_flags & VM_SHARED) {
+ /* It is impossible: shared mappings cannot be in swap */
+ eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos);
+ goto out_unsupported_unlock;
+ }
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ /* Otherwise it is in swap. */
+ if (!ctx->lazy_vm) {
+ int err;
+ /* If lazy transfer is not enabled,
+ * raise it from swap now, so that we
+ * save at least when the page is shared.
+ */
+ spin_unlock(ptl);
+ err = handle_mm_fault(mm, vma, addr, 0);
+ if (err == VM_FAULT_SIGBUS)
+ goto out_absent;
+ if (err == VM_FAULT_OOM)
+ goto out_absent;
+ err = 0;
+ goto retry;
+ }
+#endif
+ pdesc->type = PD_LAZY;
+ goto out_unlock;
+ }
+
+ if ((pg = vm_normal_page(vma, addr, pte)) == NULL) {
+ pdesc->type = PD_COPY;
+ goto out_unlock;
+ }
+
+ get_page(pg);
+ spin_unlock(ptl);
+
+ if (pg->mapping && !PageAnon(pg)) {
+ if (vma->vm_file == NULL) {
+ eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
+ goto out_unsupported;
+ }
+ if (vma->vm_file->f_mapping != pg->mapping) {
+ eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n",
+ addr, vma->vm_file->f_mapping, pg->mapping,
+ mmobj->o_pos);
+ goto out_unsupported;
+ }
+ pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
+ /* Page is in backstore. For us it is like
+ * it is not present.
+ */
+ goto out_absent;
+ }
+
+ if (PageReserved(pg)) {
+ /* Special case: ZERO_PAGE is used, when an
+ * anonymous page is accessed but not written. */
+ if (pg == ZERO_PAGE(addr)) {
+ if (pte_write(pte)) {
+ eprintk_ctx("not funny already, writable ZERO_PAGE\n");
+ goto out_unsupported;
+ }
+ zcnt++;
+ goto out_absent;
+ }
+ eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index,
+ addr, mmobj->o_pos);
+ goto out_unsupported;
+ }
+
+ if (pg == ZERO_PAGE(addr)) {
+ wprintk_ctx("that's how it works now\n");
+ }
+
+ if (!pg->mapping) {
+ eprintk_ctx("page without mapping at %08lx@%Ld\n", addr,
+ mmobj->o_pos);
+ goto out_unsupported;
+ }
+
+ if (pg->mapping && page_mapcount(pg) > 1) {
+ pdesc->shared = 1;
+ pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx);
+ if (pdesc->mm != CPT_NULL) {
+ scnt0++;
+ pdesc->type = PD_CLONE;
+ goto out_put;
+ } else {
+ scnt++;
+ }
+ }
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+ if (ctx->iter_done &&
+ test_bit(PG_checkpointed, &pg->flags)) {
+ if (pte_write(pte)) {
+ wprintk_ctx("writable PG_checkpointed page\n");
+ }
+ pdesc->index = page_to_pfn(pg);
+ pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER;
+ goto out_put;
+ }
+#endif
+ pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY;
+
+out_put:
+ if (pg)
+ put_page(pg);
+ return;
+
+out_unlock:
+ spin_unlock(ptl);
+ goto out_put;
+
+out_absent_unlock:
+ spin_unlock(ptl);
+out_absent:
+ pdesc->type = PD_ABSENT;
+ goto out_put;
+
+out_unsupported_unlock:
+ spin_unlock(ptl);
+out_unsupported:
+ ucnt++;
+ pdesc->type = PD_FUNKEY;
+ goto out_put;
+}
+
+/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages()
+ * does not really need this thing. It just stores some page fault stats there.
+ *
+ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages
+ * before accessing vma.
+ */
+void dump_pages(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct cpt_context *ctx)
+{
+#define MAX_PAGE_BATCH 16
+ struct page *pg[MAX_PAGE_BATCH];
+ int npages = (end - start)/PAGE_SIZE;
+ int count = 0;
+
+ while (count < npages) {
+ int copy = npages - count;
+ int n;
+
+ if (copy > MAX_PAGE_BATCH)
+ copy = MAX_PAGE_BATCH;
+ n = get_user_pages(current, vma->vm_mm, start, copy,
+ 0, 1, pg, NULL);
+ if (n == copy) {
+ int i;
+ for (i=0; i<n; i++) {
+ char *maddr = kmap(pg[i]);
+ ctx->write(maddr, PAGE_SIZE, ctx);
+ kunmap(pg[i]);
+ }
+ } else {
+ eprintk_ctx("get_user_pages fault");
+ for ( ; n > 0; n--)
+ page_cache_release(pg[n-1]);
+ return;
+ }
+ start += n*PAGE_SIZE;
+ count += n;
+ for ( ; n > 0; n--)
+ page_cache_release(pg[n-1]);
+ }
+ return;
+}
+
+int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb,
+ int copy,
+ struct cpt_context *ctx)
+{
+ loff_t saved_object;
+
+ cpt_push_object(&saved_object, ctx);
+
+ pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES;
+ pgb->cpt_hdrlen = sizeof(*pgb);
+ pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID;
+
+ ctx->write(pgb, sizeof(*pgb), ctx);
+ if (copy == PD_COPY || copy == PD_LAZY)
+ dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa,
+ struct cpt_context *ctx)
+{
+ struct cpt_remappage_block pgb;
+ loff_t saved_object;
+
+ cpt_push_object(&saved_object, ctx);
+
+ pgb.cpt_object = CPT_OBJ_REMAPPAGES;
+ pgb.cpt_hdrlen = sizeof(pgb);
+ pgb.cpt_content = CPT_CONTENT_VOID;
+ pgb.cpt_start = pa->start;
+ pgb.cpt_end = pa->end;
+ pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1;
+
+ ctx->write(&pgb, sizeof(pgb), ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa,
+ struct cpt_context *ctx)
+{
+ struct cpt_copypage_block pgb;
+ loff_t saved_object;
+
+ cpt_push_object(&saved_object, ctx);
+
+ pgb.cpt_object = CPT_OBJ_COPYPAGES;
+ pgb.cpt_hdrlen = sizeof(pgb);
+ pgb.cpt_content = CPT_CONTENT_VOID;
+ pgb.cpt_start = pa->start;
+ pgb.cpt_end = pa->end;
+ pgb.cpt_source = pa->mm;
+
+ ctx->write(&pgb, sizeof(pgb), ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa,
+ cpt_context_t *ctx)
+{
+ struct cpt_lazypage_block pgb;
+ loff_t saved_object;
+
+ cpt_push_object(&saved_object, ctx);
+
+ pgb.cpt_object = CPT_OBJ_LAZYPAGES;
+ pgb.cpt_hdrlen = sizeof(pgb);
+ pgb.cpt_content = CPT_CONTENT_VOID;
+ pgb.cpt_start = pa->start;
+ pgb.cpt_end = pa->end;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start,
+ (pa->end-pa->start)/PAGE_SIZE, ctx);
+#endif
+ ctx->write(&pgb, sizeof(pgb), ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa,
+ cpt_context_t *ctx)
+{
+ struct cpt_iterpage_block pgb;
+ loff_t saved_object;
+
+ cpt_push_object(&saved_object, ctx);
+
+ pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES :
+ CPT_OBJ_ITERYOUNGPAGES;
+ pgb.cpt_hdrlen = sizeof(pgb);
+ pgb.cpt_content = CPT_CONTENT_VOID;
+ pgb.cpt_start = pa->start;
+ pgb.cpt_end = pa->end;
+ ctx->write(&pgb, sizeof(pgb), ctx);
+
+ ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx);
+
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+
+static int can_expand(struct page_area *pa, struct page_desc *pd)
+{
+ if (pa->start == pa->end)
+ return 1;
+ if (pa->type != pd->type)
+ return 0;
+ if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) {
+ if (pa->end - pa->start >= PAGE_SIZE*16)
+ return 0;
+ pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index;
+ }
+ if (pa->type == PD_ABSENT)
+ return pd->index == pa->pgoff + 1;
+ if (pa->type == PD_CLONE)
+ return pd->mm == pa->mm;
+ return 1;
+}
+
+static int dump_one_vma(cpt_object_t *mmobj,
+ struct vm_area_struct *vma, struct cpt_context *ctx)
+{
+ struct cpt_vma_image *v = cpt_get_buf(ctx);
+ unsigned long addr;
+ loff_t saved_object;
+ struct cpt_page_block pgb;
+ struct page_area pa;
+ int cloned_pages = 0;
+
+ cpt_push_object(&saved_object, ctx);
+
+ v->cpt_object = CPT_OBJ_VMA;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_start = vma->vm_start;
+ v->cpt_end = vma->vm_end;
+ v->cpt_flags = vma->vm_flags;
+ if (vma->vm_flags&VM_HUGETLB) {
+ eprintk_ctx("huge TLB VMAs are still not supported\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ v->cpt_pgprot = vma->vm_page_prot.pgprot;
+ v->cpt_pgoff = vma->vm_pgoff;
+ v->cpt_file = CPT_NULL;
+#ifndef CONFIG_IA64
+ if ((void *)vma->vm_start == vma->vm_mm->context.vdso &&
+ vma->vm_ops == &special_mapping_vmops)
+ v->cpt_type = CPT_VMA_VDSO;
+ else
+#endif
+ v->cpt_type = CPT_VMA_TYPE_0;
+ v->cpt_anonvma = 0;
+
+ /* We have to remember what VMAs are bound to one anon_vma.
+ * So, we store an identifier of group of VMAs. It is handy
+ * to use absolute address of anon_vma as this identifier. */
+ v->cpt_anonvmaid = (unsigned long)vma->anon_vma;
+
+ if (vma->vm_file) {
+ struct file *filp;
+ cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx);
+ if (obj == NULL) BUG();
+ filp = obj->o_obj;
+ if (filp->f_op == &shm_file_operations) {
+ struct shm_file_data *sfd = filp->private_data;
+
+ v->cpt_type = CPT_VMA_TYPE_SHM;
+ obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx);
+ }
+ v->cpt_file = obj->o_pos;
+ }
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ if (v->cpt_type == CPT_VMA_VDSO)
+ goto out;
+
+ pa.type = PD_ABSENT;
+ pa.pgoff = vma->vm_pgoff;
+ pa.mm = CPT_NULL;
+ pa.start = vma->vm_start;
+ pa.end = vma->vm_start;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ struct page_desc pd;
+
+ page_get_desc(mmobj, vma, addr, &pd, ctx);
+ cloned_pages += pd.shared;
+
+ if (pd.type == PD_FUNKEY) {
+ eprintk_ctx("dump_one_vma: funkey page\n");
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ if (pd.type == PD_LAZY &&
+ (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED)))
+ pd.type = PD_COPY;
+#else
+ if (pd.type == PD_LAZY)
+ pd.type = PD_COPY;
+#endif
+
+ if (!can_expand(&pa, &pd)) {
+ if (pa.type == PD_COPY ||
+ pa.type == PD_ZERO) {
+ pgb.cpt_start = pa.start;
+ pgb.cpt_end = pa.end;
+ dump_page_block(vma, &pgb, pa.type, ctx);
+ } else if (pa.type == PD_CLONE) {
+ dump_copypage_block(vma, &pa, ctx);
+ cloned_pages++;
+ } else if (pa.type == PD_LAZY) {
+ dump_lazypage_block(vma, &pa, ctx);
+ } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) {
+ dump_iterpage_block(vma, &pa, ctx);
+ cloned_pages++;
+ } else if (pa.type == PD_ABSENT &&
+ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
+ dump_remappage_block(vma, &pa, ctx);
+ }
+ pa.start = addr;
+ }
+ pa.type = pd.type;
+ pa.end = addr + PAGE_SIZE;
+ pa.pgoff = pd.index;
+ if (addr == pa.start)
+ pa.list[0] = pd.index;
+ pa.mm = pd.mm;
+ }
+
+ if (pa.end > pa.start) {
+ if (pa.type == PD_COPY ||
+ pa.type == PD_ZERO) {
+ pgb.cpt_start = pa.start;
+ pgb.cpt_end = pa.end;
+ dump_page_block(vma, &pgb, pa.type, ctx);
+ } else if (pa.type == PD_CLONE) {
+ dump_copypage_block(vma, &pa, ctx);
+ cloned_pages++;
+ } else if (pa.type == PD_LAZY) {
+ dump_lazypage_block(vma, &pa, ctx);
+ } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) {
+ dump_iterpage_block(vma, &pa, ctx);
+ cloned_pages++;
+ } else if (pa.type == PD_ABSENT &&
+ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
+ dump_remappage_block(vma, &pa, ctx);
+ }
+ }
+
+ if (cloned_pages) {
+ __u32 anonvma = 1;
+ loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma);
+ ctx->pwrite(&anonvma, 4, ctx, anonpos);
+ }
+
+out:
+ cpt_close_object(ctx);
+
+ cpt_pop_object(&saved_object, ctx);
+
+ return 0;
+}
+
+static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
+ cpt_context_t *ctx)
+{
+ loff_t saved_object;
+ struct cpt_aio_ctx_image aimg;
+
+ if (!list_empty(&aio_ctx->run_list) ||
+ !list_empty(&aio_ctx->active_reqs) ||
+ aio_ctx->reqs_active) {
+ eprintk_ctx("AIO is active after suspend\n");
+ return -EBUSY;
+ }
+
+ cpt_push_object(&saved_object, ctx);
+
+ aimg.cpt_next = CPT_ALIGN(sizeof(aimg));
+ aimg.cpt_object = CPT_OBJ_AIO_CONTEXT;
+ aimg.cpt_hdrlen = sizeof(aimg);
+ aimg.cpt_content = CPT_CONTENT_ARRAY;
+
+ aimg.cpt_max_reqs = aio_ctx->max_reqs;
+ aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages;
+ aimg.cpt_nr = aio_ctx->ring_info.nr;
+ aimg.cpt_tail = aio_ctx->ring_info.tail;
+ aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base;
+
+ ctx->write(&aimg, sizeof(aimg), ctx);
+
+ cpt_pop_object(&saved_object, ctx);
+ return 0;
+}
+
+static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct mm_struct *mm = obj->o_obj;
+ struct vm_area_struct *vma;
+ struct cpt_mm_image *v = cpt_get_buf(ctx);
+ struct kioctx *aio_ctx;
+ struct hlist_node *n;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = -1;
+ v->cpt_object = CPT_OBJ_MM;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_start_code = mm->start_code;
+ v->cpt_end_code = mm->end_code;
+ v->cpt_start_data = mm->start_data;
+ v->cpt_end_data = mm->end_data;
+ v->cpt_start_brk = mm->start_brk;
+ v->cpt_brk = mm->brk;
+ v->cpt_start_stack = mm->start_stack;
+ v->cpt_start_arg = mm->arg_start;
+ v->cpt_end_arg = mm->arg_end;
+ v->cpt_start_env = mm->env_start;
+ v->cpt_end_env = mm->env_end;
+ v->cpt_def_flags = mm->def_flags;
+#ifdef CONFIG_BEANCOUNTERS
+ v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx);
+#endif
+ /* FIXME when coredump mask exceeds 8 bits */
+ WARN_ON(mm->flags >> 8);
+ v->cpt_dumpable = mm->flags;
+ v->cpt_vps_dumpable = mm->vps_dumpable;
+ v->cpt_used_hugetlb = 0; /* not used */
+#ifndef CONFIG_IA64
+ v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso;
+#endif
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+#ifdef CONFIG_X86
+ if (mm->context.size) {
+ loff_t saved_object;
+ struct cpt_obj_bits b;
+ int size;
+
+ dprintk_ctx("nontrivial LDT\n");
+
+ cpt_push_object(&saved_object, ctx);
+
+ cpt_open_object(NULL, ctx);
+ b.cpt_next = CPT_NULL;
+ b.cpt_object = CPT_OBJ_BITS;
+ b.cpt_hdrlen = sizeof(b);
+ b.cpt_content = CPT_CONTENT_MM_CONTEXT;
+ b.cpt_size = mm->context.size*LDT_ENTRY_SIZE;
+
+ ctx->write(&b, sizeof(b), ctx);
+
+ size = mm->context.size*LDT_ENTRY_SIZE;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \
+ LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19)
+ ctx->write(mm->context.ldt, size, ctx);
+#else
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ int nr = i / PAGE_SIZE, bytes;
+ char *kaddr = kmap(mm->context.ldt_pages[nr]);
+
+ bytes = size - i;
+ if (bytes > PAGE_SIZE)
+ bytes = PAGE_SIZE;
+ ctx->write(kaddr, bytes, ctx);
+ kunmap(mm->context.ldt_pages[nr]);
+ }
+#endif
+
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_object, ctx);
+ }
+#endif
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ int err;
+
+ if ((err = dump_one_vma(obj, vma, ctx)) != 0)
+ return err;
+ }
+
+ hlist_for_each_entry(aio_ctx, n, &mm->ioctx_list, list) {
+ int err;
+
+ if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_object(ctx);
+
+ return 0;
+}
+
+int cpt_dump_vm(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ scnt = scnt0 = zcnt = 0;
+
+ cpt_open_section(ctx, CPT_SECT_MM);
+
+ for_each_object(obj, CPT_OBJ_MM) {
+ int err;
+
+ if ((err = dump_one_mm(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+
+ if (scnt)
+ dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt);
+ if (scnt0)
+ dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0);
+ if (zcnt)
+ dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt);
+ return 0;
+}
diff --git a/kernel/cpt/cpt_mm.h b/kernel/cpt/cpt_mm.h
new file mode 100644
index 0000000..dc2c483
--- /dev/null
+++ b/kernel/cpt/cpt_mm.h
@@ -0,0 +1,35 @@
+int cpt_collect_mm(cpt_context_t *);
+
+int cpt_dump_vm(struct cpt_context *ctx);
+
+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx);
+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
+
+int cpt_mm_prepare(unsigned long veid);
+
+int cpt_free_pgin_dir(struct cpt_context *);
+int cpt_start_pagein(struct cpt_context *);
+int rst_setup_pagein(struct cpt_context *);
+int rst_complete_pagein(struct cpt_context *, int);
+int rst_pageind(struct cpt_context *);
+int cpt_iteration(cpt_context_t *ctx);
+int rst_iteration(cpt_context_t *ctx);
+void rst_drop_iter_dir(cpt_context_t *ctx);
+int rst_iter(struct vm_area_struct *vma, u64 pfn,
+ unsigned long addr, cpt_context_t * ctx);
+
+int rst_swapoff(struct cpt_context *);
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack,
+ unsigned long map_address);
+#endif
+
+#ifdef CONFIG_X86
+extern struct page *vdso32_pages[1];
+#define vsyscall_addr page_address(vdso32_pages[0])
+#endif
+
+extern struct vm_operations_struct special_mapping_vmops;
diff --git a/kernel/cpt/cpt_net.c b/kernel/cpt/cpt_net.c
new file mode 100644
index 0000000..473a294
--- /dev/null
+++ b/kernel/cpt/cpt_net.c
@@ -0,0 +1,652 @@
+/*
+ *
+ * kernel/cpt/cpt_net.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/rtnetlink.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/if_tun.h>
+#include <linux/veth.h>
+#include <linux/fdtable.h>
+
+#include <linux/cpt_export.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_kernel.h"
+#include "cpt_syscalls.h"
+
+static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx)
+{
+ struct cpt_netstats_image *n;
+ struct net_device_stats *stats;
+
+ if (!dev->netdev_ops->ndo_get_stats)
+ return;
+
+ n = cpt_get_buf(ctx);
+ stats = dev->netdev_ops->ndo_get_stats(dev);
+ cpt_open_object(NULL, ctx);
+
+ n->cpt_next = CPT_NULL;
+ n->cpt_object = CPT_OBJ_NET_STATS;
+ n->cpt_hdrlen = sizeof(*n);
+ n->cpt_content = CPT_CONTENT_VOID;
+
+ n->cpt_rx_packets = stats->rx_packets;
+ n->cpt_tx_packets = stats->tx_packets;
+ n->cpt_rx_bytes = stats->rx_bytes;
+ n->cpt_tx_bytes = stats->tx_bytes;
+ n->cpt_rx_errors = stats->rx_errors;
+ n->cpt_tx_errors = stats->tx_errors;
+ n->cpt_rx_dropped = stats->rx_dropped;
+ n->cpt_tx_dropped = stats->tx_dropped;
+ n->cpt_multicast = stats->multicast;
+ n->cpt_collisions = stats->collisions;
+ n->cpt_rx_length_errors = stats->rx_length_errors;
+ n->cpt_rx_over_errors = stats->rx_over_errors;
+ n->cpt_rx_crc_errors = stats->rx_crc_errors;
+ n->cpt_rx_frame_errors = stats->rx_frame_errors;
+ n->cpt_rx_fifo_errors = stats->rx_fifo_errors;
+ n->cpt_rx_missed_errors = stats->rx_missed_errors;
+ n->cpt_tx_aborted_errors = stats->tx_aborted_errors;
+ n->cpt_tx_carrier_errors = stats->tx_carrier_errors;
+ n->cpt_tx_fifo_errors = stats->tx_fifo_errors;
+ n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors;
+ n->cpt_tx_window_errors = stats->tx_window_errors;
+ n->cpt_rx_compressed = stats->rx_compressed;
+ n->cpt_tx_compressed = stats->tx_compressed;
+
+ ctx->write(n, sizeof(*n), ctx);
+ cpt_close_object(ctx);
+ cpt_release_buf(ctx);
+ return;
+}
+
+int cpt_dump_link(struct cpt_context * ctx)
+{
+ struct net *net = get_exec_env()->ve_netns;
+ struct net_device *dev;
+
+ cpt_open_section(ctx, CPT_SECT_NET_DEVICE);
+ for_each_netdev(net, dev) {
+ struct cpt_netdev_image v;
+ struct cpt_hwaddr_image hw;
+ loff_t saved_obj;
+
+ if (dev->netdev_ops->ndo_cpt == NULL) {
+ eprintk_ctx("unsupported netdev %s\n", dev->name);
+ cpt_close_section(ctx);
+ return -EBUSY;
+ }
+
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_DEVICE;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_ARRAY;
+
+ v.cpt_index = dev->ifindex;
+ v.cpt_flags = dev->flags;
+ memcpy(v.cpt_name, dev->name, IFNAMSIZ);
+ ctx->write(&v, sizeof(v), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+
+ cpt_open_object(NULL, ctx);
+ dev->netdev_ops->ndo_cpt(dev, &cpt_ops, ctx);
+
+ /* Dump hardware address */
+ cpt_open_object(NULL, ctx);
+ hw.cpt_next = CPT_NULL;
+ hw.cpt_object = CPT_OBJ_NET_HWADDR;
+ hw.cpt_hdrlen = sizeof(hw);
+ hw.cpt_content = CPT_CONTENT_VOID;
+
+ if (dev->dev_addrs.count != 1) {
+ eprintk_ctx("multiple hwaddrs on %s\n", dev->name);
+ return -EINVAL;
+ }
+
+ BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) != MAX_ADDR_LEN);
+ memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr));
+ ctx->write(&hw, sizeof(hw), ctx);
+ cpt_close_object(ctx);
+
+ cpt_dump_netstats(dev, ctx);
+
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+ }
+ cpt_close_section(ctx);
+ return 0;
+}
+
+int cpt_suspend_network(struct cpt_context *ctx)
+{
+ get_exec_env()->disable_net = 1;
+ synchronize_net();
+ return 0;
+}
+
+int cpt_resume_network(struct cpt_context *ctx)
+{
+ struct ve_struct *env;
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+ env->disable_net = 0;
+ put_ve(env);
+ return 0;
+}
+
+int cpt_dump_ifaddr(struct cpt_context * ctx)
+{
+ struct net *net = get_exec_env()->ve_netns;
+ struct net_device *dev;
+
+ cpt_open_section(ctx, CPT_SECT_NET_IFADDR);
+ for_each_netdev(net, dev) {
+ struct in_device *idev = in_dev_get(dev);
+ struct in_ifaddr *ifa;
+
+ if (!idev)
+ continue;
+
+ for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ struct cpt_ifaddr_image v;
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_IFADDR;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_index = dev->ifindex;
+ v.cpt_family = AF_INET;
+ v.cpt_masklen = ifa->ifa_prefixlen;
+ v.cpt_flags = ifa->ifa_flags;
+ v.cpt_scope = ifa->ifa_scope;
+ memset(&v.cpt_address, 0, sizeof(v.cpt_address));
+ memset(&v.cpt_peer, 0, sizeof(v.cpt_peer));
+ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
+ v.cpt_address[0] = ifa->ifa_local;
+ v.cpt_peer[0] = ifa->ifa_address;
+ v.cpt_broadcast[0] = ifa->ifa_broadcast;
+ memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ);
+ ctx->write(&v, sizeof(v), ctx);
+ cpt_close_object(ctx);
+ }
+ in_dev_put(idev);
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ for_each_netdev(net, dev) {
+ struct inet6_dev *idev = in6_dev_get(dev);
+ struct inet6_ifaddr *ifa;
+
+ if (!idev)
+ continue;
+
+ for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
+ struct cpt_ifaddr_image v;
+
+ if (dev == net->loopback_dev &&
+ ifa->prefix_len == 128 &&
+ ifa->addr.s6_addr32[0] == 0 &&
+ ifa->addr.s6_addr32[1] == 0 &&
+ ifa->addr.s6_addr32[2] == 0 &&
+ ifa->addr.s6_addr32[3] == htonl(1))
+ continue;
+
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_IFADDR;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_index = dev->ifindex;
+ v.cpt_family = AF_INET6;
+ v.cpt_masklen = ifa->prefix_len;
+ v.cpt_flags = ifa->flags;
+ v.cpt_scope = ifa->scope;
+ v.cpt_valid_lft = ifa->valid_lft;
+ v.cpt_prefered_lft = ifa->prefered_lft;
+ memcpy(&v.cpt_address, &ifa->addr, 16);
+ memcpy(&v.cpt_peer, &ifa->addr, 16);
+ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
+ memcpy(v.cpt_label, dev->name, IFNAMSIZ);
+ ctx->write(&v, sizeof(v), ctx);
+ cpt_close_object(ctx);
+ }
+ in6_dev_put(idev);
+ }
+#endif
+ cpt_close_section(ctx);
+ return 0;
+}
+
+#ifdef CONFIG_IP_FIB_TRIE
+#error "Trie fib rules are known not to be restored proprly yet"
+#endif
+
+static int cpt_dump_route(struct cpt_context * ctx)
+{
+ int err;
+ struct socket *sock;
+ struct msghdr msg;
+ struct iovec iov;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtgenmsg g;
+ } req;
+ struct sockaddr_nl nladdr;
+ struct cpt_object_hdr v;
+ mm_segment_t oldfs;
+ char *pg;
+
+ err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
+ if (err)
+ return err;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = RTM_GETROUTE;
+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.g.rtgen_family = AF_INET;
+
+ iov.iov_base=&req;
+ iov.iov_len=sizeof(req);
+ msg.msg_name=&nladdr;
+ msg.msg_namelen=sizeof(nladdr);
+ msg.msg_iov=&iov;
+ msg.msg_iovlen=1;
+ msg.msg_control=NULL;
+ msg.msg_controllen=0;
+ msg.msg_flags=MSG_DONTWAIT;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock_sendmsg(sock, &msg, sizeof(req));
+ set_fs(oldfs);
+
+ if (err < 0)
+ goto out_sock;
+
+ pg = (char*)__get_free_page(GFP_KERNEL);
+ if (pg == NULL) {
+ err = -ENOMEM;
+ goto out_sock;
+ }
+
+ cpt_open_section(ctx, CPT_SECT_NET_ROUTE);
+ cpt_open_object(NULL, ctx);
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_ROUTE;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_NLMARRAY;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+restart:
+#endif
+ for (;;) {
+ struct nlmsghdr *h;
+
+ iov.iov_base = pg;
+ iov.iov_len = PAGE_SIZE;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+ set_fs(oldfs);
+
+ if (err < 0)
+ goto out_sock_pg;
+ if (msg.msg_flags & MSG_TRUNC) {
+ err = -ENOBUFS;
+ goto out_sock_pg;
+ }
+
+ h = (struct nlmsghdr*)pg;
+ while (NLMSG_OK(h, err)) {
+ if (h->nlmsg_type == NLMSG_DONE) {
+ err = 0;
+ goto done;
+ }
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h);
+ err = errm->error;
+ eprintk_ctx("NLMSG error: %d\n", errm->error);
+ goto done;
+ }
+ if (h->nlmsg_type != RTM_NEWROUTE) {
+ eprintk_ctx("NLMSG: %d\n", h->nlmsg_type);
+ err = -EINVAL;
+ goto done;
+ }
+ ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx);
+ h = NLMSG_NEXT(h, err);
+ }
+ if (err) {
+ eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type);
+ err = -EINVAL;
+ break;
+ }
+ }
+done:
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (!err && req.g.rtgen_family == AF_INET) {
+ req.g.rtgen_family = AF_INET6;
+ iov.iov_base=&req;
+ iov.iov_len=sizeof(req);
+ msg.msg_name=&nladdr;
+ msg.msg_namelen=sizeof(nladdr);
+ msg.msg_iov=&iov;
+ msg.msg_iovlen=1;
+ msg.msg_control=NULL;
+ msg.msg_controllen=0;
+ msg.msg_flags=MSG_DONTWAIT;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock_sendmsg(sock, &msg, sizeof(req));
+ set_fs(oldfs);
+
+ if (err > 0)
+ goto restart;
+ }
+#endif
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_close_section(ctx);
+
+out_sock_pg:
+ free_page((unsigned long)pg);
+out_sock:
+ sock_release(sock);
+ return err;
+}
+
+struct args_t
+{
+ int* pfd;
+ envid_t veid;
+};
+
+static int dumpfn(void *arg)
+{
+ int i;
+ struct args_t *args = arg;
+ int *pfd = args->pfd;
+ char *argv[] = { "iptables-save", "-c", NULL };
+
+ i = real_env_create(args->veid, VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
+ if (i < 0) {
+ eprintk("cannot enter ve to dump iptables\n");
+ module_put(THIS_MODULE);
+ return 255 << 8;
+ }
+
+ if (pfd[1] != 1)
+ sc_dup2(pfd[1], 1);
+
+ for (i=0; i<current->files->fdt->max_fds; i++) {
+ if (i != 1)
+ sc_close(i);
+ }
+
+ module_put(THIS_MODULE);
+
+ set_fs(KERNEL_DS);
+ i = sc_execve("/sbin/iptables-save", argv, NULL);
+ if (i == -ENOENT)
+ i = sc_execve("/usr/sbin/iptables-save", argv, NULL);
+ eprintk("failed to exec iptables-save: %d\n", i);
+ return 255 << 8;
+}
+
+
+static int cpt_dump_iptables(struct cpt_context * ctx)
+{
+ int err = 0;
+#ifdef CONFIG_VE_IPTABLES
+ int pid;
+ int pfd[2];
+ struct file *f;
+ struct cpt_object_hdr v;
+ char buf[16];
+ loff_t pos;
+ int n;
+ int status;
+ mm_segment_t oldfs;
+ sigset_t ignore, blocked;
+ struct args_t args;
+ struct ve_struct *oldenv;
+
+ if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD))
+ return 0;
+
+ err = sc_pipe(pfd);
+ if (err < 0) {
+ eprintk_ctx("sc_pipe: %d\n", err);
+ return err;
+ }
+ args.pfd = pfd;
+ args.veid = VEID(get_exec_env());
+ ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+ sigprocmask(SIG_BLOCK, &ignore, &blocked);
+ oldenv = set_exec_env(get_ve0());
+ err = pid = local_kernel_thread(dumpfn, (void*)&args,
+ SIGCHLD | CLONE_VFORK, 0);
+ set_exec_env(oldenv);
+ if (err < 0) {
+ eprintk_ctx("local_kernel_thread: %d\n", err);
+ goto out;
+ }
+
+ f = fget(pfd[0]);
+ sc_close(pfd[1]);
+ sc_close(pfd[0]);
+
+ cpt_open_section(ctx, CPT_SECT_NET_IPTABLES);
+
+ cpt_open_object(NULL, ctx);
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NAME;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_NAME;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ pos = ctx->file->f_pos;
+ do {
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
+ set_fs(oldfs);
+ if (n > 0)
+ ctx->write(buf, n, ctx);
+ } while (n > 0);
+
+ if (n < 0)
+ eprintk_ctx("read: %d\n", n);
+
+ fput(f);
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if ((err = sc_waitx(pid, 0, &status)) < 0)
+ eprintk_ctx("wait4: %d\n", err);
+ else if ((status & 0x7f) == 0) {
+ err = (status & 0xff00) >> 8;
+ if (err != 0) {
+ eprintk_ctx("iptables-save exited with %d\n", err);
+ err = -EINVAL;
+ }
+ } else {
+ eprintk_ctx("iptables-save terminated\n");
+ err = -EINVAL;
+ }
+ set_fs(oldfs);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+ if (ctx->file->f_pos != pos) {
+ buf[0] = 0;
+ ctx->write(buf, 1, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_close_section(ctx);
+ } else {
+ pos = ctx->current_section;
+ cpt_close_object(ctx);
+ cpt_close_section(ctx);
+ ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL;
+ ctx->file->f_pos = pos;
+ }
+ return n ? : err;
+
+out:
+ if (pfd[1] >= 0)
+ sc_close(pfd[1]);
+ if (pfd[0] >= 0)
+ sc_close(pfd[0]);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+#endif
+ return err;
+}
+
+static unsigned long fold_field(void *mib[], int offt)
+{
+ unsigned long res = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+ res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+ }
+ return res;
+}
+
+static void cpt_dump_snmp_stat(struct cpt_context *ctx, void *mib[], int n)
+{
+ int i;
+ struct cpt_object_hdr o;
+ __u32 *stats;
+
+ stats = cpt_get_buf(ctx);
+
+ cpt_open_object(NULL, ctx);
+
+ for (i = 0; i < n; i++)
+ stats[i] = fold_field(mib, i);
+
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_BITS;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_DATA;
+
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->write(stats, n * sizeof(*stats), ctx);
+ ctx->align(ctx);
+
+ cpt_close_object(ctx);
+
+ cpt_release_buf(ctx);
+}
+
+static void cpt_dump_snmp_stub(struct cpt_context *ctx)
+{
+ struct cpt_object_hdr o;
+
+ cpt_open_object(NULL, ctx);
+ o.cpt_next = CPT_NULL;
+ o.cpt_object = CPT_OBJ_BITS;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_VOID;
+ ctx->write(&o, sizeof(o), ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+}
+
+static int cpt_dump_snmp(struct cpt_context *ctx)
+{
+ struct ve_struct *ve;
+ struct net *net;
+
+ ve = get_exec_env();
+ net = ve->ve_netns;
+
+ cpt_open_section(ctx, CPT_SECT_SNMP_STATS);
+
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.net_statistics,
+ LINUX_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.ip_statistics,
+ IPSTATS_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.tcp_statistics,
+ TCP_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_statistics,
+ UDP_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmp_statistics,
+ ICMP_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics,
+ ICMPMSG_MIB_MAX);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics,
+ IPSTATS_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6,
+ UDP_MIB_MAX);
+ cpt_dump_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics,
+ ICMP6_MIB_MAX);
+#else
+ cpt_dump_snmp_stub(ctx);
+ cpt_dump_snmp_stub(ctx);
+ cpt_dump_snmp_stub(ctx);
+#endif
+ cpt_close_section(ctx);
+
+ return 0;
+}
+
+int cpt_dump_ifinfo(struct cpt_context * ctx)
+{
+ int err;
+
+ rtnl_lock();
+ err = cpt_dump_link(ctx);
+ if (!err)
+ err = cpt_dump_ifaddr(ctx);
+ rtnl_unlock();
+ if (!err)
+ err = cpt_dump_route(ctx);
+ if (!err)
+ err = cpt_dump_iptables(ctx);
+ if (!err)
+ err = cpt_dump_snmp(ctx);
+ return err;
+}
diff --git a/kernel/cpt/cpt_net.h b/kernel/cpt/cpt_net.h
new file mode 100644
index 0000000..5d33877
--- /dev/null
+++ b/kernel/cpt/cpt_net.h
@@ -0,0 +1,7 @@
+int cpt_dump_ifinfo(struct cpt_context *ctx);
+int rst_restore_net(struct cpt_context *ctx);
+int cpt_suspend_network(struct cpt_context *ctx);
+int cpt_resume_network(struct cpt_context *ctx);
+int rst_resume_network(struct cpt_context *ctx);
+int cpt_dump_ip_conntrack(struct cpt_context *ctx);
+int rst_restore_ip_conntrack(struct cpt_context * ctx);
diff --git a/kernel/cpt/cpt_obj.c b/kernel/cpt/cpt_obj.c
new file mode 100644
index 0000000..341d2ab
--- /dev/null
+++ b/kernel/cpt/cpt_obj.c
@@ -0,0 +1,163 @@
+/*
+ *
+ * kernel/cpt/cpt_obj.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = kmalloc(sizeof(cpt_object_t), gfp);
+ if (obj) {
+ INIT_LIST_HEAD(&obj->o_list);
+ INIT_LIST_HEAD(&obj->o_hash);
+ INIT_LIST_HEAD(&obj->o_alist);
+ obj->o_count = 1;
+ obj->o_pos = CPT_NULL;
+ obj->o_lock = 0;
+ obj->o_parent = NULL;
+ obj->o_index = CPT_NOINDEX;
+ obj->o_obj = NULL;
+ obj->o_image = NULL;
+ obj->o_flags = 0;
+ ctx->objcount++;
+ }
+ return obj;
+}
+
+void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx)
+{
+ list_del(&obj->o_alist);
+ kfree(obj);
+ ctx->objcount--;
+}
+
+void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx)
+{
+ list_add_tail(&obj->o_list, &ctx->object_array[type]);
+}
+
+void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj,
+ cpt_object_t *head, cpt_context_t *ctx)
+{
+ list_add(&obj->o_list, &head->o_list);
+}
+
+cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p,
+ unsigned gfp_mask, cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = lookup_cpt_object(type, p, ctx);
+
+ if (obj) {
+ obj->o_count++;
+ return obj;
+ }
+
+ if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) {
+ if (p)
+ cpt_obj_setobj(obj, p, ctx);
+ intern_cpt_object(type, obj, ctx);
+ return obj;
+ }
+ return NULL;
+}
+
+cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
+{
+ return __cpt_object_add(type, p, GFP_KERNEL, ctx);
+}
+
+cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = lookup_cpt_object(type, p, ctx);
+
+ if (obj)
+ obj->o_count++;
+
+ return obj;
+}
+
+int cpt_object_init(cpt_context_t *ctx)
+{
+ int i;
+
+ for (i=0; i<CPT_OBJ_MAX; i++) {
+ INIT_LIST_HEAD(&ctx->object_array[i]);
+ }
+ return 0;
+}
+
+int cpt_object_destroy(cpt_context_t *ctx)
+{
+ int i;
+
+ for (i=0; i<CPT_OBJ_MAX; i++) {
+ while (!list_empty(&ctx->object_array[i])) {
+ struct list_head *head = ctx->object_array[i].next;
+ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
+ list_del(head);
+ if (obj->o_image)
+ kfree(obj->o_image);
+ free_cpt_object(obj, ctx);
+ }
+ }
+ if (ctx->objcount != 0)
+ eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount);
+ return 0;
+}
+
+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, type) {
+ if (obj->o_obj == p)
+ return obj;
+ }
+ return NULL;
+}
+
+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, type) {
+ if (obj->o_pos == pos)
+ return obj;
+ }
+ return NULL;
+}
+
+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, type) {
+ if (obj->o_index == index)
+ return obj;
+ }
+ return NULL;
+}
diff --git a/kernel/cpt/cpt_obj.h b/kernel/cpt/cpt_obj.h
new file mode 100644
index 0000000..2dca39b
--- /dev/null
+++ b/kernel/cpt/cpt_obj.h
@@ -0,0 +1,64 @@
+#ifndef __CPT_OBJ_H_
+#define __CPT_OBJ_H_ 1
+
+#include <linux/list.h>
+#include <linux/cpt_image.h>
+
+typedef struct _cpt_object
+{
+ struct list_head o_list;
+ struct list_head o_hash;
+ int o_count;
+ int o_index;
+ int o_lock;
+ loff_t o_pos;
+ loff_t o_ppos;
+ void *o_obj;
+ void *o_image;
+ void *o_parent;
+ struct list_head o_alist;
+ unsigned int o_flags;
+#define CPT_INODE_HARDLINKED 0x1
+} cpt_object_t;
+
+struct cpt_context;
+
+#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list)
+
+
+extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx);
+extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx);
+
+cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx);
+cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx);
+
+static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx)
+{
+ cpt->o_pos = pos;
+ /* Add to pos hash table */
+}
+
+static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx)
+{
+ cpt->o_obj = ptr;
+ /* Add to hash table */
+}
+
+static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx)
+{
+ cpt->o_index = index;
+ /* Add to index hash table */
+}
+
+
+extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx);
+extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx);
+extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx);
+extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
+
+extern int cpt_object_init(struct cpt_context *ctx);
+extern int cpt_object_destroy(struct cpt_context *ctx);
+
+#endif /* __CPT_OBJ_H_ */
diff --git a/kernel/cpt/cpt_proc.c b/kernel/cpt/cpt_proc.c
new file mode 100644
index 0000000..a7d2d82
--- /dev/null
+++ b/kernel/cpt/cpt_proc.c
@@ -0,0 +1,623 @@
+/*
+ *
+ * kernel/cpt/cpt_proc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_ioctl.h>
+#include <linux/delay.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_dump.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+
+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
+MODULE_LICENSE("GPL");
+
+/* List of contexts and lock protecting the list */
+static struct list_head cpt_context_list;
+static spinlock_t cpt_context_lock;
+
+static int proc_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ off_t pos = 0;
+ off_t begin = 0;
+ int len = 0;
+ cpt_context_t *ctx;
+
+ len += sprintf(buffer, "Ctx Id VE State\n");
+
+ spin_lock(&cpt_context_lock);
+
+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+ len += sprintf(buffer+len,"%p %08x %-8u %d",
+ ctx,
+ ctx->contextid,
+ ctx->ve_id,
+ ctx->ctx_state
+ );
+
+ buffer[len++] = '\n';
+
+ pos = begin+len;
+ if (pos < offset) {
+ len = 0;
+ begin = pos;
+ }
+ if (pos > offset+length)
+ goto done;
+ }
+ *eof = 1;
+
+done:
+ spin_unlock(&cpt_context_lock);
+ *start = buffer + (offset - begin);
+ len -= (offset - begin);
+ if(len > length)
+ len = length;
+ if(len < 0)
+ len = 0;
+ return len;
+}
+
+void cpt_context_release(cpt_context_t *ctx)
+{
+ int i;
+
+ list_del(&ctx->ctx_list);
+ spin_unlock(&cpt_context_lock);
+
+ if (ctx->ctx_state > 0)
+ cpt_resume(ctx);
+ ctx->ctx_state = CPT_CTX_ERROR;
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ if (ctx->pgin_task)
+ put_task_struct(ctx->pgin_task);
+ if (ctx->pgin_dir)
+ cpt_free_pgin_dir(ctx);
+ if (ctx->pagein_file_out)
+ fput(ctx->pagein_file_out);
+ if (ctx->pagein_file_in)
+ fput(ctx->pagein_file_in);
+#endif
+ if (ctx->objcount)
+ eprintk_ctx("%d objects leaked\n", ctx->objcount);
+ if (ctx->file)
+ fput(ctx->file);
+ cpt_flush_error(ctx);
+ if (ctx->errorfile) {
+ fput(ctx->errorfile);
+ ctx->errorfile = NULL;
+ }
+ for (i = 0; i < ctx->linkdirs_num; i++)
+ fput(ctx->linkdirs[i]);
+ if (ctx->error_msg) {
+ free_page((unsigned long)ctx->error_msg);
+ ctx->error_msg = NULL;
+ }
+ if (ctx->statusfile)
+ fput(ctx->statusfile);
+ if (ctx->lockfile)
+ fput(ctx->lockfile);
+ kfree(ctx);
+
+ spin_lock(&cpt_context_lock);
+}
+
+static void __cpt_context_put(cpt_context_t *ctx)
+{
+ if (!--ctx->refcount)
+ cpt_context_release(ctx);
+}
+
+static void cpt_context_put(cpt_context_t *ctx)
+{
+ spin_lock(&cpt_context_lock);
+ __cpt_context_put(ctx);
+ spin_unlock(&cpt_context_lock);
+}
+
+cpt_context_t * cpt_context_open(void)
+{
+ cpt_context_t *ctx;
+
+ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
+ cpt_context_init(ctx);
+ spin_lock(&cpt_context_lock);
+ list_add_tail(&ctx->ctx_list, &cpt_context_list);
+ spin_unlock(&cpt_context_lock);
+ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
+ if (ctx->error_msg != NULL)
+ ctx->error_msg[0] = 0;
+ }
+ return ctx;
+}
+
+static cpt_context_t * cpt_context_lookup(unsigned int contextid)
+{
+ cpt_context_t *ctx;
+
+ spin_lock(&cpt_context_lock);
+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+ if (ctx->contextid == contextid) {
+ ctx->refcount++;
+ spin_unlock(&cpt_context_lock);
+ return ctx;
+ }
+ }
+ spin_unlock(&cpt_context_lock);
+ return NULL;
+}
+
+int cpt_context_lookup_veid(unsigned int veid)
+{
+ cpt_context_t *ctx;
+
+ spin_lock(&cpt_context_lock);
+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+ if (ctx->ve_id == veid && ctx->ctx_state > 0) {
+ spin_unlock(&cpt_context_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&cpt_context_lock);
+ return 0;
+}
+
+static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
+{
+ int err = 0;
+ cpt_context_t *ctx;
+ struct file *dfile = NULL;
+ int try;
+
+ unlock_kernel();
+
+ if (cmd == CPT_VMPREP) {
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ err = cpt_mm_prepare(arg);
+#else
+ err = -EINVAL;
+#endif
+ goto out_lock;
+ }
+
+ if (cmd == CPT_TEST_CAPS) {
+ unsigned int src_flags, dst_flags = arg;
+
+ err = 0;
+ src_flags = test_cpu_caps_and_features();
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
+ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
+ goto out_lock;
+ }
+
+ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
+ cpt_context_t *old_ctx;
+
+ ctx = NULL;
+ if (cmd == CPT_JOIN_CONTEXT) {
+ err = -ENOENT;
+ ctx = cpt_context_lookup(arg);
+ if (!ctx)
+ goto out_lock;
+ }
+
+ spin_lock(&cpt_context_lock);
+ old_ctx = (cpt_context_t*)file->private_data;
+ file->private_data = ctx;
+
+ if (old_ctx) {
+ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
+ old_ctx->sticky = 0;
+ old_ctx->refcount--;
+ }
+ __cpt_context_put(old_ctx);
+ }
+ spin_unlock(&cpt_context_lock);
+ err = 0;
+ goto out_lock;
+ }
+
+ spin_lock(&cpt_context_lock);
+ ctx = (cpt_context_t*)file->private_data;
+ if (ctx)
+ ctx->refcount++;
+ spin_unlock(&cpt_context_lock);
+
+ if (!ctx) {
+ cpt_context_t *old_ctx;
+
+ err = -ENOMEM;
+ ctx = cpt_context_open();
+ if (!ctx)
+ goto out_lock;
+
+ spin_lock(&cpt_context_lock);
+ old_ctx = (cpt_context_t*)file->private_data;
+ if (!old_ctx) {
+ ctx->refcount++;
+ file->private_data = ctx;
+ } else {
+ old_ctx->refcount++;
+ }
+ if (old_ctx) {
+ __cpt_context_put(ctx);
+ ctx = old_ctx;
+ }
+ spin_unlock(&cpt_context_lock);
+ }
+
+ if (cmd == CPT_GET_CONTEXT) {
+ unsigned int contextid = (unsigned int)arg;
+
+ if (ctx->contextid && ctx->contextid != contextid) {
+ err = -EINVAL;
+ goto out_nosem;
+ }
+ if (!ctx->contextid) {
+ cpt_context_t *c1 = cpt_context_lookup(contextid);
+ if (c1) {
+ cpt_context_put(c1);
+ err = -EEXIST;
+ goto out_nosem;
+ }
+ ctx->contextid = contextid;
+ }
+ spin_lock(&cpt_context_lock);
+ if (!ctx->sticky) {
+ ctx->sticky = 1;
+ ctx->refcount++;
+ }
+ spin_unlock(&cpt_context_lock);
+ goto out_nosem;
+ }
+
+ down(&ctx->main_sem);
+
+ err = -EBUSY;
+ if (ctx->ctx_state < 0)
+ goto out;
+
+ err = 0;
+ switch (cmd) {
+ case CPT_SET_DUMPFD:
+ if (ctx->ctx_state == CPT_CTX_DUMPING) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ err = -EBADF;
+ dfile = fget(arg);
+ if (dfile == NULL)
+ break;
+ if (dfile->f_op == NULL ||
+ dfile->f_op->write == NULL) {
+ fput(dfile);
+ break;
+ }
+ err = 0;
+ }
+ if (ctx->file)
+ fput(ctx->file);
+ ctx->file = dfile;
+ break;
+ case CPT_LINKDIR_ADD:
+ if (ctx->linkdirs_num >= CPT_MAX_LINKDIRS) {
+ err = -EMLINK;
+ break;
+ }
+
+ dfile = fget(arg);
+ if (!dfile) {
+ err = -EBADFD;
+ break;
+ }
+
+ if (!S_ISDIR(dfile->f_dentry->d_inode->i_mode)) {
+ err = -ENOTDIR;
+ fput(dfile);
+ break;
+ }
+
+ ctx->linkdirs[ctx->linkdirs_num++] = dfile;
+ break;
+ case CPT_SET_ERRORFD:
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->errorfile)
+ fput(ctx->errorfile);
+ ctx->errorfile = dfile;
+ break;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ case CPT_SET_PAGEINFDIN:
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->pagein_file_in)
+ fput(ctx->pagein_file_in);
+ ctx->pagein_file_in = dfile;
+ break;
+ case CPT_SET_PAGEINFDOUT:
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->pagein_file_out)
+ fput(ctx->pagein_file_out);
+ ctx->pagein_file_out = dfile;
+ break;
+ case CPT_SET_LAZY:
+ ctx->lazy_vm = arg;
+ break;
+ case CPT_ITER:
+ err = cpt_iteration(ctx);
+ break;
+ case CPT_PAGEIND:
+ err = cpt_start_pagein(ctx);
+ break;
+#endif
+ case CPT_SET_VEID:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ ctx->ve_id = arg;
+ break;
+ case CPT_SET_CPU_FLAGS:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ ctx->dst_cpu_flags = arg;
+ ctx->src_cpu_flags = test_cpu_caps_and_features();
+ break;
+ case CPT_SUSPEND:
+ if (cpt_context_lookup_veid(ctx->ve_id) ||
+ ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ ctx->ctx_state = CPT_CTX_SUSPENDING;
+ try = 0;
+ do {
+ err = cpt_vps_suspend(ctx);
+ if (err)
+ cpt_resume(ctx);
+ if (err == -EAGAIN)
+ msleep(1000);
+ try++;
+ } while (err == -EAGAIN && try < 3);
+ if (err) {
+ ctx->ctx_state = CPT_CTX_IDLE;
+ } else {
+ ctx->ctx_state = CPT_CTX_SUSPENDED;
+ }
+ break;
+ case CPT_DUMP:
+ if (!ctx->ctx_state) {
+ err = -ENOENT;
+ break;
+ }
+ if (!ctx->file) {
+ err = -EBADF;
+ break;
+ }
+ err = cpt_dump(ctx);
+ break;
+ case CPT_RESUME:
+ if (ctx->ctx_state == CPT_CTX_IDLE) {
+ err = -ENOENT;
+ break;
+ }
+ err = cpt_resume(ctx);
+ if (!err)
+ ctx->ctx_state = CPT_CTX_IDLE;
+ break;
+ case CPT_KILL:
+ if (ctx->ctx_state == CPT_CTX_IDLE) {
+ err = -ENOENT;
+ break;
+ }
+ err = cpt_kill(ctx);
+ if (!err)
+ ctx->ctx_state = CPT_CTX_IDLE;
+ break;
+ case CPT_TEST_VECAPS:
+ {
+ __u32 dst_flags = arg;
+ __u32 src_flags;
+
+ err = cpt_vps_caps(ctx, &src_flags);
+ if (err)
+ break;
+
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err);
+ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err);
+ if (dst_flags & (1 << CPT_SLM_DMPRST)) {
+ eprintk_ctx("SLM is enabled on destination node, but slm_dmprst module is not loaded\n");
+ err = 1;
+ }
+
+ if (src_flags & CPT_UNSUPPORTED_MASK)
+ err = 2;
+ break;
+ }
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+out:
+ cpt_flush_error(ctx);
+ up(&ctx->main_sem);
+out_nosem:
+ cpt_context_put(ctx);
+out_lock:
+ lock_kernel();
+ if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
+ err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
+ err = -EINTR;
+ return err;
+}
+
+static int cpt_open(struct inode *inode, struct file *file)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ return 0;
+}
+
+static int cpt_release(struct inode * inode, struct file * file)
+{
+ cpt_context_t *ctx;
+
+ spin_lock(&cpt_context_lock);
+ ctx = (cpt_context_t*)file->private_data;
+ file->private_data = NULL;
+
+ if (ctx)
+ __cpt_context_put(ctx);
+ spin_unlock(&cpt_context_lock);
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+
+static struct file_operations cpt_fops = {
+ .owner = THIS_MODULE,
+ .open = cpt_open,
+ .release = cpt_release,
+ .ioctl = cpt_ioctl,
+};
+
+static struct proc_dir_entry *proc_ent;
+
+static struct ctl_table_header *ctl_header;
+
+static ctl_table debug_table[] = {
+ {
+ .procname = "cpt",
+ .data = &debug_level,
+ .maxlen = sizeof(debug_level),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+static ctl_table root_table[] = {
+ {
+ .ctl_name = CTL_DEBUG,
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static int __init init_cpt(void)
+{
+ int err;
+
+ err = -ENOMEM;
+ ctl_header = register_sysctl_table(root_table);
+ if (!ctl_header)
+ goto err_mon;
+
+ spin_lock_init(&cpt_context_lock);
+ INIT_LIST_HEAD(&cpt_context_list);
+
+ err = -EINVAL;
+ proc_ent = proc_create("cpt", 0600, NULL, NULL);
+ if (!proc_ent)
+ goto err_out;
+
+ cpt_fops.read = proc_ent->proc_fops->read;
+ cpt_fops.write = proc_ent->proc_fops->write;
+ cpt_fops.llseek = proc_ent->proc_fops->llseek;
+ proc_ent->proc_fops = &cpt_fops;
+
+ proc_ent->read_proc = proc_read;
+ proc_ent->data = NULL;
+ return 0;
+
+err_out:
+ unregister_sysctl_table(ctl_header);
+err_mon:
+ return err;
+}
+module_init(init_cpt);
+
+static void __exit exit_cpt(void)
+{
+ remove_proc_entry("cpt", NULL);
+ unregister_sysctl_table(ctl_header);
+
+ spin_lock(&cpt_context_lock);
+ while (!list_empty(&cpt_context_list)) {
+ cpt_context_t *ctx;
+ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
+
+ if (!ctx->sticky)
+ ctx->refcount++;
+ ctx->sticky = 0;
+
+ BUG_ON(ctx->refcount != 1);
+
+ __cpt_context_put(ctx);
+ }
+ spin_unlock(&cpt_context_lock);
+}
+module_exit(exit_cpt);
diff --git a/kernel/cpt/cpt_process.c b/kernel/cpt/cpt_process.c
new file mode 100644
index 0000000..6314bee
--- /dev/null
+++ b/kernel/cpt/cpt_process.c
@@ -0,0 +1,1379 @@
+/*
+ *
+ * kernel/cpt/cpt_process.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/compat.h>
+#include <linux/cpt_image.h>
+#include <linux/nsproxy.h>
+#include <linux/futex.h>
+#include <linux/posix-timers.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_ubc.h"
+#include "cpt_process.h"
+#include "cpt_kernel.h"
+
+#ifdef CONFIG_X86_32
+#undef task_pt_regs
+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
+#endif
+
+int check_task_state(struct task_struct *tsk, struct cpt_context *ctx)
+{
+#ifdef CONFIG_X86_64
+ if (!(task_thread_info(tsk)->flags&_TIF_IA32)) {
+ if (task_pt_regs(tsk)->ip >= VSYSCALL_START &&
+ task_pt_regs(tsk)->ip < VSYSCALL_END) {
+ eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk));
+ return -EAGAIN;
+ }
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86
+
+static u32 encode_segment(u32 segreg)
+{
+ segreg &= 0xFFFF;
+
+ if (segreg == 0)
+ return CPT_SEG_ZERO;
+ if ((segreg & 3) != 3) {
+ wprintk("Invalid RPL of a segment reg %x\n", segreg);
+ return CPT_SEG_ZERO;
+ }
+
+ /* LDT descriptor, it is just an index to LDT array */
+ if (segreg & 4)
+ return CPT_SEG_LDT + (segreg >> 3);
+
+ /* TLS descriptor. */
+ if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN &&
+ (segreg >> 3) <= GDT_ENTRY_TLS_MAX)
+ return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN);
+
+ /* One of standard desriptors */
+#ifdef CONFIG_X86_64
+ if (segreg == __USER32_DS)
+ return CPT_SEG_USER32_DS;
+ if (segreg == __USER32_CS)
+ return CPT_SEG_USER32_CS;
+ if (segreg == __USER_DS)
+ return CPT_SEG_USER64_DS;
+ if (segreg == __USER_CS)
+ return CPT_SEG_USER64_CS;
+#else
+ if (segreg == __USER_DS)
+ return CPT_SEG_USER32_DS;
+ if (segreg == __USER_CS)
+ return CPT_SEG_USER32_CS;
+#endif
+ wprintk("Invalid segment reg %x\n", segreg);
+ return CPT_SEG_ZERO;
+}
+
+#ifdef CONFIG_X86_64
+static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s,
+ struct task_struct *tsk)
+{
+ d->cpt_ebp = s->bp;
+ d->cpt_ebx = s->bx;
+ d->cpt_eax = s->ax;
+ d->cpt_ecx = s->cx;
+ d->cpt_edx = s->dx;
+ d->cpt_esi = s->si;
+ d->cpt_edi = s->di;
+ d->cpt_orig_eax = s->orig_ax;
+ d->cpt_eip = s->ip;
+ d->cpt_xcs = encode_segment(s->cs);
+ d->cpt_eflags = s->flags;
+ d->cpt_esp = s->sp;
+ d->cpt_xss = encode_segment(s->ss);
+ d->cpt_xds = encode_segment(tsk->thread.ds);
+ d->cpt_xes = encode_segment(tsk->thread.es);
+}
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ cpt_open_object(NULL, ctx);
+
+ if (task_thread_info(tsk)->flags & _TIF_IA32) {
+ struct cpt_x86_regs ri;
+ ri.cpt_next = sizeof(ri);
+ ri.cpt_object = CPT_OBJ_X86_REGS;
+ ri.cpt_hdrlen = sizeof(ri);
+ ri.cpt_content = CPT_CONTENT_VOID;
+
+ ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+ ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+ ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+ ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+ ri.cpt_debugreg[4] = 0;
+ ri.cpt_debugreg[5] = 0;
+ ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+ ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+ ri.cpt_fs = encode_segment(tsk->thread.fsindex);
+ ri.cpt_gs = encode_segment(tsk->thread.gsindex);
+
+ xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk);
+
+ ctx->write(&ri, sizeof(ri), ctx);
+ } else {
+ struct cpt_x86_64_regs ri;
+ ri.cpt_next = sizeof(ri);
+ ri.cpt_object = CPT_OBJ_X86_64_REGS;
+ ri.cpt_hdrlen = sizeof(ri);
+ ri.cpt_content = CPT_CONTENT_VOID;
+
+ ri.cpt_fsbase = tsk->thread.fs;
+ ri.cpt_gsbase = tsk->thread.gs;
+ ri.cpt_fsindex = encode_segment(tsk->thread.fsindex);
+ ri.cpt_gsindex = encode_segment(tsk->thread.gsindex);
+ ri.cpt_ds = encode_segment(tsk->thread.ds);
+ ri.cpt_es = encode_segment(tsk->thread.es);
+ ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+ ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+ ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+ ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+ ri.cpt_debugreg[4] = 0;
+ ri.cpt_debugreg[5] = 0;
+ ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+ ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+
+ memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs));
+
+ ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs);
+ ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss);
+
+ ctx->write(&ri, sizeof(ri), ctx);
+
+ }
+ cpt_close_object(ctx);
+
+ return 0;
+}
+
+#else
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ struct cpt_x86_regs ri;
+ struct pt_regs *pt_regs;
+
+ cpt_open_object(NULL, ctx);
+
+ ri.cpt_next = sizeof(ri);
+ ri.cpt_object = CPT_OBJ_X86_REGS;
+ ri.cpt_hdrlen = sizeof(ri);
+ ri.cpt_content = CPT_CONTENT_VOID;
+
+ ri.cpt_debugreg[0] = tsk->thread.debugreg0;
+ ri.cpt_debugreg[1] = tsk->thread.debugreg1;
+ ri.cpt_debugreg[2] = tsk->thread.debugreg2;
+ ri.cpt_debugreg[3] = tsk->thread.debugreg3;
+ ri.cpt_debugreg[6] = tsk->thread.debugreg6;
+ ri.cpt_debugreg[7] = tsk->thread.debugreg7;
+
+ pt_regs = task_pt_regs(tsk);
+
+ ri.cpt_fs = encode_segment(pt_regs->fs);
+ ri.cpt_gs = encode_segment(tsk->thread.gs);
+ ri.cpt_ugs = encode_segment(task_user_gs(tsk));
+
+ ri.cpt_ebx = pt_regs->bx;
+ ri.cpt_ecx = pt_regs->cx;
+ ri.cpt_edx = pt_regs->dx;
+ ri.cpt_esi = pt_regs->si;
+ ri.cpt_edi = pt_regs->di;
+ ri.cpt_ebp = pt_regs->bp;
+ ri.cpt_eax = pt_regs->ax;
+ ri.cpt_xds = pt_regs->ds;
+ ri.cpt_xes = pt_regs->es;
+ ri.cpt_orig_eax = pt_regs->orig_ax;
+ ri.cpt_eip = pt_regs->ip;
+ ri.cpt_xcs = pt_regs->cs;
+ ri.cpt_eflags = pt_regs->flags;
+ ri.cpt_esp = pt_regs->sp;
+ ri.cpt_xss = pt_regs->ss;
+
+ ri.cpt_xcs = encode_segment(pt_regs->cs);
+ ri.cpt_xss = encode_segment(pt_regs->ss);
+ ri.cpt_xds = encode_segment(pt_regs->ds);
+ ri.cpt_xes = encode_segment(pt_regs->es);
+
+ ctx->write(&ri, sizeof(ri), ctx);
+ cpt_close_object(ctx);
+
+ return 0;
+}
+#endif
+#endif
+
+#ifdef CONFIG_IA64
+
+/*
+ PMD?
+ */
+
+#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \
+ CPT_TID(tsk), err); return -EINVAL; } } while (0)
+
+static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct unw_frame_info info;
+ struct ia64_fpreg fpval;
+ int i;
+
+ unw_init_from_blocked_task(&info, tsk);
+ _C(unw_unwind_to_user(&info));
+
+ /* NAT_BITS */
+ do {
+ unsigned long scratch_unat;
+
+ scratch_unat = info.sw->caller_unat;
+ if (info.pri_unat_loc)
+ scratch_unat = *info.pri_unat_loc;
+
+ r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat);
+ /* Just to be on safe side. */
+ r->nat[0] &= 0xFFFFFFFFUL;
+ } while (0);
+
+ /* R4-R7 */
+ for (i = 4; i <= 7; i++) {
+ char nat = 0;
+ _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0));
+ r->nat[0] |= (nat != 0) << i;
+ }
+
+ /* B1-B5 */
+ for (i = 1; i <= 5; i++) {
+ _C(unw_access_br(&info, i, &r->br[i], 0));
+ }
+
+ /* AR_EC, AR_LC */
+ _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0));
+ _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0));
+
+ /* F2..F5, F16..F31 */
+ for (i = 2; i <= 5; i++) {
+ _C(unw_get_fr(&info, i, &fpval));
+ memcpy(&r->fr[i*2], &fpval, 16);
+ }
+ for (i = 16; i <= 31; i++) {
+ _C(unw_get_fr(&info, i, &fpval));
+ memcpy(&r->fr[i*2], &fpval, 16);
+ }
+ return 0;
+}
+
+#undef _C
+
+static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ int err;
+ unsigned long pg;
+ struct cpt_ia64_regs *r;
+ struct ia64_psr *psr;
+ struct switch_stack *sw;
+ struct pt_regs *pt;
+ void *krbs = (void *)tsk + IA64_RBS_OFFSET;
+ unsigned long reg;
+
+ if (tsk->exit_state)
+ return 0;
+
+ pt = task_pt_regs(tsk);
+
+ sw = (struct switch_stack *) (tsk->thread.ksp + 16);
+
+ if ((pg = __get_free_page(GFP_KERNEL)) == 0)
+ return -ENOMEM;
+
+ r = (void*)pg;
+ /* To catch if we forgot some register */
+ memset(r, 0xA5, sizeof(*r));
+
+ r->gr[0] = 0;
+ r->fr[0] = r->fr[1] = 0;
+ r->fr[2] = 0x8000000000000000UL;
+ r->fr[3] = 0xffff;
+
+ r->nat[0] = r->nat[1] = 0;
+
+ err = ass_to_mouth(r, tsk, ctx);
+ if (err) {
+ printk("ass_to_mouth error %d\n", err);
+ goto out;
+ }
+
+ /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
+ memcpy(&r->gr[1], &pt->r1, 8*(2-1));
+ memcpy(&r->gr[2], &pt->r2, 8*(4-2));
+ memcpy(&r->gr[8], &pt->r8, 8*(12-8));
+ memcpy(&r->gr[12], &pt->r12, 8*(14-12));
+ memcpy(&r->gr[14], &pt->r14, 8*(15-14));
+ memcpy(&r->gr[15], &pt->r15, 8*(16-15));
+ memcpy(&r->gr[16], &pt->r16, 8*(32-16));
+
+ r->br[0] = pt->b0;
+ r->br[6] = pt->b6;
+ r->br[7] = pt->b7;
+
+ r->ar_bspstore = pt->ar_bspstore;
+ r->ar_unat = pt->ar_unat;
+ r->ar_pfs = pt->ar_pfs;
+ r->ar_ccv = pt->ar_ccv;
+ r->ar_fpsr = pt->ar_fpsr;
+ r->ar_csd = pt->ar_csd;
+ r->ar_ssd = pt->ar_ssd;
+ r->ar_rsc = pt->ar_rsc;
+
+ r->cr_iip = pt->cr_iip;
+ r->cr_ipsr = pt->cr_ipsr;
+
+ r->pr = pt->pr;
+
+ r->cfm = pt->cr_ifs;
+ r->ar_rnat = pt->ar_rnat;
+
+ /* fpregs 6..9,10..11 are in pt_regs */
+ memcpy(&r->fr[2*6], &pt->f6, 16*(10-6));
+ memcpy(&r->fr[2*10], &pt->f10, 16*(12-10));
+ /* fpreg 12..15 are on switch stack */
+ memcpy(&r->fr[2*12], &sw->f12, 16*(16-12));
+ /* fpregs 32...127 */
+ psr = ia64_psr(task_pt_regs(tsk));
+ preempt_disable();
+ if (ia64_is_local_fpu_owner(tsk) && psr->mfh) {
+ psr->mfh = 0;
+ tsk->thread.flags |= IA64_THREAD_FPH_VALID;
+ ia64_save_fpu(&tsk->thread.fph[0]);
+ }
+ preempt_enable();
+ memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32));
+
+ if (tsk->thread.flags & IA64_THREAD_DBG_VALID) {
+ memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr));
+ memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr));
+ } else {
+ memset(r->ibr, 0, sizeof(r->ibr));
+ memset(r->dbr, 0, sizeof(r->dbr));
+ }
+
+ r->loadrs = pt->loadrs;
+ r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19));
+ if ((long)pt->cr_ifs > 0)
+ r->num_regs += (pt->cr_ifs & 0x7f);
+
+ if (r->num_regs > 96) {
+ eprintk_ctx(CPT_FID " too much RSE regs %lu\n",
+ CPT_TID(tsk), r->num_regs);
+ return -EINVAL;
+ }
+
+ for (reg = 0; reg < r->num_regs; reg++) {
+ unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
+ unsigned long *rnatp = ia64_rse_rnat_addr(ptr);
+
+ r->gr[32+reg] = *ptr;
+
+ if ((unsigned long)rnatp >= sw->ar_bspstore)
+ rnatp = &sw->ar_rnat;
+ if (*rnatp & (1UL<<ia64_rse_slot_num(ptr))) {
+ if (reg < 32)
+ r->nat[0] |= (1UL<<(reg+32));
+ else
+ r->nat[1] |= (1UL<<(reg-32));
+ }
+ }
+ if (r->nat[0] | r->nat[1])
+ wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk),
+ r->nat[1], r->nat[0]);
+
+ cpt_open_object(NULL, ctx);
+ r->cpt_next = sizeof(*r);
+ r->cpt_object = CPT_OBJ_IA64_REGS;
+ r->cpt_hdrlen = sizeof(*r);
+ r->cpt_content = CPT_CONTENT_VOID;
+ ctx->write(r, sizeof(*r), ctx);
+ cpt_close_object(ctx);
+ err = 0;
+
+out:
+ free_page(pg);
+ return err;
+}
+#endif
+
+static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ struct cpt_obj_bits hdr;
+ unsigned long size;
+ void *start;
+
+ cpt_open_object(NULL, ctx);
+
+#ifdef CONFIG_X86_64
+ size = tsk->thread.sp0 - tsk->thread.sp;
+ start = (void*)tsk->thread.sp;
+#elif defined(CONFIG_X86_32)
+ size = tsk->thread.sp0 - tsk->thread.sp;
+ start = (void*)tsk->thread.sp;
+#elif defined(CONFIG_IA64)
+ size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp;
+ start = (void*)tsk->thread.ksp;
+#else
+#error Arch is not supported
+#endif
+
+ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
+ hdr.cpt_object = CPT_OBJ_BITS;
+ hdr.cpt_hdrlen = sizeof(hdr);
+ hdr.cpt_content = CPT_CONTENT_STACK;
+ hdr.cpt_size = size;
+
+ ctx->write(&hdr, sizeof(hdr), ctx);
+ ctx->write(start, size, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ return 0;
+}
+
+#ifdef CONFIG_X86
+/* Formats of i387_fxsave_struct are the same for x86_64
+ * and i386. Plain luck. */
+
+static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ struct cpt_obj_bits hdr;
+ unsigned long size;
+ int type;
+
+ if (!tsk->thread.xstate)
+ return 0;
+
+ cpt_open_object(NULL, ctx);
+
+ type = CPT_CONTENT_X86_FPUSTATE;
+ size = sizeof(struct i387_fxsave_struct);
+#ifndef CONFIG_X86_64
+ if (!cpu_has_fxsr) {
+ size = sizeof(struct i387_fsave_struct);
+ type = CPT_CONTENT_X86_FPUSTATE_OLD;
+ }
+#endif
+
+ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
+ hdr.cpt_object = CPT_OBJ_BITS;
+ hdr.cpt_hdrlen = sizeof(hdr);
+ hdr.cpt_content = type;
+ hdr.cpt_size = size;
+
+ ctx->write(&hdr, sizeof(hdr), ctx);
+ ctx->write(tsk->thread.xstate, size, ctx);
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_IA64
+
+static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx)
+{
+ return 0;
+}
+#endif
+
+static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info)
+{
+ si->cpt_signo = info->si_signo;
+ si->cpt_errno = info->si_errno;
+ si->cpt_code = info->si_code;
+
+ switch(si->cpt_code & __SI_MASK) {
+ case __SI_TIMER:
+ si->cpt_pid = info->si_tid;
+ si->cpt_uid = info->si_overrun;
+ si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr);
+ si->cpt_utime = info->si_sys_private;
+ break;
+ case __SI_POLL:
+ si->cpt_pid = info->si_band;
+ si->cpt_uid = info->si_fd;
+ break;
+ case __SI_FAULT:
+ si->cpt_sigval = cpt_ptr_export(info->si_addr);
+#ifdef __ARCH_SI_TRAPNO
+ si->cpt_pid = info->si_trapno;
+#endif
+ break;
+ case __SI_CHLD:
+ si->cpt_pid = info->si_pid;
+ si->cpt_uid = info->si_uid;
+ si->cpt_sigval = info->si_status;
+ si->cpt_stime = info->si_stime;
+ si->cpt_utime = info->si_utime;
+ break;
+ case __SI_KILL:
+ case __SI_RT:
+ case __SI_MESGQ:
+ default:
+ si->cpt_pid = info->si_pid;
+ si->cpt_uid = info->si_uid;
+ si->cpt_sigval = cpt_ptr_export(info->si_ptr);
+ break;
+ }
+ return 0;
+}
+
+static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx)
+{
+ struct sigqueue *q;
+ loff_t saved_obj;
+
+ if (list_empty(&list->list))
+ return 0;
+
+ cpt_push_object(&saved_obj, ctx);
+ list_for_each_entry(q, &list->list, list) {
+ struct cpt_siginfo_image si;
+
+ si.cpt_next = sizeof(si);
+ si.cpt_object = CPT_OBJ_SIGINFO;
+ si.cpt_hdrlen = sizeof(si);
+ si.cpt_content = CPT_CONTENT_VOID;
+
+ si.cpt_qflags = q->flags;
+ si.cpt_user = q->user->uid;
+
+ if (encode_siginfo(&si, &q->info))
+ return -EINVAL;
+
+ ctx->write(&si, sizeof(si), ctx);
+ }
+ cpt_pop_object(&saved_obj, ctx);
+ return 0;
+}
+
+
+
+static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct signal_struct *sig = obj->o_obj;
+ struct cpt_signal_image *v = cpt_get_buf(ctx);
+ struct task_struct *tsk;
+ int i;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_SIGNAL_STRUCT;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_pgrp_type = CPT_PGRP_NORMAL;
+ v->cpt_pgrp = 0;
+
+#if 0 /* the code below seems to be unneeded */
+ if (sig->__pgrp <= 0) {
+ eprintk_ctx("bad pgid\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid_ns(sig->__pgrp, &init_pid_ns);
+ if (tsk == NULL)
+ v->cpt_pgrp_type = CPT_PGRP_ORPHAN;
+ read_unlock(&tasklist_lock);
+ v->cpt_pgrp = pid_to_vpid(sig->__pgrp);
+#endif
+
+ v->cpt_old_pgrp = 0;
+/* if (!sig->tty_old_pgrp) {
+ eprintk_ctx("bad tty_old_pgrp\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }*/
+ if (sig->tty_old_pgrp) {
+ v->cpt_old_pgrp_type = CPT_PGRP_NORMAL;
+ read_lock(&tasklist_lock);
+ tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID);
+ if (tsk == NULL) {
+ v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN;
+ tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID);
+ }
+ read_unlock(&tasklist_lock);
+ if (tsk == NULL) {
+ eprintk_ctx("tty_old_pgrp does not exist anymore\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp);
+ if ((int)v->cpt_old_pgrp < 0) {
+ dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp));
+ v->cpt_old_pgrp = -1;
+ v->cpt_old_pgrp_type = CPT_PGRP_STRAY;
+ }
+ }
+
+ v->cpt_session_type = CPT_PGRP_NORMAL;
+ v->cpt_session = 0;
+
+#if 0 /* the code below seems to be unneeded */
+ if (sig->__session <= 0) {
+ eprintk_ctx("bad session\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid_ns(sig->__session, &init_pid_ns);
+ if (tsk == NULL)
+ v->cpt_session_type = CPT_PGRP_ORPHAN;
+ read_unlock(&tasklist_lock);
+ v->cpt_session = pid_to_vpid(sig->__session);
+#endif
+
+ v->cpt_leader = sig->leader;
+ v->cpt_ctty = CPT_NULL;
+ if (sig->tty) {
+ cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx);
+ if (cobj)
+ v->cpt_ctty = cobj->o_pos;
+ else {
+ eprintk_ctx("controlling tty is not found\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8);
+
+ v->cpt_curr_target = 0;
+ if (sig->curr_target)
+ v->cpt_curr_target = task_pid_vnr(sig->curr_target);
+ v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0);
+ v->cpt_group_exit_code = sig->group_exit_code;
+ v->cpt_group_exit_task = 0;
+ if (sig->group_exit_task)
+ v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task);
+ v->cpt_notify_count = sig->notify_count;
+ v->cpt_group_stop_count = sig->group_stop_count;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8)
+ v->cpt_utime = sig->utime;
+ v->cpt_stime = sig->stime;
+ v->cpt_cutime = sig->cutime;
+ v->cpt_cstime = sig->cstime;
+ v->cpt_nvcsw = sig->nvcsw;
+ v->cpt_nivcsw = sig->nivcsw;
+ v->cpt_cnvcsw = sig->cnvcsw;
+ v->cpt_cnivcsw = sig->cnivcsw;
+ v->cpt_min_flt = sig->min_flt;
+ v->cpt_maj_flt = sig->maj_flt;
+ v->cpt_cmin_flt = sig->cmin_flt;
+ v->cpt_cmaj_flt = sig->cmaj_flt;
+
+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+ __asm__("undefined\n");
+
+ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+ if (i < RLIM_NLIMITS) {
+ v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur;
+ v->cpt_rlim_max[i] = sig->rlim[i].rlim_max;
+ } else {
+ v->cpt_rlim_cur[i] = CPT_NULL;
+ v->cpt_rlim_max[i] = CPT_NULL;
+ }
+ }
+#endif
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ dump_sigqueue(&sig->shared_pending, ctx);
+
+ cpt_close_object(ctx);
+ return 0;
+}
+
+int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx)
+{
+#ifdef CONFIG_KEYS
+ if (tsk->cred->request_key_auth || tsk->cred->thread_keyring) {
+ eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk));
+ return -EBUSY;
+ }
+#endif
+#ifdef CONFIG_NUMA
+ if (tsk->mempolicy) {
+ eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk));
+ return -EBUSY;
+ }
+#endif
+#ifdef CONFIG_TUX
+ if (tsk->tux_info) {
+ eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk));
+ return -EBUSY;
+ }
+#endif
+ return 0;
+}
+
+static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct task_struct *tsk = obj->o_obj;
+ const struct cred *cred;
+ int last_thread;
+ struct cpt_task_image *v = cpt_get_buf(ctx);
+ cpt_object_t *tobj;
+ cpt_object_t *tg_obj;
+ loff_t saved_obj;
+ int i;
+ int err;
+ struct timespec delta;
+ struct mm_struct * tsk_mm;
+ struct files_struct * tsk_files;
+ struct fs_struct * tsk_fs;
+ struct mnt_namespace * tsk_ns;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_signal = CPT_NULL;
+ tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx);
+ if (!tg_obj) BUG();
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_TASK;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_state = tsk->state;
+ if (tsk->state == EXIT_ZOMBIE) {
+ eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk));
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ } else if (tsk->state == EXIT_DEAD) {
+ if (tsk->exit_state != EXIT_DEAD &&
+ tsk->exit_state != EXIT_ZOMBIE) {
+ eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk));
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ if (tsk->exit_state) {
+ v->cpt_state = tsk->exit_state;
+ if (tsk->state != TASK_DEAD) {
+ eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n",
+ tsk->state, tsk->exit_state, CPT_TID(tsk));
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ if (cpt_check_unsupported(tsk, ctx)) {
+ cpt_release_buf(ctx);
+ return -EBUSY;
+ }
+
+ v->cpt_flags = tsk->flags & CPT_TASK_FLAGS_MASK;
+ v->cpt_ptrace = tsk->ptrace;
+ v->cpt_prio = tsk->prio;
+ v->cpt_exit_code = tsk->exit_code;
+ v->cpt_exit_signal = tsk->exit_signal;
+ v->cpt_pdeath_signal = tsk->pdeath_signal;
+ v->cpt_static_prio = tsk->static_prio;
+ v->cpt_rt_priority = tsk->rt_priority;
+ v->cpt_policy = tsk->policy;
+ if (v->cpt_policy != SCHED_NORMAL) {
+ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+
+ /* Unpleasant moment. When leader of thread group exits,
+ * it remains in zombie state until all the group exits.
+ * We save not-NULL pointers to process mm/files/fs, so
+ * that we can restore this thread group.
+ */
+ tsk_mm = tsk->mm;
+ tsk_files = tsk->files;
+ tsk_fs = tsk->fs;
+ tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL;
+
+ if (tsk->exit_state && !thread_group_empty(tsk) &&
+ thread_group_leader(tsk)) {
+ struct task_struct * p = tsk;
+
+ read_lock(&tasklist_lock);
+ do {
+ if (p->mm)
+ tsk_mm = p->mm;
+ if (p->files)
+ tsk_files = p->files;
+ if (p->fs)
+ tsk_fs = p->fs;
+ if (p->nsproxy && p->nsproxy->mnt_ns)
+ tsk_ns = p->nsproxy->mnt_ns;
+ p = next_thread(p);
+ } while (p != tsk);
+ read_unlock(&tasklist_lock);
+ }
+
+ v->cpt_mm = CPT_NULL;
+ if (tsk_mm) {
+ tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx);
+ if (!tobj) BUG();
+ v->cpt_mm = tobj->o_pos;
+ }
+ v->cpt_files = CPT_NULL;
+ if (tsk_files) {
+ tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx);
+ if (!tobj) BUG();
+ v->cpt_files = tobj->o_pos;
+ }
+ v->cpt_fs = CPT_NULL;
+ if (tsk_fs) {
+ tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx);
+ if (!tobj) BUG();
+ v->cpt_fs = tobj->o_pos;
+ }
+ v->cpt_namespace = CPT_NULL;
+ if (tsk_ns) {
+ tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx);
+ if (!tobj) BUG();
+ v->cpt_namespace = tobj->o_pos;
+
+ if (tsk_ns != current->nsproxy->mnt_ns)
+ eprintk_ctx("namespaces are not supported:"
+ "process " CPT_FID "\n", CPT_TID(tsk));
+ }
+ v->cpt_sysvsem_undo = CPT_NULL;
+ if (tsk->sysvsem.undo_list && !tsk->exit_state) {
+ tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx);
+ if (!tobj) BUG();
+ v->cpt_sysvsem_undo = tobj->o_pos;
+ }
+ v->cpt_sighand = CPT_NULL;
+ if (tsk->sighand) {
+ tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx);
+ if (!tobj) BUG();
+ v->cpt_sighand = tobj->o_pos;
+ }
+ v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked);
+ v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked);
+ v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask);
+
+ v->cpt_pid = task_pid_vnr(tsk);
+ v->cpt_tgid = task_tgid_vnr(tsk);
+ v->cpt_ppid = 0;
+ if (tsk->parent) {
+ if (tsk->parent != tsk->real_parent &&
+ !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) {
+ eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm);
+ cpt_release_buf(ctx);
+ return -EBUSY;
+ }
+ v->cpt_ppid = task_pid_vnr(tsk->parent);
+ }
+ v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0;
+ v->cpt_pgrp = task_pgrp_vnr(tsk);
+ v->cpt_session = task_session_vnr(tsk);
+ v->cpt_old_pgrp = 0;
+ if (tsk->signal->tty_old_pgrp)
+ v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp);
+ v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0;
+ v->cpt_set_tid = (unsigned long)tsk->set_child_tid;
+ v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid;
+ memcpy(v->cpt_comm, tsk->comm, 16);
+
+ cred = tsk->cred;
+ v->cpt_user = cred->user->uid;
+ v->cpt_uid = cred->uid;
+ v->cpt_euid = cred->euid;
+ v->cpt_suid = cred->suid;
+ v->cpt_fsuid = cred->fsuid;
+ v->cpt_gid = cred->gid;
+ v->cpt_egid = cred->egid;
+ v->cpt_sgid = cred->sgid;
+ v->cpt_fsgid = cred->fsgid;
+ v->cpt_ngids = 0;
+ if (cred->group_info && cred->group_info->ngroups != 0) {
+ int i = cred->group_info->ngroups;
+ if (i > 32) {
+ /* Shame... I did a simplified version and _forgot_
+ * about this. Later, later. */
+ eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+ v->cpt_ngids = i;
+ for (i--; i>=0; i--)
+ v->cpt_gids[i] = cred->group_info->small_block[i];
+ }
+ v->cpt_prctl_uac = 0;
+ v->cpt_prctl_fpemu = 0;
+ v->__cpt_pad1 = 0;
+#ifdef CONFIG_IA64
+ v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT;
+ v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT;
+#endif
+ memcpy(&v->cpt_ecap, &cred->cap_effective, 8);
+ memcpy(&v->cpt_icap, &cred->cap_inheritable, 8);
+ memcpy(&v->cpt_pcap, &cred->cap_permitted, 8);
+ v->cpt_keepcap = cred->securebits;
+
+ v->cpt_did_exec = tsk->did_exec;
+ v->cpt_exec_domain = -1;
+ v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<<TIF_FREEZE);
+ v->cpt_64bit = 0;
+#ifdef CONFIG_X86_64
+ /* Clear x86_64 specific flags */
+ v->cpt_thrflags &= ~(_TIF_FORK|_TIF_IA32);
+ if (!(task_thread_info(tsk)->flags & _TIF_IA32)) {
+ ctx->tasks64++;
+ v->cpt_64bit = 1;
+ }
+#endif
+#ifdef CONFIG_IA64
+ /* Clear ia64 specific flags */
+ //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32);
+ if (!IS_IA32_PROCESS(task_pt_regs(tsk))) {
+ ctx->tasks64++;
+ v->cpt_64bit = 1;
+ }
+#endif
+ v->cpt_thrstatus = task_thread_info(tsk)->status;
+ v->cpt_addr_limit = -1;
+
+ v->cpt_personality = tsk->personality;
+
+#ifdef CONFIG_X86
+ for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) {
+ if (i>=3) {
+ eprintk_ctx("too many tls descs\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a;
+ }
+#endif
+
+ v->cpt_restart.fn = CPT_RBL_0;
+ if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) {
+ struct restart_block *rb = &task_thread_info(tsk)->restart_block;
+ ktime_t e;
+
+ if (rb->fn == hrtimer_nanosleep_restart) {
+ v->cpt_restart.fn = CPT_RBL_NANOSLEEP;
+
+ e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2;
+ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+ v->cpt_restart.arg0 = rb->arg0;
+ v->cpt_restart.arg1 = rb->arg1;
+ v->cpt_restart.arg2 = ktime_to_ns(e);
+ v->cpt_restart.arg3 = 0;
+ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
+ goto continue_dump;
+ }
+#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
+ if (rb->fn == compat_nanosleep_restart) {
+ v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP;
+
+ e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2;
+ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+ v->cpt_restart.arg0 = rb->arg0;
+ v->cpt_restart.arg1 = rb->arg1;
+ v->cpt_restart.arg2 = ktime_to_ns(e);
+ v->cpt_restart.arg3 = 0;
+ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
+ goto continue_dump;
+ }
+#endif
+ if (rb->fn == do_restart_poll) {
+ u64 timeout_jiffies;
+
+ timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2;
+ e.tv64 = timeout_jiffies * TICK_NSEC;
+
+ v->cpt_restart.fn = CPT_RBL_POLL;
+ v->cpt_restart.arg0 = rb->arg0;
+ v->cpt_restart.arg1 = rb->arg1;
+ v->cpt_restart.arg2 = ktime_to_ns(e);
+ v->cpt_restart.arg3 = 0;
+ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0);
+ goto continue_dump;
+ }
+ if (rb->fn == futex_wait_restart) {
+ v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT;
+
+ e.tv64 = rb->futex.time;
+ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+ v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr;
+ v->cpt_restart.arg1 = rb->futex.val;
+ v->cpt_restart.arg2 = ktime_to_ns(e);
+ v->cpt_restart.arg3 = rb->futex.flags;
+ goto continue_dump;
+ }
+ eprintk_ctx("unknown restart block %p\n", rb->fn);
+ return -EINVAL;
+ }
+
+continue_dump:
+ v->cpt_it_real_incr = 0;
+ v->cpt_it_prof_incr = 0;
+ v->cpt_it_virt_incr = 0;
+ v->cpt_it_real_value = 0;
+ v->cpt_it_prof_value = 0;
+ v->cpt_it_virt_value = 0;
+ if (thread_group_leader(tsk) && tsk->exit_state == 0) {
+ ktime_t rem;
+
+ v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr);
+ v->cpt_it_prof_incr = tsk->signal->it[CPUCLOCK_PROF].incr;
+ v->cpt_it_virt_incr = tsk->signal->it[CPUCLOCK_VIRT].incr;
+
+ rem = hrtimer_get_remaining(&tsk->signal->real_timer);
+
+ if (hrtimer_active(&tsk->signal->real_timer)) {
+ if (rem.tv64 <= 0)
+ rem.tv64 = NSEC_PER_USEC;
+ v->cpt_it_real_value = ktime_to_ns(rem);
+ dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value);
+ }
+ v->cpt_it_prof_value = tsk->signal->it[CPUCLOCK_PROF].expires;
+ v->cpt_it_virt_value = tsk->signal->it[CPUCLOCK_VIRT].expires;
+ }
+ v->cpt_used_math = (tsk_used_math(tsk) != 0);
+
+ if (tsk->notifier) {
+ eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+
+ v->cpt_utime = tsk->utime;
+ v->cpt_stime = tsk->stime;
+ delta = tsk->start_time;
+ _set_normalized_timespec(&delta,
+ delta.tv_sec - get_exec_env()->start_timespec.tv_sec,
+ delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
+ v->cpt_starttime = cpt_timespec_export(&delta);
+ v->cpt_nvcsw = tsk->nvcsw;
+ v->cpt_nivcsw = tsk->nivcsw;
+ v->cpt_min_flt = tsk->min_flt;
+ v->cpt_maj_flt = tsk->maj_flt;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
+ v->cpt_cutime = tsk->cutime;
+ v->cpt_cstime = tsk->cstime;
+ v->cpt_cnvcsw = tsk->cnvcsw;
+ v->cpt_cnivcsw = tsk->cnivcsw;
+ v->cpt_cmin_flt = tsk->cmin_flt;
+ v->cpt_cmaj_flt = tsk->cmaj_flt;
+
+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+ __asm__("undefined\n");
+
+ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+ if (i < RLIM_NLIMITS) {
+ v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur;
+ v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max;
+ } else {
+ v->cpt_rlim_cur[i] = CPT_NULL;
+ v->cpt_rlim_max[i] = CPT_NULL;
+ }
+ }
+#else
+ v->cpt_cutime = tsk->signal->cutime;
+ v->cpt_cstime = tsk->signal->cstime;
+ v->cpt_cnvcsw = tsk->signal->cnvcsw;
+ v->cpt_cnivcsw = tsk->signal->cnivcsw;
+ v->cpt_cmin_flt = tsk->signal->cmin_flt;
+ v->cpt_cmaj_flt = tsk->signal->cmaj_flt;
+
+ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
+ __asm__("undefined\n");
+
+ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
+ if (i < RLIM_NLIMITS) {
+ v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur;
+ v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max;
+ } else {
+ v->cpt_rlim_cur[i] = CPT_NULL;
+ v->cpt_rlim_max[i] = CPT_NULL;
+ }
+ }
+#endif
+
+#ifdef CONFIG_BEANCOUNTERS
+ if (tsk->mm)
+ v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx);
+ else
+ v->cpt_mm_ub = CPT_NULL;
+ v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx);
+ v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx);
+ v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx);
+#endif
+
+ v->cpt_ptrace_message = tsk->ptrace_message;
+ v->cpt_pn_state = tsk->pn_state;
+ v->cpt_stopped_state = tsk->stopped_state;
+ v->cpt_sigsuspend_state = 0;
+
+#ifdef CONFIG_X86_32
+ if (tsk->thread.vm86_info) {
+ eprintk_ctx("vm86 task is running\n");
+ cpt_release_buf(ctx);
+ return -EBUSY;
+ }
+#endif
+
+ v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal);
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ dump_kstack(tsk, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ err = dump_registers(tsk, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ if (err)
+ return err;
+
+ if (tsk_used_math(tsk)) {
+ cpt_push_object(&saved_obj, ctx);
+ dump_fpustate(tsk, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+
+ if (tsk->last_siginfo) {
+ struct cpt_siginfo_image si;
+ cpt_push_object(&saved_obj, ctx);
+
+ si.cpt_next = sizeof(si);
+ si.cpt_object = CPT_OBJ_LASTSIGINFO;
+ si.cpt_hdrlen = sizeof(si);
+ si.cpt_content = CPT_CONTENT_VOID;
+
+ if (encode_siginfo(&si, tsk->last_siginfo))
+ return -EINVAL;
+
+ ctx->write(&si, sizeof(si), ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+
+ if (tsk->sas_ss_size) {
+ struct cpt_sigaltstack_image si;
+ cpt_push_object(&saved_obj, ctx);
+
+ si.cpt_next = sizeof(si);
+ si.cpt_object = CPT_OBJ_SIGALTSTACK;
+ si.cpt_hdrlen = sizeof(si);
+ si.cpt_content = CPT_CONTENT_VOID;
+
+ si.cpt_stack = tsk->sas_ss_sp;
+ si.cpt_stacksize = tsk->sas_ss_size;
+
+ ctx->write(&si, sizeof(si), ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+
+ if (tsk->robust_list
+#ifdef CONFIG_COMPAT
+ || tsk->compat_robust_list
+#endif
+ ) {
+ struct cpt_task_aux_image ai;
+ cpt_push_object(&saved_obj, ctx);
+
+ ai.cpt_next = sizeof(ai);
+ ai.cpt_object = CPT_OBJ_TASK_AUX;
+ ai.cpt_hdrlen = sizeof(ai);
+ ai.cpt_content = CPT_CONTENT_VOID;
+
+ ai.cpt_robust_list = (unsigned long)tsk->robust_list;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_COMPAT
+ if (task_thread_info(tsk)->flags & _TIF_IA32)
+ ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list;
+#endif
+#endif
+ ctx->write(&ai, sizeof(ai), ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+
+ dump_sigqueue(&tsk->pending, ctx);
+
+ last_thread = 1;
+ read_lock(&tasklist_lock);
+ do {
+ struct task_struct * next = next_thread(tsk);
+ if (next != tsk && !thread_group_leader(next))
+ last_thread = 0;
+ } while (0);
+ read_unlock(&tasklist_lock);
+
+ if (last_thread) {
+ struct task_struct *prev_tsk;
+ int err;
+ loff_t pos = ctx->file->f_pos;
+
+ cpt_push_object(&saved_obj, ctx);
+ err = dump_one_signal_struct(tg_obj, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ if (err)
+ return err;
+
+ prev_tsk = tsk;
+ for (;;) {
+ if (prev_tsk->tgid == tsk->tgid) {
+ loff_t tg_pos;
+
+ tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal);
+ ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos);
+ if (thread_group_leader(prev_tsk))
+ break;
+ }
+
+ if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) {
+ eprintk_ctx("bug: thread group leader is lost\n");
+ return -EINVAL;
+ }
+
+ obj = list_entry(obj->o_list.prev, cpt_object_t, o_list);
+ prev_tsk = obj->o_obj;
+ }
+ }
+
+ cpt_close_object(ctx);
+ return 0;
+}
+
+int cpt_dump_tasks(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_TASKS);
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ int err;
+
+ if ((err = dump_one_process(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+ return 0;
+}
+
+int cpt_collect_signals(cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+
+ /* Collect process fd sets */
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) {
+ eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm);
+ return -EBUSY;
+ }
+ if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL)
+ return -ENOMEM;
+ if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+
+static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct sighand_struct *sig = obj->o_obj;
+ struct cpt_sighand_image *v = cpt_get_buf(ctx);
+ int i;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_SIGHAND_STRUCT;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ for (i=0; i< _NSIG; i++) {
+ if (sig->action[i].sa.sa_handler != SIG_DFL ||
+ sig->action[i].sa.sa_flags) {
+ loff_t saved_obj;
+ struct cpt_sighandler_image *o = cpt_get_buf(ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ cpt_open_object(NULL, ctx);
+
+ o->cpt_next = CPT_NULL;
+ o->cpt_object = CPT_OBJ_SIGHANDLER;
+ o->cpt_hdrlen = sizeof(*o);
+ o->cpt_content = CPT_CONTENT_VOID;
+
+ o->cpt_signo = i;
+ o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler;
+ o->cpt_restorer = 0;
+#ifdef CONFIG_X86
+ o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer;
+#endif
+ o->cpt_flags = sig->action[i].sa.sa_flags;
+ memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8);
+ ctx->write(o, sizeof(*o), ctx);
+ cpt_release_buf(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+ }
+
+ cpt_close_object(ctx);
+ return 0;
+}
+
+int cpt_dump_sighand(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT);
+
+ for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) {
+ int err;
+
+ if ((err = dump_one_sighand_struct(obj, ctx)) != 0)
+ return err;
+ }
+
+ cpt_close_section(ctx);
+ return 0;
+}
diff --git a/kernel/cpt/cpt_process.h b/kernel/cpt/cpt_process.h
new file mode 100644
index 0000000..b9f28af
--- /dev/null
+++ b/kernel/cpt/cpt_process.h
@@ -0,0 +1,13 @@
+int cpt_collect_signals(cpt_context_t *);
+int cpt_dump_signal(struct cpt_context *);
+int cpt_dump_sighand(struct cpt_context *);
+int cpt_dump_tasks(struct cpt_context *);
+
+int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx);
+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
+
+int rst_restore_process(struct cpt_context *ctx);
+int rst_process_linkage(struct cpt_context *ctx);
+
+int check_task_state(struct task_struct *tsk, struct cpt_context *ctx);
+struct pid *alloc_vpid_safe(pid_t vnr);
diff --git a/kernel/cpt/cpt_socket.c b/kernel/cpt/cpt_socket.c
new file mode 100644
index 0000000..3943b60
--- /dev/null
+++ b/kernel/cpt/cpt_socket.c
@@ -0,0 +1,802 @@
+/*
+ *
+ * kernel/cpt/cpt_socket.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/af_unix.h>
+#include <net/tcp.h>
+#include <net/netlink_sock.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+
+static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx);
+
+
+/* Sockets are quite different of another kinds of files.
+ * There is one simplification: only one struct file can refer to a socket,
+ * so we could store information about socket directly in section FILES as
+ * a description of a file and append f.e. array of not-yet-accepted
+ * connections of listening socket as array of auxiliary data.
+ *
+ * Complications are:
+ * 1. TCP sockets can be orphans. We have to relocate orphans as well,
+ * so we have to create special section for orphans.
+ * 2. AF_UNIX sockets are distinguished objects: set of links between
+ * AF_UNIX sockets is quite arbitrary.
+ * A. Each socket can refers to many of files due to FD passing.
+ * B. Each socket except for connected ones can have in queue skbs
+ * sent by any of sockets.
+ *
+ * 2A is relatively easy: after our tasks are frozen we make an additional
+ * recursive pass throgh set of collected files and get referenced to
+ * FD passed files. After end of recursion, all the files are treated
+ * in the same way. All they will be stored in section FILES.
+ *
+ * 2B. We have to resolve all those references at some point.
+ * It is the place where pipe-like approach to image fails.
+ *
+ * All this makes socket checkpointing quite chumbersome.
+ * Right now we collect all the sockets and assign some numeric index value
+ * to each of them. The socket section is separate and put after section FILES,
+ * so section FILES refers to sockets by index, section SOCKET refers to FILES
+ * as usual by position in image. All the refs inside socket section are
+ * by index. When restoring we read socket section, create objects to hold
+ * mappings index <-> pos. At the second pass we open sockets (simultaneosly
+ * with their pairs) and create FILE objects.
+ */
+
+
+/* ====== FD passing ====== */
+
+/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we
+ * have to implement this. A problem is that in general case we receive
+ * skbs from an unknown context, so new files can arrive to checkpointed
+ * set of processes even after they are stopped. Well, we are going just
+ * to ignore unknown fds while doing real checkpointing. It is fair because
+ * links outside checkpointed set are going to fail anyway.
+ *
+ * ATTN: the procedure is recursive. We linearize the recursion adding
+ * newly found files to the end of file list, so they will be analyzed
+ * in the same loop.
+ */
+
+static int collect_one_passedfd(struct file *file, cpt_context_t * ctx)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct socket *sock;
+ struct sock *sk;
+ struct sk_buff *skb;
+
+ if (!S_ISSOCK(inode->i_mode))
+ return -ENOTSOCK;
+
+ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+
+ if (sock->ops->family != AF_UNIX)
+ return 0;
+
+ sk = sock->sk;
+
+ /* Subtle locking issue. skbs cannot be removed while
+ * we are scanning, because all the processes are stopped.
+ * They still can be added to tail of queue. Locking while
+ * we dereference skb->next is enough to resolve this.
+ * See above about collision with skbs added after we started
+ * checkpointing.
+ */
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+ if (UNIXCB(skb).fp && skb->sk &&
+ (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) {
+ struct scm_fp_list *fpl = UNIXCB(skb).fp;
+ int i;
+
+ for (i = fpl->count-1; i >= 0; i--) {
+ if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL)
+ return -ENOMEM;
+ }
+ }
+
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb->next;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ }
+
+ return 0;
+}
+
+int cpt_collect_passedfds(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+
+ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
+ int err;
+
+ if ((err = collect_one_passedfd(file, ctx)) < 0)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* ====== End of FD passing ====== */
+
+/* Must be called under bh_lock_sock() */
+
+void clear_backlog(struct sock *sk)
+{
+ struct sk_buff *skb = sk->sk_backlog.head;
+
+ sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
+ while (skb) {
+ struct sk_buff *next = skb->next;
+
+ skb->next = NULL;
+ kfree_skb(skb);
+ skb = next;
+ }
+}
+
+void release_sock_nobacklog(struct sock *sk)
+{
+ spin_lock_bh(&(sk->sk_lock.slock));
+ clear_backlog(sk);
+ sk->sk_lock.owned = 0;
+ if (waitqueue_active(&(sk->sk_lock.wq)))
+ wake_up(&(sk->sk_lock.wq));
+ spin_unlock_bh(&(sk->sk_lock.slock));
+}
+
+int cpt_dump_skb(int type, int owner, struct sk_buff *skb,
+ struct sock *sk, struct cpt_context *ctx)
+{
+ struct cpt_skb_image *v = cpt_get_buf(ctx);
+ loff_t saved_obj;
+ struct timeval tmptv;
+
+ cpt_push_object(&saved_obj, ctx);
+ cpt_open_object(NULL, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_SKB;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_owner = owner;
+ v->cpt_queue = type;
+ skb_get_timestamp(skb, &tmptv);
+ v->cpt_stamp = cpt_timeval_export(&tmptv);
+ v->cpt_hspace = skb->data - skb->head;
+ v->cpt_tspace = skb->end - skb->tail;
+ v->cpt_h = skb_transport_header(skb) - skb->head;
+ v->cpt_nh = skb_network_header(skb) - skb->head;
+ v->cpt_mac = skb_mac_header(skb) - skb->head;
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb));
+ memset(v->cpt_cb, 0, sizeof(v->cpt_cb));
+#if !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE)
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ /* Save control block according to tcp_skb_cb with IPv6 */
+ BUG_ON(sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm) >
+ sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm));
+ memcpy(v->cpt_cb, skb->cb, sizeof(struct inet_skb_parm));
+ memcpy((void *)v->cpt_cb + sizeof(struct inet6_skb_parm),
+ skb->cb + sizeof(struct inet_skb_parm),
+ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm));
+ } else
+#endif
+ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb));
+ if (sizeof(skb->cb) > sizeof(v->cpt_cb)) {
+ int i;
+ for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) {
+ if (skb->cb[i]) {
+ wprintk_ctx("dirty skb cb");
+ break;
+ }
+ }
+ }
+ v->cpt_len = skb->len;
+ v->cpt_mac_len = skb->mac_len;
+ v->cpt_csum = skb->csum;
+ v->cpt_local_df = skb->local_df;
+ v->cpt_pkt_type = skb->pkt_type;
+ v->cpt_ip_summed = skb->ip_summed;
+ v->cpt_priority = skb->priority;
+ v->cpt_protocol = skb->protocol;
+ v->cpt_security = 0;
+ v->cpt_gso_segs = skb_shinfo(skb)->gso_segs;
+ v->cpt_gso_size = skb_shinfo(skb)->gso_size;
+ if (skb_shinfo(skb)->gso_type) {
+ eprintk_ctx("skb ufo is not supported\n");
+ return -EINVAL;
+ }
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ if (skb->len + (skb->data - skb->head) > 0) {
+ struct cpt_obj_bits ob;
+ loff_t saved_obj2;
+
+ cpt_push_object(&saved_obj2, ctx);
+ cpt_open_object(NULL, ctx);
+ ob.cpt_next = CPT_NULL;
+ ob.cpt_object = CPT_OBJ_BITS;
+ ob.cpt_hdrlen = sizeof(ob);
+ ob.cpt_content = CPT_CONTENT_DATA;
+ ob.cpt_size = skb->len + v->cpt_hspace;
+
+ ctx->write(&ob, sizeof(ob), ctx);
+
+ ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx);
+ if (skb->data_len) {
+ int offset = skb->len - skb->data_len;
+ while (offset < skb->len) {
+ int copy = skb->len - offset;
+ if (copy > PAGE_SIZE)
+ copy = PAGE_SIZE;
+ (void)cpt_get_buf(ctx);
+ if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy))
+ BUG();
+ ctx->write(ctx->tmpbuf, copy, ctx);
+ __cpt_release_buf(ctx);
+ offset += copy;
+ }
+ }
+
+ ctx->align(ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj2, ctx);
+ }
+
+ if (skb->sk && skb->sk->sk_family == AF_UNIX) {
+ struct scm_fp_list *fpl = UNIXCB(skb).fp;
+
+ if (fpl) {
+ int i;
+
+ for (i = 0; i < fpl->count; i++) {
+ struct cpt_fd_image v;
+ cpt_object_t *obj;
+ loff_t saved_obj2;
+
+ obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx);
+
+ if (!obj) {
+ eprintk_ctx("lost passed FD\n");
+ return -EINVAL;
+ }
+
+ cpt_push_object(&saved_obj2, ctx);
+ cpt_open_object(NULL, ctx);
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_FILEDESC;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ v.cpt_fd = i;
+ v.cpt_file = obj->o_pos;
+ v.cpt_flags = 0;
+ ctx->write(&v, sizeof(v), ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj2, ctx);
+ }
+ }
+ }
+
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ return 0;
+}
+
+static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+ struct sk_buff *skb;
+ struct sock *sk_cache = NULL;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+ int err;
+
+ if (sk->sk_family == AF_UNIX) {
+ cpt_object_t *obj;
+ if (skb->sk != sk_cache) {
+ idx = -1;
+ sk_cache = NULL;
+ obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx);
+ if (obj) {
+ idx = obj->o_index;
+ sk_cache = skb->sk;
+ } else if (unix_peer(sk) != skb->sk)
+ goto next_skb;
+ }
+ }
+
+ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, sk, ctx);
+ if (err)
+ return err;
+
+next_skb:
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb->next;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ }
+ return 0;
+}
+
+static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+ struct sk_buff *skb;
+
+ skb = skb_peek(&sk->sk_write_queue);
+ while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) {
+ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, sk, ctx);
+ if (err)
+ return err;
+
+ spin_lock_irq(&sk->sk_write_queue.lock);
+ skb = skb->next;
+ spin_unlock_irq(&sk->sk_write_queue.lock);
+ }
+ return 0;
+}
+
+void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx)
+{
+ loff_t saved_obj;
+ if (sk->sk_filter) {
+ struct cpt_obj_bits v;
+
+ cpt_push_object(&saved_obj, ctx);
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_SKFILTER;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_DATA;
+ v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter);
+
+ ctx->write(&v, sizeof(v), ctx);
+ ctx->write(sk->sk_filter->insns, v.cpt_size, ctx);
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+ cpt_push_object(&saved_obj, ctx);
+ cpt_dump_mcfilter(sk, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+}
+
+/* Dump socket content */
+
+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx)
+{
+ struct cpt_sock_image *v = cpt_get_buf(ctx);
+ struct socket *sock;
+ struct timeval tmptv;
+
+ cpt_open_object(obj, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_SOCKET;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_file = CPT_NULL;
+ sock = sk->sk_socket;
+ if (sock && sock->file) {
+ cpt_object_t *tobj;
+ tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx);
+ if (tobj)
+ v->cpt_file = tobj->o_pos;
+ }
+ v->cpt_index = index;
+ v->cpt_parent = parent;
+
+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+ if (sock && !obj->o_lock) {
+ lockdep_off();
+ lock_sock(sk);
+ lockdep_on();
+ obj->o_lock = 1;
+ }
+ }
+
+ /* Some bits stored in inode */
+ v->cpt_ssflags = sock ? sock->flags : 0;
+ v->cpt_sstate = sock ? sock->state : 0;
+ v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0;
+
+ /* Common data */
+ v->cpt_family = sk->sk_family;
+ v->cpt_type = sk->sk_type;
+ v->cpt_state = sk->sk_state;
+ v->cpt_reuse = sk->sk_reuse;
+ v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED);
+ v->cpt_shutdown = sk->sk_shutdown;
+ v->cpt_userlocks = sk->sk_userlocks;
+ v->cpt_no_check = sk->sk_no_check;
+ v->cpt_zapped = sock_flag(sk, SOCK_DBG);
+ v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+ v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE);
+ v->cpt_protocol = sk->sk_protocol;
+ v->cpt_err = sk->sk_err;
+ v->cpt_err_soft = sk->sk_err_soft;
+ v->cpt_max_ack_backlog = sk->sk_max_ack_backlog;
+ v->cpt_priority = sk->sk_priority;
+ v->cpt_rcvlowat = sk->sk_rcvlowat;
+ v->cpt_rcvtimeo = CPT_NULL;
+ if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT)
+ v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo;
+ v->cpt_sndtimeo = CPT_NULL;
+ if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT)
+ v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo;
+ v->cpt_rcvbuf = sk->sk_rcvbuf;
+ v->cpt_sndbuf = sk->sk_sndbuf;
+ v->cpt_bound_dev_if = sk->sk_bound_dev_if;
+ v->cpt_flags = sk->sk_flags;
+ v->cpt_lingertime = CPT_NULL;
+ if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT)
+ v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime;
+ v->cpt_peer_pid = sk->sk_peercred.pid;
+ v->cpt_peer_uid = sk->sk_peercred.uid;
+ v->cpt_peer_gid = sk->sk_peercred.gid;
+ tmptv = ktime_to_timeval(sk->sk_stamp);
+ v->cpt_stamp = cpt_timeval_export(&tmptv);
+
+ v->cpt_peer = -1;
+ v->cpt_socketpair = 0;
+ v->cpt_deleted = 0;
+
+ v->cpt_laddrlen = 0;
+ if (sock) {
+ int alen = sizeof(v->cpt_laddr);
+ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+ v->cpt_laddrlen = alen;
+ }
+ v->cpt_raddrlen = 0;
+ if (sock) {
+ int alen = sizeof(v->cpt_raddr);
+ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2);
+ if (!err)
+ v->cpt_raddrlen = alen;
+ }
+
+ if (sk->sk_family == AF_UNIX) {
+ if (unix_sk(sk)->dentry) {
+ struct dentry *d = unix_sk(sk)->dentry;
+ v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d);
+ if (!v->cpt_deleted) {
+ int err = 0;
+ char *path;
+ struct path p;
+ unsigned long pg = __get_free_page(GFP_KERNEL);
+
+ if (!pg) {
+ cpt_release_buf(ctx);
+ return -ENOMEM;
+ }
+
+ p.dentry = d;
+ p.mnt = unix_sk(sk)->mnt;
+ path = d_path(&p, (char *)pg, PAGE_SIZE);
+
+ if (!IS_ERR(path)) {
+ int len = strlen(path);
+ if (len < 126) {
+ strcpy(((char*)v->cpt_laddr)+2, path);
+ v->cpt_laddrlen = len + 2;
+ } else {
+ wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2);
+ }
+ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, 1, ctx);
+ } else {
+ eprintk_ctx("cannot get path of an af_unix socket\n");
+ err = PTR_ERR(path);
+ }
+ free_page(pg);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+ }
+ }
+
+ /* If the socket is connected, find its peer. If peer is not
+ * in our table, the socket is connected to external process
+ * and we consider it disconnected.
+ */
+ if (unix_peer(sk)) {
+ cpt_object_t *pobj;
+ pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx);
+ if (pobj)
+ v->cpt_peer = pobj->o_index;
+ else
+ v->cpt_shutdown = SHUTDOWN_MASK;
+
+ if (unix_peer(unix_peer(sk)) == sk)
+ v->cpt_socketpair = 1;
+ }
+
+ /* If the socket shares address with another socket it is
+ * child of some listening socket. Find and record it. */
+ if (unix_sk(sk)->addr &&
+ atomic_read(&unix_sk(sk)->addr->refcnt) > 1 &&
+ sk->sk_state != TCP_LISTEN) {
+ cpt_object_t *pobj;
+ for_each_object(pobj, CPT_OBJ_SOCKET) {
+ struct sock *psk = pobj->o_obj;
+ if (psk->sk_family == AF_UNIX &&
+ psk->sk_state == TCP_LISTEN &&
+ unix_sk(psk)->addr == unix_sk(sk)->addr) {
+ v->cpt_parent = pobj->o_index;
+ break;
+ }
+ }
+ }
+ }
+
+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+ cpt_dump_socket_in(v, sk, ctx);
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ cpt_dump_sock_attr(sk, ctx);
+
+ dump_rqueue(index, sk, ctx);
+ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
+ dump_wqueue(index, sk, ctx);
+ cpt_dump_ofo_queue(index, sk, ctx);
+ }
+
+ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+ && sk->sk_state == TCP_LISTEN)
+ cpt_dump_synwait_queue(sk, index, ctx);
+
+ cpt_close_object(ctx);
+
+ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
+ && sk->sk_state == TCP_LISTEN)
+ cpt_dump_accept_queue(sk, index, ctx);
+
+ return 0;
+}
+
+int cpt_dump_orphaned_sockets(struct cpt_context *ctx)
+{
+ int i;
+
+ cpt_open_section(ctx, CPT_SECT_ORPHANS);
+
+ for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
+retry:
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) {
+
+ if (sk->owner_env != get_exec_env())
+ continue;
+ if (sk->sk_socket)
+ continue;
+ if (!sock_flag(sk, SOCK_DEAD))
+ continue;
+ if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx))
+ continue;
+ sock_hold(sk);
+ spin_unlock_bh(lock);
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ eprintk_ctx("BUG: sk locked by whom?\n");
+ sk->sk_lock.owned = 1;
+ bh_unlock_sock(sk);
+ local_bh_enable();
+
+ cpt_dump_socket(NULL, sk, -1, -1, ctx);
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ sk->sk_lock.owned = 0;
+ clear_backlog(sk);
+ tcp_done(sk);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+
+ goto retry;
+ }
+ spin_unlock_bh(lock);
+ }
+ cpt_close_section(ctx);
+ return 0;
+}
+
+static int can_dump(struct sock *sk, cpt_context_t *ctx)
+{
+ switch (sk->sk_family) {
+ case AF_NETLINK:
+ if (((struct netlink_sock *)sk)->cb) {
+ eprintk_ctx("netlink socket has active callback\n");
+ return 0;
+ }
+ break;
+ }
+ return 1;
+}
+
+/* We are not going to block suspend when we have external AF_UNIX connections.
+ * But we cannot stop feed of new packets/connections to our environment
+ * from outside. Taking into account that it is intrincically unreliable,
+ * we collect some amount of data, but when checkpointing/restoring we
+ * are going to drop everything, which does not make sense: skbs sent
+ * by outside processes, connections from outside etc. etc.
+ */
+
+/* The first pass. When we see socket referenced by a file, we just
+ * add it to socket table */
+int cpt_collect_socket(struct file *file, cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+ struct socket *sock;
+ struct sock *sk;
+
+ if (!S_ISSOCK(file->f_dentry->d_inode->i_mode))
+ return -ENOTSOCK;
+ sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket;
+ sk = sock->sk;
+ if (!can_dump(sk, ctx))
+ return -EAGAIN;
+ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL)
+ return -ENOMEM;
+ obj->o_parent = file;
+
+ return 0;
+}
+
+/*
+ * We should end with table containing:
+ * * all sockets opened by our processes in the table.
+ * * all the sockets queued in listening queues on _our_ listening sockets,
+ * which are connected to our opened sockets.
+ */
+
+static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx)
+{
+ struct sock *sk = obj->o_obj;
+ cpt_object_t *cobj;
+ struct sk_buff *skb;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
+ struct sock *lsk = skb->sk;
+ if (unix_peer(lsk) &&
+ lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) {
+ if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL)
+ return -ENOMEM;
+ cobj->o_parent = obj->o_parent;
+ }
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb->next;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ }
+
+ return 0;
+}
+
+int cpt_index_sockets(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+ unsigned long index = 0;
+
+ /* Collect not-yet-accepted children of listening sockets. */
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct sock *sk = obj->o_obj;
+
+ if (sk->sk_state != TCP_LISTEN)
+ continue;
+
+ if (sk->sk_family == AF_UNIX)
+ collect_one_unix_listening_sock(obj, ctx);
+ }
+
+ /* Assign indices to all the sockets. */
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct sock *sk = obj->o_obj;
+ cpt_obj_setindex(obj, index++, ctx);
+
+ if (sk->sk_socket && sk->sk_socket->file) {
+ cpt_object_t *tobj;
+ tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx);
+ if (tobj)
+ cpt_obj_setindex(tobj, obj->o_index, ctx);
+ }
+ }
+
+ return 0;
+}
+
+void cpt_unlock_sockets(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+
+ lockdep_off();
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct sock *sk = obj->o_obj;
+ if (sk && obj->o_lock) {
+ if (sk->sk_socket)
+ release_sock(sk);
+ }
+ }
+ lockdep_on();
+}
+
+void cpt_kill_sockets(cpt_context_t * ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct sock *sk = obj->o_obj;
+ if (sk && obj->o_lock) {
+ struct ve_struct *old_env;
+ old_env = set_exec_env(sk->owner_env);
+ cpt_kill_socket(sk, ctx);
+ if (sk->sk_socket)
+ release_sock_nobacklog(sk);
+ set_exec_env(old_env);
+ }
+ }
+}
+
+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx)
+{
+ struct fasync_struct *fa;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct socket *sock;
+
+ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
+
+ for (fa = sock->fasync_list; fa; fa = fa->fa_next) {
+ if (fa->fa_file == file)
+ return fa->fa_fd;
+ }
+ return -1;
+}
diff --git a/kernel/cpt/cpt_socket.h b/kernel/cpt/cpt_socket.h
new file mode 100644
index 0000000..9c64399
--- /dev/null
+++ b/kernel/cpt/cpt_socket.h
@@ -0,0 +1,37 @@
+struct sock;
+
+int cpt_collect_passedfds(cpt_context_t *);
+int cpt_index_sockets(cpt_context_t *);
+int cpt_collect_socket(struct file *, cpt_context_t *);
+int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx);
+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx);
+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx);
+int rst_sockets(struct cpt_context *ctx);
+int rst_sockets_complete(struct cpt_context *ctx);
+int cpt_dump_orphaned_sockets(struct cpt_context *ctx);
+
+int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx);
+struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner,
+ __u32 *queue, struct cpt_context *ctx);
+
+void cpt_unlock_sockets(cpt_context_t *);
+void cpt_kill_sockets(cpt_context_t *);
+
+
+int cpt_kill_socket(struct sock *, cpt_context_t *);
+int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*);
+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx);
+int rst_listen_socket_in(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx);
+__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx);
+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *);
+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx);
+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx);
+int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct sock *sk,
+ struct cpt_context *ctx);
+int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx);
+
+int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
+ loff_t pos, cpt_context_t *ctx);
+int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
+ loff_t pos, cpt_context_t *ctx);
diff --git a/kernel/cpt/cpt_socket_in.c b/kernel/cpt/cpt_socket_in.c
new file mode 100644
index 0000000..d565745
--- /dev/null
+++ b/kernel/cpt/cpt_socket_in.c
@@ -0,0 +1,448 @@
+/*
+ *
+ * kernel/cpt/cpt_socket_in.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/if_inet6.h>
+#include <linux/igmp.h>
+#include <linux/ipv6.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+
+static inline __u32 jiffies_export(unsigned long tmo)
+{
+ __s32 delta = (long)(tmo - jiffies);
+ return delta;
+}
+
+static inline __u32 tcp_jiffies_export(__u32 tmo)
+{
+ __s32 delta = tmo - tcp_time_stamp;
+ return delta;
+}
+
+int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx)
+{
+ struct sk_buff *skb;
+ struct tcp_sock *tp;
+
+ if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP)
+ return 0;
+
+ tp = tcp_sk(sk);
+
+ skb = skb_peek(&tp->out_of_order_queue);
+ while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) {
+ int err;
+
+ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, sk, ctx);
+ if (err)
+ return err;
+
+ spin_lock_irq(&tp->out_of_order_queue.lock);
+ skb = skb->next;
+ spin_unlock_irq(&tp->out_of_order_queue.lock);
+ }
+ return 0;
+}
+
+static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk,
+ struct cpt_context *ctx)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ si->cpt_pred_flags = tp->pred_flags;
+ si->cpt_rcv_nxt = tp->rcv_nxt;
+ si->cpt_snd_nxt = tp->snd_nxt;
+ si->cpt_snd_una = tp->snd_una;
+ si->cpt_snd_sml = tp->snd_sml;
+ si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp);
+ si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime);
+ si->cpt_tcp_header_len = tp->tcp_header_len;
+ si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending;
+ si->cpt_quick = inet_csk(sk)->icsk_ack.quick;
+ si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong;
+ si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked;
+ si->cpt_ato = inet_csk(sk)->icsk_ack.ato;
+ si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout);
+ si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime);
+ si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size;
+ si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss;
+ si->cpt_snd_wl1 = tp->snd_wl1;
+ si->cpt_snd_wnd = tp->snd_wnd;
+ si->cpt_max_window = tp->max_window;
+ si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie;
+ si->cpt_mss_cache = tp->mss_cache;
+ si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */
+ si->cpt_mss_clamp = tp->rx_opt.mss_clamp;
+ si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len;
+ si->cpt_ext2_header_len = 0;
+ si->cpt_ca_state = inet_csk(sk)->icsk_ca_state;
+ si->cpt_retransmits = inet_csk(sk)->icsk_retransmits;
+ si->cpt_reordering = tp->reordering;
+ si->cpt_frto_counter = tp->frto_counter;
+ si->cpt_frto_highmark = tp->frto_highmark;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+ // // si->cpt_adv_cong = tp->adv_cong;
+#endif
+ si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept;
+ si->cpt_backoff = inet_csk(sk)->icsk_backoff;
+ si->cpt_srtt = tp->srtt;
+ si->cpt_mdev = tp->mdev;
+ si->cpt_mdev_max = tp->mdev_max;
+ si->cpt_rttvar = tp->rttvar;
+ si->cpt_rtt_seq = tp->rtt_seq;
+ si->cpt_rto = inet_csk(sk)->icsk_rto;
+ si->cpt_packets_out = tp->packets_out;
+ si->cpt_left_out = tp->sacked_out + tp->lost_out;
+ si->cpt_retrans_out = tp->retrans_out;
+ si->cpt_lost_out = tp->lost_out;
+ si->cpt_sacked_out = tp->sacked_out;
+ si->cpt_fackets_out = tp->fackets_out;
+ si->cpt_snd_ssthresh = tp->snd_ssthresh;
+ si->cpt_snd_cwnd = tp->snd_cwnd;
+ si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt;
+ si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp;
+ si->cpt_snd_cwnd_used = tp->snd_cwnd_used;
+ si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp);
+ si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout);
+ si->cpt_ka_timeout = 0;
+ si->cpt_rcv_wnd = tp->rcv_wnd;
+ si->cpt_rcv_wup = tp->rcv_wup;
+ si->cpt_write_seq = tp->write_seq;
+ si->cpt_pushed_seq = tp->pushed_seq;
+ si->cpt_copied_seq = tp->copied_seq;
+ si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok;
+ si->cpt_wscale_ok = tp->rx_opt.wscale_ok;
+ si->cpt_sack_ok = tp->rx_opt.sack_ok;
+ si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp;
+ si->cpt_snd_wscale = tp->rx_opt.snd_wscale;
+ si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale;
+ si->cpt_nonagle = tp->nonagle;
+ si->cpt_keepalive_probes = tp->keepalive_probes;
+ si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval;
+ si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr;
+ si->cpt_ts_recent = tp->rx_opt.ts_recent;
+ si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ si->cpt_user_mss = tp->rx_opt.user_mss;
+ si->cpt_dsack = tp->rx_opt.dsack;
+ si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq;
+ si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq;
+ si->cpt_sack_array[2] = tp->selective_acks[0].start_seq;
+ si->cpt_sack_array[3] = tp->selective_acks[0].end_seq;
+ si->cpt_sack_array[4] = tp->selective_acks[1].start_seq;
+ si->cpt_sack_array[5] = tp->selective_acks[1].end_seq;
+ si->cpt_sack_array[6] = tp->selective_acks[2].start_seq;
+ si->cpt_sack_array[7] = tp->selective_acks[2].end_seq;
+ si->cpt_sack_array[8] = tp->selective_acks[3].start_seq;
+ si->cpt_sack_array[9] = tp->selective_acks[3].end_seq;
+ si->cpt_window_clamp = tp->window_clamp;
+ si->cpt_rcv_ssthresh = tp->rcv_ssthresh;
+ si->cpt_probes_out = inet_csk(sk)->icsk_probes_out;
+ si->cpt_num_sacks = tp->rx_opt.num_sacks;
+ si->cpt_advmss = tp->advmss;
+ si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries;
+ si->cpt_ecn_flags = tp->ecn_flags;
+ si->cpt_prior_ssthresh = tp->prior_ssthresh;
+ si->cpt_high_seq = tp->high_seq;
+ si->cpt_retrans_stamp = tp->retrans_stamp;
+ si->cpt_undo_marker = tp->undo_marker;
+ si->cpt_undo_retrans = tp->undo_retrans;
+ si->cpt_urg_seq = tp->urg_seq;
+ si->cpt_urg_data = tp->urg_data;
+ si->cpt_pending = inet_csk(sk)->icsk_pending;
+ si->cpt_snd_up = tp->snd_up;
+ si->cpt_keepalive_time = tp->keepalive_time;
+ si->cpt_keepalive_intvl = tp->keepalive_intvl;
+ si->cpt_linger2 = tp->linger2;
+
+ if (sk->sk_state != TCP_LISTEN &&
+ sk->sk_state != TCP_CLOSE &&
+ sock_flag(sk, SOCK_KEEPOPEN)) {
+ si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires);
+ }
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ {
+ extern struct inet_connection_sock_af_ops ipv6_mapped;
+ if (sk->sk_family == AF_INET6 &&
+ inet_csk(sk)->icsk_af_ops == &ipv6_mapped)
+ si->cpt_mapped = 1;
+ }
+#endif
+
+ return 0;
+}
+
+
+int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk,
+ struct cpt_context *ctx)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (sk->sk_family == AF_INET) {
+ struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr);
+ sin->sin_family = AF_INET;
+ sin->sin_port = inet->sport;
+ sin->sin_addr.s_addr = inet->rcv_saddr;
+ si->cpt_laddrlen = sizeof(*sin);
+ } else if (sk->sk_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = inet->sport;
+ memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16);
+ si->cpt_laddrlen = sizeof(*sin6);
+ }
+ if (!inet->num)
+ si->cpt_laddrlen = 0;
+
+ si->cpt_daddr = inet->daddr;
+ si->cpt_dport = inet->dport;
+ si->cpt_saddr = inet->saddr;
+ si->cpt_rcv_saddr = inet->rcv_saddr;
+ si->cpt_sport = inet->sport;
+ si->cpt_uc_ttl = inet->uc_ttl;
+ si->cpt_tos = inet->tos;
+ si->cpt_cmsg_flags = inet->cmsg_flags;
+ si->cpt_mc_index = inet->mc_index;
+ si->cpt_mc_addr = inet->mc_addr;
+ si->cpt_hdrincl = inet->hdrincl;
+ si->cpt_mc_ttl = inet->mc_ttl;
+ si->cpt_mc_loop = inet->mc_loop;
+ si->cpt_pmtudisc = inet->pmtudisc;
+ si->cpt_recverr = inet->recverr;
+ si->cpt_freebind = inet->freebind;
+ si->cpt_idcounter = inet->id;
+
+ si->cpt_cork_flags = inet->cork.flags;
+ si->cpt_cork_fragsize = 0;
+ si->cpt_cork_length = inet->cork.length;
+ si->cpt_cork_addr = inet->cork.addr;
+ si->cpt_cork_saddr = inet->cork.fl.fl4_src;
+ si->cpt_cork_daddr = inet->cork.fl.fl4_dst;
+ si->cpt_cork_oif = inet->cork.fl.oif;
+ if (inet->cork.dst) {
+ struct rtable *rt = (struct rtable *)inet->cork.dst;
+ si->cpt_cork_fragsize = inet->cork.fragsize;
+ si->cpt_cork_saddr = rt->fl.fl4_src;
+ si->cpt_cork_daddr = rt->fl.fl4_dst;
+ si->cpt_cork_oif = rt->fl.oif;
+ }
+
+ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
+ struct udp_sock *up = udp_sk(sk);
+ si->cpt_udp_pending = up->pending;
+ si->cpt_udp_corkflag = up->corkflag;
+ si->cpt_udp_encap = up->encap_type;
+ si->cpt_udp_len = up->len;
+ }
+
+ if (sk->sk_family == AF_INET6) {
+ memcpy(si->cpt_saddr6, &np->saddr, 16);
+ memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16);
+ memcpy(si->cpt_daddr6, &np->daddr, 16);
+ si->cpt_flow_label6 = np->flow_label;
+ si->cpt_frag_size6 = np->frag_size;
+ si->cpt_hop_limit6 = np->hop_limit;
+ si->cpt_mcast_hops6 = np->mcast_hops;
+ si->cpt_mcast_oif6 = np->mcast_oif;
+ si->cpt_rxopt6 = np->rxopt.all;
+ si->cpt_mc_loop6 = np->mc_loop;
+ si->cpt_recverr6 = np->recverr;
+ si->cpt_sndflow6 = np->sndflow;
+ si->cpt_pmtudisc6 = np->pmtudisc;
+ si->cpt_ipv6only6 = np->ipv6only;
+ si->cpt_mapped = 0;
+ }
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
+ cpt_dump_socket_tcp(si, sk, ctx);
+
+ return 0;
+}
+
+int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx)
+{
+ struct request_sock *req;
+
+ for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next)
+ cpt_dump_socket(NULL, req->sk, -1, index, ctx);
+ return 0;
+}
+
+
+static int dump_openreq(struct request_sock *req, struct sock *sk, int index,
+ struct cpt_context *ctx)
+{
+ struct cpt_openreq_image *v = cpt_get_buf(ctx);
+
+ cpt_open_object(NULL, ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_OPENREQ;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_VOID;
+
+ v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn;
+ v->cpt_snt_isn = tcp_rsk(req)->snt_isn;
+ v->cpt_rmt_port = inet_rsk(req)->rmt_port;
+ v->cpt_mss = req->mss;
+ v->cpt_family = req->rsk_ops->family;
+ v->cpt_retrans = req->retrans;
+ v->cpt_snd_wscale = inet_rsk(req)->snd_wscale;
+ v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale;
+ v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok;
+ v->cpt_sack_ok = inet_rsk(req)->sack_ok;
+ v->cpt_wscale_ok = inet_rsk(req)->wscale_ok;
+ v->cpt_ecn_ok = inet_rsk(req)->ecn_ok;
+ v->cpt_acked = inet_rsk(req)->acked;
+ v->cpt_window_clamp = req->window_clamp;
+ v->cpt_rcv_wnd = req->rcv_wnd;
+ v->cpt_ts_recent = req->ts_recent;
+ v->cpt_expires = jiffies_export(req->expires);
+
+ if (v->cpt_family == AF_INET) {
+ memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4);
+ memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4);
+ } else {
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16);
+ memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16);
+ v->cpt_iif = inet6_rsk(req)->iif;
+#endif
+ }
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ cpt_close_object(ctx);
+ return 0;
+}
+
+int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx)
+{
+ struct inet_connection_sock *icsk;
+ struct listen_sock *lopt;
+ struct request_sock *req;
+ int nr_entries;
+ int i;
+
+ icsk = inet_csk(sk);
+ lopt = icsk->icsk_accept_queue.listen_opt;
+ nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries;
+
+ for (i=0; i < nr_entries; i++) {
+ for (req=lopt->syn_table[i]; req; req=req->dl_next) {
+ loff_t saved_obj;
+ cpt_push_object(&saved_obj, ctx);
+ dump_openreq(req, sk, index, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ }
+ }
+ return 0;
+}
+
+
+int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx)
+{
+ if (sk->sk_state != TCP_CLOSE &&
+ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
+ sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_state != TCP_LISTEN)
+ tcp_set_state(sk, TCP_CLOSE);
+ else
+ sk->sk_prot->disconnect(sk, 0);
+ }
+ return 0;
+}
+
+int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ip_mc_socklist *iml;
+
+ for (iml = inet->mc_list; iml; iml = iml->next) {
+ struct cpt_sockmc_image smi;
+ int scnt = 0;
+ int i;
+
+ if (iml->sflist)
+ scnt = iml->sflist->sl_count*16;
+
+ smi.cpt_next = sizeof(smi) + scnt;
+ smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
+ smi.cpt_hdrlen = sizeof(smi);
+ smi.cpt_content = CPT_CONTENT_DATA;
+
+ smi.cpt_family = AF_INET;
+ smi.cpt_mode = iml->sfmode;
+ smi.cpt_ifindex = iml->multi.imr_ifindex;
+ memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr));
+ smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr;
+
+ ctx->write(&smi, sizeof(smi), ctx);
+
+ for (i = 0; i < scnt; i++) {
+ u32 addr[4];
+ memset(&addr, 0, sizeof(addr));
+ addr[0] = iml->sflist->sl_addr[i];
+ ctx->write(&addr, sizeof(addr), ctx);
+ }
+ }
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_mc_socklist *mcl;
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) {
+ struct cpt_sockmc_image smi;
+ int scnt = 0;
+ int i;
+
+ if (mcl->sflist)
+ scnt = mcl->sflist->sl_count*16;
+
+ smi.cpt_next = sizeof(smi) + scnt;
+ smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
+ smi.cpt_hdrlen = sizeof(smi);
+ smi.cpt_content = CPT_CONTENT_DATA;
+
+ smi.cpt_family = AF_INET6;
+ smi.cpt_mode = mcl->sfmode;
+ smi.cpt_ifindex = mcl->ifindex;
+ memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr));
+
+ ctx->write(&smi, sizeof(smi), ctx);
+ for (i = 0; i < scnt; i++)
+ ctx->write(&mcl->sflist->sl_addr[i], 16, ctx);
+ }
+ }
+#endif
+ return 0;
+}
diff --git a/kernel/cpt/cpt_syscalls.h b/kernel/cpt/cpt_syscalls.h
new file mode 100644
index 0000000..8cc0925
--- /dev/null
+++ b/kernel/cpt/cpt_syscalls.h
@@ -0,0 +1,101 @@
+#include <linux/unistd.h>
+#include <linux/syscalls.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+
+#define WRAP(c, args) return sys_##c args
+#define WRAP2(c, args) int err; mm_segment_t oldfs; \
+ oldfs = get_fs(); set_fs(KERNEL_DS); \
+ err = sys_##c args ;\
+ set_fs(oldfs); \
+ return err
+
+static inline int sc_close(int fd)
+{
+ WRAP(close, (fd));
+}
+
+static inline int sc_dup2(int fd1, int fd2)
+{
+ WRAP(dup2, (fd1, fd2));
+}
+
+static inline int sc_unlink(char *name)
+{
+ WRAP2(unlink, (name));
+}
+
+static inline int sc_pipe(int *pfd)
+{
+ return do_pipe_flags(pfd, 0);
+}
+
+static inline int sc_mknod(char *name, int mode, int dev)
+{
+ WRAP2(mknod, (name, mode, dev));
+}
+
+static inline int sc_chmod(char *name, int mode)
+{
+ WRAP2(mkdir, (name, mode));
+}
+
+static inline int sc_chown(char *name, int uid, int gid)
+{
+ WRAP2(chown, (name, uid, gid));
+}
+
+static inline int sc_mkdir(char *name, int mode)
+{
+ WRAP2(mkdir, (name, mode));
+}
+
+static inline int sc_rmdir(char *name)
+{
+ WRAP2(rmdir, (name));
+}
+
+static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags)
+{
+ WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL));
+}
+
+static inline int sc_mprotect(unsigned long start, size_t len,
+ unsigned long prot)
+{
+ WRAP(mprotect, (start, len, prot));
+}
+
+static inline int sc_mlock(unsigned long start, size_t len)
+{
+ WRAP(mlock, (start, len));
+}
+
+static inline int sc_munlock(unsigned long start, size_t len)
+{
+ WRAP(munlock, (start, len));
+}
+
+static inline int sc_remap_file_pages(unsigned long start, size_t len,
+ unsigned long prot, unsigned long pgoff,
+ unsigned long flags)
+{
+ WRAP(remap_file_pages, (start, len, prot, pgoff, flags));
+}
+
+static inline int sc_waitx(int pid, int opt, int *stat_addr)
+{
+ WRAP(wait4, (pid, stat_addr, opt, NULL));
+}
+
+static inline int sc_flock(int fd, int flags)
+{
+ WRAP(flock, (fd, flags));
+}
+
+static inline int sc_open(char* path, int flags, int mode)
+{
+ WRAP(open, (path, flags, mode));
+}
+
+extern int sc_execve(char *cms, char **argv, char **env);
diff --git a/kernel/cpt/cpt_sysvipc.c b/kernel/cpt/cpt_sysvipc.c
new file mode 100644
index 0000000..820f1ac
--- /dev/null
+++ b/kernel/cpt/cpt_sysvipc.c
@@ -0,0 +1,403 @@
+/*
+ *
+ * kernel/cpt/cpt_sysvipc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/shm.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_kernel.h"
+
+struct _warg {
+ struct file *file;
+ struct cpt_sysvshm_image *v;
+};
+
+static int dump_one_shm(struct shmid_kernel *shp, void *arg)
+{
+ struct _warg *warg = arg;
+ struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v;
+
+ if (shp->shm_file != warg->file)
+ return 0;
+
+ v->cpt_key = shp->shm_perm.key;
+ v->cpt_uid = shp->shm_perm.uid;
+ v->cpt_gid = shp->shm_perm.gid;
+ v->cpt_cuid = shp->shm_perm.cuid;
+ v->cpt_cgid = shp->shm_perm.cgid;
+ v->cpt_mode = shp->shm_perm.mode;
+ v->cpt_seq = shp->shm_perm.seq;
+
+ v->cpt_id = shp->shm_perm.id;
+ v->cpt_segsz = shp->shm_segsz;
+ v->cpt_atime = shp->shm_atim;
+ v->cpt_ctime = shp->shm_ctim;
+ v->cpt_dtime = shp->shm_dtim;
+ v->cpt_creator = shp->shm_cprid;
+ v->cpt_last = shp->shm_lprid;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+ v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1;
+#else
+ v->cpt_mlockuser = -1;
+#endif
+ return 1;
+}
+
+int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx)
+{
+ struct cpt_sysvshm_image *v = cpt_get_buf(ctx);
+ struct _warg warg;
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_SYSV_SHM;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_VOID;
+
+ warg.file = file;
+ warg.v = v;
+ if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) {
+ cpt_release_buf(ctx);
+ return -ESRCH;
+ }
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+ return 0;
+}
+
+
+int match_sem(int id, struct sem_array *sema, void *arg)
+{
+ if (id != (unsigned long)arg)
+ return 0;
+ return sema->sem_nsems + 1;
+}
+
+static int get_sem_nsem(int id, cpt_context_t *ctx)
+{
+ int res;
+ res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id);
+ if (res > 0)
+ return res - 1;
+ eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id);
+ return -ESRCH;
+}
+
+static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx)
+{
+ struct cpt_sysvsem_undo_image v;
+ loff_t saved_obj;
+
+ cpt_open_object(NULL, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_SEMUNDO;
+ v.cpt_id = su->semid;
+ v.cpt_nsem = get_sem_nsem(su->semid, ctx);
+ if ((int)v.cpt_nsem < 0)
+ return -ESRCH;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx);
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+ return 0;
+}
+
+struct sem_warg {
+ int last_id;
+ struct cpt_sysvsem_image *v;
+};
+
+static int dump_one_sem(int id, struct sem_array *sma, void *arg)
+{
+ struct sem_warg * warg = (struct sem_warg *)arg;
+ struct cpt_sysvsem_image *v = warg->v;
+ int i;
+
+ if (warg->last_id != -1) {
+ if ((id % IPCMNI) <= warg->last_id)
+ return 0;
+ }
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_SYSV_SEM;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_SEMARRAY;
+
+ v->cpt_key = sma->sem_perm.key;
+ v->cpt_uid = sma->sem_perm.uid;
+ v->cpt_gid = sma->sem_perm.gid;
+ v->cpt_cuid = sma->sem_perm.cuid;
+ v->cpt_cgid = sma->sem_perm.cgid;
+ v->cpt_mode = sma->sem_perm.mode;
+ v->cpt_seq = sma->sem_perm.seq;
+
+ v->cpt_id = id;
+ v->cpt_ctime = sma->sem_ctime;
+ v->cpt_otime = sma->sem_otime;
+
+ for (i=0; i<sma->sem_nsems; i++) {
+ struct {
+ __u32 semval;
+ __u32 sempid;
+ } *s = (void*)v + v->cpt_next;
+ if (v->cpt_next >= PAGE_SIZE - sizeof(*s))
+ return -EINVAL;
+ s->semval = sma->sem_base[i].semval;
+ s->sempid = sma->sem_base[i].sempid;
+ v->cpt_next += sizeof(*s);
+ }
+
+ warg->last_id = id % IPCMNI;
+ return 1;
+}
+
+
+int cpt_dump_sysvsem(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+ struct sem_warg warg;
+
+ /* Dumping semaphores is quite tricky because we cannot
+ * write to dump file under lock inside sysvipc_walk_sem().
+ */
+ cpt_open_section(ctx, CPT_SECT_SYSV_SEM);
+ warg.last_id = -1;
+ warg.v = cpt_get_buf(ctx);
+ for (;;) {
+ if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0)
+ break;
+ ctx->write(warg.v, warg.v->cpt_next, ctx);
+ }
+ cpt_release_buf(ctx);
+ cpt_close_section(ctx);
+
+ cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO);
+ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
+ struct sem_undo_list *semu = obj->o_obj;
+ struct sem_undo *su;
+ struct cpt_object_hdr v;
+ loff_t saved_obj;
+
+ cpt_open_object(obj, ctx);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_ARRAY;
+
+ ctx->write(&v, sizeof(v), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ list_for_each_entry(su, &semu->list_proc, list_proc) {
+ if (su->semid != -1) {
+ int err;
+ err = dump_one_semundo(su, ctx);
+ if (err < 0)
+ return err;
+ }
+ }
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+ }
+ cpt_close_section(ctx);
+ return 0;
+}
+
+struct msg_warg {
+ int last_id;
+ struct msg_queue *msq;
+ struct cpt_sysvmsg_image *v;
+};
+
+static int dump_one_msg(int id, struct msg_queue *msq, void *arg)
+{
+ struct msg_warg * warg = (struct msg_warg *)arg;
+ struct cpt_sysvmsg_image *v = warg->v;
+
+ if (warg->last_id != -1) {
+ if ((id % IPCMNI) <= warg->last_id)
+ return 0;
+ }
+
+ v->cpt_next = sizeof(*v);
+ v->cpt_object = CPT_OBJ_SYSVMSG;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_key = msq->q_perm.key;
+ v->cpt_uid = msq->q_perm.uid;
+ v->cpt_gid = msq->q_perm.gid;
+ v->cpt_cuid = msq->q_perm.cuid;
+ v->cpt_cgid = msq->q_perm.cgid;
+ v->cpt_mode = msq->q_perm.mode;
+ v->cpt_seq = msq->q_perm.seq;
+
+ v->cpt_id = id;
+ v->cpt_stime = msq->q_stime;
+ v->cpt_rtime = msq->q_rtime;
+ v->cpt_ctime = msq->q_ctime;
+ v->cpt_last_sender = msq->q_lspid;
+ v->cpt_last_receiver = msq->q_lrpid;
+ v->cpt_qbytes = msq->q_qbytes;
+
+ warg->msq = msq;
+ warg->last_id = id % IPCMNI;
+ return 1;
+}
+
+static int do_store(void * src, int len, int offset, void * data)
+{
+ cpt_context_t * ctx = data;
+ ctx->write(src, len, ctx);
+ return 0;
+}
+
+static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx)
+{
+ loff_t saved_obj;
+ struct cpt_sysvmsg_msg_image mv;
+
+ cpt_open_object(NULL, ctx);
+ mv.cpt_next = CPT_NULL;
+ mv.cpt_object = CPT_OBJ_SYSVMSG_MSG;
+ mv.cpt_hdrlen = sizeof(mv);
+ mv.cpt_content = CPT_CONTENT_DATA;
+
+ mv.cpt_type = m->m_type;
+ mv.cpt_size = m->m_ts;
+
+ ctx->write(&mv, sizeof(mv), ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ sysv_msg_store(m, do_store, m->m_ts, ctx);
+ cpt_pop_object(&saved_obj, ctx);
+ cpt_close_object(ctx);
+}
+
+int cpt_dump_sysvmsg(struct cpt_context *ctx)
+{
+ struct msg_warg warg;
+
+ /* Dumping msg queues is tricky because we cannot
+ * write to dump file under lock inside sysvipc_walk_msg().
+ *
+ * And even worse, we have to access msg list in an unserialized
+ * context. It is fragile. But VE is still frozen, remember?
+ */
+ cpt_open_section(ctx, CPT_SECT_SYSV_MSG);
+ warg.last_id = -1;
+ warg.v = cpt_get_buf(ctx);
+ for (;;) {
+ loff_t saved_obj;
+ struct msg_msg * m;
+
+ if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0)
+ break;
+
+ cpt_open_object(NULL, ctx);
+
+ ctx->write(warg.v, warg.v->cpt_next, ctx);
+
+ cpt_push_object(&saved_obj, ctx);
+ list_for_each_entry(m, &warg.msq->q_messages, m_list) {
+ cpt_dump_one_sysvmsg(m, ctx);
+ }
+ cpt_pop_object(&saved_obj, ctx);
+
+ cpt_close_object(ctx);
+ }
+ cpt_release_buf(ctx);
+ cpt_close_section(ctx);
+ return 0;
+}
+
+static int cpt_collect_sysvsem_undo(cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ if (tsk->exit_state) {
+ /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list
+ * on exit. Grrr... */
+ continue;
+ }
+ if (tsk->sysvsem.undo_list &&
+ cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL)
+ return -ENOMEM;
+ }
+
+ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
+ struct sem_undo_list *semu = obj->o_obj;
+
+ if (atomic_read(&semu->refcnt) != obj->o_count) {
+ eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt));
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
+static int collect_one_shm(struct shmid_kernel *shp, void *arg)
+{
+ cpt_context_t *ctx = arg;
+
+ if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+int cpt_collect_sysvshm(cpt_context_t * ctx)
+{
+ int err;
+
+ err = sysvipc_walk_shm(collect_one_shm, ctx);
+
+ return err < 0 ? err : 0;
+}
+
+int cpt_collect_sysv(cpt_context_t * ctx)
+{
+ int err;
+
+ err = cpt_collect_sysvsem_undo(ctx);
+ if (err)
+ return err;
+ err = cpt_collect_sysvshm(ctx);
+ if (err)
+ return err;
+
+ return 0;
+}
diff --git a/kernel/cpt/cpt_tty.c b/kernel/cpt/cpt_tty.c
new file mode 100644
index 0000000..8ac9417
--- /dev/null
+++ b/kernel/cpt/cpt_tty.c
@@ -0,0 +1,215 @@
+/*
+ *
+ * kernel/cpt/cpt_tty.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/tty.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+/* We must support at least N_TTY. */
+
+int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx)
+{
+ struct tty_struct *tty = file->private_data;
+ cpt_object_t *obj;
+ struct cpt_obj_ref o;
+ loff_t saved_pos;
+
+ obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx);
+ if (!obj)
+ return -EINVAL;
+
+ cpt_push_object(&saved_pos, ctx);
+
+ o.cpt_next = sizeof(o);
+ o.cpt_object = CPT_OBJ_REF;
+ o.cpt_hdrlen = sizeof(o);
+ o.cpt_content = CPT_CONTENT_VOID;
+ o.cpt_pos = obj->o_pos;
+ ctx->write(&o, sizeof(o), ctx);
+
+ cpt_pop_object(&saved_pos, ctx);
+
+ return 0;
+}
+
+int cpt_collect_tty(struct file *file, cpt_context_t * ctx)
+{
+ struct tty_struct *tty = file->private_data;
+
+ if (tty) {
+ if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL)
+ return -ENOMEM;
+ if (tty->link) {
+ cpt_object_t *obj;
+
+ obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx);
+ if (obj == NULL)
+ return -ENOMEM;
+ /* Undo o_count, tty->link is not a reference */
+ obj->o_count--;
+ }
+ }
+ return 0;
+}
+
+int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct tty_struct *tty = obj->o_obj;
+ struct cpt_tty_image *v;
+
+ if (tty->link) {
+ if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) {
+ eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE);
+ return -EINVAL;
+ }
+ if (tty->link->link != tty) {
+ eprintk_ctx("bad pty pair\n");
+ return -EINVAL;
+ }
+ if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+ tty->driver->subtype == PTY_TYPE_SLAVE &&
+ tty->link->count)
+ obj->o_count++;
+ }
+ if (obj->o_count != tty->count) {
+ eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count);
+ return -EBUSY;
+ }
+
+ cpt_open_object(obj, ctx);
+
+ v = cpt_get_buf(ctx);
+ v->cpt_next = -1;
+ v->cpt_object = CPT_OBJ_TTY;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ v->cpt_index = tty->index;
+ v->cpt_link = -1;
+ if (tty->link)
+ v->cpt_link = tty->link->index;
+ v->cpt_drv_type = tty->driver->type;
+ v->cpt_drv_subtype = tty->driver->subtype;
+ v->cpt_drv_flags = tty->driver->flags;
+ v->cpt_packet = tty->packet;
+ v->cpt_stopped = tty->stopped;
+ v->cpt_hw_stopped = tty->hw_stopped;
+ v->cpt_flow_stopped = tty->flow_stopped;
+ v->cpt_flags = tty->flags;
+ v->cpt_ctrl_status = tty->ctrl_status;
+ v->cpt_canon_data = tty->canon_data;
+ v->cpt_canon_head = tty->canon_head - tty->read_tail;
+ v->cpt_canon_column = tty->canon_column;
+ v->cpt_column = tty->column;
+ v->cpt_erasing = tty->erasing;
+ v->cpt_lnext = tty->lnext;
+ v->cpt_icanon = tty->icanon;
+ v->cpt_raw = tty->raw;
+ v->cpt_real_raw = tty->real_raw;
+ v->cpt_closing = tty->closing;
+ v->cpt_minimum_to_wake = tty->minimum_to_wake;
+ v->cpt_pgrp = 0;
+ if (tty->pgrp) {
+ v->cpt_pgrp = pid_vnr(tty->pgrp);
+ if ((int)v->cpt_pgrp < 0) {
+ dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp);
+ v->cpt_pgrp = -1;
+ }
+ }
+ v->cpt_session = 0;
+ if (tty->session) {
+ v->cpt_session = pid_vnr(tty->session);
+ if ((int)v->cpt_session < 0) {
+ eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session);
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ memcpy(v->cpt_name, tty->name, 64);
+ v->cpt_ws_row = tty->winsize.ws_row;
+ v->cpt_ws_col = tty->winsize.ws_col;
+ v->cpt_ws_prow = tty->winsize.ws_ypixel;
+ v->cpt_ws_pcol = tty->winsize.ws_xpixel;
+ if (tty->termios == NULL) {
+ eprintk_ctx("NULL termios");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ v->cpt_c_line = tty->termios->c_line;
+ v->cpt_c_iflag = tty->termios->c_iflag;
+ v->cpt_c_oflag = tty->termios->c_oflag;
+ v->cpt_c_cflag = tty->termios->c_cflag;
+ v->cpt_c_lflag = tty->termios->c_lflag;
+ memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS);
+ if (NCCS < 32)
+ memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS);
+ memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags));
+
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ if (tty->read_buf && tty->read_cnt) {
+ struct cpt_obj_bits *v = cpt_get_buf(ctx);
+ loff_t saved_pos;
+
+ cpt_push_object(&saved_pos, ctx);
+ cpt_open_object(NULL, ctx);
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_BITS;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_DATA;
+ v->cpt_size = tty->read_cnt;
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_release_buf(ctx);
+
+ if (tty->read_cnt) {
+ int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail);
+ ctx->write(tty->read_buf + tty->read_tail, n, ctx);
+ if (tty->read_cnt > n)
+ ctx->write(tty->read_buf, tty->read_cnt-n, ctx);
+ ctx->align(ctx);
+ }
+
+ cpt_close_object(ctx);
+ cpt_pop_object(&saved_pos, ctx);
+ }
+
+ cpt_close_object(ctx);
+
+ return 0;
+}
+
+__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx)
+{
+ struct tty_struct * tty;
+ struct fasync_struct *fa;
+
+ tty = (struct tty_struct *)file->private_data;
+
+ for (fa = tty->fasync; fa; fa = fa->fa_next) {
+ if (fa->fa_file == file)
+ return fa->fa_fd;
+ }
+ return -1;
+}
diff --git a/kernel/cpt/cpt_ubc.c b/kernel/cpt/cpt_ubc.c
new file mode 100644
index 0000000..0fc4f5f
--- /dev/null
+++ b/kernel/cpt/cpt_ubc.c
@@ -0,0 +1,135 @@
+/*
+ *
+ * kernel/cpt/cpt_ubc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/types.h>
+#include <bc/beancounter.h>
+#include <asm/signal.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx);
+ if (obj != NULL) {
+ if (obj->o_count == 1)
+ get_beancounter(bc);
+ if (bc->parent != NULL && obj->o_parent == NULL)
+ obj->o_parent = cpt_add_ubc(bc->parent, ctx);
+ }
+ return obj;
+}
+
+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx);
+ if (obj == NULL) {
+ char buf[48];
+ print_ub_uid(bc, buf, sizeof(buf));
+ eprintk("CPT: unknown ub %s (%p)\n", buf, bc);
+ dump_stack();
+ return CPT_NULL;
+ }
+ return obj->o_pos;
+}
+
+static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
+ int held)
+{
+ dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL);
+ dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL);
+ dmp->held = (held ? prm->held : CPT_NULL);
+ dmp->maxheld = prm->maxheld;
+ dmp->minheld = prm->minheld;
+ dmp->failcnt = prm->failcnt;
+}
+
+static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct user_beancounter *bc;
+ struct cpt_beancounter_image *v;
+ int i;
+
+ bc = obj->o_obj;
+ v = cpt_get_buf(ctx);
+
+ v->cpt_next = CPT_NULL;
+ v->cpt_object = CPT_OBJ_UBC;
+ v->cpt_hdrlen = sizeof(*v);
+ v->cpt_content = CPT_CONTENT_ARRAY;
+
+ if (obj->o_parent != NULL)
+ v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos;
+ else
+ v->cpt_parent = CPT_NULL;
+ v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0;
+ v->cpt_ub_resources = UB_RESOURCES;
+ BUILD_BUG_ON(ARRAY_SIZE(v->cpt_parms) < UB_RESOURCES * 2);
+ for (i = 0; i < UB_RESOURCES; i++) {
+ dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0);
+ dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1);
+ }
+ memset(v->cpt_parms + UB_RESOURCES * 2, 0,
+ sizeof(v->cpt_parms)
+ - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0]));
+
+ cpt_open_object(obj, ctx);
+ ctx->write(v, sizeof(*v), ctx);
+ cpt_close_object(ctx);
+
+ cpt_release_buf(ctx);
+ return 0;
+}
+
+int cpt_dump_ubc(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+ int skipped;
+ int top;
+
+ cpt_open_section(ctx, CPT_SECT_UBC);
+
+ do {
+ skipped = 0;
+ top = 0;
+ for_each_object(obj, CPT_OBJ_UBC) {
+ if (obj->o_parent == NULL)
+ top++;
+ if (obj->o_pos != CPT_NULL)
+ continue;
+ if (obj->o_parent != NULL &&
+ ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL)
+ skipped++;
+ else
+ dump_one_bc(obj, ctx);
+ }
+ } while (skipped && (top < 2));
+
+ cpt_close_section(ctx);
+ if (top > 1) {
+ eprintk_ctx("More than one top level ub exist");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void cpt_finish_ubc(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_UBC)
+ put_beancounter(obj->o_obj);
+}
diff --git a/kernel/cpt/cpt_ubc.h b/kernel/cpt/cpt_ubc.h
new file mode 100644
index 0000000..645ba79
--- /dev/null
+++ b/kernel/cpt/cpt_ubc.h
@@ -0,0 +1,23 @@
+#ifdef CONFIG_BEANCOUNTERS
+cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
+__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
+int cpt_dump_ubc(struct cpt_context *ctx);
+
+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx);
+int rst_undump_ubc(struct cpt_context *ctx);
+
+void cpt_finish_ubc(struct cpt_context *ctx);
+void rst_finish_ubc(struct cpt_context *ctx);
+void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id);
+void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id);
+#else
+static int inline cpt_dump_ubc(struct cpt_context *ctx)
+{ return 0; }
+static int inline rst_undump_ubc(struct cpt_context *ctx)
+{ return 0; }
+static void inline cpt_finish_ubc(struct cpt_context *ctx)
+{ return; }
+static void inline rst_finish_ubc(struct cpt_context *ctx)
+{ return; }
+#endif
+
diff --git a/kernel/cpt/cpt_x8664.S b/kernel/cpt/cpt_x8664.S
new file mode 100644
index 0000000..0d5e361
--- /dev/null
+++ b/kernel/cpt/cpt_x8664.S
@@ -0,0 +1,67 @@
+#define ASSEMBLY 1
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/errno.h>
+
+ .code64
+
+ .macro FAKE_STACK_FRAME child_rip
+ /* push in order ss, rsp, eflags, cs, rip */
+ xorq %rax, %rax
+ pushq %rax /* ss */
+ pushq %rax /* rsp */
+ pushq $(1<<9) /* eflags - interrupts on */
+ pushq $__KERNEL_CS /* cs */
+ pushq \child_rip /* rip */
+ pushq %rax /* orig rax */
+ .endm
+
+ .macro UNFAKE_STACK_FRAME
+ addq $8*6, %rsp
+ .endm
+
+ENTRY(asm_kernel_thread)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $child_rip
+ SAVE_ALL
+
+ # rdi: flags, rsi: usp, rdx: will be &pt_regs
+ movq %rdx,%rdi
+ orq $0x00800000,%rdi
+ movq $-1, %rsi
+ movq %rsp, %rdx
+
+ xorl %r8d,%r8d
+ xorl %r9d,%r9d
+ pushq %rcx
+ call do_fork_pid
+ addq $8, %rsp
+ /* call do_fork */
+ movq %rax,RAX(%rsp)
+ xorl %edi,%edi
+ RESTORE_ALL
+ UNFAKE_STACK_FRAME
+ ret
+ CFI_ENDPROC
+ENDPROC(asm_kernel_thread)
+
+child_rip:
+ pushq $0 # fake return address
+ CFI_STARTPROC
+ movq %rdi, %rax
+ movq %rsi, %rdi
+ call *%rax
+ movq %rax, %rdi
+ call do_exit
+ CFI_ENDPROC
+ENDPROC(child_rip)
+
diff --git a/kernel/cpt/rst_conntrack.c b/kernel/cpt/rst_conntrack.c
new file mode 100644
index 0000000..b863ac4
--- /dev/null
+++ b/kernel/cpt/rst_conntrack.c
@@ -0,0 +1,328 @@
+/*
+ *
+ * kernel/cpt/rst_conntrack.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/unistd.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/cpt_image.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+
+#if defined(CONFIG_VE_IPTABLES) && \
+ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+
+#define ASSERT_READ_LOCK(x) do { } while (0)
+#define ASSERT_WRITE_LOCK(x) do { } while (0)
+
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+struct ct_holder
+{
+ struct ct_holder *next;
+ struct ip_conntrack *ct;
+ int index;
+};
+
+static int decode_tuple(struct cpt_ipct_tuple *v,
+ struct ip_conntrack_tuple *tuple, int dir,
+ cpt_context_t *ctx)
+{
+ tuple->dst.ip = v->cpt_dst;
+ tuple->dst.u.all = v->cpt_dstport;
+ if (ctx->image_version < CPT_VERSION_16) {
+ /* In 2.6.9 kernel protonum has short type */
+ __u16 protonum = *(__u16 *)&v->cpt_protonum;
+ if (protonum > 0xff && protonum < 0xffff) {
+ eprintk_ctx("tuple: protonum > 255: %u\n", protonum);
+ return -EINVAL;
+ }
+ tuple->dst.protonum = protonum;
+ tuple->dst.dir = dir;
+ } else {
+ tuple->dst.protonum = v->cpt_protonum;
+ tuple->dst.dir = v->cpt_dir;
+ if (dir != tuple->dst.dir) {
+ eprintk_ctx("dir != tuple->dst.dir\n");
+ return -EINVAL;
+ }
+ }
+
+ tuple->src.ip = v->cpt_src;
+ tuple->src.u.all = v->cpt_srcport;
+ return 0;
+}
+
+
+static int undump_expect_list(struct ip_conntrack *ct,
+ struct cpt_ip_conntrack_image *ci,
+ loff_t pos, struct ct_holder *ct_list,
+ cpt_context_t *ctx)
+{
+ loff_t end;
+ int err;
+
+ end = pos + ci->cpt_next;
+ pos += ci->cpt_hdrlen;
+ while (pos < end) {
+ struct cpt_ip_connexpect_image v;
+ struct ip_conntrack_expect *exp;
+ struct ip_conntrack *sibling;
+
+ err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx);
+ if (err)
+ return err;
+
+ sibling = NULL;
+ if (v.cpt_sibling_conntrack) {
+ struct ct_holder *c;
+
+ for (c = ct_list; c; c = c->next) {
+ if (c->index == v.cpt_sibling_conntrack) {
+ sibling = c->ct;
+ break;
+ }
+ }
+ if (!sibling) {
+ eprintk_ctx("lost sibling of expectation\n");
+ return -EINVAL;
+ }
+ }
+
+ write_lock_bh(&ip_conntrack_lock);
+
+ /* It is possible. Helper module could be just unregistered,
+ * if expectation were on the list, it would be destroyed. */
+ if (ct->helper == NULL) {
+ write_unlock_bh(&ip_conntrack_lock);
+ dprintk_ctx("conntrack: no helper and non-trivial expectation\n");
+ continue;
+ }
+
+ exp = ip_conntrack_expect_alloc(NULL);
+ if (exp == NULL) {
+ write_unlock_bh(&ip_conntrack_lock);
+ return -ENOMEM;
+ }
+
+ if (decode_tuple(&v.cpt_tuple, &exp->tuple, 0, ctx) ||
+ decode_tuple(&v.cpt_mask, &exp->mask, 0, ctx)) {
+ ip_conntrack_expect_put(exp);
+ write_unlock_bh(&ip_conntrack_lock);
+ return -EINVAL;
+ }
+
+ exp->master = ct;
+ nf_conntrack_get(&ct->ct_general);
+ ip_conntrack_expect_insert(exp);
+#if 0
+ if (sibling) {
+ exp->sibling = sibling;
+ sibling->master = exp;
+ LIST_DELETE(&ve_ip_conntrack_expect_list, exp);
+ ct->expecting--;
+ nf_conntrack_get(&master_ct(sibling)->infos[0]);
+ } else
+#endif
+ if (ct->helper->timeout) {
+ mod_timer(&exp->timeout, jiffies + v.cpt_timeout);
+ }
+ write_unlock_bh(&ip_conntrack_lock);
+
+ ip_conntrack_expect_put(exp);
+
+ pos += v.cpt_next;
+ }
+ return 0;
+}
+
+static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos,
+ struct ct_holder **ct_list, cpt_context_t *ctx)
+{
+ int err = 0;
+ struct ip_conntrack *conntrack;
+ struct ct_holder *c;
+ struct ip_conntrack_tuple orig, repl;
+
+ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
+ if (c == NULL)
+ return -ENOMEM;
+
+ if (decode_tuple(&ci->cpt_tuple[0], &orig, 0, ctx) ||
+ decode_tuple(&ci->cpt_tuple[1], &repl, 1, ctx)) {
+ kfree(c);
+ return -EINVAL;
+ }
+
+ conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub);
+ if (!conntrack || IS_ERR(conntrack)) {
+ kfree(c);
+ return -ENOMEM;
+ }
+
+ c->ct = conntrack;
+ c->next = *ct_list;
+ *ct_list = c;
+ c->index = ci->cpt_index;
+
+ conntrack->status = ci->cpt_status;
+
+ memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto));
+ memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help));
+
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+ conntrack->mark = ci->cpt_mark;
+#endif
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
+ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
+ conntrack->nat.masq_index = ci->cpt_masq_index;
+#endif
+ if (ci->cpt_initialized) {
+ conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos;
+ conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before;
+ conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after;
+ conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos;
+ conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before;
+ conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after;
+ }
+ if (conntrack->status & IPS_NAT_DONE_MASK)
+ ip_nat_hash_conntrack(conntrack);
+#endif
+
+ if (ci->cpt_ct_helper) {
+ conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple);
+ if (conntrack->helper == NULL) {
+ eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n");
+ err = -EINVAL;
+ }
+ }
+
+ ip_conntrack_hash_insert(conntrack);
+ conntrack->timeout.expires = jiffies + ci->cpt_timeout;
+
+ if (err == 0 && ci->cpt_next > ci->cpt_hdrlen)
+ err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx);
+
+ if (conntrack->helper)
+ ip_conntrack_helper_put(conntrack->helper);
+
+ return err;
+}
+
+static void convert_conntrack_image(struct cpt_ip_conntrack_image *ci)
+{
+ struct cpt_ip_conntrack_image_compat img;
+
+ memcpy(&img, ci, sizeof(struct cpt_ip_conntrack_image_compat));
+ /*
+ * Size of cpt_help_data in 2.6.9 kernel is 16 bytes,
+ * in 2.6.18 cpt_help_data size is 24 bytes, so zero the rest 8 bytes
+ */
+ memset(ci->cpt_help_data + 4, 0, 8);
+ ci->cpt_initialized = img.cpt_initialized;
+ ci->cpt_num_manips = img.cpt_num_manips;
+ memcpy(ci->cpt_nat_manips, img.cpt_nat_manips, sizeof(img.cpt_nat_manips));
+ memcpy(ci->cpt_nat_seq, img.cpt_nat_seq, sizeof(img.cpt_nat_seq));
+ ci->cpt_masq_index = img.cpt_masq_index;
+ /* Id will be assigned in ip_conntrack_hash_insert(), so make it 0 here */
+ ci->cpt_id = 0;
+ /* mark was not supported in 2.6.9, so set it to default 0 value */
+ ci->cpt_mark = 0;
+
+}
+
+int rst_restore_ip_conntrack(struct cpt_context * ctx)
+{
+ int err = 0;
+ loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_ip_conntrack_image ci;
+ struct ct_holder *c;
+ struct ct_holder *ct_list = NULL;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) {
+ eprintk_ctx("conntrack module ct->proto version mismatch\n");
+ return -EINVAL;
+ }
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx);
+ if (err)
+ break;
+ if (ctx->image_version < CPT_VERSION_16)
+ convert_conntrack_image(&ci);
+ err = undump_one_ct(&ci, sec, &ct_list, ctx);
+ if (err)
+ break;
+ sec += ci.cpt_next;
+ }
+
+ while ((c = ct_list) != NULL) {
+ ct_list = c->next;
+ if (c->ct)
+ add_timer(&c->ct->timeout);
+ kfree(c);
+ }
+
+ return err;
+}
+
+#else
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+int rst_restore_ip_conntrack(struct cpt_context * ctx)
+{
+ if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL)
+ return -EINVAL;
+ return 0;
+}
+
+#endif
diff --git a/kernel/cpt/rst_context.c b/kernel/cpt/rst_context.c
new file mode 100644
index 0000000..0007197
--- /dev/null
+++ b/kernel/cpt/rst_context.c
@@ -0,0 +1,331 @@
+/*
+ *
+ * kernel/cpt/rst_context.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_files.h"
+
+static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx)
+{
+ mm_segment_t oldfs;
+ ssize_t err = -EBADF;
+ struct file *file = ctx->file;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (file)
+ err = file->f_op->read(file, addr, count, &file->f_pos);
+ set_fs(oldfs);
+ if (err != count)
+ return err >= 0 ? -EIO : err;
+ return 0;
+}
+
+static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
+{
+ mm_segment_t oldfs;
+ ssize_t err = -EBADF;
+ struct file *file = ctx->file;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (file)
+ err = file->f_op->read(file, addr, count, &pos);
+ set_fs(oldfs);
+ if (err != count)
+ return err >= 0 ? -EIO : err;
+ return 0;
+}
+
+static void file_align(struct cpt_context *ctx)
+{
+ struct file *file = ctx->file;
+
+ if (file)
+ file->f_pos = CPT_ALIGN(file->f_pos);
+}
+
+int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end)
+{
+ struct cpt_section_hdr hdr;
+ int err;
+ loff_t pos;
+
+ pos = ctx->sections[type];
+ *start = *end = pos;
+
+ if (pos != CPT_NULL) {
+ if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0)
+ return err;
+ if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr))
+ return -EINVAL;
+ *start = pos + hdr.cpt_hdrlen;
+ *end = pos + hdr.cpt_next;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(rst_get_section);
+
+void rst_context_init(struct cpt_context *ctx)
+{
+ int i;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ init_MUTEX(&ctx->main_sem);
+ ctx->refcount = 1;
+
+ ctx->current_section = -1;
+ ctx->current_object = -1;
+ ctx->pagesize = PAGE_SIZE;
+ ctx->read = file_read;
+ ctx->pread = file_pread;
+ ctx->align = file_align;
+ for (i=0; i < CPT_SECT_MAX; i++)
+ ctx->sections[i] = CPT_NULL;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ init_completion(&ctx->pgin_notify);
+#endif
+ cpt_object_init(ctx);
+}
+
+static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx)
+{
+ struct cpt_section_hdr h;
+
+ while (start < end) {
+ int err;
+
+ err = ctx->pread(&h, sizeof(h), ctx, start);
+ if (err)
+ return err;
+ if (h.cpt_hdrlen < sizeof(h) ||
+ h.cpt_next < h.cpt_hdrlen ||
+ start + h.cpt_next > end)
+ return -EINVAL;
+ if (h.cpt_section >= CPT_SECT_MAX)
+ return -EINVAL;
+ ctx->sections[h.cpt_section] = start;
+ start += h.cpt_next;
+ }
+ return 0;
+}
+
+int rst_open_dumpfile(struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_major_tail *v;
+ struct cpt_major_hdr h;
+ unsigned long size;
+
+ err = -EBADF;
+ if (!ctx->file)
+ goto err_out;
+
+ err = -ENOMEM;
+ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
+ if (ctx->tmpbuf == NULL)
+ goto err_out;
+ __cpt_release_buf(ctx);
+
+ size = ctx->file->f_dentry->d_inode->i_size;
+
+ if (size & 7) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ if (size < sizeof(struct cpt_major_hdr) +
+ sizeof(struct cpt_major_tail)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ err = ctx->pread(&h, sizeof(h), ctx, 0);
+ if (err) {
+ eprintk_ctx("too short image 1 %d\n", err);
+ goto err_out;
+ }
+ if (h.cpt_signature[0] != CPT_SIGNATURE0 ||
+ h.cpt_signature[1] != CPT_SIGNATURE1 ||
+ h.cpt_signature[2] != CPT_SIGNATURE2 ||
+ h.cpt_signature[3] != CPT_SIGNATURE3) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ if (h.cpt_hz != HZ) {
+ err = -EINVAL;
+ eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ);
+ goto err_out;
+ }
+ ctx->virt_jiffies64 = h.cpt_start_jiffies64;
+ ctx->start_time.tv_sec = h.cpt_start_sec;
+ ctx->start_time.tv_nsec = h.cpt_start_nsec;
+ ctx->kernel_config_flags = h.cpt_kernel_config[0];
+ ctx->iptables_mask = h.cpt_iptables_mask;
+ if (h.cpt_image_version > CPT_CURRENT_VERSION ||
+ CPT_VERSION_MINOR(h.cpt_image_version) >
+ CPT_VERSION_MINOR(CPT_CURRENT_VERSION)) {
+ eprintk_ctx("Unknown image version: %x. Can't restore.\n",
+ h.cpt_image_version);
+ err = -EINVAL;
+ goto err_out;
+ }
+ ctx->image_version = h.cpt_image_version;
+ ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features);
+ ctx->image_arch = h.cpt_os_arch;
+
+ v = cpt_get_buf(ctx);
+ err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v));
+ if (err) {
+ eprintk_ctx("too short image 2 %d\n", err);
+ cpt_release_buf(ctx);
+ goto err_out;
+ }
+ if (v->cpt_signature[0] != CPT_SIGNATURE0 ||
+ v->cpt_signature[1] != CPT_SIGNATURE1 ||
+ v->cpt_signature[2] != CPT_SIGNATURE2 ||
+ v->cpt_signature[3] != CPT_SIGNATURE3 ||
+ v->cpt_nsect != CPT_SECT_MAX_INDEX) {
+ err = -EINVAL;
+ cpt_release_buf(ctx);
+ goto err_out;
+ }
+ if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) {
+ cpt_release_buf(ctx);
+ goto err_out;
+ }
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ ctx->lazypages = v->cpt_lazypages;
+#endif
+ ctx->tasks64 = v->cpt_64bit;
+ cpt_release_buf(ctx);
+ return 0;
+
+err_out:
+ if (ctx->tmpbuf) {
+ free_page((unsigned long)ctx->tmpbuf);
+ ctx->tmpbuf = NULL;
+ }
+ return err;
+}
+
+void rst_close_dumpfile(struct cpt_context *ctx)
+{
+ if (ctx->file) {
+ fput(ctx->file);
+ ctx->file = NULL;
+ }
+ if (ctx->tmpbuf) {
+ free_page((unsigned long)ctx->tmpbuf);
+ ctx->tmpbuf = NULL;
+ }
+}
+
+int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_object_hdr *hdr = tmp;
+ err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos);
+ if (err)
+ return err;
+ if (type > 0 && type != hdr->cpt_object)
+ return -EINVAL;
+ if (hdr->cpt_hdrlen > hdr->cpt_next)
+ return -EINVAL;
+ if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr))
+ return -EINVAL;
+ if (size < sizeof(*hdr))
+ return -EINVAL;
+ if (size > hdr->cpt_hdrlen)
+ size = hdr->cpt_hdrlen;
+ if (size > sizeof(*hdr))
+ err = ctx->pread(hdr+1, size - sizeof(*hdr),
+ ctx, pos + sizeof(*hdr));
+ return err;
+}
+EXPORT_SYMBOL(_rst_get_object);
+
+void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ void *tmp;
+ struct cpt_object_hdr hdr;
+ err = ctx->pread(&hdr, sizeof(hdr), ctx, pos);
+ if (err)
+ return NULL;
+ if (type > 0 && type != hdr.cpt_object)
+ return NULL;
+ if (hdr.cpt_hdrlen > hdr.cpt_next)
+ return NULL;
+ if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr))
+ return NULL;
+ tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+ err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos);
+ if (!err)
+ return tmp;
+ kfree(tmp);
+ return NULL;
+}
+EXPORT_SYMBOL(__rst_get_object);
+
+__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_object_hdr hdr;
+ __u8 *name;
+
+ err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx);
+ if (err)
+ return NULL;
+ if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE)
+ return NULL;
+ name = (void*)__get_free_page(GFP_KERNEL);
+ if (!name)
+ return NULL;
+ err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen,
+ ctx, *pos_p + hdr.cpt_hdrlen);
+ if (err) {
+ free_page((unsigned long)name);
+ return NULL;
+ }
+ *pos_p += hdr.cpt_next;
+ return name;
+}
+
+__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx)
+{
+ return __rst_get_name(&pos, ctx);
+}
+
+void rst_put_name(__u8 *name, struct cpt_context *ctx)
+{
+ unsigned long addr = (unsigned long)name;
+
+ if (addr)
+ free_page(addr&~(PAGE_SIZE-1));
+}
+
+struct rst_ops rst_ops = {
+ .get_object = _rst_get_object,
+ .rst_file = rst_file,
+};
diff --git a/kernel/cpt/rst_epoll.c b/kernel/cpt/rst_epoll.c
new file mode 100644
index 0000000..0ac4cae
--- /dev/null
+++ b/kernel/cpt/rst_epoll.c
@@ -0,0 +1,169 @@
+/*
+ *
+ * kernel/cpt/rst_epoll.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/eventpoll.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+/* Those funcations are static in fs/eventpoll.c */
+extern int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+ struct file *tfile, int fd);
+extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
+extern void ep_release_epitem(struct epitem *epi);
+
+
+struct file *cpt_open_epolldev(struct cpt_file_image *fi,
+ unsigned flags,
+ struct cpt_context *ctx)
+{
+ struct file *file;
+ int efd;
+
+ /* Argument "size" is ignored, use just 1 */
+ efd = sys_epoll_create(1);
+ if (efd < 0)
+ return ERR_PTR(efd);
+
+ file = fget(efd);
+ sys_close(efd);
+ return file;
+}
+
+static int restore_one_epoll(cpt_object_t *obj,
+ loff_t pos,
+ struct cpt_epoll_image *ebuf,
+ cpt_context_t *ctx)
+{
+ int err = 0;
+ loff_t endpos;
+ struct file *file = obj->o_obj;
+ struct eventpoll *ep;
+
+ if (file->f_op != &eventpoll_fops) {
+ eprintk_ctx("bad epoll file\n");
+ return -EINVAL;
+ }
+
+ ep = file->private_data;
+
+ if (unlikely(ep == NULL)) {
+ eprintk_ctx("bad epoll device\n");
+ return -EINVAL;
+ }
+
+ endpos = pos + ebuf->cpt_next;
+ pos += ebuf->cpt_hdrlen;
+ while (pos < endpos) {
+ struct cpt_epoll_file_image efi;
+ struct epoll_event epds;
+
+ cpt_object_t *tobj;
+
+ err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx);
+ if (err)
+ return err;
+ tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx);
+ if (!tobj) {
+ eprintk_ctx("epoll file not found\n");
+ return -EINVAL;
+ }
+ epds.events = efi.cpt_events;
+ epds.data = efi.cpt_data;
+ mutex_lock(&ep->mtx);
+ err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd);
+ if (!err) {
+ struct epitem *epi;
+ epi = ep_find(ep, tobj->o_obj, efi.cpt_fd);
+ if (epi) {
+ if (efi.cpt_ready) {
+ unsigned long flags;
+ spin_lock_irqsave(&ep->lock, flags);
+ if (list_empty(&epi->rdllink))
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ spin_unlock_irqrestore(&ep->lock, flags);
+ }
+ }
+ }
+ mutex_unlock(&ep->mtx);
+ if (err)
+ break;
+ pos += efi.cpt_next;
+ }
+ return err;
+}
+
+int rst_eventpoll(cpt_context_t *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_EPOLL];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ cpt_object_t *obj;
+ struct cpt_epoll_image *ebuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx);
+ if (obj == NULL) {
+ eprintk_ctx("cannot find epoll file object\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ err = restore_one_epoll(obj, sec, ebuf, ctx);
+ cpt_release_buf(ctx);
+ if (err)
+ return err;
+ sec += ebuf->cpt_next;
+ }
+
+ return 0;
+
+}
diff --git a/kernel/cpt/rst_files.c b/kernel/cpt/rst_files.c
new file mode 100644
index 0000000..a84e3d3
--- /dev/null
+++ b/kernel/cpt/rst_files.c
@@ -0,0 +1,1779 @@
+/*
+ *
+ * kernel/cpt/rst_files.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/nsproxy.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/fs_struct.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/namei.h>
+#include <linux/vmalloc.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <asm/uaccess.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+#include <linux/mnt_namespace.h>
+#include <linux/fdtable.h>
+#include <linux/shm.h>
+#include <linux/signalfd.h>
+#include <linux/proc_fs.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+
+#include "cpt_syscalls.h"
+
+
+struct filejob {
+ struct filejob *next;
+ int pid;
+ loff_t fdi;
+};
+
+static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx)
+{
+ struct filejob *j;
+
+ j = kmalloc(sizeof(*j), GFP_KERNEL);
+ if (j == NULL)
+ return -ENOMEM;
+ j->pid = current->pid;
+ j->fdi = pos;
+ j->next = ctx->filejob_queue;
+ ctx->filejob_queue = j;
+ return 0;
+}
+
+static void _anon_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct page *page = buf->page;
+
+ /*
+ * If nobody else uses this page, and we don't already have a
+ * temporary page, let's keep track of it as a one-deep
+ * allocation cache. (Otherwise just release our reference to it)
+ */
+ if (page_count(page) == 1 && !pipe->tmp_page)
+ pipe->tmp_page = page;
+ else
+ page_cache_release(page);
+
+ module_put(THIS_MODULE);
+}
+
+static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, int atomic)
+{
+ if (atomic) {
+ buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+ return kmap_atomic(buf->page, KM_USER0);
+ }
+
+ return kmap(buf->page);
+}
+
+static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, void *map_data)
+{
+ if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+ buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+ kunmap_atomic(map_data, KM_USER0);
+ } else
+ kunmap(buf->page);
+}
+
+static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct page *page = buf->page;
+
+ if (page_count(page) == 1) {
+ lock_page(page);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+ page_cache_get(buf->page);
+}
+
+static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+ return 0;
+}
+
+static struct pipe_buf_operations _anon_pipe_buf_ops = {
+ .can_merge = 1,
+ .map = _anon_pipe_buf_map,
+ .unmap = _anon_pipe_buf_unmap,
+ .release = _anon_pipe_buf_release,
+ .confirm = _anon_pipe_buf_confirm,
+ .get = _anon_pipe_buf_get,
+ .steal = _anon_pipe_buf_steal,
+};
+
+/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer
+ * many times. We need to mark it in CPT_OBJ_INODE table in some way.
+ */
+static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi,
+ struct cpt_context *ctx)
+{
+ struct inode *ino = file->f_dentry->d_inode;
+ struct cpt_inode_image ii;
+ struct cpt_obj_bits b;
+ struct pipe_inode_info *info;
+ int err;
+ int count;
+
+ if (!S_ISFIFO(ino->i_mode)) {
+ eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode);
+ return -EINVAL;
+ }
+ if (fi->cpt_inode == CPT_NULL)
+ return 0;
+
+ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
+ if (err)
+ return err;
+
+ if (ii.cpt_next <= ii.cpt_hdrlen)
+ return 0;
+
+ err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx);
+ if (err)
+ return err;
+
+ if (b.cpt_size == 0)
+ return 0;
+
+ mutex_lock(&ino->i_mutex);
+ info = ino->i_pipe;
+ if (info->nrbufs) {
+ mutex_unlock(&ino->i_mutex);
+ eprintk("pipe buffer is restored already\n");
+ return -EINVAL;
+ }
+ info->curbuf = 0;
+ count = 0;
+ while (count < b.cpt_size) {
+ struct pipe_buffer *buf = info->bufs + info->nrbufs;
+ void * addr;
+ int chars;
+
+ chars = b.cpt_size - count;
+ if (chars > PAGE_SIZE)
+ chars = PAGE_SIZE;
+ if (!try_module_get(THIS_MODULE)) {
+ err = -EBUSY;
+ break;
+ }
+
+ buf->page = alloc_page(GFP_HIGHUSER);
+ if (buf->page == NULL) {
+ err = -ENOMEM;
+ break;
+ }
+ buf->ops = &_anon_pipe_buf_ops;
+ buf->offset = 0;
+ buf->len = chars;
+ info->nrbufs++;
+ addr = kmap(buf->page);
+ err = ctx->pread(addr, chars, ctx,
+ fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count);
+ if (err)
+ break;
+ count += chars;
+ }
+ mutex_unlock(&ino->i_mutex);
+
+ return err;
+}
+
+static int make_flags(struct cpt_file_image *fi)
+{
+ int flags = O_NOFOLLOW;
+ switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ flags |= O_RDWR; break;
+ case FMODE_WRITE:
+ flags |= O_WRONLY; break;
+ case FMODE_READ:
+ flags |= O_RDONLY; break;
+ default: break;
+ }
+ flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC);
+ flags |= O_NONBLOCK|O_NOCTTY;
+ return flags;
+}
+
+static struct file *open_pipe(char *name,
+ struct cpt_file_image *fi,
+ unsigned flags,
+ struct cpt_context *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+ struct cpt_inode_image ii;
+ struct file *rf, *wf;
+
+ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
+ if (err)
+ return ERR_PTR(err);
+
+ if (ii.cpt_sb == FSMAGIC_PIPEFS) {
+ int pfd[2];
+
+ if ((err = sc_pipe(pfd)) < 0)
+ return ERR_PTR(err);
+
+ rf = fcheck(pfd[0]);
+ wf = fcheck(pfd[1]);
+ get_file(rf);
+ get_file(wf);
+ sc_close(pfd[0]);
+ sc_close(pfd[1]);
+
+ if (fi->cpt_mode&FMODE_READ) {
+ struct file *tf;
+ tf = wf; wf = rf; rf = tf;
+ }
+ } else {
+ if (fi->cpt_mode&FMODE_READ) {
+ rf = filp_open(name, flags, 0);
+ if (IS_ERR(rf)) {
+ dprintk_ctx("filp_open\n");
+ return rf;
+ }
+ dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current),
+ (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode);
+ return rf;
+ }
+
+ dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode);
+
+ rf = filp_open(name, O_RDWR|O_NONBLOCK, 0);
+ if (IS_ERR(rf))
+ return rf;
+ wf = dentry_open(dget(rf->f_dentry),
+ mntget(rf->f_vfsmnt), flags, NULL);
+ }
+
+ /* Add pipe inode to obj table. */
+ obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx);
+ if (obj == NULL) {
+ fput(rf); fput(wf);
+ return ERR_PTR(-ENOMEM);
+ }
+ cpt_obj_setpos(obj, fi->cpt_inode, ctx);
+ obj->o_parent = rf;
+
+ /* Add another side of pipe to obj table, it will not be used
+ * (o_pos = PT_NULL), another processes opeining pipe will find
+ * inode and open it with dentry_open(). */
+ obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx);
+ if (obj == NULL) {
+ fput(wf);
+ return ERR_PTR(-ENOMEM);
+ }
+ return wf;
+}
+
+static struct file *open_special(struct cpt_file_image *fi,
+ unsigned flags,
+ int deleted,
+ struct cpt_context *ctx)
+{
+ struct cpt_inode_image *ii;
+ struct file *file;
+
+ /* Directories and named pipes are not special actually */
+ if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode))
+ return NULL;
+
+ /* No support for block devices at the moment. */
+ if (S_ISBLK(fi->cpt_i_mode))
+ return ERR_PTR(-EINVAL);
+
+ if (S_ISSOCK(fi->cpt_i_mode)) {
+ eprintk_ctx("bug: socket is not open\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Support only (some) character devices at the moment. */
+ if (!S_ISCHR(fi->cpt_i_mode))
+ return ERR_PTR(-EINVAL);
+
+ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx);
+ if (ii == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ /* Do not worry about this right now. /dev/null,zero,*random are here.
+ * To prohibit at least /dev/mem?
+ */
+ if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) {
+ kfree(ii);
+ return NULL;
+ }
+
+ /* /dev/net/tun will be opened by caller */
+ if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) {
+ kfree(ii);
+ return NULL;
+ }
+
+ file = rst_open_tty(fi, ii, flags, ctx);
+ kfree(ii);
+ return file;
+}
+
+static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx)
+{
+ struct file_lock lock;
+ cpt_object_t *obj;
+
+ memset(&lock, 0, sizeof(lock));
+ lock.fl_type = fli->cpt_type;
+ lock.fl_flags = fli->cpt_flags & ~FL_SLEEP;
+ lock.fl_start = fli->cpt_start;
+ lock.fl_end = fli->cpt_end;
+ obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx);
+ if (!obj) {
+ eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner);
+ return -EINVAL;
+ }
+ lock.fl_owner = obj->o_obj;
+ lock.fl_pid = vpid_to_pid(fli->cpt_pid);
+ if (lock.fl_pid < 0) {
+ eprintk_ctx("unknown lock pid %d\n", lock.fl_pid);
+ return -EINVAL;
+ }
+ lock.fl_file = file;
+
+ if (lock.fl_owner == NULL)
+ eprintk_ctx("no lock owner\n");
+ return posix_lock_file(file, &lock, NULL);
+}
+
+static int restore_flock(struct file *file, struct cpt_flock_image *fli,
+ cpt_context_t *ctx)
+{
+ int cmd, err, fd;
+ fd = get_unused_fd();
+ if (fd < 0) {
+ eprintk_ctx("BSD flock cannot be restored\n");
+ return fd;
+ }
+ get_file(file);
+ fd_install(fd, file);
+ if (fli->cpt_type == F_RDLCK) {
+ cmd = LOCK_SH;
+ } else if (fli->cpt_type == F_WRLCK) {
+ cmd = LOCK_EX;
+ } else {
+ eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type);
+ sc_close(fd);
+ return -EINVAL;
+ }
+
+ err = sc_flock(fd, LOCK_NB | cmd);
+ sc_close(fd);
+ return err;
+}
+
+
+static int fixup_posix_locks(struct file *file,
+ struct cpt_file_image *fi,
+ loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ loff_t end;
+ struct cpt_flock_image fli;
+
+ end = pos + fi->cpt_next;
+ pos += fi->cpt_hdrlen;
+ while (pos < end) {
+ err = rst_get_object(-1, pos, &fli, ctx);
+ if (err)
+ return err;
+ if (fli.cpt_object == CPT_OBJ_FLOCK &&
+ (fli.cpt_flags&FL_POSIX)) {
+ err = restore_posix_lock(file, &fli, ctx);
+ if (err)
+ return err;
+ dprintk_ctx("posix lock restored\n");
+ }
+ pos += fli.cpt_next;
+ }
+ return 0;
+}
+
+int rst_posix_locks(struct cpt_context *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+ struct cpt_file_image fi;
+
+ if (obj->o_pos == CPT_NULL)
+ continue;
+
+ err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx);
+ if (err < 0)
+ return err;
+ if (fi.cpt_next > fi.cpt_hdrlen)
+ fixup_posix_locks(file, &fi, obj->o_pos, ctx);
+ }
+ return 0;
+}
+
+static int fixup_flocks(struct file *file,
+ struct cpt_file_image *fi,
+ loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ loff_t end;
+ struct cpt_flock_image fli;
+
+ end = pos + fi->cpt_next;
+ pos += fi->cpt_hdrlen;
+ while (pos < end) {
+ err = rst_get_object(-1, pos, &fli, ctx);
+ if (err)
+ return err;
+ if (fli.cpt_object == CPT_OBJ_FLOCK &&
+ (fli.cpt_flags&FL_FLOCK)) {
+ err = restore_flock(file, &fli, ctx);
+ if (err)
+ return err;
+ dprintk_ctx("bsd lock restored\n");
+ }
+ pos += fli.cpt_next;
+ }
+ return 0;
+}
+
+
+static int fixup_reg_data(struct file *file, loff_t pos, loff_t end,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_page_block pgb;
+ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
+
+ do_write = file->f_op->write;
+ if (do_write == NULL) {
+ eprintk_ctx("no write method. Cannot restore contents of the file.\n");
+ return -EINVAL;
+ }
+
+ atomic_long_inc(&file->f_count);
+
+ while (pos < end) {
+ loff_t opos;
+ loff_t ipos;
+ int count;
+
+ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
+ if (err)
+ goto out;
+ dprintk_ctx("restoring file data block: %08x-%08x\n",
+ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
+ ipos = pos + pgb.cpt_hdrlen;
+ opos = pgb.cpt_start;
+ count = pgb.cpt_end-pgb.cpt_start;
+ while (count > 0) {
+ mm_segment_t oldfs;
+ int copy = count;
+
+ if (copy > PAGE_SIZE)
+ copy = PAGE_SIZE;
+ (void)cpt_get_buf(ctx);
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
+ set_fs(oldfs);
+ if (err) {
+ __cpt_release_buf(ctx);
+ goto out;
+ }
+ if (!(file->f_mode & FMODE_WRITE) ||
+ (file->f_flags&O_DIRECT)) {
+ fput(file);
+ file = dentry_open(dget(file->f_dentry),
+ mntget(file->f_vfsmnt),
+ O_WRONLY | O_LARGEFILE, NULL);
+ if (IS_ERR(file)) {
+ __cpt_release_buf(ctx);
+ return PTR_ERR(file);
+ }
+ }
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ ipos += copy;
+ err = do_write(file, ctx->tmpbuf, copy, &opos);
+ set_fs(oldfs);
+ __cpt_release_buf(ctx);
+ if (err != copy) {
+ if (err >= 0)
+ err = -EIO;
+ goto out;
+ }
+ count -= copy;
+ }
+ pos += pgb.cpt_next;
+ }
+ err = 0;
+
+out:
+ fput(file);
+ return err;
+}
+
+
+static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi,
+ struct cpt_inode_image *ii,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct file *file = *file_p;
+ struct iattr newattrs;
+
+ if (!S_ISREG(fi->cpt_i_mode))
+ return 0;
+
+ if (file == NULL) {
+ file = shmem_file_setup("dev/zero", ii->cpt_size, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ *file_p = file;
+ }
+
+ if (ii->cpt_next > ii->cpt_hdrlen) {
+ struct cpt_object_hdr hdr;
+ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen);
+ if (err)
+ return err;
+ if (hdr.cpt_object == CPT_OBJ_PAGES) {
+ err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen,
+ fi->cpt_inode+ii->cpt_next, ctx);
+ if (err)
+ return err;
+ }
+ }
+
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+ /* stage 1 - update size like do_truncate does */
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+ newattrs.ia_size = ii->cpt_size;
+ cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime);
+ err = notify_change(file->f_dentry, &newattrs);
+ if (err)
+ goto out;
+
+ /* stage 2 - update times, owner and mode */
+ newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME |
+ ATTR_ATIME_SET | ATTR_MTIME_SET |
+ ATTR_MODE | ATTR_UID | ATTR_GID;
+ newattrs.ia_uid = ii->cpt_uid;
+ newattrs.ia_gid = ii->cpt_gid;
+ newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT;
+ newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT);
+ cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime);
+ cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime);
+ err = notify_change(file->f_dentry, &newattrs);
+
+out:
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ return err;
+}
+
+static int fixup_file_flags(struct file *file, struct cpt_file_image *fi,
+ int was_dentry_open, loff_t pos,
+ cpt_context_t *ctx)
+{
+ const struct cred *cred = current_cred() /* should be valid already */;
+
+ if (fi->cpt_pos != file->f_pos) {
+ int err = -ESPIPE;
+ if (file->f_op->llseek)
+ err = file->f_op->llseek(file, fi->cpt_pos, 0);
+ if (err < 0) {
+ dprintk_ctx("file %Ld lseek %Ld - %Ld\n",
+ (long long)pos,
+ (long long)file->f_pos,
+ (long long)fi->cpt_pos);
+ file->f_pos = fi->cpt_pos;
+ }
+ }
+
+ if (cred->uid != fi->cpt_uid || cred->gid != fi->cpt_gid)
+ wprintk_ctx("fixup_file_flags: oops... creds mismatch\n");
+
+ /*
+ * this is wrong. but with current cpt_file_image there's
+ * nothing we can do
+ */
+
+ put_cred(file->f_cred);
+ file->f_cred = get_cred(cred);
+
+ file->f_owner.pid = 0;
+ if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) {
+ file->f_owner.pid = find_get_pid(fi->cpt_fown_pid);
+ if (file->f_owner.pid == NULL) {
+ wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n",
+ fi->cpt_fown_pid);
+ return -EINVAL;
+ }
+ }
+ file->f_owner.uid = fi->cpt_fown_uid;
+ file->f_owner.euid = fi->cpt_fown_euid;
+ file->f_owner.signum = fi->cpt_fown_signo;
+
+ if (file->f_mode != fi->cpt_mode) {
+ if (was_dentry_open &&
+ ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) {
+ file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK);
+ file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK);
+ }
+ if (file->f_mode != fi->cpt_mode)
+ wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode);
+ }
+ if (file->f_flags != fi->cpt_flags) {
+ if (!(fi->cpt_flags&O_NOFOLLOW))
+ file->f_flags &= ~O_NOFOLLOW;
+ if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) {
+ file->f_flags &= ~O_NONBLOCK;
+ file->f_flags |= fi->cpt_flags&O_NONBLOCK;
+ }
+ if (fi->cpt_flags&FASYNC) {
+ if (fi->cpt_fown_fd == -1) {
+ wprintk_ctx("No fd for FASYNC\n");
+ return -EINVAL;
+ } else if (file->f_op && file->f_op->fasync) {
+ if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) {
+ wprintk_ctx("FASYNC problem\n");
+ return -EINVAL;
+ } else {
+ file->f_flags |= FASYNC;
+ }
+ }
+ }
+ if (file->f_flags != fi->cpt_flags) {
+ eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static struct file *
+open_deleted(char *name, unsigned flags, struct cpt_file_image *fi,
+ struct cpt_inode_image *ii, cpt_context_t *ctx)
+{
+ struct file * file;
+ char *suffix = NULL;
+ int attempt = 0;
+ int tmp_pass = 0;
+ mode_t mode = fi->cpt_i_mode;
+
+ /* Strip (deleted) part... */
+ if (strlen(name) > strlen(" (deleted)")) {
+ if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) {
+ suffix = &name[strlen(name) - strlen(" (deleted)")];
+ *suffix = 0;
+ } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) {
+ memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1);
+ suffix = name + strlen(name);
+ }
+ }
+
+try_again:
+ for (;;) {
+ if (attempt) {
+ if (attempt > 1000) {
+ eprintk_ctx("open_deleted: failed after %d attempts\n", attempt);
+ return ERR_PTR(-EEXIST);
+ }
+ if (suffix == NULL) {
+ eprintk_ctx("open_deleted: no suffix\n");
+ return ERR_PTR(-EEXIST);
+ }
+ sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt));
+ }
+ attempt++;
+
+ if (S_ISFIFO(mode)) {
+ int err;
+ err = sc_mknod(name, S_IFIFO|(mode&017777), 0);
+ if (err == -EEXIST)
+ continue;
+ if (err < 0 && !tmp_pass)
+ goto change_dir;
+ if (err < 0)
+ return ERR_PTR(err);
+ file = open_pipe(name, fi, flags, ctx);
+ sc_unlink(name);
+ } else if (S_ISCHR(mode)) {
+ int err;
+ err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev));
+ if (err == -EEXIST)
+ continue;
+ if (err < 0 && !tmp_pass)
+ goto change_dir;
+ if (err < 0)
+ return ERR_PTR(err);
+ file = filp_open(name, flags, mode&017777);
+ sc_unlink(name);
+ } else if (S_ISDIR(mode)) {
+ int err;
+ err = sc_mkdir(name, mode&017777);
+ if (err == -EEXIST)
+ continue;
+ if (err < 0 && !tmp_pass)
+ goto change_dir;
+ if (err < 0)
+ return ERR_PTR(err);
+ file = filp_open(name, flags, mode&017777);
+ sc_rmdir(name);
+ } else {
+ file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777);
+ if (IS_ERR(file)) {
+ if (PTR_ERR(file) == -EEXIST)
+ continue;
+ if (!tmp_pass)
+ goto change_dir;
+ } else {
+ sc_unlink(name);
+ }
+ }
+ break;
+ }
+
+ if (IS_ERR(file)) {
+ eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file));
+ return file;
+ } else {
+ dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode);
+ }
+ return file;
+
+change_dir:
+ sprintf(name, "/tmp/rst%u", current->pid);
+ suffix = name + strlen(name);
+ attempt = 1;
+ tmp_pass = 1;
+ goto try_again;
+}
+
+#ifdef CONFIG_SIGNALFD
+static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx)
+{
+ sigset_t mask;
+ mm_segment_t old_fs;
+ int fd;
+ struct file *file;
+
+ cpt_sigset_import(&mask, fi->cpt_priv);
+
+ old_fs = get_fs(); set_fs(KERNEL_DS);
+ fd = do_signalfd(-1, &mask, flags & (O_CLOEXEC | O_NONBLOCK));
+ set_fs(old_fs);
+
+ if (fd < 0)
+ return ERR_PTR(fd);
+
+ file = fget(fd);
+ sys_close(fd);
+
+ return file;
+}
+#else
+static struct file *open_signalfd(struct cpt_file_image *fi, int flags, struct cpt_context *ctx)
+{
+ return ERR_PTR(-EINVAL);
+}
+#endif
+
+struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx)
+{
+ int err;
+ int was_dentry_open = 0;
+ cpt_object_t *obj;
+ cpt_object_t *iobj;
+ struct cpt_file_image fi;
+ __u8 *name = NULL;
+ struct file *file;
+ struct proc_dir_entry *proc_dead_file;
+ int flags;
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx);
+ if (obj) {
+ file = obj->o_obj;
+ if (obj->o_index >= 0) {
+ dprintk_ctx("file is attached to a socket\n");
+ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
+ if (err < 0)
+ goto err_out;
+ fixup_file_flags(file, &fi, 0, pos, ctx);
+ }
+ get_file(file);
+ return file;
+ }
+
+ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
+ if (err < 0)
+ goto err_out;
+
+ flags = make_flags(&fi);
+
+ /* Easy way, inode has been already open. */
+ if (fi.cpt_inode != CPT_NULL &&
+ !(fi.cpt_lflags & CPT_DENTRY_CLONING) &&
+ (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL &&
+ iobj->o_parent) {
+ struct file *filp = iobj->o_parent;
+ file = dentry_open(dget(filp->f_dentry),
+ mntget(filp->f_vfsmnt), flags, NULL);
+ dprintk_ctx("rst_file: file obtained by dentry_open\n");
+ was_dentry_open = 1;
+ goto map_file;
+ }
+
+ if (fi.cpt_next > fi.cpt_hdrlen)
+ name = rst_get_name(pos + sizeof(fi), ctx);
+
+ if (!name) {
+ eprintk_ctx("no name for file?\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ if (fi.cpt_lflags & CPT_DENTRY_DELETED) {
+ struct cpt_inode_image ii;
+ if (fi.cpt_inode == CPT_NULL) {
+ eprintk_ctx("deleted file and no inode.\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx);
+ if (err)
+ goto err_out;
+
+ if (ii.cpt_next > ii.cpt_hdrlen) {
+ struct cpt_object_hdr hdr;
+ err = ctx->pread(&hdr, sizeof(hdr), ctx,
+ fi.cpt_inode + ii.cpt_hdrlen);
+ if (err)
+ goto err_out;
+ if (hdr.cpt_object == CPT_OBJ_NAME) {
+ rst_put_name(name, ctx);
+ name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen,
+ ctx);
+ if (!name) {
+ eprintk_ctx("no name for link?\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+ if ((fi.cpt_lflags & CPT_DENTRY_HARDLINKED) &&
+ !ctx->hardlinked_on) {
+ eprintk_ctx("Open hardlinked is off\n");
+ err = -EPERM;
+ goto err_out;
+ }
+ goto open_file;
+ }
+ }
+
+ /* One very special case... */
+ if (S_ISREG(fi.cpt_i_mode) &&
+ (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) {
+ /* MAP_ANON|MAP_SHARED mapping.
+ * kernel makes this damn ugly way, when file which
+ * is passed to mmap by user does not match
+ * file finally attached to VMA. Ok, rst_mm
+ * has to take care of this. Otherwise, it will fail.
+ */
+ file = NULL;
+ } else if (S_ISREG(fi.cpt_i_mode) ||
+ S_ISCHR(fi.cpt_i_mode) ||
+ S_ISFIFO(fi.cpt_i_mode) ||
+ S_ISDIR(fi.cpt_i_mode)) {
+ if (S_ISCHR(fi.cpt_i_mode)) {
+ file = open_special(&fi, flags, 1, ctx);
+ if (file != NULL)
+ goto map_file;
+ }
+ file = open_deleted(name, flags, &fi, &ii, ctx);
+ if (IS_ERR(file))
+ goto out;
+ } else {
+ eprintk_ctx("not a regular deleted file.\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = fixup_file_content(&file, &fi, &ii, ctx);
+ if (err)
+ goto err_put;
+ goto map_file;
+ } else {
+open_file:
+ if (!name[0]) {
+ eprintk_ctx("empty name for file?\n");
+ err = -EINVAL;
+ goto err_out;
+ }
+ if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) &&
+ (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL)
+ goto map_file;
+#ifdef CONFIG_INOTIFY_USER
+ if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) &&
+ (file = rst_open_inotify(&fi, flags, ctx)) != NULL)
+ goto map_file;
+#else
+ if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) {
+ err = -EINVAL;
+ goto err_out;
+ }
+#endif
+ if ((fi.cpt_lflags & CPT_DENTRY_SIGNALFD) &&
+ (file = open_signalfd(&fi, flags, ctx)) != NULL)
+ goto map_file;
+ if (S_ISFIFO(fi.cpt_i_mode) &&
+ (file = open_pipe(name, &fi, flags, ctx)) != NULL)
+ goto map_file;
+ if (!S_ISREG(fi.cpt_i_mode) &&
+ (file = open_special(&fi, flags, 0, ctx)) != NULL)
+ goto map_file;
+ }
+
+ /* This hook is needed to open file /proc/<pid>/<somefile>
+ * but there is no proccess with pid <pid>.
+ */
+ proc_dead_file = NULL;
+ if (fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD) {
+ sprintf(name, "/proc/rst_dead_pid_file_%d", task_pid_vnr(current));
+
+ proc_dead_file = create_proc_entry(name + 6, S_IRUGO|S_IWUGO,
+ NULL);
+ if (!proc_dead_file) {
+ eprintk_ctx("can't create proc entry %s\n", name);
+ err = -ENOMEM;
+ goto err_out;
+ }
+#ifdef CONFIG_PROC_FS
+ proc_dead_file->proc_fops = &dummy_proc_pid_file_operations;
+#endif
+ }
+
+ file = filp_open(name, flags, 0);
+
+ if (proc_dead_file) {
+ remove_proc_entry(proc_dead_file->name, NULL);
+ if (!IS_ERR(file))
+ d_drop(file->f_dentry);
+ }
+map_file:
+ if (!IS_ERR(file)) {
+ fixup_file_flags(file, &fi, was_dentry_open, pos, ctx);
+
+ if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) {
+ err = fixup_pipe_data(file, &fi, ctx);
+ if (err)
+ goto err_put;
+ }
+
+ /* This is very special hack. Logically, cwd/root are
+ * nothing but open directories. Nevertheless, this causes
+ * failures of restores, when number of open files in VE
+ * is close to limit. So, if it is rst_file() of cwd/root
+ * (fd = -2) and the directory is not deleted, we skip
+ * adding files to object table. If the directory is
+ * not unlinked, this cannot cause any problems.
+ */
+ if (fd != -2 ||
+ !S_ISDIR(file->f_dentry->d_inode->i_mode) ||
+ (fi.cpt_lflags & CPT_DENTRY_DELETED)) {
+ obj = cpt_object_get(CPT_OBJ_FILE, file, ctx);
+ if (!obj) {
+ obj = cpt_object_add(CPT_OBJ_FILE, file, ctx);
+ if (obj)
+ get_file(file);
+ }
+ if (obj)
+ cpt_obj_setpos(obj, pos, ctx);
+
+ obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
+ if (obj) {
+ cpt_obj_setpos(obj, fi.cpt_inode, ctx);
+ if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED))
+ obj->o_parent = file;
+ }
+ }
+
+ if (fi.cpt_next > fi.cpt_hdrlen) {
+ err = fixup_flocks(file, &fi, pos, ctx);
+ if (err)
+ goto err_put;
+ }
+ } else {
+ if ((fi.cpt_lflags & CPT_DENTRY_PROC) &&
+ !(fi.cpt_lflags & CPT_DENTRY_PROCPID_DEAD)) {
+ dprintk_ctx("rst_file /proc delayed\n");
+ file = NULL;
+ } else if (name)
+ eprintk_ctx("can't open file %s\n", name);
+ }
+
+out:
+ if (name)
+ rst_put_name(name, ctx);
+ return file;
+
+err_put:
+ if (file)
+ fput(file);
+err_out:
+ if (name)
+ rst_put_name(name, ctx);
+ return ERR_PTR(err);
+}
+
+
+__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ __u32 flag = 0;
+
+ if (ti->cpt_files == CPT_NULL ||
+ lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx))
+ flag |= CLONE_FILES;
+ if (ti->cpt_fs == CPT_NULL ||
+ lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx))
+ flag |= CLONE_FS;
+ return flag;
+}
+
+static void local_close_files(struct files_struct * files)
+{
+ int i, j;
+
+ j = 0;
+ for (;;) {
+ unsigned long set;
+ i = j * __NFDBITS;
+ if (i >= files->fdt->max_fds)
+ break;
+ set = files->fdt->open_fds->fds_bits[j];
+ while (set) {
+ if (set & 1) {
+ struct file * file = xchg(&files->fdt->fd[i], NULL);
+ if (file)
+ filp_close(file, files);
+ }
+ i++;
+ set >>= 1;
+ }
+ files->fdt->open_fds->fds_bits[j] = 0;
+ files->fdt->close_on_exec->fds_bits[j] = 0;
+ j++;
+ }
+}
+
+extern int expand_fdtable(struct files_struct *files, int nr);
+
+
+static int rst_files(struct cpt_task_image *ti, struct cpt_context *ctx,
+ int from, int to)
+{
+ struct cpt_files_struct_image fi;
+ struct files_struct *f = current->files;
+ cpt_object_t *obj;
+ loff_t pos, endpos;
+ int err;
+
+ if (ti->cpt_files == CPT_NULL) {
+ current->files = NULL;
+ if (f)
+ put_files_struct(f);
+ return 0;
+ }
+
+ if (from == 3) {
+ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx);
+ if (err)
+ return err;
+
+ goto just_do_it;
+ }
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx);
+ if (obj) {
+ if (obj->o_obj != f) {
+ put_files_struct(f);
+ f = obj->o_obj;
+ atomic_inc(&f->count);
+ current->files = f;
+ }
+ return 0;
+ }
+
+ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx);
+ if (err)
+ return err;
+
+ local_close_files(f);
+
+ if (fi.cpt_max_fds > f->fdt->max_fds) {
+ spin_lock(&f->file_lock);
+ err = expand_fdtable(f, fi.cpt_max_fds-1);
+ spin_unlock(&f->file_lock);
+ if (err < 0)
+ return err;
+ }
+
+just_do_it:
+ pos = ti->cpt_files + fi.cpt_hdrlen;
+ endpos = ti->cpt_files + fi.cpt_next;
+ while (pos < endpos) {
+ struct cpt_fd_image fdi;
+ struct file *filp;
+
+ err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx);
+ if (err)
+ return err;
+ if (fdi.cpt_fd < from || fdi.cpt_fd > to)
+ goto skip;
+
+ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
+ if (IS_ERR(filp)) {
+ eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp),
+ (long long)fdi.cpt_file);
+ return PTR_ERR(filp);
+ }
+ if (filp == NULL) {
+ int err = rst_filejob_queue(pos, ctx);
+ if (err)
+ return err;
+ } else {
+ if (fdi.cpt_fd >= f->fdt->max_fds) BUG();
+ f->fdt->fd[fdi.cpt_fd] = filp;
+ FD_SET(fdi.cpt_fd, f->fdt->open_fds);
+ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
+ FD_SET(fdi.cpt_fd, f->fdt->close_on_exec);
+ }
+
+skip:
+ pos += fdi.cpt_next;
+ }
+ f->next_fd = fi.cpt_next_fd;
+
+ obj = cpt_object_add(CPT_OBJ_FILES, f, ctx);
+ if (obj) {
+ cpt_obj_setpos(obj, ti->cpt_files, ctx);
+ cpt_obj_setindex(obj, fi.cpt_index, ctx);
+ }
+ return 0;
+}
+
+int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ return rst_files(ti, ctx, (ti->cpt_pid == 1) ? 3 : 0, INT_MAX);
+}
+
+int rst_files_std(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ return rst_files(ti, ctx, 0, 2);
+}
+
+int rst_do_filejobs(cpt_context_t *ctx)
+{
+ struct filejob *j;
+
+ while ((j = ctx->filejob_queue) != NULL) {
+ int err;
+ struct task_struct *tsk;
+ struct cpt_fd_image fdi;
+ struct file *filp;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(j->pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (!tsk)
+ return -EINVAL;
+
+ err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx);
+ if (err) {
+ put_task_struct(tsk);
+ return err;
+ }
+
+ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
+ if (tsk->files->fdt->fd[fdi.cpt_fd] ||
+ FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) {
+ eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi);
+ put_task_struct(tsk);
+ return -EBUSY;
+ }
+
+ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
+ if (IS_ERR(filp)) {
+ eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file);
+ put_task_struct(tsk);
+ return PTR_ERR(filp);
+ }
+ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
+ tsk->files->fdt->fd[fdi.cpt_fd] = filp;
+ FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds);
+ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
+ FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec);
+
+ dprintk_ctx("filejob %Ld done\n", j->fdi);
+
+ put_task_struct(tsk);
+ ctx->filejob_queue = j->next;
+ kfree(j);
+ }
+ return 0;
+}
+
+void rst_flush_filejobs(cpt_context_t *ctx)
+{
+ struct filejob *j;
+
+ while ((j = ctx->filejob_queue) != NULL) {
+ ctx->filejob_queue = j->next;
+ kfree(j);
+ }
+}
+
+int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ struct fs_struct *f = current->fs;
+ cpt_object_t *obj;
+
+ if (ti->cpt_fs == CPT_NULL) {
+ exit_fs(current);
+ return 0;
+ }
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx);
+ if (obj) {
+ if (obj->o_obj != f) {
+ exit_fs(current);
+ f = obj->o_obj;
+ write_lock(&f->lock);
+ f->users++;
+ write_unlock(&f->lock);
+ current->fs = f;
+ }
+ return 0;
+ }
+
+ /* Do _not_ restore root. Image contains absolute pathnames.
+ * So, we fix it in context of rst process.
+ */
+
+ obj = cpt_object_add(CPT_OBJ_FS, f, ctx);
+ if (obj)
+ cpt_obj_setpos(obj, ti->cpt_fs, ctx);
+
+ return 0;
+}
+
+int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp,
+ loff_t *pos, struct cpt_context *ctx)
+{
+ struct cpt_file_image fi;
+ struct file * file;
+ int err;
+
+ err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx);
+ if (err)
+ return err;
+
+ file = rst_file(*pos, -2, ctx);
+ if (IS_ERR(file)) {
+ if (PTR_ERR(file) == -EINVAL && S_ISLNK(fi.cpt_i_mode)) {
+ /* One special case: inotify on symlink */
+ struct nameidata nd;
+ __u8 *name = NULL;
+
+ if (fi.cpt_next > fi.cpt_hdrlen)
+ name = rst_get_name(*pos + sizeof(fi), ctx);
+ if (!name) {
+ eprintk_ctx("can't get name for file\n");
+ return -EINVAL;
+ }
+ if ((err = path_lookup(name, 0, &nd)) != 0) {
+ eprintk_ctx("path_lookup %s: %d\n", name, err);
+ rst_put_name(name, ctx);
+ return -EINVAL;
+ }
+ *dp = nd.path.dentry;
+ *mp = nd.path.mnt;
+ *pos += fi.cpt_next;
+ rst_put_name(name, ctx);
+ return 0;
+ }
+ return PTR_ERR(file);
+ }
+
+ *dp = dget(file->f_dentry);
+ *mp = mntget(file->f_vfsmnt);
+ *pos += fi.cpt_next;
+ fput(file);
+ return 0;
+}
+
+static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct dentry *old_root;
+ struct vfsmount *old_rootmnt;
+ write_lock(&fs->lock);
+ old_root = fs->root.dentry;
+ old_rootmnt = fs->root.mnt;
+ fs->root.mnt = mnt;
+ fs->root.dentry = dentry;
+ write_unlock(&fs->lock);
+ if (old_root) {
+ dput(old_root);
+ mntput(old_rootmnt);
+ }
+}
+
+static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
+ struct dentry *dentry)
+{
+ struct dentry *old_pwd;
+ struct vfsmount *old_pwdmnt;
+
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd.dentry;
+ old_pwdmnt = fs->pwd.mnt;
+ fs->pwd.mnt = mnt;
+ fs->pwd.dentry = dentry;
+ write_unlock(&fs->lock);
+
+ if (old_pwd) {
+ dput(old_pwd);
+ mntput(old_pwdmnt);
+ }
+}
+
+
+int rst_restore_fs(struct cpt_context *ctx)
+{
+ loff_t pos;
+ cpt_object_t *obj;
+ int err = 0;
+
+ for_each_object(obj, CPT_OBJ_FS) {
+ struct cpt_fs_struct_image fi;
+ struct fs_struct *fs = obj->o_obj;
+ int i;
+ struct dentry *d[3];
+ struct vfsmount *m[3];
+
+ err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx);
+ if (err)
+ return err;
+
+ fs->umask = fi.cpt_umask;
+
+ pos = obj->o_pos + fi.cpt_hdrlen;
+ d[0] = d[1] = d[2] = NULL;
+ m[0] = m[1] = m[2] = NULL;
+ i = 0;
+ while (pos < obj->o_pos + fi.cpt_next && i<3) {
+ err = cpt_get_dentry(d+i, m+i, &pos, ctx);
+ if (err) {
+ eprintk_ctx("cannot get_dir: %d", err);
+ for (--i; i >= 0; i--) {
+ if (d[i])
+ dput(d[i]);
+ if (m[i])
+ mntput(m[i]);
+ }
+ return err;
+ }
+ i++;
+ }
+ if (d[0])
+ __set_fs_root(fs, m[0], d[0]);
+ if (d[1])
+ __set_fs_pwd(fs, m[1], d[1]);
+ if (d[2])
+ wprintk_ctx("altroot arrived...\n");
+ }
+ return err;
+}
+
+int do_one_mount(char *mntpnt, char *mnttype, char *mntbind,
+ unsigned long flags, unsigned long mnt_flags,
+ struct cpt_context *ctx)
+{
+ int err;
+
+ if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0))
+ mntbind = NULL;
+
+ if (mntbind)
+ flags |= MS_BIND;
+ /* Join per-mountpoint flags with global flags */
+ if (mnt_flags & MNT_NOSUID)
+ flags |= MS_NOSUID;
+ if (mnt_flags & MNT_NODEV)
+ flags |= MS_NODEV;
+ if (mnt_flags & MNT_NOEXEC)
+ flags |= MS_NOEXEC;
+
+ err = sc_mount(mntbind, mntpnt, mnttype, flags);
+ if (err < 0) {
+ eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags);
+ return err;
+ }
+ return 0;
+}
+
+static int undumptmpfs(void *arg)
+{
+ int i;
+ int *pfd = arg;
+ int fd1, fd2, err;
+ char *argv[] = { "tar", "x", "-C", "/", "-S", NULL };
+
+ if (pfd[0] != 0)
+ sc_dup2(pfd[0], 0);
+
+ set_fs(KERNEL_DS);
+ fd1 = sc_open("/dev/null", O_WRONLY, 0);
+ fd2 = sc_open("/dev/null", O_WRONLY, 0);
+try:
+ if (fd1 < 0 || fd2 < 0) {
+ if (fd1 == -ENOENT && fd2 == -ENOENT) {
+ err = sc_mknod("/dev/null", S_IFCHR|0666,
+ new_encode_dev((MEM_MAJOR<<MINORBITS)|3));
+ if (err < 0) {
+ eprintk("can't create /dev/null: %d\n", err);
+ module_put(THIS_MODULE);
+ return 255 << 8;
+ }
+ fd1 = sc_open("/dev/null", O_WRONLY, 0666);
+ fd2 = sc_open("/dev/null", O_WRONLY, 0666);
+ sc_unlink("/dev/null");
+ goto try;
+ }
+ eprintk("can not open /dev/null for tar: %d %d\n", fd1, fd2);
+ module_put(THIS_MODULE);
+ return 255 << 8;
+ }
+ if (fd1 != 1)
+ sc_dup2(fd1, 1);
+ if (fd2 != 2)
+ sc_dup2(fd2, 2);
+
+ for (i = 3; i < current->files->fdt->max_fds; i++)
+ sc_close(i);
+
+ module_put(THIS_MODULE);
+
+ i = sc_execve("/bin/tar", argv, NULL);
+ eprintk("failed to exec /bin/tar: %d\n", i);
+ return 255 << 8;
+}
+
+static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx)
+{
+ int err;
+ int pfd[2];
+ struct file *f;
+ struct cpt_object_hdr v;
+ int n;
+ loff_t end;
+ int pid;
+ int status;
+ mm_segment_t oldfs;
+ sigset_t ignore, blocked;
+
+ err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx);
+ if (err < 0)
+ return err;
+
+ err = sc_pipe(pfd);
+ if (err < 0)
+ return err;
+ ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+ sigprocmask(SIG_BLOCK, &ignore, &blocked);
+ pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0);
+ if (err < 0) {
+ eprintk_ctx("tmpfs local_kernel_thread: %d\n", err);
+ goto out;
+ }
+ f = fget(pfd[1]);
+ sc_close(pfd[1]);
+ sc_close(pfd[0]);
+
+ ctx->file->f_pos = *pos + v.cpt_hdrlen;
+ end = *pos + v.cpt_next;
+ *pos += v.cpt_next;
+ do {
+ char buf[16];
+
+ n = end - ctx->file->f_pos;
+ if (n > sizeof(buf))
+ n = sizeof(buf);
+
+ if (ctx->read(buf, n, ctx))
+ break;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ f->f_op->write(f, buf, n, &f->f_pos);
+ set_fs(oldfs);
+ } while (ctx->file->f_pos < end);
+
+ fput(f);
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if ((err = sc_waitx(pid, 0, &status)) < 0)
+ eprintk_ctx("wait4: %d\n", err);
+ else if ((status & 0x7f) == 0) {
+ err = (status & 0xff00) >> 8;
+ if (err != 0) {
+ eprintk_ctx("tar exited with %d\n", err);
+ err = -EINVAL;
+ }
+ } else {
+ eprintk_ctx("tar terminated\n");
+ err = -EINVAL;
+ }
+ set_fs(oldfs);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+ return err;
+
+out:
+ if (pfd[1] >= 0)
+ sc_close(pfd[1]);
+ if (pfd[0] >= 0)
+ sc_close(pfd[0]);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+ return err;
+}
+
+int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx)
+{
+ struct mnt_namespace *n;
+ struct list_head *p;
+ struct vfsmount *t;
+ char *path, *path_buf;
+ int ret;
+
+ n = current->nsproxy->mnt_ns;
+ ret = -ENOENT;
+ path_buf = cpt_get_buf(ctx);
+ down_read(&namespace_sem);
+ list_for_each(p, &n->list) {
+ struct path pt;
+ t = list_entry(p, struct vfsmount, mnt_list);
+ pt.dentry = t->mnt_root;
+ pt.mnt = t;
+ path = d_path(&pt, path_buf, PAGE_SIZE);
+ if (IS_ERR(path))
+ continue;
+ if (!strcmp(path, mntpnt) &&
+ !strcmp(t->mnt_sb->s_type->name, mnttype)) {
+ ret = 0;
+ break;
+ }
+ }
+ up_read(&namespace_sem);
+ __cpt_release_buf(ctx);
+ return ret;
+}
+
+int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ loff_t endpos;
+
+ endpos = pos + mi->cpt_next;
+ pos += mi->cpt_hdrlen;
+
+ while (pos < endpos) {
+ char *mntdev;
+ char *mntpnt;
+ char *mnttype;
+ char *mntbind;
+
+ mntdev = __rst_get_name(&pos, ctx);
+ mntpnt = __rst_get_name(&pos, ctx);
+ mnttype = __rst_get_name(&pos, ctx);
+ mntbind = NULL;
+ if (mi->cpt_mntflags & CPT_MNT_BIND)
+ mntbind = __rst_get_name(&pos, ctx);
+ err = -EINVAL;
+ if (mnttype && mntpnt) {
+ err = 0;
+ if (!(mi->cpt_mntflags & CPT_MNT_EXT) &&
+ strcmp(mntpnt, "/")) {
+ err = do_one_mount(mntpnt, mnttype, mntbind,
+ mi->cpt_flags,
+ mi->cpt_mntflags, ctx);
+ if (!err &&
+ strcmp(mnttype, "tmpfs") == 0 &&
+ !(mi->cpt_mntflags & (CPT_MNT_BIND)))
+ err = rst_restore_tmpfs(&pos, ctx);
+ } else if (mi->cpt_mntflags & CPT_MNT_EXT) {
+ err = check_ext_mount(mntpnt, mnttype, ctx);
+ if (err)
+ eprintk_ctx("mount point is missing: %s\n", mntpnt);
+ }
+ }
+ if (mntdev)
+ rst_put_name(mntdev, ctx);
+ if (mntpnt)
+ rst_put_name(mntpnt, ctx);
+ if (mnttype)
+ rst_put_name(mnttype, ctx);
+ if (mntbind)
+ rst_put_name(mntbind, ctx);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_vfsmount_image mi;
+
+ while (pos < endpos) {
+ err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx);
+ if (err)
+ return err;
+ err = restore_one_vfsmount(&mi, pos, ctx);
+ if (err)
+ return err;
+ pos += mi.cpt_next;
+ }
+ return 0;
+}
+
+int rst_root_namespace(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_NAMESPACE];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_object_hdr sbuf;
+ int done = 0;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx);
+ if (err)
+ return err;
+ if (done) {
+ eprintk_ctx("multiple namespaces are not supported\n");
+ break;
+ }
+ done++;
+ err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx);
+ if (err)
+ return err;
+ sec += sbuf.cpt_next;
+ }
+
+ return 0;
+}
+
+int rst_stray_files(struct cpt_context *ctx)
+{
+ int err = 0;
+ loff_t sec = ctx->sections[CPT_SECT_FILES];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ struct cpt_object_hdr sbuf;
+ cpt_object_t *obj;
+
+ err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx);
+ if (err)
+ break;
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx);
+ if (!obj) {
+ struct file *file;
+
+ dprintk_ctx("stray file %Ld\n", sec);
+
+ file = rst_sysv_shm_itself(sec, ctx);
+
+ if (IS_ERR(file)) {
+ eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file));
+ return PTR_ERR(file);
+ } else {
+ fput(file);
+ }
+ }
+ sec += sbuf.cpt_next;
+ }
+
+ return err;
+}
diff --git a/kernel/cpt/rst_inotify.c b/kernel/cpt/rst_inotify.c
new file mode 100644
index 0000000..bcea486
--- /dev/null
+++ b/kernel/cpt/rst_inotify.c
@@ -0,0 +1,188 @@
+/*
+ *
+ * kernel/cpt/rst_inotify.c
+ *
+ * Copyright (C) 2000-2007 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mnt_namespace.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/vzcalluser.h>
+#include <linux/inotify.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+#include "cpt_fsmagic.h"
+#include "cpt_syscalls.h"
+
+struct file *rst_open_inotify(struct cpt_file_image *fi,
+ unsigned flags,
+ struct cpt_context *ctx)
+{
+ struct file *file;
+ int fd;
+
+ fd = sys_inotify_init();
+ if (fd < 0)
+ return ERR_PTR(fd);
+
+ file = fget(fd);
+ sys_close(fd);
+ return file;
+}
+
+static int restore_one_inotify(cpt_object_t *obj,
+ loff_t pos,
+ struct cpt_inotify_image *ibuf,
+ cpt_context_t *ctx)
+{
+ int err = 0;
+ loff_t endpos;
+ struct file *file = obj->o_obj;
+ struct fsnotify_group *group;
+
+ if (file->f_op != &inotify_fops) {
+ eprintk_ctx("bad inotify file\n");
+ return -EINVAL;
+ }
+
+ group = file->private_data;
+
+ if (unlikely(group == NULL)) {
+ eprintk_ctx("bad inotify device\n");
+ return -EINVAL;
+ }
+
+ endpos = pos + ibuf->cpt_next;
+ pos += ibuf->cpt_hdrlen;
+ while (pos < endpos) {
+ union {
+ struct cpt_inotify_wd_image wi;
+ struct cpt_inotify_ev_image ei;
+ } u;
+
+ err = rst_get_object(-1, pos, &u, ctx);
+ if (err) {
+ eprintk_ctx("rst_get_object: %d\n", err);
+ return err;
+ }
+ if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) {
+ struct path p;
+ loff_t fpos = pos + u.wi.cpt_hdrlen;
+
+ err = cpt_get_dentry(&p.dentry, &p.mnt, &fpos, ctx);
+ if (err) {
+ eprintk_ctx("cpt_get_dentry: %d\n", err);
+ return err;
+ }
+
+ err = __inotify_new_watch(group, &p, u.wi.cpt_mask, u.wi.cpt_wd);
+ path_put(&p);
+ if (err < 0)
+ break;
+
+ err = 0; /* for proper returt value */
+ } else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) {
+#if 0
+ struct inotify_user_watch dummy_watch;
+ struct inotify_watch *w;
+ char *name = NULL;
+
+ if (u.ei.cpt_namelen) {
+ name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL);
+ if (name == NULL) {
+ err = -ENOMEM;
+ break;
+ }
+ name[u.ei.cpt_namelen] = 0;
+ err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen);
+ if (err) {
+ kfree(name);
+ break;
+ }
+ }
+
+ w = &dummy_watch.wdata;
+ dummy_watch.dev = dev;
+ atomic_set(&w->count, 2);
+
+ /* Trick to avoid destruction due to exit event */
+ if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT))
+ atomic_inc(&w->count);
+ dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask,
+ u.ei.cpt_cookie, name, NULL);
+ if (name)
+ kfree(name);
+#endif
+ wprintk_ctx("inotify events dropped\n");
+ } else {
+ eprintk_ctx("bad object: %u\n", u.wi.cpt_object);
+ err = -EINVAL;
+ break;
+ }
+ pos += u.wi.cpt_next;
+ }
+ return err;
+}
+
+int rst_inotify(cpt_context_t *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_INOTIFY];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ cpt_object_t *obj;
+ struct cpt_inotify_image ibuf;
+
+ err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx);
+ if (err)
+ return err;
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx);
+ if (obj == NULL) {
+ eprintk_ctx("cannot find inotify file object\n");
+ return -EINVAL;
+ }
+ err = restore_one_inotify(obj, sec, &ibuf, ctx);
+ if (err)
+ return err;
+ sec += ibuf.cpt_next;
+ }
+
+ return 0;
+}
diff --git a/kernel/cpt/rst_mm.c b/kernel/cpt/rst_mm.c
new file mode 100644
index 0000000..78627cc
--- /dev/null
+++ b/kernel/cpt/rst_mm.c
@@ -0,0 +1,1150 @@
+/*
+ *
+ * kernel/cpt/rst_mm.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/virtinfo.h>
+#include <linux/virtinfoscp.h>
+#include <linux/hugetlb.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+#include <linux/vmalloc.h>
+#include <linux/rmap.h>
+#include <linux/hash.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <asm/mmu.h>
+#ifdef CONFIG_X86
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#endif
+#include <asm/mmu_context.h>
+#include <asm/vsyscall.h>
+#include <linux/swapops.h>
+#include <linux/cpt_image.h>
+
+#ifdef CONFIG_VE
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#endif
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_files.h"
+#include "cpt_ubc.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+#include "cpt_pagein.h"
+#endif
+
+#include "cpt_syscalls.h"
+
+#define __PAGE_NX (1ULL<<63)
+
+static unsigned long make_prot(struct cpt_vma_image *vmai)
+{
+ unsigned long prot = 0;
+
+ if (vmai->cpt_flags&VM_READ)
+ prot |= PROT_READ;
+ if (vmai->cpt_flags&VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vmai->cpt_flags&VM_EXEC)
+ prot |= PROT_EXEC;
+ if (vmai->cpt_flags&VM_GROWSDOWN)
+ prot |= PROT_GROWSDOWN;
+ if (vmai->cpt_flags&VM_GROWSUP)
+ prot |= PROT_GROWSUP;
+ return prot;
+}
+
+static unsigned long make_flags(struct cpt_vma_image *vmai)
+{
+ unsigned long flags = MAP_FIXED;
+
+ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
+ flags |= MAP_SHARED;
+ else
+ flags |= MAP_PRIVATE;
+
+ if (vmai->cpt_file == CPT_NULL)
+ flags |= MAP_ANONYMOUS;
+ if (vmai->cpt_flags&VM_GROWSDOWN)
+ flags |= MAP_GROWSDOWN;
+#ifdef MAP_GROWSUP
+ if (vmai->cpt_flags&VM_GROWSUP)
+ flags |= MAP_GROWSUP;
+#endif
+ if (vmai->cpt_flags&VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vmai->cpt_flags&VM_EXECUTABLE)
+ flags |= MAP_EXECUTABLE;
+ if (!(vmai->cpt_flags&VM_ACCOUNT))
+ flags |= MAP_NORESERVE;
+ return flags;
+}
+
+#ifdef CONFIG_X86
+#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \
+ && !defined(CONFIG_XEN)
+static int __alloc_ldt(mm_context_t *pc, int mincount)
+{
+ int oldsize, newsize, nr;
+
+ if (mincount <= pc->size)
+ return 0;
+ /*
+ * LDT got larger - reallocate if necessary.
+ */
+ oldsize = pc->size;
+ mincount = (mincount+511)&(~511);
+ newsize = mincount*LDT_ENTRY_SIZE;
+ for (nr = 0; nr * PAGE_SIZE < newsize; nr++) {
+ BUG_ON(nr * PAGE_SIZE >= 64*1024);
+ if (!pc->ldt_pages[nr]) {
+ pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC);
+ if (!pc->ldt_pages[nr])
+ goto nomem;
+ clear_highpage(pc->ldt_pages[nr]);
+ }
+ }
+ pc->size = mincount;
+ return 0;
+
+nomem:
+ while (--nr >= 0)
+ __free_page(pc->ldt_pages[nr]);
+ pc->size = 0;
+ return -ENOMEM;
+}
+
+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
+{
+ struct mm_struct *mm = current->mm;
+ int i;
+ int err;
+ int size;
+
+ err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE);
+ if (err)
+ return err;
+
+ size = mm->context.size*LDT_ENTRY_SIZE;
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ int nr = i / PAGE_SIZE, bytes;
+ char *kaddr = kmap(mm->context.ldt_pages[nr]);
+
+ bytes = size - i;
+ if (bytes > PAGE_SIZE)
+ bytes = PAGE_SIZE;
+ err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i);
+ kunmap(mm->context.ldt_pages[nr]);
+ if (err)
+ return err;
+ }
+
+ load_LDT(&mm->context);
+ return 0;
+}
+
+#else
+
+static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
+{
+ struct mm_struct *mm = current->mm;
+ int oldsize = mm->context.size;
+ void *oldldt;
+ void *newldt;
+ int err;
+
+ if (li->cpt_size > PAGE_SIZE)
+ newldt = vmalloc(li->cpt_size);
+ else
+ newldt = kmalloc(li->cpt_size, GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+
+ err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen);
+ if (err)
+ return err;
+
+ oldldt = mm->context.ldt;
+ mm->context.ldt = newldt;
+ mm->context.size = li->cpt_size/LDT_ENTRY_SIZE;
+
+ load_LDT(&mm->context);
+
+ if (oldsize) {
+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(oldldt);
+ else
+ kfree(oldldt);
+ }
+ return 0;
+}
+#endif
+#endif
+
+static int
+restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg)
+{
+ struct aio_ring_info *info = &aio_ctx->ring_info;
+ unsigned nr_events = aio_ctx->max_reqs;
+ unsigned long size;
+ int nr_pages;
+
+ /* We recalculate parameters of the ring exactly like
+ * fs/aio.c does and then compare calculated values
+ * with ones, stored in dump. They must be the same. */
+
+ nr_events += 2;
+
+ size = sizeof(struct aio_ring);
+ size += sizeof(struct io_event) * nr_events;
+ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ if (nr_pages != aimg->cpt_ring_pages)
+ return -EINVAL;
+
+ info->nr_pages = nr_pages;
+
+ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+
+ if (nr_events != aimg->cpt_nr)
+ return -EINVAL;
+
+ info->nr = 0;
+ info->ring_pages = info->internal_pages;
+ if (nr_pages > AIO_RING_PAGES) {
+ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
+ if (!info->ring_pages)
+ return -ENOMEM;
+ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
+ }
+
+ info->mmap_size = nr_pages * PAGE_SIZE;
+
+ /* This piece of shit is not entirely my fault. Kernel aio.c makes
+ * something odd mmap()ping some pages and then pinning them.
+ * I guess it is just some mud remained of failed attempt to show ring
+ * to user space. The result is odd. :-) Immediately after
+ * creation of AIO context, kernel shares those pages with user
+ * and user can read and even write there. But after the first
+ * fork, pages are marked COW with evident consequences.
+ * I remember, I did the same mistake in the first version
+ * of mmapped packet socket, luckily that crap never reached
+ * mainstream.
+ *
+ * So, what are we going to do? I can simulate this odd behaviour
+ * exactly, but I am not insane yet. For now just take the pages
+ * from user space. Alternatively, we could keep kernel copy
+ * in AIO context image, which would be more correct.
+ *
+ * What is wrong now? If the pages are COWed, ring is transferred
+ * incorrectly.
+ */
+ down_read(&current->mm->mmap_sem);
+ info->mmap_base = aimg->cpt_mmap_base;
+ info->nr_pages = get_user_pages(current, current->mm,
+ info->mmap_base, nr_pages,
+ 1, 0, info->ring_pages, NULL);
+ up_read(&current->mm->mmap_sem);
+
+ if (unlikely(info->nr_pages != nr_pages)) {
+ int i;
+
+ for (i=0; i<info->nr_pages; i++)
+ put_page(info->ring_pages[i]);
+ if (info->ring_pages && info->ring_pages != info->internal_pages)
+ kfree(info->ring_pages);
+ return -EFAULT;
+ }
+
+ aio_ctx->user_id = info->mmap_base;
+
+ info->nr = nr_events;
+ info->tail = aimg->cpt_tail;
+
+ return 0;
+}
+
+static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx)
+{
+ int err;
+ struct kioctx *aio_ctx;
+ extern spinlock_t aio_nr_lock;
+
+ aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
+ if (!aio_ctx)
+ return -ENOMEM;
+
+ memset(aio_ctx, 0, sizeof(*aio_ctx));
+ aio_ctx->max_reqs = aimg->cpt_max_reqs;
+
+ if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) {
+ kmem_cache_free(kioctx_cachep, aio_ctx);
+ eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err);
+ return err;
+ }
+
+ aio_ctx->mm = current->mm;
+ atomic_inc(&aio_ctx->mm->mm_count);
+ atomic_set(&aio_ctx->users, 1);
+ spin_lock_init(&aio_ctx->ctx_lock);
+ spin_lock_init(&aio_ctx->ring_info.ring_lock);
+ init_waitqueue_head(&aio_ctx->wait);
+ INIT_LIST_HEAD(&aio_ctx->active_reqs);
+ INIT_LIST_HEAD(&aio_ctx->run_list);
+ INIT_WORK(&aio_ctx->wq.work, aio_kick_handler);
+
+ spin_lock(&aio_nr_lock);
+ aio_nr += aio_ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
+
+ spin_lock(&aio_ctx->mm->ioctx_lock);
+ hlist_add_head(&aio_ctx->list, &aio_ctx->mm->ioctx_list);
+ spin_unlock(&aio_ctx->mm->ioctx_lock);
+
+ return 0;
+}
+
+struct anonvma_map
+{
+ struct hlist_node list;
+ struct anon_vma *avma;
+ __u64 id;
+};
+
+static int verify_create_anonvma(struct mm_struct *mm,
+ struct cpt_vma_image *vmai,
+ cpt_context_t *ctx)
+{
+ struct anon_vma *avma = NULL;
+ struct anon_vma *new_avma;
+ struct vm_area_struct *vma;
+ int h;
+
+ if (!ctx->anonvmas) {
+ if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE)
+ return -EINVAL;
+ if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+ for (h = 0; h < CPT_ANONVMA_HSIZE; h++)
+ INIT_HLIST_HEAD(&ctx->anonvmas[h]);
+ } else {
+ struct anonvma_map *map;
+ struct hlist_node *elem;
+
+ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
+ hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) {
+ if (map->id == vmai->cpt_anonvmaid) {
+ avma = map->avma;
+ break;
+ }
+ }
+ }
+
+ down_read(&mm->mmap_sem);
+ if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) {
+ up_read(&mm->mmap_sem);
+ return -ESRCH;
+ }
+ if (vma->vm_start != vmai->cpt_start) {
+ up_read(&mm->mmap_sem);
+ eprintk_ctx("vma start mismatch\n");
+ return -EINVAL;
+ }
+ if (vma->vm_pgoff != vmai->cpt_pgoff) {
+ dprintk_ctx("vma pgoff mismatch, fixing\n");
+ if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) {
+ eprintk_ctx("cannot fixup vma pgoff\n");
+ up_read(&mm->mmap_sem);
+ return -EINVAL;
+ }
+ vma->vm_pgoff = vmai->cpt_pgoff;
+ }
+
+ if (!vma->anon_vma) {
+ if (avma) {
+ vma->anon_vma = avma;
+ anon_vma_link(vma);
+ } else {
+ int err;
+
+ err = anon_vma_prepare(vma);
+
+ if (err) {
+ up_read(&mm->mmap_sem);
+ return err;
+ }
+ }
+ } else {
+ /* Note, we _can_ arrive to the situation, when two
+ * different anonvmaid's point to one anon_vma, this happens
+ * f.e. when mmap() merged new area to previous one and
+ * they will share one anon_vma even if they did not on
+ * original host.
+ *
+ * IT IS OK. To all that I understand, we may merge all
+ * the anon_vma's and rmap can scan all the huge list of vmas
+ * searching for page. It is just "suboptimal".
+ *
+ * Real disaster would happen, if vma already got an anon_vma
+ * with different id. It is very rare case, kernel does the
+ * best efforts to merge anon_vmas when some attributes are
+ * different. In this case we will fall to copying memory.
+ */
+ if (avma && vma->anon_vma != avma) {
+ up_read(&mm->mmap_sem);
+ wprintk_ctx("anon_vma mismatch\n");
+ return 0;
+ }
+ }
+
+ new_avma = vma->anon_vma;
+ up_read(&mm->mmap_sem);
+
+ if (!avma) {
+ struct anonvma_map *map;
+
+ if (!new_avma)
+ return -EINVAL;
+
+ if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ map->id = vmai->cpt_anonvmaid;
+ map->avma = new_avma;
+ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
+ hlist_add_head(&map->list, &ctx->anonvmas[h]);
+ }
+ return 0;
+}
+
+static int copy_mm_pages(struct mm_struct *src, unsigned long start,
+ unsigned long end)
+{
+ int err;
+
+ for (; start < end; start += PAGE_SIZE) {
+ struct page *page;
+ struct page *spage;
+ void *maddr, *srcaddr;
+
+ err = get_user_pages(current, current->mm,
+ start, 1, 1, 1, &page, NULL);
+ if (err == 0)
+ err = -EFAULT;
+ if (err < 0)
+ return err;
+
+ err = get_user_pages(current, src,
+ start, 1, 0, 1, &spage, NULL);
+
+ if (err == 0)
+ err = -EFAULT;
+ if (err < 0) {
+ page_cache_release(page);
+ return err;
+ }
+
+ srcaddr = kmap(spage);
+ maddr = kmap(page);
+ memcpy(maddr, srcaddr, PAGE_SIZE);
+ set_page_dirty_lock(page);
+ kunmap(page);
+ kunmap(spage);
+ page_cache_release(page);
+ page_cache_release(spage);
+ }
+ return 0;
+}
+
+#include <linux/proc_fs.h>
+
+static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx)
+{
+ int err = 0;
+ unsigned long addr;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct file *file = NULL;
+ unsigned long prot;
+ int checked = 0;
+
+ if (vmai->cpt_type == CPT_VMA_VDSO) {
+ if (ctx->vdso == NULL) {
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ err = arch_setup_additional_pages(NULL, 0,
+ vmai->cpt_start);
+#endif
+ goto out;
+ }
+ }
+
+ prot = make_prot(vmai);
+
+ if (vmai->cpt_file != CPT_NULL) {
+ if (vmai->cpt_type == CPT_VMA_TYPE_0) {
+ file = rst_file(vmai->cpt_file, -1, ctx);
+ if (IS_ERR(file)) {
+ eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file);
+ return PTR_ERR(file);
+ }
+ } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) {
+ file = rst_sysv_shm_vma(vmai, ctx);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+ }
+
+ down_write(&mm->mmap_sem);
+
+ if ((make_flags(vmai) & VM_EXECUTABLE) && mm->exe_file != file)
+ set_mm_exe_file(mm, file);
+
+ addr = do_mmap_pgoff(file, vmai->cpt_start,
+ vmai->cpt_end-vmai->cpt_start,
+ prot, make_flags(vmai),
+ vmai->cpt_pgoff);
+
+ if (addr != vmai->cpt_start) {
+ up_write(&mm->mmap_sem);
+
+ err = -EINVAL;
+ if (IS_ERR((void*)addr))
+ err = addr;
+ goto out;
+ }
+
+ vma = find_vma(mm, vmai->cpt_start);
+ if (vma == NULL) {
+ up_write(&mm->mmap_sem);
+ eprintk_ctx("cannot find mmapped vma\n");
+ err = -ESRCH;
+ goto out;
+ }
+
+ /* do_mmap_pgoff() can merge new area to previous one (not to the next,
+ * we mmap in order, the rest of mm is still unmapped). This can happen
+ * f.e. if flags are to be adjusted later, or if we had different
+ * anon_vma on two adjacent regions. Split it by brute force. */
+ if (vma->vm_start != vmai->cpt_start) {
+ dprintk_ctx("vma %Ld merged, split\n", vmapos);
+ err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0);
+ if (err) {
+ up_write(&mm->mmap_sem);
+ eprintk_ctx("cannot split vma\n");
+ goto out;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) {
+ err = verify_create_anonvma(mm, vmai, ctx);
+ if (err) {
+ eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos);
+ goto out;
+ }
+ }
+
+ if (vmai->cpt_type == CPT_VMA_VDSO) {
+ struct page *page;
+ void *maddr;
+
+ err = get_user_pages(current, current->mm,
+ (unsigned long)vmai->cpt_start,
+ 1, 1, 1, &page, NULL);
+ if (err == 0)
+ err = -EFAULT;
+ if (err < 0) {
+ eprintk_ctx("can't get vdso: get_user_pages: %d\n", err);
+ goto out;
+ }
+ err = 0;
+ maddr = kmap(page);
+ memcpy(maddr, ctx->vdso, PAGE_SIZE);
+ set_page_dirty_lock(page);
+ kunmap(page);
+ page_cache_release(page);
+ goto out;
+ }
+
+ if (vmai->cpt_next > vmai->cpt_hdrlen) {
+ loff_t offset = vmapos + vmai->cpt_hdrlen;
+
+ do {
+ union {
+ struct cpt_page_block pb;
+ struct cpt_remappage_block rpb;
+ struct cpt_copypage_block cpb;
+ struct cpt_lazypage_block lpb;
+ struct cpt_iterpage_block ipb;
+ } u;
+ loff_t pos;
+
+ err = rst_get_object(-1, offset, &u, ctx);
+ if (err) {
+ eprintk_ctx("vma fix object: %d\n", err);
+ goto out;
+ }
+ if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) {
+ err = sc_remap_file_pages(u.rpb.cpt_start,
+ u.rpb.cpt_end-u.rpb.cpt_start,
+ 0, u.rpb.cpt_pgoff, 0);
+ if (err < 0) {
+ eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err,
+ (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start),
+ (__u32)u.rpb.cpt_pgoff);
+ goto out;
+ }
+ offset += u.rpb.cpt_next;
+ continue;
+ } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) {
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ unsigned long ptr = u.lpb.cpt_start;
+
+ down_read(&mm->mmap_sem);
+ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
+ up_read(&mm->mmap_sem);
+ eprintk_ctx("lost vm_area_struct\n");
+ err = -ESRCH;
+ goto out;
+ }
+ err = anon_vma_prepare(vma);
+ if (err) {
+ up_read(&mm->mmap_sem);
+ goto out;
+ }
+ while (ptr < u.lpb.cpt_end) {
+ err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE,
+ ptr, ctx);
+ if (err)
+ break;
+ ptr += PAGE_SIZE;
+ }
+ up_read(&mm->mmap_sem);
+#else
+ err = -EINVAL;
+#endif
+ if (err)
+ goto out;
+ offset += u.cpb.cpt_next;
+ continue;
+ } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) {
+ struct vm_area_struct *vma, *vma1;
+ struct mm_struct *src;
+ struct anon_vma *src_anon;
+ cpt_object_t *mobj;
+
+ if (!vmai->cpt_anonvmaid) {
+ err = -EINVAL;
+ eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n");
+ goto out;
+ }
+
+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx);
+ if (!mobj) {
+ eprintk_ctx("lost mm_struct to clone pages from\n");
+ err = -ESRCH;
+ goto out;
+ }
+ src = mobj->o_obj;
+
+ down_read(&src->mmap_sem);
+ src_anon = NULL;
+ vma1 = find_vma(src, u.cpb.cpt_start);
+ if (vma1)
+ src_anon = vma1->anon_vma;
+ up_read(&src->mmap_sem);
+
+ if (!vma1) {
+ eprintk_ctx("lost src vm_area_struct\n");
+ err = -ESRCH;
+ goto out;
+ }
+
+ down_read(&mm->mmap_sem);
+ if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) {
+ up_read(&mm->mmap_sem);
+ eprintk_ctx("lost vm_area_struct\n");
+ err = -ESRCH;
+ goto out;
+ }
+
+ if (!src_anon ||
+ !vma->anon_vma ||
+ vma->anon_vma != src_anon ||
+ vma->vm_start - vma1->vm_start !=
+ (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) {
+ up_read(&mm->mmap_sem);
+ wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos);
+ err = copy_mm_pages(mobj->o_obj,
+ u.cpb.cpt_start,
+ u.cpb.cpt_end);
+ } else {
+ err = __copy_page_range(vma, vma1,
+ u.cpb.cpt_start,
+ u.cpb.cpt_end-u.cpb.cpt_start);
+ up_read(&mm->mmap_sem);
+ }
+ if (err) {
+ eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err,
+ (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start),
+ (long)u.cpb.cpt_source);
+ goto out;
+ }
+
+ offset += u.cpb.cpt_next;
+ continue;
+ } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES ||
+ u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES
+ ) {
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+ unsigned long ptr = u.lpb.cpt_start;
+ u64 page_pos[16];
+ pos = offset + sizeof(u.pb);
+
+ err = ctx->pread(&page_pos,
+ 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE,
+ ctx,
+ pos);
+ if (err) {
+ eprintk_ctx("Oops\n");
+ goto out;
+ }
+
+ down_read(&mm->mmap_sem);
+ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
+ up_read(&mm->mmap_sem);
+ eprintk_ctx("lost vm_area_struct\n");
+ err = -ESRCH;
+ goto out;
+ }
+ err = anon_vma_prepare(vma);
+ if (err) {
+ up_read(&mm->mmap_sem);
+ goto out;
+ }
+ while (ptr < u.lpb.cpt_end) {
+ err = rst_iter(vma,
+ page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE],
+ ptr,
+ ctx);
+ if (err)
+ break;
+ ptr += PAGE_SIZE;
+ }
+ if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) {
+ make_pages_present((unsigned long)u.lpb.cpt_start,
+ (unsigned long)u.lpb.cpt_end);
+ }
+ up_read(&mm->mmap_sem);
+#else
+ err = -EINVAL;
+#endif
+ if (err)
+ goto out;
+ offset += u.cpb.cpt_next;
+ continue;
+ }
+ if (u.pb.cpt_object != CPT_OBJ_PAGES) {
+ eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object);
+ err = -EINVAL;
+ goto out;
+ }
+ pos = offset + sizeof(u.pb);
+ if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) {
+ /* I guess this is get_user_pages() messed things,
+ * this happens f.e. when gdb inserts breakpoints.
+ */
+ int i;
+ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) {
+ struct page *page;
+ void *maddr;
+ err = get_user_pages(current, current->mm,
+ (unsigned long)u.pb.cpt_start + i*PAGE_SIZE,
+ 1, 1, 1, &page, NULL);
+ if (err == 0)
+ err = -EFAULT;
+ if (err < 0) {
+ eprintk_ctx("get_user_pages: %d\n", err);
+ goto out;
+ }
+ err = 0;
+ maddr = kmap(page);
+ if (u.pb.cpt_content == CPT_CONTENT_VOID) {
+ memset(maddr, 0, PAGE_SIZE);
+ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
+ err = ctx->pread(maddr, PAGE_SIZE,
+ ctx, pos + i*PAGE_SIZE);
+ if (err) {
+ kunmap(page);
+ goto out;
+ }
+ } else {
+ err = -EINVAL;
+ kunmap(page);
+ goto out;
+ }
+ set_page_dirty_lock(page);
+ kunmap(page);
+ page_cache_release(page);
+ }
+ } else {
+ if (!(prot&PROT_WRITE))
+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
+ if (u.pb.cpt_content == CPT_CONTENT_VOID) {
+ int i;
+ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) {
+ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i);
+ if (err) {
+ eprintk_ctx("__put_user 2 %d\n", err);
+ goto out;
+ }
+ }
+ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
+ loff_t tpos = pos;
+ err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start),
+ u.pb.cpt_end-u.pb.cpt_start,
+ &tpos);
+ if (err != u.pb.cpt_end-u.pb.cpt_start) {
+ if (err >= 0)
+ err = -EIO;
+ goto out;
+ }
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+ if (!(prot&PROT_WRITE))
+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
+ }
+ err = 0;
+ offset += u.pb.cpt_next;
+ } while (offset < vmapos + vmai->cpt_next);
+ }
+
+check:
+ do {
+ struct vm_area_struct *vma;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+ if (vma) {
+ if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) {
+ VM_ClearReadHint(vma);
+ vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK;
+ }
+ if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) {
+ dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos);
+ up_read(&mm->mmap_sem);
+ if (vma->vm_flags&VM_LOCKED)
+ err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
+ else
+ err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
+ /* When mlock fails with EFAULT, it means
+ * that it could not bring in pages.
+ * It can happen after mlock() on unreadable
+ * VMAs. But VMA is correctly locked,
+ * so that this error can be ignored. */
+ if (err == -EFAULT)
+ err = 0;
+ if (err)
+ goto out;
+ goto check;
+ }
+ if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX)
+ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
+ (unsigned long long)vma->vm_page_prot.pgprot,
+ (unsigned long long)vmai->cpt_pgprot);
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+ if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) &&
+ (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE))
+ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
+ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot);
+#endif
+ if (vma->vm_flags != vmai->cpt_flags) {
+ unsigned long x = vma->vm_flags ^ vmai->cpt_flags;
+ if (x & VM_EXEC) {
+ /* Crap. On i386 this is OK.
+ * It is impossible to make via mmap/mprotect
+ * exec.c clears VM_EXEC on stack. */
+ vma->vm_flags &= ~VM_EXEC;
+ } else if ((x & VM_ACCOUNT) && !checked) {
+ checked = 1;
+ if (!(prot&PROT_WRITE)) {
+ up_read(&mm->mmap_sem);
+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
+ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
+ goto check;
+ }
+ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
+ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
+ } else {
+ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
+ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
+ }
+ }
+ } else {
+ wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos);
+ }
+ up_read(&mm->mmap_sem);
+ } while (0);
+
+out:
+ if (file)
+ fput(file);
+ return err;
+}
+
+#ifndef CONFIG_IA64
+#define TASK_UNMAP_START 0
+#else
+/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping
+ * used to accelerate speculative dereferences of NULL pointer. */
+#define TASK_UNMAP_START PAGE_SIZE
+#endif
+
+static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx)
+{
+ int err = 0;
+ unsigned int def_flags;
+ struct mm_struct *mm = current->mm;
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *bc;
+#endif
+
+ down_write(&mm->mmap_sem);
+ do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START);
+
+#ifdef CONFIG_BEANCOUNTERS
+ /*
+ * MM beancounter is usually correct from the fork time,
+ * but not for init, for example.
+ * Luckily, mm_ub can be changed for a completely empty MM.
+ */
+ bc = rst_lookup_ubc(vmi->cpt_mmub, ctx);
+ err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc);
+ if (err & NOTIFY_FAIL) {
+ up_write(&mm->mmap_sem);
+ return -ECHRNG;
+ }
+ if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) {
+ struct user_beancounter *old_bc;
+
+ old_bc = mm->mm_ub;
+ mm->mm_ub = bc;
+ bc = old_bc;
+ }
+ err = 0;
+ put_beancounter(bc);
+#endif
+
+ mm->start_code = vmi->cpt_start_code;
+ mm->end_code = vmi->cpt_end_code;
+ mm->start_data = vmi->cpt_start_data;
+ mm->end_data = vmi->cpt_end_data;
+ mm->start_brk = vmi->cpt_start_brk;
+ mm->brk = vmi->cpt_brk;
+ mm->start_stack = vmi->cpt_start_stack;
+ mm->arg_start = vmi->cpt_start_arg;
+ mm->arg_end = vmi->cpt_end_arg;
+ mm->env_start = vmi->cpt_start_env;
+ mm->env_end = vmi->cpt_end_env;
+ mm->def_flags = 0;
+ def_flags = vmi->cpt_def_flags;
+
+ mm->flags = vmi->cpt_dumpable;
+ if (ctx->image_version < CPT_VERSION_24)
+ mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS;
+
+ mm->vps_dumpable = vmi->cpt_vps_dumpable;
+#ifndef CONFIG_IA64
+ if (ctx->image_version >= CPT_VERSION_9) {
+ mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso);
+ current_thread_info()->sysenter_return =
+ VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN);
+ }
+#endif
+
+#if 0 /* def CONFIG_HUGETLB_PAGE*/
+/* NB: ? */
+ int used_hugetlb;
+#endif
+ up_write(&mm->mmap_sem);
+
+ if (vmi->cpt_next > vmi->cpt_hdrlen) {
+ loff_t offset = pos + vmi->cpt_hdrlen;
+ do {
+ union {
+ struct cpt_vma_image vmai;
+ struct cpt_aio_ctx_image aioi;
+ struct cpt_obj_bits bits;
+ } u;
+ err = rst_get_object(-1, offset, &u, ctx);
+ if (err)
+ goto out;
+ if (u.vmai.cpt_object == CPT_OBJ_VMA) {
+#ifdef CONFIG_IA64
+ //// Later...
+ if (u.vmai.cpt_start)
+#endif
+ err = do_rst_vma(&u.vmai, offset, pos, ctx);
+ if (err)
+ goto out;
+#ifdef CONFIG_X86
+ } else if (u.bits.cpt_object == CPT_OBJ_BITS &&
+ u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) {
+ err = do_rst_ldt(&u.bits, offset, ctx);
+ if (err)
+ goto out;
+#endif
+ } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) {
+ err = do_rst_aio(&u.aioi, offset, ctx);
+ if (err)
+ goto out;
+ } else {
+ eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object);
+ err = -EINVAL;
+ goto out;
+ }
+ offset += u.vmai.cpt_next;
+ } while (offset < pos + vmi->cpt_next);
+ }
+
+ down_write(&mm->mmap_sem);
+ mm->def_flags = def_flags;
+ up_write(&mm->mmap_sem);
+
+
+out:
+ return err;
+}
+
+extern void exit_mm(struct task_struct * tsk);
+
+int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ int err = 0;
+ cpt_object_t *mobj;
+ void *tmp = (void*)__get_free_page(GFP_KERNEL);
+ struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp;
+
+ if (!tmp)
+ return -ENOMEM;
+
+ if (ti->cpt_mm == CPT_NULL) {
+ if (current->mm) {
+ virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT,
+ current);
+ exit_mm(current);
+ }
+ goto out;
+ }
+
+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
+ if (mobj) {
+ if (current->mm != mobj->o_obj) BUG();
+ goto out;
+ }
+
+ if (current->mm == NULL) {
+ struct mm_struct *mm = mm_alloc();
+ if (mm == NULL) {
+ err = -ENOMEM;
+ goto out;
+ }
+ err = init_new_context(current, mm);
+ if (err) {
+ mmdrop(mm);
+ goto out;
+ }
+ current->mm = mm;
+ }
+
+ if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0)
+ goto out;
+ if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) {
+ eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm);
+ goto out;
+ }
+ err = -ENOMEM;
+ mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx);
+ if (mobj != NULL) {
+ err = 0;
+ cpt_obj_setpos(mobj, ti->cpt_mm, ctx);
+ }
+
+out:
+ if (tmp)
+ free_page((unsigned long)tmp);
+ return err;
+}
+
+/* This is part of mm setup, made in parent context. Mostly, it is the place,
+ * where we graft mm of another process to child.
+ */
+
+int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ struct task_struct *tsk = obj->o_obj;
+ cpt_object_t *mobj;
+
+ /* Task without mm. Just get rid of this. */
+ if (ti->cpt_mm == CPT_NULL) {
+ if (tsk->mm) {
+ virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT,
+ tsk);
+ mmput(tsk->mm);
+ tsk->mm = NULL;
+ }
+ return 0;
+ }
+
+ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
+ if (mobj) {
+ struct mm_struct *newmm = mobj->o_obj;
+ /* Good, the MM is already created. */
+ if (newmm == tsk->mm) {
+ /* Already done by clone(). */
+ return 0;
+ }
+ mmput(tsk->mm);
+ atomic_inc(&newmm->mm_users);
+ tsk->mm = newmm;
+ tsk->active_mm = newmm;
+ }
+ return 0;
+}
+
+/* We use CLONE_VM when mm of child is going to be shared with parent.
+ * Otherwise mm is copied.
+ */
+
+__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ if (ti->cpt_mm == CPT_NULL ||
+ lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx))
+ return CLONE_VM;
+ return 0;
+}
diff --git a/kernel/cpt/rst_net.c b/kernel/cpt/rst_net.c
new file mode 100644
index 0000000..5da7a8c
--- /dev/null
+++ b/kernel/cpt/rst_net.c
@@ -0,0 +1,745 @@
+/*
+ *
+ * kernel/cpt/rst_net.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/addrconf.h>
+#include <linux/if_tun.h>
+#include <linux/veth.h>
+#include <linux/venet.h>
+#include <linux/fdtable.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/cpt_export.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_kernel.h"
+#include "cpt_net.h"
+#include "cpt_files.h"
+
+#include "cpt_syscalls.h"
+
+extern struct in_ifaddr *inet_alloc_ifa(void);
+extern int inet_insert_ifa(struct in_ifaddr *ifa);
+extern struct in_device *inetdev_init(struct net_device *dev);
+
+int rst_restore_ifaddr(struct cpt_context *ctx)
+{
+ struct net *net = get_exec_env()->ve_netns;
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_ifaddr_image di;
+ struct net_device *dev;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ int cindex = -1;
+ int err;
+ err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx);
+ if (err)
+ return err;
+ cindex = di.cpt_index;
+ rtnl_lock();
+ dev = __dev_get_by_index(net, cindex);
+ if (dev && di.cpt_family == AF_INET) {
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
+ in_dev = inetdev_init(dev);
+ ifa = inet_alloc_ifa();
+ if (ifa) {
+ ifa->ifa_local = di.cpt_address[0];
+ ifa->ifa_address = di.cpt_peer[0];
+ ifa->ifa_broadcast = di.cpt_broadcast[0];
+ ifa->ifa_prefixlen = di.cpt_masklen;
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ ifa->ifa_flags = di.cpt_flags;
+ ifa->ifa_scope = di.cpt_scope;
+ memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ err = inet_insert_ifa(ifa);
+ if (err && err != -EEXIST) {
+ rtnl_unlock();
+ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
+ return err;
+ }
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ } else if (dev && di.cpt_family == AF_INET6) {
+ __u32 prefered_lft;
+ __u32 valid_lft;
+ struct net *net = get_exec_env()->ve_ns->net_ns;
+ prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ?
+ 0 : di.cpt_prefered_lft;
+ valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ?
+ 0xFFFFFFFF : di.cpt_valid_lft;
+ err = inet6_addr_add(net, dev->ifindex,
+ (struct in6_addr *)di.cpt_address,
+ di.cpt_masklen, 0,
+ prefered_lft,
+ valid_lft);
+ if (err && err != -EEXIST) {
+ rtnl_unlock();
+ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
+ return err;
+ }
+#endif
+ } else {
+ rtnl_unlock();
+ eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index);
+ return -EINVAL;
+ }
+ rtnl_unlock();
+ sec += di.cpt_next;
+ }
+ return 0;
+}
+
+static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx)
+{
+ int min_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ struct rtmsg *rtm = NLMSG_DATA(nlh);
+ __u32 prefix0 = 0;
+
+ if (nlh->nlmsg_len > min_len) {
+ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
+ struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len);
+
+ while (RTA_OK(rta, attrlen)) {
+ if (rta->rta_type == RTA_DST) {
+ prefix0 = *(__u32*)RTA_DATA(rta);
+ }
+ rta = RTA_NEXT(rta, attrlen);
+ }
+ }
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ if (rtm->rtm_family == AF_INET6) {
+ if (rtm->rtm_type == RTN_LOCAL)
+ return 2;
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ return 2;
+ if (rtm->rtm_protocol == RTPROT_UNSPEC ||
+ rtm->rtm_protocol == RTPROT_RA ||
+ rtm->rtm_protocol == RTPROT_REDIRECT ||
+ rtm->rtm_protocol == RTPROT_KERNEL)
+ return 2;
+ if (rtm->rtm_protocol == RTPROT_BOOT &&
+ ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) ||
+ (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000))))
+ return 2;
+ }
+#endif
+ return rtm->rtm_protocol == RTPROT_KERNEL;
+}
+
+int rst_restore_route(struct cpt_context *ctx)
+{
+ int err;
+ struct socket *sock;
+ struct msghdr msg;
+ struct iovec iov;
+ struct sockaddr_nl nladdr;
+ mm_segment_t oldfs;
+ loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_object_hdr v;
+ char *pg;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ if (h.cpt_hdrlen >= h.cpt_next)
+ return 0;
+
+ sec += h.cpt_hdrlen;
+ err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx);
+ if (err < 0)
+ return err;
+
+ err = sock_create(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
+ if (err)
+ return err;
+
+ pg = (char*)__get_free_page(GFP_KERNEL);
+ if (pg == NULL) {
+ err = -ENOMEM;
+ goto out_sock;
+ }
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ endsec = sec + v.cpt_next;
+ sec += v.cpt_hdrlen;
+
+ while (sec < endsec) {
+ struct nlmsghdr *n;
+ struct nlmsghdr nh;
+ int kernel_flag;
+
+ if (endsec - sec < sizeof(nh))
+ break;
+
+ err = ctx->pread(&nh, sizeof(nh), ctx, sec);
+ if (err)
+ goto out_sock_pg;
+ if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE ||
+ endsec - sec < nh.nlmsg_len) {
+ err = -EINVAL;
+ goto out_sock_pg;
+ }
+ err = ctx->pread(pg, nh.nlmsg_len, ctx, sec);
+ if (err)
+ goto out_sock_pg;
+
+ n = (struct nlmsghdr*)pg;
+ n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE;
+
+ err = rewrite_rtmsg(n, ctx);
+ if (err < 0)
+ goto out_sock_pg;
+ kernel_flag = err;
+
+ if (kernel_flag == 2)
+ goto do_next;
+
+ iov.iov_base=n;
+ iov.iov_len=nh.nlmsg_len;
+ msg.msg_name=&nladdr;
+ msg.msg_namelen=sizeof(nladdr);
+ msg.msg_iov=&iov;
+ msg.msg_iovlen=1;
+ msg.msg_control=NULL;
+ msg.msg_controllen=0;
+ msg.msg_flags=MSG_DONTWAIT;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock_sendmsg(sock, &msg, nh.nlmsg_len);
+ set_fs(oldfs);
+
+ if (err < 0)
+ goto out_sock_pg;
+ err = 0;
+
+ iov.iov_base=pg;
+ iov.iov_len=PAGE_SIZE;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
+ set_fs(oldfs);
+ if (err != -EAGAIN) {
+ if (n->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *e = NLMSG_DATA(n);
+ if (e->error != -EEXIST || !kernel_flag)
+ eprintk_ctx("NLMERR: %d\n", e->error);
+ } else {
+ eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type);
+ }
+ }
+do_next:
+ err = 0;
+ sec += NLMSG_ALIGN(nh.nlmsg_len);
+ }
+
+out_sock_pg:
+ free_page((unsigned long)pg);
+out_sock:
+ sock_release(sock);
+ return err;
+}
+
+int rst_resume_network(struct cpt_context *ctx)
+{
+ struct ve_struct *env;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+ env->disable_net = 0;
+ put_ve(env);
+ return 0;
+}
+
+static int rst_restore_netstats(loff_t pos, struct net_device *dev,
+ struct cpt_context * ctx)
+{
+ struct cpt_netstats_image *n;
+ struct net_device_stats *stats = NULL;
+ int err;
+
+ if (!dev->netdev_ops->ndo_get_stats)
+ return 0;
+
+ n = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx);
+ if (err)
+ goto out;
+ BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen);
+ preempt_disable();
+
+ if (dev->netdev_ops->ndo_cpt == NULL) {
+ err = -ENODEV;
+ eprintk_ctx("Network device %s is not supported\n", dev->name);
+ goto out;
+ }
+
+ stats = dev->netdev_ops->ndo_get_stats(dev);
+
+ stats->rx_packets = n->cpt_rx_packets;
+ stats->tx_packets = n->cpt_tx_packets;
+ stats->rx_bytes = n->cpt_rx_bytes;
+ stats->tx_bytes = n->cpt_tx_bytes;
+ stats->rx_errors = n->cpt_rx_errors;
+ stats->tx_errors = n->cpt_tx_errors;
+ stats->rx_dropped = n->cpt_rx_dropped;
+ stats->tx_dropped = n->cpt_tx_dropped;
+ stats->multicast = n->cpt_multicast;
+ stats->collisions = n->cpt_collisions;
+ stats->rx_length_errors = n->cpt_rx_length_errors;
+ stats->rx_over_errors = n->cpt_rx_over_errors;
+ stats->rx_crc_errors = n->cpt_rx_crc_errors;
+ stats->rx_frame_errors = n->cpt_rx_frame_errors;
+ stats->rx_fifo_errors = n->cpt_rx_fifo_errors;
+ stats->rx_missed_errors = n->cpt_rx_missed_errors;
+ stats->tx_aborted_errors = n->cpt_tx_aborted_errors;
+ stats->tx_carrier_errors = n->cpt_tx_carrier_errors;
+ stats->tx_fifo_errors = n->cpt_tx_fifo_errors;
+ stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors;
+ stats->tx_window_errors = n->cpt_tx_window_errors;
+ stats->rx_compressed = n->cpt_rx_compressed;
+ stats->tx_compressed = n->cpt_tx_compressed;
+
+out:
+ preempt_enable();
+ cpt_release_buf(ctx);
+ return err;
+}
+
+int rst_restore_netdev(struct cpt_context *ctx)
+{
+ struct net *net = get_exec_env()->ve_netns;
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_netdev_image di;
+ struct net_device *dev;
+
+ get_exec_env()->disable_net = 1;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ loff_t pos;
+ struct net_device *dev_new;
+ struct netdev_rst *ops;
+
+ err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx);
+ if (err)
+ return err;
+
+ rtnl_lock();
+ pos = sec + di.cpt_hdrlen;
+ if (di.cpt_next > sizeof(di)) {
+ struct cpt_object_hdr hdr;
+ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
+ ctx, sec + di.cpt_hdrlen);
+ if (err)
+ goto out;
+
+ ops = NULL;
+ while (1) {
+ ops = netdev_find_rst(hdr.cpt_object, ops);
+ if (ops == NULL)
+ break;
+
+ err = ops->ndo_rst(sec, &di, &rst_ops, ctx);
+ if (!err) {
+ pos += hdr.cpt_next;
+ break;
+ } else if (err < 0) {
+ eprintk_ctx("netdev %d rst failed %d\n",
+ hdr.cpt_object, err);
+ goto out;
+ }
+ }
+ }
+
+ dev = __dev_get_by_name(net, di.cpt_name);
+ if (dev) {
+ if (dev->ifindex != di.cpt_index) {
+ dev_new = __dev_get_by_index(net, di.cpt_index);
+ if (!dev_new) {
+ write_lock_bh(&dev_base_lock);
+ hlist_del(&dev->index_hlist);
+ if (dev->iflink == dev->ifindex)
+ dev->iflink = di.cpt_index;
+ dev->ifindex = di.cpt_index;
+ hlist_add_head(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+ write_unlock_bh(&dev_base_lock);
+ } else {
+ write_lock_bh(&dev_base_lock);
+ hlist_del(&dev->index_hlist);
+ hlist_del(&dev_new->index_hlist);
+ if (dev_new->iflink == dev_new->ifindex)
+ dev_new->iflink = dev->ifindex;
+ dev_new->ifindex = dev->ifindex;
+ if (dev->iflink == dev->ifindex)
+ dev->iflink = di.cpt_index;
+ dev->ifindex = di.cpt_index;
+ hlist_add_head(&dev->index_hlist,
+ dev_index_hash(net, dev->ifindex));
+ hlist_add_head(&dev_new->index_hlist,
+ dev_index_hash(net, dev_new->ifindex));
+ write_unlock_bh(&dev_base_lock);
+ }
+ }
+ if (di.cpt_flags^dev->flags) {
+ err = dev_change_flags(dev, di.cpt_flags);
+ if (err)
+ eprintk_ctx("dev_change_flags err: %d\n", err);
+ }
+ while (pos < sec + di.cpt_next) {
+ struct cpt_object_hdr hdr;
+ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr),
+ ctx, pos);
+ if (err)
+ goto out;
+ if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) {
+ /* Restore hardware address */
+ struct cpt_hwaddr_image hw;
+ err = rst_get_object(CPT_OBJ_NET_HWADDR,
+ pos, &hw, ctx);
+ if (err)
+ goto out;
+ BUILD_BUG_ON(sizeof(hw.cpt_dev_addr) !=
+ MAX_ADDR_LEN);
+ memcpy(dev->dev_addr, hw.cpt_dev_addr,
+ sizeof(hw.cpt_dev_addr));
+ } else if (hdr.cpt_object == CPT_OBJ_NET_STATS) {
+ err = rst_restore_netstats(pos, dev, ctx);
+ if (err) {
+ eprintk_ctx("rst stats %s: %d\n",
+ di.cpt_name, err);
+ goto out;
+ }
+ }
+ pos += hdr.cpt_next;
+ }
+ } else {
+ eprintk_ctx("unknown interface 2 %s\n", di.cpt_name);
+ }
+ rtnl_unlock();
+ sec += di.cpt_next;
+ }
+ return 0;
+out:
+ rtnl_unlock();
+ return err;
+}
+
+static int dumpfn(void *arg)
+{
+ int i;
+ int *pfd = arg;
+ char *argv[] = { "iptables-restore", "-c", NULL };
+
+ if (pfd[0] != 0)
+ sc_dup2(pfd[0], 0);
+
+ for (i=1; i<current->files->fdt->max_fds; i++)
+ sc_close(i);
+
+ module_put(THIS_MODULE);
+
+ set_fs(KERNEL_DS);
+ i = sc_execve("/sbin/iptables-restore", argv, NULL);
+ if (i == -ENOENT)
+ i = sc_execve("/usr/sbin/iptables-restore", argv, NULL);
+ eprintk("failed to exec iptables-restore: %d\n", i);
+ return 255 << 8;
+}
+
+static int rst_restore_iptables(struct cpt_context * ctx)
+{
+ int err;
+ int pfd[2];
+ struct file *f;
+ struct cpt_object_hdr v;
+ int n;
+ struct cpt_section_hdr h;
+ loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES];
+ loff_t end;
+ int pid;
+ int status;
+ mm_segment_t oldfs;
+ sigset_t ignore, blocked;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ if (h.cpt_hdrlen == h.cpt_next)
+ return 0;
+ if (h.cpt_hdrlen > h.cpt_next)
+ return -EINVAL;
+ sec += h.cpt_hdrlen;
+ err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx);
+ if (err < 0)
+ return err;
+
+ err = sc_pipe(pfd);
+ if (err < 0)
+ return err;
+ ignore.sig[0] = CPT_SIG_IGNORE_MASK;
+ sigprocmask(SIG_BLOCK, &ignore, &blocked);
+ pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0);
+ if (err < 0) {
+ eprintk_ctx("iptables local_kernel_thread: %d\n", err);
+ goto out;
+ }
+ f = fget(pfd[1]);
+ sc_close(pfd[1]);
+ sc_close(pfd[0]);
+
+ ctx->file->f_pos = sec + v.cpt_hdrlen;
+ end = sec + v.cpt_next;
+ do {
+ char *p;
+ char buf[16];
+
+ n = end - ctx->file->f_pos;
+ if (n > sizeof(buf))
+ n = sizeof(buf);
+
+ if (ctx->read(buf, n, ctx))
+ break;
+ if ((p = memchr(buf, 0, n)) != NULL)
+ n = p - buf;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ f->f_op->write(f, buf, n, &f->f_pos);
+ set_fs(oldfs);
+ } while (ctx->file->f_pos < end);
+
+ fput(f);
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if ((err = sc_waitx(pid, 0, &status)) < 0)
+ eprintk_ctx("wait4: %d\n", err);
+ else if ((status & 0x7f) == 0) {
+ err = (status & 0xff00) >> 8;
+ if (err != 0) {
+ eprintk_ctx("iptables-restore exited with %d\n", err);
+ eprintk_ctx("Most probably some iptables modules are not loaded\n");
+ err = -EINVAL;
+ }
+ } else {
+ eprintk_ctx("iptables-restore terminated\n");
+ err = -EINVAL;
+ }
+ set_fs(oldfs);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+
+ return err;
+
+out:
+ if (pfd[1] >= 0)
+ sc_close(pfd[1]);
+ if (pfd[0] >= 0)
+ sc_close(pfd[0]);
+ sigprocmask(SIG_SETMASK, &blocked, NULL);
+ return err;
+}
+
+static int rst_restore_snmp_stat(struct cpt_context *ctx, void *mib[], int n,
+ loff_t *ppos, loff_t endpos)
+{
+ int err, in, i;
+ struct cpt_object_hdr o;
+ __u32 *stats;
+
+ err = rst_get_object(CPT_OBJ_BITS, *ppos, &o, ctx);
+ if (err)
+ return err;
+
+ in = o.cpt_next - o.cpt_hdrlen;
+ if (in >= PAGE_SIZE - 4) {
+ eprintk_ctx("Too long SNMP buf (%d)\n", in);
+ return -EINVAL;
+ }
+
+ if (o.cpt_content != CPT_CONTENT_DATA) {
+ if (o.cpt_content == CPT_CONTENT_VOID)
+ return 1;
+
+ eprintk_ctx("Corrupted SNMP stats\n");
+ return -EINVAL;
+ }
+
+ stats = cpt_get_buf(ctx);
+ err = ctx->pread(stats, in, ctx, (*ppos) + o.cpt_hdrlen);
+ if (err)
+ goto out;
+
+ in /= sizeof(*stats);
+ if (in > n)
+ wprintk_ctx("SNMP stats trimmed\n");
+ else
+ n = in;
+
+ for (i = 0; i < n; i++)
+ *((unsigned long *)(per_cpu_ptr(mib[0], 0)) + i) = stats[i];
+
+ *ppos += o.cpt_next;
+ if (*ppos < endpos)
+ err = 1; /* go on restoring */
+out:
+ cpt_release_buf(ctx);
+ return err;
+}
+
+static int rst_restore_snmp(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_SNMP_STATS];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct ve_struct *ve;
+ struct net *net;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_SNMP_STATS || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ ve = get_exec_env();
+ net = ve->ve_netns;
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ if (sec >= endsec)
+ goto out;
+
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.net_statistics,
+ LINUX_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ip_statistics,
+ IPSTATS_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.tcp_statistics,
+ TCP_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_statistics,
+ UDP_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmp_statistics,
+ ICMP_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpmsg_statistics,
+ ICMPMSG_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.ipv6_statistics,
+ IPSTATS_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.udp_stats_in6,
+ UDP_MIB_MAX, &sec, endsec);
+ if (err <= 0)
+ goto out;
+ err = rst_restore_snmp_stat(ctx, (void **)&net->mib.icmpv6_statistics,
+ ICMP6_MIB_MAX, &sec, endsec);
+#endif
+ if (err == 1)
+ err = 0;
+out:
+ return err;
+}
+
+int rst_restore_net(struct cpt_context *ctx)
+{
+ int err;
+
+ err = rst_restore_netdev(ctx);
+ if (!err)
+ err = rst_restore_ifaddr(ctx);
+ if (!err)
+ err = rst_restore_route(ctx);
+ if (!err)
+ err = rst_restore_iptables(ctx);
+ if (!err)
+ err = rst_restore_ip_conntrack(ctx);
+ if (!err)
+ err = rst_restore_snmp(ctx);
+ return err;
+}
diff --git a/kernel/cpt/rst_proc.c b/kernel/cpt/rst_proc.c
new file mode 100644
index 0000000..beaaa3f
--- /dev/null
+++ b/kernel/cpt/rst_proc.c
@@ -0,0 +1,582 @@
+/*
+ *
+ * kernel/cpt/rst_proc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_ioctl.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_dump.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_kernel.h"
+
+MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
+MODULE_LICENSE("GPL");
+
+/* List of contexts and lock protecting the list */
+static struct list_head cpt_context_list;
+static spinlock_t cpt_context_lock;
+
+static int proc_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ off_t pos = 0;
+ off_t begin = 0;
+ int len = 0;
+ cpt_context_t *ctx;
+
+ len += sprintf(buffer, "Ctx Id VE State\n");
+
+ spin_lock(&cpt_context_lock);
+
+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+ len += sprintf(buffer+len,"%p %08x %-8u %d",
+ ctx,
+ ctx->contextid,
+ ctx->ve_id,
+ ctx->ctx_state
+ );
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ len += pagein_info_printf(buffer+len, ctx);
+#endif
+
+ buffer[len++] = '\n';
+
+ pos = begin+len;
+ if (pos < offset) {
+ len = 0;
+ begin = pos;
+ }
+ if (pos > offset+length)
+ goto done;
+ }
+ *eof = 1;
+
+done:
+ spin_unlock(&cpt_context_lock);
+ *start = buffer + (offset - begin);
+ len -= (offset - begin);
+ if(len > length)
+ len = length;
+ if(len < 0)
+ len = 0;
+ return len;
+}
+
+void rst_context_release(cpt_context_t *ctx)
+{
+ list_del(&ctx->ctx_list);
+ spin_unlock(&cpt_context_lock);
+
+ if (ctx->ctx_state > 0)
+ rst_resume(ctx);
+ ctx->ctx_state = CPT_CTX_ERROR;
+
+ rst_close_dumpfile(ctx);
+
+ if (ctx->anonvmas) {
+ int h;
+ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) {
+ while (!hlist_empty(&ctx->anonvmas[h])) {
+ struct hlist_node *elem = ctx->anonvmas[h].first;
+ hlist_del(elem);
+ kfree(elem);
+ }
+ }
+ free_page((unsigned long)ctx->anonvmas);
+ }
+ cpt_flush_error(ctx);
+ if (ctx->errorfile) {
+ fput(ctx->errorfile);
+ ctx->errorfile = NULL;
+ }
+ if (ctx->error_msg) {
+ free_page((unsigned long)ctx->error_msg);
+ ctx->error_msg = NULL;
+ }
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+ rst_drop_iter_dir(ctx);
+#endif
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ if (ctx->pagein_file_out)
+ fput(ctx->pagein_file_out);
+ if (ctx->pagein_file_in)
+ fput(ctx->pagein_file_in);
+ if (ctx->pgin_task)
+ put_task_struct(ctx->pgin_task);
+#endif
+ if (ctx->filejob_queue)
+ rst_flush_filejobs(ctx);
+ if (ctx->vdso)
+ free_page((unsigned long)ctx->vdso);
+ if (ctx->objcount)
+ eprintk_ctx("%d objects leaked\n", ctx->objcount);
+ kfree(ctx);
+
+ spin_lock(&cpt_context_lock);
+}
+
+static void __cpt_context_put(cpt_context_t *ctx)
+{
+ if (!--ctx->refcount)
+ rst_context_release(ctx);
+}
+
+static void cpt_context_put(cpt_context_t *ctx)
+{
+ spin_lock(&cpt_context_lock);
+ __cpt_context_put(ctx);
+ spin_unlock(&cpt_context_lock);
+}
+
+cpt_context_t * rst_context_open(void)
+{
+ cpt_context_t *ctx;
+
+ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
+ rst_context_init(ctx);
+ spin_lock(&cpt_context_lock);
+ list_add_tail(&ctx->ctx_list, &cpt_context_list);
+ spin_unlock(&cpt_context_lock);
+ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
+ if (ctx->error_msg != NULL)
+ ctx->error_msg[0] = 0;
+ }
+ return ctx;
+}
+
+void rst_report_error(int err, cpt_context_t *ctx)
+{
+ if (ctx->statusfile) {
+ mm_segment_t oldfs;
+ int status = 7 /* VZ_ENVCREATE_ERROR */;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (ctx->statusfile->f_op && ctx->statusfile->f_op->write)
+ ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos);
+ set_fs(oldfs);
+ fput(ctx->statusfile);
+ ctx->statusfile = NULL;
+ }
+}
+
+
+static cpt_context_t * cpt_context_lookup(unsigned int ctxid)
+{
+ cpt_context_t *ctx;
+
+ spin_lock(&cpt_context_lock);
+ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
+ if (ctx->contextid == ctxid) {
+ ctx->refcount++;
+ spin_unlock(&cpt_context_lock);
+ return ctx;
+ }
+ }
+ spin_unlock(&cpt_context_lock);
+ return NULL;
+}
+
+static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
+{
+ int err = 0;
+ cpt_context_t *ctx;
+ struct file *dfile = NULL;
+
+ unlock_kernel();
+
+ if (cmd == CPT_TEST_CAPS) {
+ err = test_cpu_caps_and_features();
+ goto out_lock;
+ }
+
+ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
+ cpt_context_t *old_ctx;
+
+ ctx = NULL;
+ if (cmd == CPT_JOIN_CONTEXT) {
+ err = -ENOENT;
+ ctx = cpt_context_lookup(arg);
+ if (!ctx)
+ goto out_lock;
+ }
+
+ spin_lock(&cpt_context_lock);
+ old_ctx = (cpt_context_t*)file->private_data;
+ file->private_data = ctx;
+
+ if (old_ctx) {
+ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
+ old_ctx->sticky = 0;
+ old_ctx->refcount--;
+ }
+ __cpt_context_put(old_ctx);
+ }
+ spin_unlock(&cpt_context_lock);
+ err = 0;
+ goto out_lock;
+ }
+
+ spin_lock(&cpt_context_lock);
+ ctx = (cpt_context_t*)file->private_data;
+ if (ctx)
+ ctx->refcount++;
+ spin_unlock(&cpt_context_lock);
+
+ if (!ctx) {
+ cpt_context_t *old_ctx;
+
+ err = -ENOMEM;
+ ctx = rst_context_open();
+ if (!ctx)
+ goto out_lock;
+
+ spin_lock(&cpt_context_lock);
+ old_ctx = (cpt_context_t*)file->private_data;
+ if (!old_ctx) {
+ ctx->refcount++;
+ file->private_data = ctx;
+ } else {
+ old_ctx->refcount++;
+ }
+ if (old_ctx) {
+ __cpt_context_put(ctx);
+ ctx = old_ctx;
+ }
+ spin_unlock(&cpt_context_lock);
+ }
+
+ if (cmd == CPT_GET_CONTEXT) {
+ unsigned int contextid = (unsigned int)arg;
+
+ err = -EINVAL;
+ if (ctx->contextid && ctx->contextid != contextid)
+ goto out_nosem;
+ if (!ctx->contextid) {
+ cpt_context_t *c1 = cpt_context_lookup(contextid);
+ if (c1) {
+ cpt_context_put(c1);
+ err = -EEXIST;
+ goto out_nosem;
+ }
+ ctx->contextid = contextid;
+ }
+ spin_lock(&cpt_context_lock);
+ if (!ctx->sticky) {
+ ctx->sticky = 1;
+ ctx->refcount++;
+ }
+ spin_unlock(&cpt_context_lock);
+ err = 0;
+ goto out_nosem;
+ }
+
+ down(&ctx->main_sem);
+
+ err = -EBUSY;
+ if (ctx->ctx_state < 0)
+ goto out;
+
+ err = 0;
+ switch (cmd) {
+ case CPT_SET_DUMPFD:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ err = -EBADF;
+ dfile = fget(arg);
+ if (dfile == NULL)
+ break;
+ if (dfile->f_op == NULL ||
+ dfile->f_op->read == NULL) {
+ fput(dfile);
+ break;
+ }
+ err = 0;
+ }
+ if (ctx->file)
+ fput(ctx->file);
+ ctx->file = dfile;
+ break;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ case CPT_SET_PAGEINFDIN:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->pagein_file_in)
+ fput(ctx->pagein_file_in);
+ ctx->pagein_file_in = dfile;
+ break;
+ case CPT_SET_PAGEINFDOUT:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->pagein_file_out)
+ fput(ctx->pagein_file_out);
+ ctx->pagein_file_out = dfile;
+ break;
+ case CPT_PAGEIND:
+ err = rst_pageind(ctx);
+ break;
+#endif
+#ifdef CONFIG_VZ_CHECKPOINT_ITER
+ case CPT_ITER:
+ err = rst_iteration(ctx);
+ break;
+#endif
+ case CPT_SET_LOCKFD:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->lockfile)
+ fput(ctx->lockfile);
+ ctx->lockfile = dfile;
+ break;
+ case CPT_SET_STATUSFD:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->statusfile)
+ fput(ctx->statusfile);
+ ctx->statusfile = dfile;
+ break;
+ case CPT_SET_ERRORFD:
+ if (arg >= 0) {
+ dfile = fget(arg);
+ if (dfile == NULL) {
+ err = -EBADF;
+ break;
+ }
+ }
+ if (ctx->errorfile)
+ fput(ctx->errorfile);
+ ctx->errorfile = dfile;
+ break;
+ case CPT_HARDLNK_ON:
+ ctx->hardlinked_on = 1;
+ break;
+ case CPT_SET_VEID:
+ if (ctx->ctx_state > 0) {
+ err = -EBUSY;
+ break;
+ }
+ ctx->ve_id = arg;
+ break;
+ case CPT_UNDUMP:
+ if (ctx->ctx_state > 0) {
+ err = -ENOENT;
+ break;
+ }
+ ctx->ctx_state = CPT_CTX_UNDUMPING;
+ err = vps_rst_undump(ctx);
+ if (err) {
+ rst_report_error(err, ctx);
+ if (rst_kill(ctx) == 0)
+ ctx->ctx_state = CPT_CTX_IDLE;
+ } else {
+ ctx->ctx_state = CPT_CTX_UNDUMPED;
+ }
+ break;
+ case CPT_RESUME:
+ if (!ctx->ctx_state) {
+ err = -ENOENT;
+ break;
+ }
+ err = rst_resume(ctx);
+ if (!err)
+ ctx->ctx_state = CPT_CTX_IDLE;
+ break;
+ case CPT_KILL:
+ if (!ctx->ctx_state) {
+ err = -ENOENT;
+ break;
+ }
+ err = rst_kill(ctx);
+ if (!err)
+ ctx->ctx_state = CPT_CTX_IDLE;
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+out:
+ cpt_flush_error(ctx);
+ up(&ctx->main_sem);
+out_nosem:
+ cpt_context_put(ctx);
+out_lock:
+ lock_kernel();
+ if (err == -ERESTARTSYS || err == -ERESTARTNOINTR ||
+ err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK)
+ err = -EINTR;
+ return err;
+}
+
+static int rst_open(struct inode * inode, struct file * file)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -EBUSY;
+
+ return 0;
+}
+
+static int rst_release(struct inode * inode, struct file * file)
+{
+ cpt_context_t *ctx;
+
+ spin_lock(&cpt_context_lock);
+ ctx = (cpt_context_t*)file->private_data;
+ file->private_data = NULL;
+ if (ctx)
+ __cpt_context_put(ctx);
+ spin_unlock(&cpt_context_lock);
+
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+static struct file_operations rst_fops =
+{
+ .owner = THIS_MODULE,
+ .ioctl = rst_ioctl,
+ .open = rst_open,
+ .release = rst_release,
+};
+
+
+static struct proc_dir_entry *proc_ent;
+extern void *schedule_tail_p;
+extern void schedule_tail_hook(void);
+
+static struct ctl_table_header *ctl_header;
+
+static ctl_table debug_table[] = {
+ {
+ .procname = "rst",
+ .data = &debug_level,
+ .maxlen = sizeof(debug_level),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+static ctl_table root_table[] = {
+ {
+ .ctl_name = CTL_DEBUG,
+ .procname = "debug",
+ .mode = 0555,
+ .child = debug_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static int __init init_rst(void)
+{
+ int err;
+
+ err = -ENOMEM;
+ ctl_header = register_sysctl_table(root_table);
+ if (!ctl_header)
+ goto err_mon;
+
+ spin_lock_init(&cpt_context_lock);
+ INIT_LIST_HEAD(&cpt_context_list);
+
+ err = -EINVAL;
+ proc_ent = proc_create("rst", 0600, NULL, NULL);
+ if (!proc_ent)
+ goto err_out;
+
+ rst_fops.read = proc_ent->proc_fops->read;
+ rst_fops.write = proc_ent->proc_fops->write;
+ rst_fops.llseek = proc_ent->proc_fops->llseek;
+ proc_ent->proc_fops = &rst_fops;
+
+ proc_ent->read_proc = proc_read;
+ proc_ent->data = NULL;
+ return 0;
+
+err_out:
+ unregister_sysctl_table(ctl_header);
+err_mon:
+ return err;
+}
+module_init(init_rst);
+
+static void __exit exit_rst(void)
+{
+ remove_proc_entry("rst", NULL);
+ unregister_sysctl_table(ctl_header);
+
+ spin_lock(&cpt_context_lock);
+ while (!list_empty(&cpt_context_list)) {
+ cpt_context_t *ctx;
+ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
+
+ if (!ctx->sticky)
+ ctx->refcount++;
+ ctx->sticky = 0;
+
+ BUG_ON(ctx->refcount != 1);
+
+ __cpt_context_put(ctx);
+ }
+ spin_unlock(&cpt_context_lock);
+}
+module_exit(exit_rst);
diff --git a/kernel/cpt/rst_process.c b/kernel/cpt/rst_process.c
new file mode 100644
index 0000000..ffed431
--- /dev/null
+++ b/kernel/cpt/rst_process.c
@@ -0,0 +1,1663 @@
+/*
+ *
+ * kernel/cpt/rst_process.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/posix-timers.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/virtinfo.h>
+#include <linux/virtinfoscp.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/ptrace.h>
+#include <linux/tty.h>
+#include <linux/nsproxy.h>
+#include <linux/securebits.h>
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#endif
+#include <asm/unistd.h>
+
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_ubc.h"
+#include "cpt_process.h"
+#include "cpt_kernel.h"
+
+
+#define HOOK_RESERVE 256
+
+struct resume_info
+{
+ asmlinkage void (*hook)(struct resume_info *);
+ unsigned long hooks;
+#define HOOK_TID 0
+#define HOOK_CONT 1
+#define HOOK_LSI 2
+#define HOOK_RESTART 3
+ unsigned long tid_ptrs[2];
+ siginfo_t last_siginfo;
+};
+
+#ifdef CONFIG_X86_32
+
+#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0)
+#define IN_ERROR(regs) ((long)(regs)->ax < 0)
+#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax))
+#define SYSCALL_RETVAL(regs) ((regs)->ax)
+#define SYSCALL_NR(regs) ((regs)->orig_ax)
+
+#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \
+ (regs)->ip -= 2; } while (0)
+
+#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name)
+
+/* In new kernels task_pt_regs() is define to something inappropriate */
+#undef task_pt_regs
+#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1)
+
+#elif defined(CONFIG_X86_64)
+
+#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0)
+#define IN_ERROR(regs) ((long)(regs)->ax < 0)
+#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax))
+#define SYSCALL_RETVAL(regs) ((regs)->ax)
+#define SYSCALL_NR(regs) ((regs)->orig_ax)
+
+#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \
+ (regs)->ip -= 2; } while (0)
+
+#define __NR32_restart_syscall 0
+#define __NR32_rt_sigtimedwait 177
+#define __NR32_pause 29
+#define __NR32_futex 240
+
+#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \
+ SYSCALL_NR(regs) == __NR_##name) || \
+ ((task_thread_info(tsk)->flags&_TIF_IA32) && \
+ SYSCALL_NR(regs) == __NR32_##name))
+
+#elif defined (CONFIG_IA64)
+
+#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0)
+#define IN_ERROR(regs) ((long)(regs)->r10 == -1)
+#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0)
+#define SYSCALL_RETVAL(regs) ((regs)->r8)
+#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1)
+
+#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0)
+
+#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \
+ (regs)->r10 = 0; \
+ ia64_decrement_ip(regs); } while (0)
+
+#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name)
+
+#else
+
+#error This arch is not supported
+
+#endif
+
+#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs))
+
+pid_t vpid_to_pid(pid_t nr)
+{
+ pid_t vnr;
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = find_vpid(nr);
+ vnr = (pid == NULL ? -1 : pid->numbers[0].nr);
+ rcu_read_unlock();
+ return vnr;
+}
+
+static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si)
+{
+ memset(info, 0, sizeof(*info));
+ switch(si->cpt_code & __SI_MASK) {
+ case __SI_TIMER:
+ info->si_tid = si->cpt_pid;
+ info->si_overrun = si->cpt_uid;
+ info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval);
+ info->si_sys_private = si->cpt_utime;
+ break;
+ case __SI_POLL:
+ info->si_band = si->cpt_pid;
+ info->si_fd = si->cpt_uid;
+ break;
+ case __SI_FAULT:
+ info->si_addr = cpt_ptr_import(si->cpt_sigval);
+#ifdef __ARCH_SI_TRAPNO
+ info->si_trapno = si->cpt_pid;
+#endif
+ break;
+ case __SI_CHLD:
+ info->si_pid = si->cpt_pid;
+ info->si_uid = si->cpt_uid;
+ info->si_status = si->cpt_sigval;
+ info->si_stime = si->cpt_stime;
+ info->si_utime = si->cpt_utime;
+ break;
+ case __SI_KILL:
+ case __SI_RT:
+ case __SI_MESGQ:
+ default:
+ info->si_pid = si->cpt_pid;
+ info->si_uid = si->cpt_uid;
+ info->si_ptr = cpt_ptr_import(si->cpt_sigval);
+ break;
+ }
+ info->si_signo = si->cpt_signo;
+ info->si_errno = si->cpt_errno;
+ info->si_code = si->cpt_code;
+}
+
+static int restore_sigqueue(struct task_struct *tsk,
+ struct sigpending *queue, unsigned long start,
+ unsigned long end)
+{
+ while (start < end) {
+ struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start;
+ if (si->cpt_object == CPT_OBJ_SIGINFO) {
+ struct sigqueue *q = NULL;
+ struct user_struct *up;
+
+ up = alloc_uid(get_exec_env()->user_ns, si->cpt_user);
+ if (!up)
+ return -ENOMEM;
+ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
+ if (!q) {
+ free_uid(up);
+ return -ENOMEM;
+ }
+ if (ub_siginfo_charge(q, get_exec_ub())) {
+ kmem_cache_free(sigqueue_cachep, q);
+ free_uid(up);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&q->list);
+ /* Preallocated elements (posix timers) are not
+ * supported yet. It is safe to replace them with
+ * a private one. */
+ q->flags = 0;
+ q->user = up;
+ atomic_inc(&q->user->sigpending);
+
+ decode_siginfo(&q->info, si);
+ list_add_tail(&q->list, &queue->list);
+ }
+ start += si->cpt_next;
+ }
+ return 0;
+}
+
+int rst_process_linkage(cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ struct cpt_task_image *ti = obj->o_image;
+
+ if (tsk == NULL) {
+ eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm);
+ return -EINVAL;
+ }
+
+ if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) {
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = alloc_vpid_safe(ti->cpt_pgrp);
+ if (!pid) {
+ eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+
+ write_lock_irq(&tasklist_lock);
+ detach_pid(tsk, PIDTYPE_PGID);
+ if (thread_group_leader(tsk))
+ attach_pid(tsk, PIDTYPE_PGID, pid);
+ write_unlock_irq(&tasklist_lock);
+
+ if (task_pgrp_vnr(tsk) != pid_vnr(pid)) {
+ eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+ rcu_read_unlock();
+ }
+ if (task_session_vnr(tsk) != ti->cpt_session) {
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = alloc_vpid_safe(ti->cpt_session);
+ if (!pid) {
+ eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+
+ write_lock_irq(&tasklist_lock);
+ detach_pid(tsk, PIDTYPE_SID);
+ if (thread_group_leader(tsk))
+ attach_pid(tsk, PIDTYPE_SID, pid);
+ write_unlock_irq(&tasklist_lock);
+
+ if (task_session_vnr(tsk) != pid_vnr(pid)) {
+ eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+ rcu_read_unlock();
+ }
+ if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) {
+ struct pid *pid;
+
+ rcu_read_lock();
+ pid = get_pid(find_vpid(ti->cpt_old_pgrp));
+ if (!pid) {
+ eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk));
+ return -EINVAL;
+ }
+ tsk->signal->tty_old_pgrp = pid;
+ rcu_read_unlock();
+ }
+ }
+
+ return 0;
+}
+
+struct pid *alloc_vpid_safe(pid_t vnr)
+{
+ struct pid *pid;
+
+ pid = alloc_pid(current->nsproxy->pid_ns, vnr);
+ if (!pid)
+ pid = find_vpid(vnr);
+ return pid;
+}
+
+static int
+restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx)
+{
+ int err;
+ struct cpt_signal_image *si = cpt_get_buf(ctx);
+
+ current->signal->tty = NULL;
+
+ err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+
+#if 0 /* this should have been restored in rst_process_linkage */
+ if (task_pgrp_vnr(current) != si->cpt_pgrp) {
+ struct pid * pid = NULL, *free = NULL;
+
+ rcu_read_lock();
+ if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) {
+#if 0
+ if (!is_virtual_pid(si->cpt_pgrp)) {
+ eprintk_ctx("external process group " CPT_FID, CPT_TID(current));
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+#endif
+ pid = alloc_vpid_safe(si->cpt_pgrp);
+ free = pid;
+ }
+ write_lock_irq(&tasklist_lock);
+ if (pid != NULL) {
+ if (task_pgrp_nr(current) != pid_nr(pid)) {
+ detach_pid(current, PIDTYPE_PGID);
+ if (thread_group_leader(current)) {
+ attach_pid(current, PIDTYPE_PGID, pid);
+ free = NULL;
+ }
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ if (free != NULL)
+ free_pid(free);
+ rcu_read_unlock();
+ }
+#endif
+
+ current->signal->tty_old_pgrp = NULL;
+ if ((int)si->cpt_old_pgrp > 0) {
+ if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) {
+ current->signal->tty_old_pgrp =
+ alloc_pid(current->nsproxy->pid_ns, 0);
+ if (!current->signal->tty_old_pgrp) {
+ eprintk_ctx("failed to allocate stray tty_old_pgrp\n");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ } else {
+ rcu_read_lock();
+ current->signal->tty_old_pgrp =
+ get_pid(alloc_vpid_safe(si->cpt_old_pgrp));
+ rcu_read_unlock();
+ if (!current->signal->tty_old_pgrp) {
+ dprintk_ctx("forward old tty PGID\n");
+ current->signal->tty_old_pgrp = NULL;
+ }
+ }
+ }
+
+#if 0 /* this should have been restored in rst_process_linkage */
+ if (task_session_vnr(current) != si->cpt_session) {
+ struct pid * pid = NULL, *free = NULL;
+
+ rcu_read_lock();
+ if (si->cpt_session_type == CPT_PGRP_ORPHAN) {
+#if 0
+ if (!is_virtual_pid(si->cpt_session)) {
+ eprintk_ctx("external process session " CPT_FID, CPT_TID(current));
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+#endif
+ pid = alloc_vpid_safe(si->cpt_session);
+ free = pid;
+ }
+ write_lock_irq(&tasklist_lock);
+ if (pid == NULL)
+ pid = find_vpid(si->cpt_session);
+ if (pid != NULL) {
+ if (task_session_nr(current) != pid_nr(pid)) {
+ detach_pid(current, PIDTYPE_SID);
+ set_task_session(current, pid_nr(pid));
+ if (thread_group_leader(current)) {
+ attach_pid(current, PIDTYPE_SID, pid);
+ free = NULL;
+ }
+ }
+ }
+ write_unlock_irq(&tasklist_lock);
+ if (free != NULL)
+ free_pid(free);
+ rcu_read_unlock();
+ }
+#endif
+
+ cpt_sigset_import(&current->signal->shared_pending.signal, si->cpt_sigpending);
+ current->signal->leader = si->cpt_leader;
+ if (si->cpt_ctty != CPT_NULL) {
+ cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx);
+ if (obj) {
+ struct tty_struct *tty = obj->o_obj;
+ if (!tty->session || tty->session ==
+ task_session(current)) {
+ put_pid(tty->session);
+ tty->session = get_pid(task_session(current));
+ tty_kref_put(current->signal->tty);
+ current->signal->tty = tty_kref_get(tty);
+ } else {
+ wprintk_ctx("tty session mismatch\n");
+ }
+ }
+ }
+
+ if (si->cpt_curr_target) {
+ current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target);
+ if (current->signal->curr_target == NULL) {
+ wprintk_ctx("oops, curr_target=NULL, pid=%u\n", si->cpt_curr_target);
+ current->signal->curr_target = current;
+ }
+ }
+ current->signal->flags = 0;
+ *exiting = si->cpt_group_exit;
+ current->signal->group_exit_code = si->cpt_group_exit_code;
+ if (si->cpt_group_exit_task) {
+ current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task);
+ if (current->signal->group_exit_task == NULL) {
+ eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task);
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ current->signal->notify_count = si->cpt_notify_count;
+ current->signal->group_stop_count = si->cpt_group_stop_count;
+
+ if (si->cpt_next > si->cpt_hdrlen) {
+ char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL);
+ if (buf == NULL) {
+ cpt_release_buf(ctx);
+ return -ENOMEM;
+ }
+ err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx,
+ ti->cpt_signal + si->cpt_hdrlen);
+ if (err) {
+ kfree(buf);
+ cpt_release_buf(ctx);
+ return err;
+ }
+ restore_sigqueue(current,
+ &current->signal->shared_pending, (unsigned long)buf,
+ (unsigned long)buf + si->cpt_next - si->cpt_hdrlen);
+ kfree(buf);
+ }
+ cpt_release_buf(ctx);
+ return 0;
+}
+
+int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_sighand_image si;
+ int i;
+ loff_t pos, endpos;
+
+ err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx);
+ if (err)
+ return err;
+
+ for (i=0; i<_NSIG; i++) {
+ current->sighand->action[i].sa.sa_handler = SIG_DFL;
+#ifndef CONFIG_IA64
+ current->sighand->action[i].sa.sa_restorer = 0;
+#endif
+ current->sighand->action[i].sa.sa_flags = 0;
+ memset(&current->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t));
+ }
+
+ pos = ti->cpt_sighand + si.cpt_hdrlen;
+ endpos = ti->cpt_sighand + si.cpt_next;
+ while (pos < endpos) {
+ struct cpt_sighandler_image shi;
+
+ err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx);
+ if (err)
+ return err;
+ current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler;
+#ifndef CONFIG_IA64
+ current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer;
+#endif
+ current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags;
+ cpt_sigset_import(&current->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask);
+ pos += shi.cpt_next;
+ }
+
+ return 0;
+}
+
+
+__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ __u32 flag = 0;
+
+ if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx))
+ flag |= CLONE_THREAD;
+ if (ti->cpt_sighand == CPT_NULL ||
+ lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx))
+ flag |= CLONE_SIGHAND;
+ return flag;
+}
+
+int
+rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+
+ if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) {
+ return -EINVAL;
+ }
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx);
+ if (obj) {
+ struct sighand_struct *sig = current->sighand;
+ if (obj->o_obj != sig) {
+ return -EINVAL;
+ }
+ } else {
+ obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx);
+ if (obj == NULL)
+ return -ENOMEM;
+ cpt_obj_setpos(obj, ti->cpt_sighand, ctx);
+ err = restore_one_sighand_struct(ti, ctx);
+ if (err)
+ return err;
+ }
+
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx);
+ if (obj) {
+ struct signal_struct *sig = current->signal;
+ if (obj->o_obj != sig) {
+ return -EINVAL;
+ }
+/* if (current->signal) {
+ pid_t session;
+
+ session = process_session(current);
+ set_process_vgroup(current, session);
+ set_signal_vsession(current->signal, session);
+ }*/
+ } else {
+ obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx);
+ if (obj == NULL)
+ return -ENOMEM;
+ cpt_obj_setpos(obj, ti->cpt_signal, ctx);
+ err = restore_one_signal_struct(ti, exiting, ctx);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_X86
+static u32 decode_segment(u32 segid)
+{
+ if (segid == CPT_SEG_ZERO)
+ return 0;
+
+ /* TLS descriptors */
+ if (segid <= CPT_SEG_TLS3)
+ return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3;
+
+ /* LDT descriptor, it is just an index to LDT array */
+ if (segid >= CPT_SEG_LDT)
+ return ((segid - CPT_SEG_LDT) << 3) | 7;
+
+ /* Check for one of standard descriptors */
+#ifdef CONFIG_X86_64
+ if (segid == CPT_SEG_USER32_DS)
+ return __USER32_DS;
+ if (segid == CPT_SEG_USER32_CS)
+ return __USER32_CS;
+ if (segid == CPT_SEG_USER64_DS)
+ return __USER_DS;
+ if (segid == CPT_SEG_USER64_CS)
+ return __USER_CS;
+#else
+ if (segid == CPT_SEG_USER32_DS)
+ return __USER_DS;
+ if (segid == CPT_SEG_USER32_CS)
+ return __USER_CS;
+#endif
+ wprintk("Invalid segment reg %d\n", segid);
+ return 0;
+}
+#endif
+
+#if defined (CONFIG_IA64)
+void ia64_decrement_ip (struct pt_regs *regs)
+{
+ unsigned long w0, ri = ia64_psr(regs)->ri - 1;
+
+ if (ia64_psr(regs)->ri == 0) {
+ regs->cr_iip -= 16;
+ ri = 2;
+ get_user(w0, (char __user *) regs->cr_iip + 0);
+ if (((w0 >> 1) & 0xf) == 2) {
+ /*
+ * rfi'ing to slot 2 of an MLX bundle causes
+ * an illegal operation fault. We don't want
+ * that to happen...
+ */
+ ri = 1;
+ }
+ }
+ ia64_psr(regs)->ri = ri;
+}
+#endif
+
+static void rst_child_tid(unsigned long *child_tids)
+{
+ dprintk("rct: " CPT_FID "\n", CPT_TID(current));
+ current->clear_child_tid = (void*)child_tids[0];
+ current->set_child_tid = (void*)child_tids[1];
+}
+
+static void rst_last_siginfo(void)
+{
+ int signr;
+ siginfo_t *info = current->last_siginfo;
+ struct pt_regs *regs = task_pt_regs(current);
+ struct k_sigaction *ka;
+ int ptrace_id;
+
+ dprintk("rlsi: " CPT_FID "\n", CPT_TID(current));
+
+ spin_lock_irq(&current->sighand->siglock);
+ current->last_siginfo = NULL;
+ recalc_sigpending();
+
+ ptrace_id = current->pn_state;
+ clear_pn_state(current);
+
+ switch (ptrace_id) {
+ case PN_STOP_TF:
+ case PN_STOP_TF_RT:
+ /* frame_*signal */
+ dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n",
+ task_pid_vnr(current), current->pid, current->comm,
+ info->si_signo, info->si_code,
+ current->exit_code, SYSCALL_NR(regs),
+ current->ptrace, current->ptrace_message);
+ goto out;
+ case PN_STOP_ENTRY:
+ case PN_STOP_LEAVE:
+ /* do_syscall_trace */
+ spin_unlock_irq(&current->sighand->siglock);
+ dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code);
+ if (current->exit_code) {
+ send_sig(current->exit_code, current, 1);
+ current->exit_code = 0;
+ }
+ if (IN_SYSCALL(regs)) {
+ if (ptrace_id == PN_STOP_ENTRY
+#ifdef CONFIG_X86
+ && SYSCALL_ERRNO(regs) == ENOSYS
+#endif
+ )
+ SYSCALL_RESTART(regs);
+ else if (IN_ERROR(regs) &&
+ syscall_is(current, regs, rt_sigtimedwait) &&
+ (SYSCALL_ERRNO(regs) == EAGAIN ||
+ SYSCALL_ERRNO(regs) == EINTR))
+ SYSCALL_RESTART(regs);
+ }
+ return;
+ case PN_STOP_FORK:
+ /* fork */
+ SYSCALL_SETRET(regs, current->ptrace_message);
+ dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs));
+ goto out;
+ case PN_STOP_VFORK:
+ /* after vfork */
+ SYSCALL_SETRET(regs, current->ptrace_message);
+ dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs));
+ goto out;
+ case PN_STOP_SIGNAL:
+ /* normal case : dequeue signal */
+ break;
+ case PN_STOP_EXIT:
+ dprintk("ptrace exit caught\n");
+ current->ptrace &= ~PT_TRACE_EXIT;
+ spin_unlock_irq(&current->sighand->siglock);
+ module_put(THIS_MODULE);
+ complete_and_exit(NULL, current->ptrace_message);
+ BUG();
+ case PN_STOP_EXEC:
+ eprintk("ptrace after exec caught: must not happen\n");
+ BUG();
+ default:
+ eprintk("ptrace with unknown identity %d\n", ptrace_id);
+ BUG();
+ }
+
+ signr = current->exit_code;
+ if (signr == 0) {
+ dprintk("rlsi: canceled signal %d\n", info->si_signo);
+ goto out;
+ }
+ current->exit_code = 0;
+
+ if (signr != info->si_signo) {
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_pid_vnr(current->parent);
+ info->si_uid = current->parent->cred->uid;
+ }
+
+ /* If the (new) signal is now blocked, requeue it. */
+ if (sigismember(&current->blocked, signr)) {
+ dprintk("going to requeue signal %d\n", signr);
+ goto out_resend_sig;
+ }
+
+ ka = &current->sighand->action[signr-1];
+ if (ka->sa.sa_handler == SIG_IGN) {
+ dprintk("going to resend signal %d (ignored)\n", signr);
+ goto out;
+ }
+ if (ka->sa.sa_handler != SIG_DFL) {
+ dprintk("going to resend signal %d (not SIG_DFL)\n", signr);
+ goto out_resend_sig;
+ }
+ if (signr == SIGCONT ||
+ signr == SIGCHLD ||
+ signr == SIGWINCH ||
+ signr == SIGURG ||
+ current->pid == 1)
+ goto out;
+
+ /* All the rest, which we cannot handle are requeued. */
+ dprintk("going to resend signal %d (sigh)\n", signr);
+out_resend_sig:
+ spin_unlock_irq(&current->sighand->siglock);
+ send_sig_info(signr, info, current);
+ return;
+
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+static void rst_finish_stop(void)
+{
+ /* ...
+ * do_signal() ->
+ * get_signal_to_deliver() ->
+ * do_signal_stop() ->
+ * finish_stop()
+ *
+ * Normally after SIGCONT it will dequeue the next signal. If no signal
+ * is found, do_signal restarts syscall unconditionally.
+ * Otherwise signal handler is pushed on user stack.
+ */
+
+ dprintk("rfs: " CPT_FID "\n", CPT_TID(current));
+
+ clear_stop_state(current);
+ current->exit_code = 0;
+}
+
+static void rst_restart_sys(void)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /* This hook is supposed to be executed, when we have
+ * to complete some interrupted syscall.
+ */
+ dprintk("rrs: " CPT_FID "\n", CPT_TID(current));
+
+ if (!IN_SYSCALL(regs) || !IN_ERROR(regs))
+ return;
+
+#ifdef __NR_pause
+ if (syscall_is(current,regs,pause)) {
+ if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
+ } else
+#else
+ /* On this arch pause() is simulated with sigsuspend(). */
+ if (syscall_is(current,regs,rt_sigsuspend)) {
+ if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
+ } else
+#endif
+ if (syscall_is(current,regs,rt_sigtimedwait)) {
+ if (SYSCALL_ERRNO(regs) == EAGAIN ||
+ SYSCALL_ERRNO(regs) == EINTR) {
+ SYSCALL_RESTART(regs);
+ }
+ } else if (syscall_is(current,regs,futex)) {
+ if (SYSCALL_ERRNO(regs) == EINTR &&
+ !signal_pending(current)) {
+ SYSCALL_RESTART(regs);
+ }
+ }
+
+ if (!signal_pending(current)) {
+ if (SYSCALL_ERRNO(regs) == ERESTARTSYS ||
+ SYSCALL_ERRNO(regs) == ERESTARTNOINTR ||
+ SYSCALL_ERRNO(regs) == ERESTARTNOHAND) {
+ SYSCALL_RESTART(regs);
+ } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) {
+ int new = __NR_restart_syscall;
+#ifdef CONFIG_X86_64
+ if (task_thread_info(current)->flags&_TIF_IA32)
+ new = __NR32_restart_syscall;
+#endif
+ SYSCALL_RESTART2(regs, new);
+ }
+ }
+}
+
+#ifdef CONFIG_X86_32
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
+ struct cpt_task_image *ti, struct cpt_x86_regs *b,
+ struct resume_info **rip, struct cpt_context *ctx)
+{
+ extern char i386_ret_from_resume;
+
+ if (b->cpt_object != CPT_OBJ_X86_REGS)
+ return -EINVAL;
+
+ tsk->thread.sp = (unsigned long) regs;
+ tsk->thread.sp0 = (unsigned long) (regs+1);
+ tsk->thread.ip = (unsigned long) &i386_ret_from_resume;
+
+ tsk->thread.gs = decode_segment(b->cpt_gs);
+ task_user_gs(tsk) = decode_segment(b->cpt_ugs);
+ tsk->thread.debugreg0 = b->cpt_debugreg[0];
+ tsk->thread.debugreg1 = b->cpt_debugreg[1];
+ tsk->thread.debugreg2 = b->cpt_debugreg[2];
+ tsk->thread.debugreg3 = b->cpt_debugreg[3];
+ tsk->thread.debugreg6 = b->cpt_debugreg[6];
+ tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+ regs->bx = b->cpt_ebx;
+ regs->cx = b->cpt_ecx;
+ regs->dx = b->cpt_edx;
+ regs->si = b->cpt_esi;
+ regs->di = b->cpt_edi;
+ regs->bp = b->cpt_ebp;
+ regs->ax = b->cpt_eax;
+ regs->ds = b->cpt_xds;
+ regs->es = b->cpt_xes;
+ regs->orig_ax = b->cpt_orig_eax;
+ regs->ip = b->cpt_eip;
+ regs->cs = b->cpt_xcs;
+ regs->flags = b->cpt_eflags;
+ regs->sp = b->cpt_esp;
+ regs->ss = b->cpt_xss;
+
+ regs->cs = decode_segment(b->cpt_xcs);
+ regs->ss = decode_segment(b->cpt_xss);
+ regs->ds = decode_segment(b->cpt_xds);
+ regs->es = decode_segment(b->cpt_xes);
+ regs->fs = decode_segment(b->cpt_fs);
+
+ tsk->thread.sp -= HOOK_RESERVE;
+ memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
+ *rip = (void*)tsk->thread.sp;
+
+ return 0;
+}
+
+#elif defined(CONFIG_X86_64)
+
+static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s)
+{
+ memset(d, 0, sizeof(struct pt_regs));
+ d->bp = s->cpt_ebp;
+ d->bx = s->cpt_ebx;
+ d->ax = (s32)s->cpt_eax;
+ d->cx = s->cpt_ecx;
+ d->dx = s->cpt_edx;
+ d->si = s->cpt_esi;
+ d->di = s->cpt_edi;
+ d->orig_ax = (s32)s->cpt_orig_eax;
+ d->ip = s->cpt_eip;
+ d->cs = s->cpt_xcs;
+ d->flags = s->cpt_eflags;
+ d->sp = s->cpt_esp;
+ d->ss = s->cpt_xss;
+}
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *regs,
+ struct cpt_task_image *ti, struct cpt_obj_bits *hdr,
+ struct resume_info **rip, struct cpt_context *ctx)
+{
+ if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) {
+ struct cpt_x86_64_regs *b = (void*)hdr;
+
+ tsk->thread.sp = (unsigned long) regs;
+ tsk->thread.sp0 = (unsigned long) (regs+1);
+
+ tsk->thread.fs = b->cpt_fsbase;
+ tsk->thread.gs = b->cpt_gsbase;
+ tsk->thread.fsindex = decode_segment(b->cpt_fsindex);
+ tsk->thread.gsindex = decode_segment(b->cpt_gsindex);
+ tsk->thread.ds = decode_segment(b->cpt_ds);
+ tsk->thread.es = decode_segment(b->cpt_es);
+ tsk->thread.debugreg0 = b->cpt_debugreg[0];
+ tsk->thread.debugreg1 = b->cpt_debugreg[1];
+ tsk->thread.debugreg2 = b->cpt_debugreg[2];
+ tsk->thread.debugreg3 = b->cpt_debugreg[3];
+ tsk->thread.debugreg6 = b->cpt_debugreg[6];
+ tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+ memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs));
+
+ tsk->thread.usersp = regs->sp;
+ regs->cs = decode_segment(b->cpt_cs);
+ regs->ss = decode_segment(b->cpt_ss);
+ } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) {
+ struct cpt_x86_regs *b = (void*)hdr;
+
+ tsk->thread.sp = (unsigned long) regs;
+ tsk->thread.sp0 = (unsigned long) (regs+1);
+
+ tsk->thread.fs = 0;
+ tsk->thread.gs = 0;
+ tsk->thread.fsindex = decode_segment(b->cpt_fs);
+ tsk->thread.gsindex = decode_segment(b->cpt_gs);
+ tsk->thread.debugreg0 = b->cpt_debugreg[0];
+ tsk->thread.debugreg1 = b->cpt_debugreg[1];
+ tsk->thread.debugreg2 = b->cpt_debugreg[2];
+ tsk->thread.debugreg3 = b->cpt_debugreg[3];
+ tsk->thread.debugreg6 = b->cpt_debugreg[6];
+ tsk->thread.debugreg7 = b->cpt_debugreg[7];
+
+ xlate_ptregs_32_to_64(regs, b);
+
+ tsk->thread.usersp = regs->sp;
+ regs->cs = decode_segment(b->cpt_xcs);
+ regs->ss = decode_segment(b->cpt_xss);
+ tsk->thread.ds = decode_segment(b->cpt_xds);
+ tsk->thread.es = decode_segment(b->cpt_xes);
+ } else {
+ return -EINVAL;
+ }
+
+ tsk->thread.sp -= HOOK_RESERVE;
+ memset((void*)tsk->thread.sp, 0, HOOK_RESERVE);
+ *rip = (void*)tsk->thread.sp;
+ return 0;
+}
+
+#elif defined(CONFIG_IA64)
+
+#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */
+
+#define PUT_BITS(first, last, nat) \
+ ({ \
+ unsigned long bit = ia64_unat_pos(&pt->r##first); \
+ unsigned long nbits = (last - first + 1); \
+ unsigned long mask = MASK(nbits) << first; \
+ long dist; \
+ if (bit < first) \
+ dist = 64 + bit - first; \
+ else \
+ dist = bit - first; \
+ ia64_rotl(nat & mask, dist); \
+ })
+
+unsigned long
+ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat)
+{
+ unsigned long scratch_unat;
+
+ /*
+ * Registers that are stored consecutively in struct pt_regs
+ * can be handled in parallel. If the register order in
+ * struct_pt_regs changes, this code MUST be updated.
+ */
+ scratch_unat = PUT_BITS( 1, 1, nat);
+ scratch_unat |= PUT_BITS( 2, 3, nat);
+ scratch_unat |= PUT_BITS(12, 13, nat);
+ scratch_unat |= PUT_BITS(14, 14, nat);
+ scratch_unat |= PUT_BITS(15, 15, nat);
+ scratch_unat |= PUT_BITS( 8, 11, nat);
+ scratch_unat |= PUT_BITS(16, 31, nat);
+
+ return scratch_unat;
+
+}
+
+static unsigned long
+ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat)
+{
+ unsigned long scratch_unat;
+
+ scratch_unat = PUT_BITS( 4, 7, nat);
+
+ return scratch_unat;
+
+}
+
+#undef PUT_BITS
+
+
+static int restore_registers(struct task_struct *tsk, struct pt_regs *pt,
+ struct cpt_task_image *ti,
+ struct cpt_ia64_regs *r,
+ struct resume_info **rip,
+ struct cpt_context *ctx)
+{
+ extern char ia64_ret_from_resume;
+ struct switch_stack *sw;
+ struct resume_info *ri;
+ struct ia64_psr *psr = ia64_psr(pt);
+ void *krbs = (void *)tsk + IA64_RBS_OFFSET;
+ unsigned long reg;
+
+ if (r->cpt_object != CPT_OBJ_IA64_REGS)
+ return -EINVAL;
+
+ if (r->num_regs > 96) {
+ eprintk(CPT_FID " too much RSE regs %lu\n",
+ CPT_TID(tsk), r->num_regs);
+ return -EINVAL;
+ }
+
+ *rip = ri = ((void*)pt) - HOOK_RESERVE;
+ sw = ((struct switch_stack *) ri) - 1;
+
+ memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack));
+ memset(ri, 0, HOOK_RESERVE);
+
+ /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */
+ memcpy(&pt->r1, &r->gr[1], 8*(2-1));
+ memcpy(&pt->r2, &r->gr[2], 8*(4-2));
+ memcpy(&pt->r8, &r->gr[8], 8*(12-8));
+ memcpy(&pt->r12, &r->gr[12], 8*(14-12));
+ memcpy(&pt->r14, &r->gr[14], 8*(15-14));
+ memcpy(&pt->r15, &r->gr[15], 8*(16-15));
+ memcpy(&pt->r16, &r->gr[16], 8*(32-16));
+
+ pt->b0 = r->br[0];
+ pt->b6 = r->br[6];
+ pt->b7 = r->br[7];
+
+ pt->ar_bspstore = r->ar_bspstore;
+ pt->ar_unat = r->ar_unat;
+ pt->ar_pfs = r->ar_pfs;
+ pt->ar_ccv = r->ar_ccv;
+ pt->ar_fpsr = r->ar_fpsr;
+ pt->ar_csd = r->ar_csd;
+ pt->ar_ssd = r->ar_ssd;
+ pt->ar_rsc = r->ar_rsc;
+
+ pt->cr_iip = r->cr_iip;
+ pt->cr_ipsr = r->cr_ipsr;
+
+ pt->pr = r->pr;
+
+ pt->cr_ifs = r->cfm;
+
+ /* fpregs 6..9,10..11 are in pt_regs */
+ memcpy(&pt->f6, &r->fr[2*6], 16*(10-6));
+ memcpy(&pt->f10, &r->fr[2*10], 16*(12-10));
+ /* fpreg 12..15 are on switch stack */
+ memcpy(&sw->f12, &r->fr[2*12], 16*(16-12));
+ /* fpregs 32...127 */
+ tsk->thread.flags |= IA64_THREAD_FPH_VALID;
+ memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32));
+ ia64_drop_fpu(tsk);
+ psr->dfh = 1;
+
+ memcpy(&sw->r4, &r->gr[4], 8*(8-4));
+ memcpy(&sw->b1, &r->br[1], 8*(6-1));
+ sw->ar_lc = r->ar_lc;
+
+ memcpy(&sw->f2, &r->fr[2*2], 16*(6-2));
+ memcpy(&sw->f16, &r->fr[2*16], 16*(32-16));
+
+ sw->caller_unat = 0;
+ sw->ar_fpsr = pt->ar_fpsr;
+ sw->ar_unat = 0;
+ if (r->nat[0] & 0xFFFFFF0FUL)
+ sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]);
+ if (r->nat[0] & 0xF0)
+ sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]);
+
+ sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs);
+ memset(krbs, 0, (void*)sw->ar_bspstore - krbs);
+ sw->ar_rnat = 0;
+ sw->ar_pfs = 0;
+
+ /* This is tricky. When we are in syscall, we have frame
+ * of output register (sometimes, plus one input reg sometimes).
+ * It is not so easy to restore such frame, RSE optimizes
+ * and does not fetch those regs from backstore. So, we restore
+ * the whole frame as local registers, and then repartition it
+ * in ia64_ret_from_resume().
+ */
+ if ((long)pt->cr_ifs >= 0) {
+ unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F);
+ sw->ar_pfs = out | (out<<7);
+ }
+ if (r->ar_ec)
+ sw->ar_pfs |= (r->ar_ec & 0x3F) << 52;
+
+ for (reg = 0; reg < r->num_regs; reg++) {
+ unsigned long *ptr = ia64_rse_skip_regs(krbs, reg);
+ unsigned long *rnatp;
+ unsigned long set_rnat = 0;
+
+ *ptr = r->gr[32+reg];
+
+ if (reg < 32)
+ set_rnat = (r->nat[0] & (1UL<<(reg+32)));
+ else
+ set_rnat = (r->nat[1] & (1UL<<(reg-32)));
+
+ if (set_rnat) {
+ rnatp = ia64_rse_rnat_addr(ptr);
+ if ((unsigned long)rnatp >= sw->ar_bspstore)
+ rnatp = &sw->ar_rnat;
+ *rnatp |= (1UL<<ia64_rse_slot_num(ptr));
+ }
+ }
+
+ sw->b0 = (unsigned long) &ia64_ret_from_resume;
+ tsk->thread.ksp = (unsigned long) sw - 16;
+
+#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */
+#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */
+#define PRED_USER_STACK 3 /* returning to user-stacks? */
+#define PRED_SYSCALL 4 /* inside a system call? */
+#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */
+
+ pt->loadrs = r->loadrs;
+ sw->pr = 0;
+ sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL);
+ sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL));
+ sw->pr &= ~(1UL << PRED_KERNEL_STACK);
+ sw->pr |= (1UL << PRED_USER_STACK);
+ if ((long)pt->cr_ifs < 0) {
+ sw->pr |= (1UL << PRED_NON_SYSCALL);
+ } else {
+ sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
+ }
+
+ return 0;
+}
+#endif
+
+asmlinkage void rst_resume_work(struct resume_info *ri)
+{
+ if (ri->hooks & (1<<HOOK_TID))
+ rst_child_tid(ri->tid_ptrs);
+ if (ri->hooks & (1<<HOOK_CONT))
+ rst_finish_stop();
+ if (ri->hooks & (1<<HOOK_LSI))
+ rst_last_siginfo();
+ if (ri->hooks & (1<<HOOK_RESTART))
+ rst_restart_sys();
+ module_put(THIS_MODULE);
+}
+
+static void rst_apply_mxcsr_mask(struct task_struct *tsk)
+{
+#ifdef CONFIG_X86_32
+ unsigned int flags;
+
+ flags = test_cpu_caps_and_features();
+
+ /* if cpu does not support sse2 mask 6 bit (DAZ flag) and 16-31 bits
+ in MXCSR to avoid general protection fault */
+ if (!(flags & (1 << CPT_CPU_X86_SSE2)))
+ tsk->thread.xstate->fxsave.mxcsr &= 0x0000ffbf;
+#endif
+}
+
+#ifdef CONFIG_X86
+#include <asm/i387.h>
+#endif
+
+#define RLIM_INFINITY32 0xffffffff
+#define RLIM_INFINITY64 (~0ULL)
+
+#ifdef CONFIG_X86_64
+#define rst_rlim_32_to_64(a, i, t, im) \
+do { \
+ if (im->cpt_rlim_##a[i] == RLIM_INFINITY32) \
+ t->signal->rlim[i].rlim_##a = RLIM_INFINITY64; \
+ else \
+ t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \
+} while (0)
+#elif defined(CONFIG_X86_32)
+#define rst_rlim_64_to_32(a, i, t, im) \
+do { \
+ if (im->cpt_rlim_##a[i] == RLIM_INFINITY64) \
+ t->signal->rlim[i].rlim_##a = RLIM_INFINITY32; \
+ else if (im->cpt_rlim_##a[i] > RLIM_INFINITY32) { \
+ eprintk_ctx("rlimit %Lu is too high for 32-bit task, " \
+ "dump file is corrupted\n", \
+ im->cpt_rlim_##a[i]); \
+ return -EINVAL; \
+ } else \
+ t->signal->rlim[i].rlim_##a = im->cpt_rlim_##a[i]; \
+} while (0)
+#endif
+
+int rst_restore_process(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ struct cpt_task_image *ti = obj->o_image;
+ struct pt_regs * regs;
+ struct cpt_object_hdr *b;
+ struct cpt_siginfo_image *lsi = NULL;
+ struct resume_info *ri = NULL;
+ int i;
+ int err = 0;
+#ifdef CONFIG_BEANCOUNTERS
+ struct task_beancounter *tbc;
+ struct user_beancounter *new_bc, *old_bc;
+#endif
+
+ if (tsk == NULL) {
+ eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm);
+ return -EFAULT;
+ }
+
+ wait_task_inactive(tsk, 0);
+#ifdef CONFIG_BEANCOUNTERS
+ tbc = &tsk->task_bc;
+ new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx);
+ err = virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_RSTTSK, new_bc);
+ if (err & NOTIFY_FAIL) {
+ put_beancounter(new_bc);
+ return -ECHRNG;
+ }
+ old_bc = tbc->exec_ub;
+ if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) {
+ dprintk(" *** replacing ub %p by %p for %p (%d %s)\n",
+ old_bc, new_bc, tsk,
+ tsk->pid, tsk->comm);
+ tbc->exec_ub = new_bc;
+ new_bc = old_bc;
+ }
+ put_beancounter(new_bc);
+#endif
+ regs = task_pt_regs(tsk);
+
+ if (!tsk->exit_state) {
+ tsk->lock_depth = -1;
+#ifdef CONFIG_PREEMPT
+ task_thread_info(tsk)->preempt_count--;
+#endif
+ }
+
+ if (tsk->static_prio != ti->cpt_static_prio)
+ set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio));
+
+ cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked);
+ cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked);
+ cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked);
+ cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending);
+
+#ifdef CONFIG_IA64
+ SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac);
+ SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu);
+#endif
+ tsk->did_exec = (ti->cpt_did_exec != 0);
+ tsk->utime = ti->cpt_utime;
+ tsk->stime = ti->cpt_stime;
+ if (ctx->image_version == CPT_VERSION_8)
+ tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC);
+ else
+ cpt_timespec_import(&tsk->start_time, ti->cpt_starttime);
+ _set_normalized_timespec(&tsk->start_time,
+ tsk->start_time.tv_sec +
+ VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec,
+ tsk->start_time.tv_nsec +
+ VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec);
+
+ tsk->nvcsw = ti->cpt_nvcsw;
+ tsk->nivcsw = ti->cpt_nivcsw;
+ tsk->min_flt = ti->cpt_min_flt;
+ tsk->maj_flt = ti->cpt_maj_flt;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
+ tsk->cutime = ti->cpt_cutime;
+ tsk->cstime = ti->cpt_cstime;
+ tsk->cnvcsw = ti->cpt_cnvcsw;
+ tsk->cnivcsw = ti->cpt_cnivcsw;
+ tsk->cmin_flt = ti->cpt_cmin_flt;
+ tsk->cmaj_flt = ti->cpt_cmaj_flt;
+
+ BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS);
+
+ for (i=0; i<RLIM_NLIMITS; i++) {
+ tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
+ tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i];
+ }
+#else
+ if (thread_group_leader(tsk) && tsk->signal) {
+ tsk->signal->utime = ti->cpt_utime;
+ tsk->signal->stime = ti->cpt_stime;
+ tsk->signal->cutime = ti->cpt_cutime;
+ tsk->signal->cstime = ti->cpt_cstime;
+ tsk->signal->nvcsw = ti->cpt_nvcsw;
+ tsk->signal->nivcsw = ti->cpt_nivcsw;
+ tsk->signal->cnvcsw = ti->cpt_cnvcsw;
+ tsk->signal->cnivcsw = ti->cpt_cnivcsw;
+ tsk->signal->min_flt = ti->cpt_min_flt;
+ tsk->signal->maj_flt = ti->cpt_maj_flt;
+ tsk->signal->cmin_flt = ti->cpt_cmin_flt;
+ tsk->signal->cmaj_flt = ti->cpt_cmaj_flt;
+
+ for (i=0; i<RLIM_NLIMITS; i++) {
+#ifdef CONFIG_X86_64
+ if (ctx->image_arch == CPT_OS_ARCH_I386) {
+ rst_rlim_32_to_64(cur, i, tsk, ti);
+ rst_rlim_32_to_64(max, i, tsk, ti);
+ } else
+#elif defined(CONFIG_X86_32)
+ if (ctx->image_arch == CPT_OS_ARCH_EMT64) {
+ rst_rlim_64_to_32(cur, i, tsk, ti);
+ rst_rlim_64_to_32(max, i, tsk, ti);
+ } else
+#endif
+ {
+ tsk->signal->rlim[i].rlim_cur =
+ ti->cpt_rlim_cur[i];
+ tsk->signal->rlim[i].rlim_max =
+ ti->cpt_rlim_max[i];
+ }
+ }
+ }
+#endif
+
+#ifdef CONFIG_X86
+ for (i=0; i<3; i++) {
+ if (i >= GDT_ENTRY_TLS_ENTRIES) {
+ eprintk_ctx("too many tls descs\n");
+ } else {
+ tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF;
+ tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32;
+ }
+ }
+#endif
+
+ clear_stopped_child_used_math(tsk);
+
+ b = (void *)(ti+1);
+ while ((void*)b < ((void*)ti) + ti->cpt_next) {
+ /* Siginfo objects are at the end of obj array */
+ if (b->cpt_object == CPT_OBJ_SIGINFO) {
+ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
+ restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next);
+ set_exec_env(env);
+ break;
+ }
+
+ switch (b->cpt_object) {
+#ifdef CONFIG_X86
+ case CPT_OBJ_BITS:
+ if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE &&
+ cpu_has_fxsr) {
+ if (init_fpu(tsk))
+ return -ENOMEM;
+ memcpy(tsk->thread.xstate,
+ (void*)b + b->cpt_hdrlen,
+ sizeof(struct i387_fxsave_struct));
+ rst_apply_mxcsr_mask(tsk);
+ if (ti->cpt_used_math)
+ set_stopped_child_used_math(tsk);
+ }
+#ifndef CONFIG_X86_64
+ else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD &&
+ !cpu_has_fxsr) {
+ if (init_fpu(tsk))
+ return -ENOMEM;
+ memcpy(tsk->thread.xstate,
+ (void*)b + b->cpt_hdrlen,
+ sizeof(struct i387_fsave_struct));
+ if (ti->cpt_used_math)
+ set_stopped_child_used_math(tsk);
+ }
+#endif
+ break;
+#endif
+ case CPT_OBJ_LASTSIGINFO:
+ lsi = (void*)b;
+ break;
+ case CPT_OBJ_X86_REGS:
+ case CPT_OBJ_X86_64_REGS:
+ case CPT_OBJ_IA64_REGS:
+ if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) {
+ eprintk_ctx("cannot restore registers: image is corrupted\n");
+ return -EINVAL;
+ }
+ break;
+ case CPT_OBJ_SIGALTSTACK: {
+ struct cpt_sigaltstack_image *sas;
+ sas = (struct cpt_sigaltstack_image *)b;
+ tsk->sas_ss_sp = sas->cpt_stack;
+ tsk->sas_ss_size = sas->cpt_stacksize;
+ break;
+ }
+ case CPT_OBJ_TASK_AUX: {
+ struct cpt_task_aux_image *ai;
+ ai = (struct cpt_task_aux_image *)b;
+ tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list);
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_COMPAT
+ if (task_thread_info(tsk)->flags&_TIF_IA32) {
+ tsk->robust_list = (void __user *)NULL;
+ tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list);
+ }
+#endif
+#endif
+ break;
+ }
+ }
+ b = ((void*)b) + b->cpt_next;
+ }
+
+ if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ eprintk_ctx("missing register info\n");
+ return -EINVAL;
+ }
+
+ if (ti->cpt_ppid != ti->cpt_rppid) {
+ struct task_struct *parent;
+ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
+ write_lock_irq(&tasklist_lock);
+ parent = find_task_by_vpid(ti->cpt_ppid);
+ if (parent && parent != tsk->parent) {
+ list_add(&tsk->ptrace_entry, &tsk->parent->ptraced);
+ /*
+ * Ptraced kids are no longer in the parent children
+ * remove_parent(tsk);
+ * tsk->parent = parent;
+ * add_parent(tsk);
+ */
+ }
+ write_unlock_irq(&tasklist_lock);
+ set_exec_env(env);
+ }
+
+ tsk->ptrace_message = ti->cpt_ptrace_message;
+ tsk->pn_state = ti->cpt_pn_state;
+ tsk->stopped_state = ti->cpt_stopped_state;
+ task_thread_info(tsk)->flags = ti->cpt_thrflags;
+
+ /* The image was created with kernel < 2.6.16, while
+ * task hanged in sigsuspend -> do_signal.
+ *
+ * FIXME! This needs more brain efforts...
+ */
+ if (ti->cpt_sigsuspend_state) {
+ set_restore_sigmask();
+ }
+
+#ifdef CONFIG_X86_64
+ task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME;
+ if (!ti->cpt_64bit)
+ task_thread_info(tsk)->flags |= _TIF_IA32;
+#endif
+
+#ifdef CONFIG_X86_32
+ do {
+ if (regs->orig_ax == __NR__newselect && regs->di) {
+ struct timeval tv;
+ if (access_process_vm(tsk, regs->di, &tv,
+ sizeof(tv), 0) != sizeof(tv)) {
+ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n",
+ task_pid_vnr(tsk), tsk->pid, tsk->comm,
+ regs->di);
+ break;
+ }
+ dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n",
+ task_pid_vnr(tsk), tsk->pid, tsk->comm,
+ tv.tv_sec, tv.tv_usec);
+ tv.tv_sec -= ctx->delta_time.tv_sec;
+ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
+ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
+ tv.tv_sec--;
+ } else {
+ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
+ }
+ if (tv.tv_sec < 0) {
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ }
+ dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n",
+ task_pid_vnr(tsk), tsk->pid, tsk->comm,
+ tv.tv_sec, tv.tv_usec);
+ if (access_process_vm(tsk, regs->di, &tv,
+ sizeof(tv), 1) != sizeof(tv)) {
+ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n",
+ task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di);
+ }
+
+ } else if (regs->orig_ax == __NR_select && regs->di) {
+ struct {
+ unsigned long n;
+ fd_set __user *inp, *outp, *exp;
+ struct timeval __user *tvp;
+ } a;
+ struct timeval tv;
+ if (access_process_vm(tsk, regs->bx, &a,
+ sizeof(a), 0) != sizeof(a)) {
+ wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid);
+ break;
+ }
+ if (access_process_vm(tsk, (unsigned long)a.tvp,
+ &tv, sizeof(tv), 0) != sizeof(tv)) {
+ wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid);
+ break;
+ }
+ dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n",
+ tsk->pid, tv.tv_sec, tv.tv_usec);
+ tv.tv_sec -= ctx->delta_time.tv_sec;
+ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
+ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
+ tv.tv_sec--;
+ } else {
+ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
+ }
+ if (tv.tv_sec < 0) {
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ }
+ dprintk_ctx("task %d: New timeval in select: %ld.%ld\n",
+ tsk->pid, tv.tv_sec, tv.tv_usec);
+ if (access_process_vm(tsk, (unsigned long)a.tvp,
+ &tv, sizeof(tv), 1) != sizeof(tv)) {
+ wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid);
+ }
+ }
+ } while (0);
+#endif
+
+ if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) {
+ switch (SYSCALL_ERRNO(regs)) {
+ case ERESTARTSYS:
+ case ERESTARTNOINTR:
+ case ERESTARTNOHAND:
+ case ERESTART_RESTARTBLOCK:
+ case EAGAIN:
+ case EINTR:
+ ri->hooks |= (1<<HOOK_RESTART);
+ }
+ }
+
+ if (ri && (lsi || tsk->pn_state)) {
+ /* ... -> ptrace_notify()
+ * or
+ * ... -> do_signal() -> get_signal_to_deliver() ->
+ * ptrace stop
+ */
+ tsk->last_siginfo = &ri->last_siginfo;
+ ri->hooks |= (1<<HOOK_LSI);
+ if (lsi)
+ decode_siginfo(tsk->last_siginfo, lsi);
+ }
+
+ tsk->ptrace = ti->cpt_ptrace;
+ tsk->flags = (tsk->flags & PF_USED_MATH) |
+ (ti->cpt_flags & CPT_TASK_FLAGS_MASK);
+ clear_tsk_thread_flag(tsk, TIF_FREEZE);
+ tsk->exit_signal = ti->cpt_exit_signal;
+
+ if (ri && tsk->stopped_state) {
+ dprintk_ctx("finish_stop\n");
+ if (ti->cpt_state != TASK_STOPPED)
+ eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state);
+ ri->hooks |= (1<<HOOK_CONT);
+ }
+
+ if (ri && (ti->cpt_set_tid || ti->cpt_clear_tid)) {
+ ri->hooks |= (1<<HOOK_TID);
+ ri->tid_ptrs[0] = ti->cpt_clear_tid;
+ ri->tid_ptrs[1] = ti->cpt_set_tid;
+ dprintk_ctx("settids\n");
+ }
+
+ if (ri && ri->hooks &&
+ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ if (try_module_get(THIS_MODULE))
+ ri->hook = rst_resume_work;
+ }
+
+ if (ti->cpt_state == TASK_TRACED)
+ tsk->state = TASK_TRACED;
+ else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) {
+ tsk->signal->it[CPUCLOCK_VIRT].expires = 0;
+ tsk->signal->it[CPUCLOCK_PROF].expires = 0;
+ if (tsk->state != TASK_DEAD)
+ eprintk_ctx("oops, schedule() did not make us dead\n");
+ }
+
+ if (thread_group_leader(tsk) &&
+ ti->cpt_it_real_value &&
+ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ ktime_t val;
+ s64 nsec;
+
+ nsec = ti->cpt_it_real_value;
+ val.tv64 = 0;
+
+ if (ctx->image_version < CPT_VERSION_9)
+ nsec *= TICK_NSEC;
+
+ val = ktime_add_ns(val, nsec - ctx->delta_nsec);
+ if (val.tv64 <= 0)
+ val.tv64 = NSEC_PER_USEC;
+ dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk),
+ (long long)val.tv64,
+ (unsigned long long)ti->cpt_it_real_value);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) {
+ /* FIXME. Check!!!! */
+ hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL);
+ } else {
+ wprintk_ctx("Timer clash. Impossible?\n");
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk),
+ (unsigned long long)val.tv64);
+ }
+
+ module_put(THIS_MODULE);
+ }
+ return 0;
+}
diff --git a/kernel/cpt/rst_socket.c b/kernel/cpt/rst_socket.c
new file mode 100644
index 0000000..78cc4ff
--- /dev/null
+++ b/kernel/cpt/rst_socket.c
@@ -0,0 +1,993 @@
+/*
+ *
+ * kernel/cpt/rst_socket.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/socket.h>
+#include <linux/un.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/scm.h>
+#include <net/af_unix.h>
+
+#include <bc/kmem.h>
+#include <bc/sock_orphan.h>
+#include <bc/net.h>
+#include <bc/tcp.h>
+
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_files.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+
+#include "cpt_syscalls.h"
+
+
+static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ struct timeval tmptv;
+
+ if (sk->sk_socket) {
+ sk->sk_socket->flags = si->cpt_ssflags;
+ sk->sk_socket->state = si->cpt_sstate;
+ }
+ sk->sk_reuse = si->cpt_reuse;
+ sk->sk_shutdown = si->cpt_shutdown;
+ sk->sk_userlocks = si->cpt_userlocks;
+ sk->sk_no_check = si->cpt_no_check;
+ sock_reset_flag(sk, SOCK_DBG);
+ if (si->cpt_debug)
+ sock_set_flag(sk, SOCK_DBG);
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+ if (si->cpt_rcvtstamp)
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ sock_reset_flag(sk, SOCK_LOCALROUTE);
+ if (si->cpt_localroute)
+ sock_set_flag(sk, SOCK_LOCALROUTE);
+ sk->sk_protocol = si->cpt_protocol;
+ sk->sk_err = si->cpt_err;
+ sk->sk_err_soft = si->cpt_err_soft;
+ sk->sk_priority = si->cpt_priority;
+ sk->sk_rcvlowat = si->cpt_rcvlowat;
+ sk->sk_rcvtimeo = si->cpt_rcvtimeo;
+ if (si->cpt_rcvtimeo == CPT_NULL)
+ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_sndtimeo = si->cpt_sndtimeo;
+ if (si->cpt_sndtimeo == CPT_NULL)
+ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_rcvbuf = si->cpt_rcvbuf;
+ sk->sk_sndbuf = si->cpt_sndbuf;
+ sk->sk_bound_dev_if = si->cpt_bound_dev_if;
+ sk->sk_flags = si->cpt_flags;
+ sk->sk_lingertime = si->cpt_lingertime;
+ if (si->cpt_lingertime == CPT_NULL)
+ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ sk->sk_peercred.pid = si->cpt_peer_pid;
+ sk->sk_peercred.uid = si->cpt_peer_uid;
+ sk->sk_peercred.gid = si->cpt_peer_gid;
+ cpt_timeval_import(&tmptv, si->cpt_stamp);
+ sk->sk_stamp = timeval_to_ktime(tmptv);
+ return 0;
+}
+
+static struct file *sock_mapfile(struct socket *sock)
+{
+ int fd = sock_map_fd(sock, 0);
+
+ if (fd >= 0) {
+ struct file *file = sock->file;
+ get_file(file);
+ sc_close(fd);
+ return file;
+ }
+ return ERR_PTR(fd);
+}
+
+/* Assumption is that /tmp exists and writable.
+ * In previous versions we assumed that listen() will autobind
+ * the socket. It does not do this for AF_UNIX by evident reason:
+ * socket in abstract namespace is accessible, unlike socket bound
+ * to deleted FS object.
+ */
+
+static int
+select_deleted_name(char * name, cpt_context_t *ctx)
+{
+ int i;
+
+ for (i=0; i<100; i++) {
+ struct nameidata nd;
+ unsigned int rnd = net_random();
+
+ sprintf(name, "/tmp/SOCK.%08x", rnd);
+
+ if (path_lookup(name, 0, &nd) != 0)
+ return 0;
+
+ path_put(&nd.path);
+ }
+
+ eprintk_ctx("failed to allocate deleted socket inode\n");
+ return -ELOOP;
+}
+
+static int
+bind_unix_socket(struct socket *sock, struct cpt_sock_image *si,
+ cpt_context_t *ctx)
+{
+ int err;
+ char *name;
+ struct sockaddr* addr;
+ int addrlen;
+ struct sockaddr_un sun;
+ struct nameidata nd;
+
+ if ((addrlen = si->cpt_laddrlen) <= 2)
+ return 0;
+
+ nd.path.dentry = NULL;
+ name = ((char*)si->cpt_laddr) + 2;
+ addr = (struct sockaddr *)si->cpt_laddr;
+
+ if (name[0]) {
+ if (path_lookup(name, 0, &nd))
+ nd.path.dentry = NULL;
+
+ if (si->cpt_deleted) {
+ if (nd.path.dentry == NULL &&
+ sock->ops->bind(sock, addr, addrlen) == 0) {
+ sc_unlink(name);
+ return 0;
+ }
+
+ addr = (struct sockaddr*)&sun;
+ addr->sa_family = AF_UNIX;
+ name = ((char*)addr) + 2;
+ err = select_deleted_name(name, ctx);
+ if (err)
+ goto out;
+ addrlen = 2 + strlen(name);
+ } else if (nd.path.dentry) {
+ if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) {
+ eprintk_ctx("bind_unix_socket: not a socket dentry\n");
+ err = -EINVAL;
+ goto out;
+ }
+ sc_unlink(name);
+ }
+ }
+
+ err = sock->ops->bind(sock, addr, addrlen);
+
+ if (!err && name[0]) {
+ if (nd.path.dentry) {
+ sc_chown(name, nd.path.dentry->d_inode->i_uid,
+ nd.path.dentry->d_inode->i_gid);
+ sc_chmod(name, nd.path.dentry->d_inode->i_mode);
+ }
+ if (si->cpt_deleted)
+ sc_unlink(name);
+ }
+
+out:
+ if (nd.path.dentry)
+ path_put(&nd.path);
+ return err;
+}
+
+static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si,
+ struct cpt_context *ctx)
+{
+ struct sock *sk = sock->sk;
+ cpt_object_t *obj;
+ struct sock *parent;
+
+ if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN)
+ return 0;
+
+ if (si->cpt_parent == -1)
+ return bind_unix_socket(sock, si, ctx);
+
+ obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+ if (!obj)
+ return 0;
+
+ parent = obj->o_obj;
+ if (unix_sk(parent)->addr) {
+ if (unix_sk(sk)->addr &&
+ atomic_dec_and_test(&unix_sk(sk)->addr->refcnt))
+ kfree(unix_sk(sk)->addr);
+ atomic_inc(&unix_sk(parent)->addr->refcnt);
+ unix_sk(sk)->addr = unix_sk(parent)->addr;
+ }
+ return 0;
+}
+
+static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ loff_t endpos;
+
+ pos = pos + si->cpt_hdrlen;
+ endpos = pos + si->cpt_next;
+ while (pos < endpos) {
+ struct sk_buff *skb;
+ __u32 type;
+
+ skb = rst_skb(sk, &pos, NULL, &type, ctx);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) == -EINVAL) {
+ int err;
+
+ err = rst_sock_attr(&pos, sk, ctx);
+ if (err)
+ return err;
+ }
+ return PTR_ERR(skb);
+ }
+
+ if (type == CPT_SKB_RQ) {
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ } else {
+ wprintk_ctx("strange socket queue type %u\n", type);
+ kfree_skb(skb);
+ }
+ }
+ return 0;
+}
+
+static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct socket *sock;
+ struct socket *sock2 = NULL;
+ struct file *file;
+ cpt_object_t *fobj;
+ cpt_object_t *pobj = NULL;
+
+ err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol,
+ &sock);
+ if (err)
+ return err;
+
+ if (si->cpt_socketpair) {
+ err = sock_create(si->cpt_family, si->cpt_type,
+ si->cpt_protocol, &sock2);
+ if (err)
+ goto err_out;
+
+ err = sock->ops->socketpair(sock, sock2);
+ if (err < 0)
+ goto err_out;
+
+ /* Socketpair with a peer outside our environment.
+ * So, we create real half-open pipe and do not worry
+ * about dead end anymore. */
+ if (si->cpt_peer == -1) {
+ sock_release(sock2);
+ sock2 = NULL;
+ }
+ }
+
+ cpt_obj_setobj(obj, sock->sk, ctx);
+
+ if (si->cpt_file != CPT_NULL) {
+ file = sock_mapfile(sock);
+ err = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto err_out;
+
+ err = -ENOMEM;
+
+ obj->o_parent = file;
+
+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
+ goto err_out;
+ cpt_obj_setpos(fobj, si->cpt_file, ctx);
+ cpt_obj_setindex(fobj, si->cpt_index, ctx);
+ }
+
+ if (sock2) {
+ struct file *file2;
+
+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx);
+ if (!pobj) BUG();
+ if (pobj->o_obj) BUG();
+ cpt_obj_setobj(pobj, sock2->sk, ctx);
+
+ if (pobj->o_ppos != CPT_NULL) {
+ file2 = sock_mapfile(sock2);
+ err = PTR_ERR(file2);
+ if (IS_ERR(file2))
+ goto err_out;
+
+ err = -ENOMEM;
+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL)
+ goto err_out;
+ cpt_obj_setpos(fobj, pobj->o_ppos, ctx);
+ cpt_obj_setindex(fobj, si->cpt_peer, ctx);
+
+ pobj->o_parent = file2;
+ }
+ }
+
+ setup_sock_common(sock->sk, si, obj->o_pos, ctx);
+ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) {
+ int saved_reuse = sock->sk->sk_reuse;
+
+ inet_sk(sock->sk)->freebind = 1;
+ sock->sk->sk_reuse = 2;
+ if (si->cpt_laddrlen) {
+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+ if (err) {
+ dprintk_ctx("binding failed: %d, do not worry\n", err);
+ }
+ }
+ sock->sk->sk_reuse = saved_reuse;
+ rst_socket_in(si, obj->o_pos, sock->sk, ctx);
+ } else if (sock->sk->sk_family == AF_NETLINK) {
+ struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr;
+ if (nl->nl_pid) {
+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+ if (err) {
+ eprintk_ctx("AF_NETLINK binding failed: %d\n", err);
+ }
+ }
+ if (si->cpt_raddrlen && nl->nl_pid) {
+ err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK);
+ if (err) {
+ eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err);
+ }
+ }
+ generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+ } else if (sock->sk->sk_family == PF_PACKET) {
+ struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr;
+ if (ll->sll_protocol || ll->sll_ifindex) {
+ int alen = si->cpt_laddrlen;
+ if (alen < sizeof(struct sockaddr_ll))
+ alen = sizeof(struct sockaddr_ll);
+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen);
+ if (err) {
+ eprintk_ctx("AF_PACKET binding failed: %d\n", err);
+ }
+ }
+ generic_restore_queues(sock->sk, si, obj->o_pos, ctx);
+ }
+ fixup_unix_address(sock, si, ctx);
+
+ if (sock2) {
+ err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx);
+ if (err)
+ return err;
+ setup_sock_common(sock2->sk, si, pobj->o_pos, ctx);
+ fixup_unix_address(sock2, si, ctx);
+ }
+
+ if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
+ && (int)si->cpt_parent != -1) {
+ cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+ if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0)
+ sock->sk = NULL;
+ }
+
+
+ if (si->cpt_file == CPT_NULL && sock->sk &&
+ sock->sk->sk_family == AF_INET) {
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ sock->sk = NULL;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ eprintk_ctx("oops, sock is locked by user\n");
+
+ sock_hold(sk);
+ sock_orphan(sk);
+ ub_inc_orphan_count(sk);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ sock_put(sk);
+ dprintk_ctx("orphaning socket %p\n", sk);
+ }
+ }
+
+ if (si->cpt_file == CPT_NULL && sock->sk == NULL)
+ sock_release(sock);
+
+ return 0;
+
+err_out:
+ if (sock2)
+ sock_release(sock2);
+ sock_release(sock);
+ return err;
+}
+
+static int open_listening_socket(loff_t pos, struct cpt_sock_image *si,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct socket *sock;
+ struct file *file;
+ cpt_object_t *obj, *fobj;
+
+ err = sock_create(si->cpt_family, si->cpt_type, si->cpt_protocol,
+ &sock);
+ if (err) {
+ eprintk_ctx("open_listening_socket: sock_create: %d\n", err);
+ return err;
+ }
+
+ sock->sk->sk_reuse = 2;
+ sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if;
+
+ if (sock->sk->sk_family == AF_UNIX) {
+ err = bind_unix_socket(sock, si, ctx);
+ } else if (si->cpt_laddrlen) {
+ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
+ inet_sk(sock->sk)->freebind = 1;
+
+ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
+
+ if (err) {
+ eprintk_ctx("open_listening_socket: bind: %d\n", err);
+ goto err_out;
+ }
+ }
+
+ err = sock->ops->listen(sock, si->cpt_max_ack_backlog);
+ if (err) {
+ eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted);
+ goto err_out;
+ }
+
+ /* Now we may access socket body directly and fixup all the things. */
+
+ file = sock_mapfile(sock);
+ err = PTR_ERR(file);
+ if (IS_ERR(file)) {
+ eprintk_ctx("open_listening_socket: map: %d\n", err);
+ goto err_out;
+ }
+
+ err = -ENOMEM;
+ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
+ goto err_out;
+ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL)
+ goto err_out;
+ cpt_obj_setpos(obj, pos, ctx);
+ cpt_obj_setindex(obj, si->cpt_index, ctx);
+ obj->o_parent = file;
+ cpt_obj_setpos(fobj, si->cpt_file, ctx);
+ cpt_obj_setindex(fobj, si->cpt_index, ctx);
+
+ setup_sock_common(sock->sk, si, pos, ctx);
+
+ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) {
+ rst_listen_socket_in(sock->sk, si, pos, ctx);
+ rst_restore_synwait_queue(sock->sk, si, pos, ctx);
+ }
+
+ return 0;
+
+err_out:
+ sock_release(sock);
+ return err;
+}
+
+static int
+rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+ int err;
+ loff_t pos = *pos_p;
+ struct cpt_sockmc_image v;
+
+ err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx);
+ if (err)
+ return err;
+
+ *pos_p += v.cpt_next;
+
+ if (v.cpt_family == AF_INET)
+ return rst_sk_mcfilter_in(sk, &v, pos, ctx);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ else if (v.cpt_family == AF_INET6)
+ return rst_sk_mcfilter_in6(sk, &v, pos, ctx);
+#endif
+ else
+ return -EAFNOSUPPORT;
+}
+
+
+static int
+rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+ int err;
+ struct sk_filter *fp, *old_fp;
+ loff_t pos = *pos_p;
+ struct cpt_obj_bits v;
+
+ err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx);
+ if (err)
+ return err;
+
+ *pos_p += v.cpt_next;
+
+ if (v.cpt_size % sizeof(struct sock_filter))
+ return -EINVAL;
+
+ fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC);
+ if (fp == NULL)
+ return -ENOMEM;
+ atomic_set(&fp->refcnt, 1);
+ fp->len = v.cpt_size/sizeof(struct sock_filter);
+
+ err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen);
+ if (err) {
+ sk_filter_uncharge(sk, fp);
+ return err;
+ }
+
+ old_fp = sk->sk_filter;
+ sk->sk_filter = fp;
+ if (old_fp)
+ sk_filter_uncharge(sk, old_fp);
+ return 0;
+}
+
+
+int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
+{
+ int err;
+ loff_t pos = *pos_p;
+
+ err = rst_sock_attr_skfilter(pos_p, sk, ctx);
+ if (err && pos == *pos_p)
+ err = rst_sock_attr_mcfilter(pos_p, sk, ctx);
+ return err;
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void rst_tcp_cb_ipv4_to_ipv6(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+ BUG_ON(sizeof(skb->cb) - sizeof(struct inet6_skb_parm) <
+ sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm));
+ memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm));
+ memcpy(skb->cb + sizeof(struct inet6_skb_parm),
+ (void *)v->cpt_cb + sizeof(struct inet_skb_parm),
+ sizeof(struct tcp_skb_cb) - sizeof(struct inet6_skb_parm));
+}
+#else
+static void rst_tcp_cb_ipv6_to_ipv4(struct cpt_skb_image *v, struct sk_buff *skb)
+{
+ BUG_ON(sizeof(v->cpt_cb) - sizeof(struct inet6_skb_parm) <
+ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm));
+ memcpy(skb->cb, v->cpt_cb, sizeof(struct inet_skb_parm));
+ memcpy(skb->cb + sizeof(struct inet_skb_parm),
+ (void *)v->cpt_cb + sizeof(struct inet6_skb_parm),
+ sizeof(struct tcp_skb_cb) - sizeof(struct inet_skb_parm));
+}
+#endif
+
+struct tcp_skb_cb_ipv6 {
+ union {
+ struct inet_skb_parm h4;
+ struct inet6_skb_parm h6;
+ } header;
+ __u32 seq;
+ __u32 end_seq;
+ __u32 when;
+ __u8 flags;
+ __u8 sacked;
+ __u16 urg_ptr;
+ __u32 ack_seq;
+};
+
+#define check_tcp_cb_conv(op1, op2) do { \
+ if (!ctx->tcp_cb_convert) \
+ ctx->tcp_cb_convert = CPT_TCP_CB_##op1; \
+ else if (ctx->tcp_cb_convert == CPT_TCP_CB_##op2) { \
+ kfree_skb(skb); \
+ return ERR_PTR(-EINVAL); \
+ } \
+} while (0)
+
+struct sk_buff * rst_skb(struct sock *sk, loff_t *pos_p, __u32 *owner,
+ __u32 *queue, struct cpt_context *ctx)
+{
+ int err;
+ struct sk_buff *skb;
+ struct cpt_skb_image v;
+ loff_t pos = *pos_p;
+ struct scm_fp_list *fpl = NULL;
+ struct timeval tmptv;
+
+ err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx);
+ if (err)
+ return ERR_PTR(err);
+ *pos_p = pos + v.cpt_next;
+
+ if (owner)
+ *owner = v.cpt_owner;
+ if (queue)
+ *queue = v.cpt_queue;
+
+ skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL);
+ if (skb == NULL)
+ return ERR_PTR(-ENOMEM);
+ skb_reserve(skb, v.cpt_hspace);
+ skb_put(skb, v.cpt_len);
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->transport_header = v.cpt_h;
+ skb->network_header = v.cpt_nh;
+ skb->mac_header = v.cpt_mac;
+#else
+ skb->transport_header = skb->head + v.cpt_h;
+ skb->network_header = skb->head + v.cpt_nh;
+ skb->mac_header = skb->head + v.cpt_mac;
+#endif
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb));
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ /*
+ * According to Alexey all packets in queue have non-zero
+ * flags, as at least TCPCB_FLAG_ACK is set on them.
+ * Luckily for us, offset of field flags in tcp_skb_cb struct
+ * with IPv6 is higher then total size of tcp_skb_cb struct
+ * without IPv6.
+ */
+ if (ctx->image_version >= CPT_VERSION_18_2 ||
+ ((struct tcp_skb_cb_ipv6 *)&v.cpt_cb)->flags) {
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ check_tcp_cb_conv(NOT_CONV, CONV);
+ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb));
+#else
+ check_tcp_cb_conv(CONV, NOT_CONV);
+ rst_tcp_cb_ipv6_to_ipv4(&v, skb);
+#endif
+ } else {
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ check_tcp_cb_conv(CONV, NOT_CONV);
+ rst_tcp_cb_ipv4_to_ipv6(&v, skb);
+#else
+ check_tcp_cb_conv(NOT_CONV, CONV);
+ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb));
+#endif
+ }
+ } else
+ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb));
+ skb->mac_len = v.cpt_mac_len;
+
+ skb->csum = v.cpt_csum;
+ skb->local_df = v.cpt_local_df;
+ skb->pkt_type = v.cpt_pkt_type;
+ skb->ip_summed = v.cpt_ip_summed;
+ skb->priority = v.cpt_priority;
+ skb->protocol = v.cpt_protocol;
+ cpt_timeval_import(&tmptv, v.cpt_stamp);
+ skb->tstamp = timeval_to_ktime(tmptv);
+
+ skb_shinfo(skb)->gso_segs = v.cpt_gso_segs;
+ skb_shinfo(skb)->gso_size = v.cpt_gso_size;
+ if (ctx->image_version == 0) {
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ }
+
+ if (v.cpt_next > v.cpt_hdrlen) {
+ pos = pos + v.cpt_hdrlen;
+ while (pos < *pos_p) {
+ union {
+ struct cpt_obj_bits b;
+ struct cpt_fd_image f;
+ } u;
+
+ err = rst_get_object(-1, pos, &u, ctx);
+ if (err) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ if (u.b.cpt_object == CPT_OBJ_BITS) {
+ if (u.b.cpt_size != v.cpt_hspace + skb->len) {
+ eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len);
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen);
+ if (err) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) {
+ if (!fpl) {
+ fpl = kmalloc(sizeof(struct scm_fp_list),
+ GFP_KERNEL_UBC);
+ if (!fpl) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOMEM);
+ }
+ fpl->count = 0;
+ UNIXCB(skb).fp = fpl;
+ }
+ fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx);
+ if (!IS_ERR(fpl->fp[fpl->count]))
+ fpl->count++;
+ }
+ pos += u.b.cpt_next;
+ }
+ }
+
+ return skb;
+}
+
+static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ loff_t endpos;
+
+ pos = pos + si->cpt_hdrlen;
+ endpos = pos + si->cpt_next;
+ while (pos < endpos) {
+ struct sk_buff *skb;
+ struct sock *owner_sk;
+ __u32 owner;
+
+ skb = rst_skb(sk, &pos, &owner, NULL, ctx);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) == -EINVAL) {
+ int err;
+
+ err = rst_sock_attr(&pos, sk, ctx);
+ if (err)
+ return err;
+ }
+ return PTR_ERR(skb);
+ }
+
+ owner_sk = unix_peer(sk);
+ if (owner != -1) {
+ cpt_object_t *pobj;
+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx);
+ if (pobj == NULL) {
+ eprintk_ctx("orphan af_unix skb?\n");
+ kfree_skb(skb);
+ continue;
+ }
+ owner_sk = pobj->o_obj;
+ }
+ if (owner_sk == NULL) {
+ dprintk_ctx("orphan af_unix skb 2?\n");
+ kfree_skb(skb);
+ continue;
+ }
+ skb_set_owner_w(skb, owner_sk);
+ if (UNIXCB(skb).fp)
+ skb->destructor = unix_destruct_fds;
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ if (sk->sk_state == TCP_LISTEN) {
+ struct socket *sock = skb->sk->sk_socket;
+ if (sock == NULL) BUG();
+ if (sock->file) BUG();
+ skb->sk->sk_socket = NULL;
+ skb->sk->sk_sleep = NULL;
+ sock->sk = NULL;
+ sock_release(sock);
+ }
+ }
+ return 0;
+}
+
+
+/* All the sockets are created before we start to open files */
+
+int rst_sockets(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_SOCKET];
+ loff_t endsec;
+ cpt_object_t *obj;
+ struct cpt_section_hdr h;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err) {
+ eprintk_ctx("rst_sockets: ctx->pread: %d\n", err);
+ return err;
+ }
+ if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) {
+ eprintk_ctx("rst_sockets: hdr err\n");
+ return -EINVAL;
+ }
+
+ /* The first pass: we create socket index and open listening sockets. */
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
+ if (err) {
+ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
+ cpt_release_buf(ctx);
+ return err;
+ }
+ if (sbuf->cpt_state == TCP_LISTEN) {
+ err = open_listening_socket(sec, sbuf, ctx);
+ cpt_release_buf(ctx);
+ if (err) {
+ eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err);
+ return err;
+ }
+ } else {
+ cpt_release_buf(ctx);
+ obj = alloc_cpt_object(GFP_KERNEL, ctx);
+ if (obj == NULL)
+ return -ENOMEM;
+ cpt_obj_setindex(obj, sbuf->cpt_index, ctx);
+ cpt_obj_setpos(obj, sec, ctx);
+ obj->o_ppos = sbuf->cpt_file;
+ intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx);
+ }
+ sec += sbuf->cpt_next;
+ }
+
+ /* Pass 2: really restore sockets */
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct cpt_sock_image *sbuf;
+ if (obj->o_obj != NULL)
+ continue;
+ sbuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+ if (err) {
+ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
+ cpt_release_buf(ctx);
+ return err;
+ }
+ if (sbuf->cpt_state == TCP_LISTEN) BUG();
+ err = open_socket(obj, sbuf, ctx);
+ cpt_release_buf(ctx);
+ if (err) {
+ eprintk_ctx("rst_sockets: open_socket: %d\n", err);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int rst_orphans(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_ORPHANS];
+ loff_t endsec;
+ cpt_object_t *obj;
+ struct cpt_section_hdr h;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+ obj = alloc_cpt_object(GFP_KERNEL, ctx);
+ if (obj == NULL) {
+ cpt_release_buf(ctx);
+ return -ENOMEM;
+ }
+ obj->o_pos = sec;
+ obj->o_ppos = sbuf->cpt_file;
+ err = open_socket(obj, sbuf, ctx);
+ dprintk_ctx("Restoring orphan: %d\n", err);
+ free_cpt_object(obj, ctx);
+ cpt_release_buf(ctx);
+ if (err)
+ return err;
+ sec += sbuf->cpt_next;
+ }
+
+ return 0;
+}
+
+
+/* Pass 3: I understand, this is not funny already :-),
+ * but we have to do another pass to establish links between
+ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX
+ * skb queues with proper skb->sk links.
+ *
+ * This could be made at the end of rst_sockets(), but we defer
+ * restoring af_unix queues up to the end of restoring files to
+ * make restoring passed FDs cleaner.
+ */
+
+int rst_sockets_complete(struct cpt_context *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct cpt_sock_image *sbuf;
+ struct sock *sk = obj->o_obj;
+ struct sock *peer;
+
+ if (!sk) BUG();
+
+ if (sk->sk_family != AF_UNIX)
+ continue;
+
+ sbuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+
+ if (sbuf->cpt_next > sbuf->cpt_hdrlen)
+ restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx);
+
+ cpt_release_buf(ctx);
+
+ if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) {
+ cpt_object_t *pobj;
+
+ sbuf = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+
+ if (sbuf->cpt_peer != -1) {
+ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx);
+ if (pobj) {
+ peer = pobj->o_obj;
+ sock_hold(peer);
+ unix_peer(sk) = peer;
+ }
+ }
+ cpt_release_buf(ctx);
+ }
+ }
+
+ rst_orphans(ctx);
+
+ return 0;
+}
+
diff --git a/kernel/cpt/rst_socket_in.c b/kernel/cpt/rst_socket_in.c
new file mode 100644
index 0000000..08bf907
--- /dev/null
+++ b/kernel/cpt/rst_socket_in.c
@@ -0,0 +1,578 @@
+/*
+ *
+ * kernel/cpt/rst_socket_in.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/jhash.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/ipv6.h>
+#include <linux/igmp.h>
+#include <net/addrconf.h>
+#include <net/inet6_connection_sock.h>
+#include <linux/nsproxy.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_socket.h"
+#include "cpt_kernel.h"
+
+static inline unsigned long jiffies_import(__u32 tmo)
+{
+ __s32 delta = tmo;
+ return jiffies + (long)delta;
+}
+
+static inline __u32 tcp_jiffies_import(__u32 tmo)
+{
+ return ((__u32)jiffies) + tmo;
+}
+
+
+static int restore_queues(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ loff_t endpos;
+
+ pos = pos + si->cpt_hdrlen;
+ endpos = pos + si->cpt_next;
+ while (pos < endpos) {
+ struct sk_buff *skb;
+ __u32 type;
+
+ skb = rst_skb(sk, &pos, NULL, &type, ctx);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) == -EINVAL) {
+ int err;
+
+ err = rst_sock_attr(&pos, sk, ctx);
+ if (err)
+ return err;
+ }
+ return PTR_ERR(skb);
+ }
+
+ if (sk->sk_type == SOCK_STREAM) {
+ if (type == CPT_SKB_RQ) {
+ skb_set_owner_r(skb, sk);
+ ub_tcprcvbuf_charge_forced(sk, skb);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ } else if (type == CPT_SKB_OFOQ) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ skb_set_owner_r(skb, sk);
+ ub_tcprcvbuf_charge_forced(sk, skb);
+ skb_queue_tail(&tp->out_of_order_queue, skb);
+ } else if (type == CPT_SKB_WQ) {
+ sk->sk_wmem_queued += skb->truesize;
+ sk->sk_forward_alloc -= skb->truesize;
+ ub_tcpsndbuf_charge_forced(sk, skb);
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ } else {
+ wprintk_ctx("strange stream queue type %u\n", type);
+ kfree_skb(skb);
+ }
+ } else {
+ if (type == CPT_SKB_RQ) {
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ } else if (type == CPT_SKB_WQ) {
+ struct inet_sock *inet = inet_sk(sk);
+ if (inet->cork.fragsize) {
+ skb_set_owner_w(skb, sk);
+ skb_queue_tail(&sk->sk_write_queue, skb);
+ } else {
+ eprintk_ctx("cork skb is dropped\n");
+ kfree_skb(skb);
+ }
+ } else {
+ wprintk_ctx("strange dgram queue type %u\n", type);
+ kfree_skb(skb);
+ }
+ }
+ }
+ return 0;
+}
+
+static struct sock *find_parent(__u16 sport, cpt_context_t *ctx)
+{
+ cpt_object_t *obj;
+ for_each_object(obj, CPT_OBJ_SOCKET) {
+ struct sock *sk = obj->o_obj;
+ if (sk &&
+ sk->sk_state == TCP_LISTEN &&
+ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
+ inet_sk(sk)->sport == sport)
+ return sk;
+ }
+ return NULL;
+}
+
+static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
+ struct cpt_context *ctx)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ tp->pred_flags = si->cpt_pred_flags;
+ tp->rcv_nxt = si->cpt_rcv_nxt;
+ tp->snd_nxt = si->cpt_snd_nxt;
+ tp->snd_una = si->cpt_snd_una;
+ tp->snd_sml = si->cpt_snd_sml;
+ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
+ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
+ tp->tcp_header_len = si->cpt_tcp_header_len;
+ inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending;
+ inet_csk(sk)->icsk_ack.quick = si->cpt_quick;
+ inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong;
+ inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked;
+ inet_csk(sk)->icsk_ack.ato = si->cpt_ato;
+ inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout);
+ inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime);
+ inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size;
+ inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss;
+ tp->snd_wl1 = si->cpt_snd_wl1;
+ tp->snd_wnd = si->cpt_snd_wnd;
+ tp->max_window = si->cpt_max_window;
+ inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie;
+ tp->mss_cache = si->cpt_mss_cache;
+ tp->rx_opt.mss_clamp = si->cpt_mss_clamp;
+ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
+ inet_csk(sk)->icsk_ca_state = si->cpt_ca_state;
+ inet_csk(sk)->icsk_retransmits = si->cpt_retransmits;
+ tp->reordering = si->cpt_reordering;
+ tp->frto_counter = si->cpt_frto_counter;
+ tp->frto_highmark = si->cpt_frto_highmark;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+ // // tp->adv_cong = si->cpt_adv_cong;
+#endif
+ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
+ inet_csk(sk)->icsk_backoff = si->cpt_backoff;
+ tp->srtt = si->cpt_srtt;
+ tp->mdev = si->cpt_mdev;
+ tp->mdev_max = si->cpt_mdev_max;
+ tp->rttvar = si->cpt_rttvar;
+ tp->rtt_seq = si->cpt_rtt_seq;
+ inet_csk(sk)->icsk_rto = si->cpt_rto;
+ tp->packets_out = si->cpt_packets_out;
+ tp->retrans_out = si->cpt_retrans_out;
+ tp->lost_out = si->cpt_lost_out;
+ tp->sacked_out = si->cpt_sacked_out;
+ tp->fackets_out = si->cpt_fackets_out;
+ tp->snd_ssthresh = si->cpt_snd_ssthresh;
+ tp->snd_cwnd = si->cpt_snd_cwnd;
+ tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt;
+ tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp;
+ tp->snd_cwnd_used = si->cpt_snd_cwnd_used;
+ tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp);
+ inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout);
+ tp->rcv_wnd = si->cpt_rcv_wnd;
+ tp->rcv_wup = si->cpt_rcv_wup;
+ tp->write_seq = si->cpt_write_seq;
+ tp->pushed_seq = si->cpt_pushed_seq;
+ tp->copied_seq = si->cpt_copied_seq;
+ tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok;
+ tp->rx_opt.wscale_ok = si->cpt_wscale_ok;
+ tp->rx_opt.sack_ok = si->cpt_sack_ok;
+ tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp;
+ tp->rx_opt.snd_wscale = si->cpt_snd_wscale;
+ tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale;
+ tp->nonagle = si->cpt_nonagle;
+ tp->keepalive_probes = si->cpt_keepalive_probes;
+ tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval;
+ tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr;
+ tp->rx_opt.ts_recent = si->cpt_ts_recent;
+ tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp;
+ tp->rx_opt.user_mss = si->cpt_user_mss;
+ tp->rx_opt.dsack = si->cpt_dsack;
+ tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0];
+ tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1];
+ tp->selective_acks[0].start_seq = si->cpt_sack_array[2];
+ tp->selective_acks[0].end_seq = si->cpt_sack_array[3];
+ tp->selective_acks[1].start_seq = si->cpt_sack_array[4];
+ tp->selective_acks[1].end_seq = si->cpt_sack_array[5];
+ tp->selective_acks[2].start_seq = si->cpt_sack_array[6];
+ tp->selective_acks[2].end_seq = si->cpt_sack_array[7];
+ tp->selective_acks[3].start_seq = si->cpt_sack_array[8];
+ tp->selective_acks[3].end_seq = si->cpt_sack_array[9];
+
+ tp->window_clamp = si->cpt_window_clamp;
+ tp->rcv_ssthresh = si->cpt_rcv_ssthresh;
+ inet_csk(sk)->icsk_probes_out = si->cpt_probes_out;
+ tp->rx_opt.num_sacks = si->cpt_num_sacks;
+ tp->advmss = si->cpt_advmss;
+ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
+ tp->ecn_flags = si->cpt_ecn_flags;
+ tp->prior_ssthresh = si->cpt_prior_ssthresh;
+ tp->high_seq = si->cpt_high_seq;
+ tp->retrans_stamp = si->cpt_retrans_stamp;
+ tp->undo_marker = si->cpt_undo_marker;
+ tp->undo_retrans = si->cpt_undo_retrans;
+ tp->urg_seq = si->cpt_urg_seq;
+ tp->urg_data = si->cpt_urg_data;
+ inet_csk(sk)->icsk_pending = si->cpt_pending;
+ tp->snd_up = si->cpt_snd_up;
+ tp->keepalive_time = si->cpt_keepalive_time;
+ tp->keepalive_intvl = si->cpt_keepalive_intvl;
+ tp->linger2 = si->cpt_linger2;
+
+ sk->sk_send_head = NULL;
+ for (skb = skb_peek(&sk->sk_write_queue);
+ skb && skb != (struct sk_buff*)&sk->sk_write_queue;
+ skb = skb->next) {
+ if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) {
+ sk->sk_send_head = skb;
+ break;
+ }
+ }
+
+ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) {
+ struct inet_sock *inet = inet_sk(sk);
+ if (inet->num == 0) {
+ cpt_object_t *lobj = NULL;
+
+ if ((int)si->cpt_parent != -1)
+ lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
+
+ if (lobj && lobj->o_obj) {
+ inet->num = ntohs(inet->sport);
+ local_bh_disable();
+ __inet_inherit_port(lobj->o_obj, sk);
+ local_bh_enable();
+ dprintk_ctx("port inherited from parent\n");
+ } else {
+ struct sock *lsk = find_parent(inet->sport, ctx);
+ if (lsk) {
+ inet->num = ntohs(inet->sport);
+ local_bh_disable();
+ __inet_inherit_port(lsk, sk);
+ local_bh_enable();
+ dprintk_ctx("port inherited\n");
+ } else {
+ eprintk_ctx("we are kinda lost...\n");
+ }
+ }
+ }
+
+ sk->sk_prot->hash(sk);
+
+ if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER)
+ sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout);
+ if (inet_csk(sk)->icsk_pending)
+ sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer,
+ inet_csk(sk)->icsk_timeout);
+ if (sock_flag(sk, SOCK_KEEPOPEN)) {
+ unsigned long expires = jiffies_import(si->cpt_ka_timeout);
+ if (time_after(jiffies, expires))
+ expires = jiffies + HZ;
+ sk_reset_timer(sk, &sk->sk_timer, expires);
+ }
+ }
+
+ if (sk->sk_family == AF_INET6)
+ sk->sk_gso_type = SKB_GSO_TCPV6;
+ else
+ sk->sk_gso_type = SKB_GSO_TCPV4;
+
+ return 0;
+}
+
+static void rst_listen_socket_tcp(struct cpt_sock_image *si, struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
+ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
+ tp->tcp_header_len = si->cpt_tcp_header_len;
+ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
+
+ /* Next options are inherited by children */
+ tp->mss_cache = si->cpt_mss_cache;
+ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
+ tp->reordering = si->cpt_reordering;
+ tp->nonagle = si->cpt_nonagle;
+ tp->keepalive_probes = si->cpt_keepalive_probes;
+ tp->rx_opt.user_mss = si->cpt_user_mss;
+ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
+ tp->keepalive_time = si->cpt_keepalive_time;
+ tp->keepalive_intvl = si->cpt_keepalive_intvl;
+ tp->linger2 = si->cpt_linger2;
+}
+
+int rst_listen_socket_in( struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ lock_sock(sk);
+
+ inet->uc_ttl = si->cpt_uc_ttl;
+ inet->tos = si->cpt_tos;
+ inet->cmsg_flags = si->cpt_cmsg_flags;
+ inet->pmtudisc = si->cpt_pmtudisc;
+ inet->recverr = si->cpt_recverr;
+ inet->freebind = si->cpt_freebind;
+ inet->id = si->cpt_idcounter;
+
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ np->frag_size = si->cpt_frag_size6;
+ np->hop_limit = si->cpt_hop_limit6;
+
+ np->rxopt.all = si->cpt_rxopt6;
+ np->mc_loop = si->cpt_mc_loop6;
+ np->recverr = si->cpt_recverr6;
+ np->pmtudisc = si->cpt_pmtudisc6;
+ np->ipv6only = si->cpt_ipv6only6;
+ }
+
+ if (sk->sk_protocol == IPPROTO_TCP)
+ rst_listen_socket_tcp(si, sk);
+
+ release_sock(sk);
+ return 0;
+}
+
+int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
+ struct cpt_context *ctx)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = get_exec_env()->ve_ns->net_ns;
+
+ lock_sock(sk);
+
+ sk->sk_state = si->cpt_state;
+
+ inet->daddr = si->cpt_daddr;
+ inet->dport = si->cpt_dport;
+ inet->saddr = si->cpt_saddr;
+ inet->rcv_saddr = si->cpt_rcv_saddr;
+ inet->sport = si->cpt_sport;
+ inet->uc_ttl = si->cpt_uc_ttl;
+ inet->tos = si->cpt_tos;
+ inet->cmsg_flags = si->cpt_cmsg_flags;
+ inet->mc_index = si->cpt_mc_index;
+ inet->mc_addr = si->cpt_mc_addr;
+ inet->hdrincl = si->cpt_hdrincl;
+ inet->mc_ttl = si->cpt_mc_ttl;
+ inet->mc_loop = si->cpt_mc_loop;
+ inet->pmtudisc = si->cpt_pmtudisc;
+ inet->recverr = si->cpt_recverr;
+ inet->freebind = si->cpt_freebind;
+ inet->id = si->cpt_idcounter;
+
+ inet->cork.flags = si->cpt_cork_flags;
+ inet->cork.fragsize = si->cpt_cork_fragsize;
+ inet->cork.length = si->cpt_cork_length;
+ inet->cork.addr = si->cpt_cork_addr;
+ inet->cork.fl.fl4_src = si->cpt_cork_saddr;
+ inet->cork.fl.fl4_dst = si->cpt_cork_daddr;
+ inet->cork.fl.oif = si->cpt_cork_oif;
+ if (inet->cork.fragsize) {
+ if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) {
+ eprintk_ctx("failed to restore cork route\n");
+ inet->cork.fragsize = 0;
+ }
+ }
+
+ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
+ struct udp_sock *up = udp_sk(sk);
+ up->pending = si->cpt_udp_pending;
+ up->corkflag = si->cpt_udp_corkflag;
+ up->encap_type = si->cpt_udp_encap;
+ up->len = si->cpt_udp_len;
+ }
+
+ if (sk->sk_family == AF_INET6) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ memcpy(&np->saddr, si->cpt_saddr6, 16);
+ memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16);
+ memcpy(&np->daddr, si->cpt_daddr6, 16);
+ np->flow_label = si->cpt_flow_label6;
+ np->frag_size = si->cpt_frag_size6;
+ np->hop_limit = si->cpt_hop_limit6;
+ np->mcast_hops = si->cpt_mcast_hops6;
+ np->mcast_oif = si->cpt_mcast_oif6;
+ np->rxopt.all = si->cpt_rxopt6;
+ np->mc_loop = si->cpt_mc_loop6;
+ np->recverr = si->cpt_recverr6;
+ np->sndflow = si->cpt_sndflow6;
+ np->pmtudisc = si->cpt_pmtudisc6;
+ np->ipv6only = si->cpt_ipv6only6;
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ if (si->cpt_mapped) {
+ extern struct inet_connection_sock_af_ops ipv6_mapped;
+ if (sk->sk_type == SOCK_STREAM &&
+ sk->sk_protocol == IPPROTO_TCP) {
+ inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
+ sk->sk_backlog_rcv = tcp_v4_do_rcv;
+ }
+ }
+#endif
+ }
+
+ restore_queues(sk, si, pos, ctx);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
+ rst_socket_tcp(si, pos, sk, ctx);
+
+ release_sock(sk);
+ return 0;
+}
+
+int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx)
+{
+ struct request_sock *req;
+
+ if (lsk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ req = reqsk_alloc(&tcp_request_sock_ops);
+ if (!req)
+ return -ENOMEM;
+
+ sk->sk_socket = NULL;
+ sk->sk_sleep = NULL;
+ inet_csk_reqsk_queue_add(lsk, req, sk);
+ return 0;
+}
+
+int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si,
+ loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ loff_t end = pos + si->cpt_next;
+
+ pos += si->cpt_hdrlen;
+
+ lock_sock(sk);
+ while (pos < end) {
+ struct cpt_openreq_image oi;
+
+ err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx);
+ if (err) {
+ err = rst_sock_attr(&pos, sk, ctx);
+ if (err) {
+ release_sock(sk);
+ return err;
+ }
+
+ continue;
+ }
+
+ if (oi.cpt_object == CPT_OBJ_OPENREQ) {
+ struct request_sock *req;
+
+ if (oi.cpt_family == AF_INET6 &&
+ sk->sk_family != AF_INET6)
+ /* related to non initialized cpt_family bug */
+ goto next;
+
+ if (oi.cpt_family == AF_INET6) {
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ req = reqsk_alloc(&tcp6_request_sock_ops);
+#else
+ release_sock(sk);
+ return -EINVAL;
+#endif
+ } else {
+ req = reqsk_alloc(&tcp_request_sock_ops);
+ }
+
+ if (req == NULL) {
+ release_sock(sk);
+ return -ENOMEM;
+ }
+
+ tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn;
+ tcp_rsk(req)->snt_isn = oi.cpt_snt_isn;
+ inet_rsk(req)->rmt_port = oi.cpt_rmt_port;
+ req->mss = oi.cpt_mss;
+ req->retrans = oi.cpt_retrans;
+ inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale;
+ inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale;
+ inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok;
+ inet_rsk(req)->sack_ok = oi.cpt_sack_ok;
+ inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok;
+ inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok;
+ inet_rsk(req)->acked = oi.cpt_acked;
+ inet_rsk(req)->opt = NULL;
+ req->window_clamp = oi.cpt_window_clamp;
+ req->rcv_wnd = oi.cpt_rcv_wnd;
+ req->ts_recent = oi.cpt_ts_recent;
+ req->expires = jiffies_import(oi.cpt_expires);
+ req->sk = NULL;
+ req->secid = 0;
+ req->peer_secid = 0;
+
+ if (oi.cpt_family == AF_INET6) {
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+ inet6_rsk(req)->pktopts = NULL;
+ memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16);
+ memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16);
+ inet6_rsk(req)->iif = oi.cpt_iif;
+ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+#endif
+ } else {
+ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4);
+ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4);
+ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+ }
+ }
+next:
+ pos += oi.cpt_next;
+ }
+ release_sock(sk);
+ return 0;
+}
+
+int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
+ loff_t pos, cpt_context_t *ctx)
+{
+ struct ip_mreqn imr;
+
+ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
+ eprintk_ctx("IGMPv3 is still not supported\n");
+ return -EINVAL;
+ }
+
+ memset(&imr, 0, sizeof(imr));
+ imr.imr_ifindex = v->cpt_ifindex;
+ imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0];
+ return ip_mc_join_group(sk, &imr);
+}
+
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
+ loff_t pos, cpt_context_t *ctx)
+{
+
+ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
+ eprintk_ctx("IGMPv3 is still not supported\n");
+ return -EINVAL;
+ }
+
+ return ipv6_sock_mc_join(sk, v->cpt_ifindex,
+ (struct in6_addr*)v->cpt_mcaddr);
+}
+#endif
diff --git a/kernel/cpt/rst_sysvipc.c b/kernel/cpt/rst_sysvipc.c
new file mode 100644
index 0000000..b5e62a7
--- /dev/null
+++ b/kernel/cpt/rst_sysvipc.c
@@ -0,0 +1,639 @@
+/*
+ *
+ * kernel/cpt/rst_sysvipc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/nsproxy.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/shm.h>
+#include <linux/msg.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <bc/kmem.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_kernel.h"
+
+struct _warg {
+ struct file *file;
+ struct cpt_sysvshm_image *v;
+};
+
+static int fixup_one_shm(struct shmid_kernel *shp, void *arg)
+{
+ struct _warg *warg = arg;
+
+ if (shp->shm_file != warg->file)
+ return 0;
+ if (shp->shm_nattch)
+ return -EEXIST;
+
+ shp->shm_perm.uid = warg->v->cpt_uid;
+ shp->shm_perm.gid = warg->v->cpt_gid;
+ shp->shm_perm.cuid = warg->v->cpt_cuid;
+ shp->shm_perm.cgid = warg->v->cpt_cgid;
+ shp->shm_perm.mode = warg->v->cpt_mode;
+
+ shp->shm_atim = warg->v->cpt_atime;
+ shp->shm_dtim = warg->v->cpt_dtime;
+ shp->shm_ctim = warg->v->cpt_ctime;
+ shp->shm_cprid = warg->v->cpt_creator;
+ shp->shm_lprid = warg->v->cpt_last;
+
+ /* TODO: fix shp->mlock_user? */
+ return 1;
+}
+
+static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v)
+{
+ struct _warg warg;
+
+ warg.file = file;
+ warg.v = v;
+
+ return sysvipc_walk_shm(fixup_one_shm, &warg);
+}
+
+static int fixup_shm_data(struct file *file, loff_t pos, loff_t end,
+ struct cpt_context *ctx)
+{
+ struct cpt_page_block pgb;
+ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
+
+ do_write = file->f_dentry->d_inode->i_fop->write;
+ if (do_write == NULL) {
+ eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n");
+ return -EINVAL;
+ }
+
+ while (pos < end) {
+ loff_t opos;
+ loff_t ipos;
+ int count;
+ int err;
+
+ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
+ if (err)
+ return err;
+ dprintk_ctx("restoring SHM block: %08x-%08x\n",
+ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
+ ipos = pos + pgb.cpt_hdrlen;
+ opos = pgb.cpt_start;
+ count = pgb.cpt_end-pgb.cpt_start;
+ while (count > 0) {
+ mm_segment_t oldfs;
+ int copy = count;
+
+ if (copy > PAGE_SIZE)
+ copy = PAGE_SIZE;
+ (void)cpt_get_buf(ctx);
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
+ set_fs(oldfs);
+ if (err) {
+ __cpt_release_buf(ctx);
+ return err;
+ }
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ ipos += copy;
+ err = do_write(file, ctx->tmpbuf, copy, &opos);
+ set_fs(oldfs);
+ __cpt_release_buf(ctx);
+ if (err != copy) {
+ eprintk_ctx("write() failure\n");
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ count -= copy;
+ }
+ pos += pgb.cpt_next;
+ }
+ return 0;
+}
+
+struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx)
+{
+ struct file *file;
+ int err;
+ loff_t dpos, epos;
+ union {
+ struct cpt_file_image fi;
+ struct cpt_sysvshm_image shmi;
+ struct cpt_inode_image ii;
+ } u;
+
+ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
+ if (err < 0)
+ goto err_out;
+ pos = u.fi.cpt_inode;
+ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
+ if (err < 0)
+ goto err_out;
+ dpos = pos + u.ii.cpt_hdrlen;
+ epos = pos + u.ii.cpt_next;
+ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
+ if (err < 0)
+ goto err_out;
+ dpos += u.shmi.cpt_next;
+
+ file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id,
+ u.shmi.cpt_segsz, u.shmi.cpt_mode);
+ if (!IS_ERR(file)) {
+ err = fixup_shm(file, &u.shmi);
+ if (err != -EEXIST && dpos < epos) {
+ err = fixup_shm_data(file, dpos, epos, ctx);
+ if (err)
+ goto err_put;
+ }
+ } else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) {
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ struct shmid_kernel *shp;
+
+ shp = shm_lock(ipc_ns, u.shmi.cpt_id);
+ BUG_ON(IS_ERR(shp));
+ get_file(shp->shm_file);
+ file = shp->shm_file;
+ shm_unlock(shp);
+ }
+ return file;
+
+err_put:
+ fput(file);
+err_out:
+ return ERR_PTR(err);
+}
+
+struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx)
+{
+ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+ struct file *file;
+ union {
+ struct cpt_file_image fi;
+ struct cpt_inode_image ii;
+ struct cpt_sysvshm_image shmi;
+ } u;
+ struct shmid_kernel *shp;
+ struct shm_file_data *sfd;
+ struct path path;
+ mode_t f_mode;
+ loff_t pos;
+ int err;
+
+ pos = vmai->cpt_file;
+ file = rst_sysv_shm_itself(pos, ctx);
+ if (IS_ERR(file) && PTR_ERR(file) != -EEXIST)
+ return file;
+ fput(file);
+
+ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
+ if (err < 0)
+ goto err_out;
+ pos = u.fi.cpt_inode;
+ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
+ if (err < 0)
+ goto err_out;
+ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
+ if (err < 0)
+ goto err_out;
+
+ shp = shm_lock(ipc_ns, u.shmi.cpt_id);
+ BUG_ON(IS_ERR(shp));
+ path.dentry = dget(shp->shm_file->f_path.dentry);
+ path.mnt = shp->shm_file->f_path.mnt;
+ shm_unlock(shp);
+
+ err = -ENOMEM;
+ sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
+ if (!sfd)
+ goto out_put_dentry;
+
+ f_mode = 0;
+ if (vmai->cpt_flags & VM_READ)
+ f_mode |= FMODE_READ;
+ if (vmai->cpt_flags & VM_WRITE)
+ f_mode |= FMODE_WRITE;
+ if (vmai->cpt_flags & VM_EXEC)
+ f_mode |= FMODE_EXEC;
+
+ err = -ENOMEM;
+ file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
+ if (!file)
+ goto out_free;
+
+ file->private_data = sfd;
+ file->f_mapping = shp->shm_file->f_mapping;
+ sfd->id = shp->shm_perm.id;
+ sfd->ns = get_ipc_ns(ipc_ns);
+ sfd->file = shp->shm_file;
+ sfd->vm_ops = NULL;
+
+ return file;
+
+out_free:
+ kfree(sfd);
+out_put_dentry:
+ dput(path.dentry);
+err_out:
+ return ERR_PTR(err);
+}
+
+static int attach_one_undo(int semid, struct sem_array *sma, void *arg)
+{
+ struct sem_undo *su = arg;
+ struct sem_undo_list *undo_list = current->sysvsem.undo_list;
+
+ if (semid != su->semid)
+ return 0;
+
+ list_add(&su->list_proc, &undo_list->list_proc);
+ list_add(&su->list_id, &sma->list_id);
+
+ return 1;
+}
+
+static int attach_undo(struct sem_undo *su)
+{
+ return sysvipc_walk_sem(attach_one_undo, su);
+}
+
+static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx)
+{
+ int err;
+ struct sem_undo_list *undo_list;
+
+ if (current->sysvsem.undo_list) {
+ eprintk_ctx("Funny undo_list\n");
+ return 0;
+ }
+
+ undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC);
+ if (undo_list == NULL)
+ return -ENOMEM;
+
+ atomic_set(&undo_list->refcnt, 1);
+ spin_lock_init(&undo_list->lock);
+ INIT_LIST_HEAD(&undo_list->list_proc);
+ current->sysvsem.undo_list = undo_list;
+
+ if (sui->cpt_next > sui->cpt_hdrlen) {
+ loff_t offset = pos + sui->cpt_hdrlen;
+ do {
+ struct sem_undo *new;
+ struct cpt_sysvsem_undo_image spi;
+ err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx);
+ if (err)
+ goto out;
+ new = kmalloc(sizeof(struct sem_undo) +
+ sizeof(short)*spi.cpt_nsem,
+ GFP_KERNEL_UBC);
+ if (!new) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem);
+ new->semadj = (short *) &new[1];
+ new->semid = spi.cpt_id;
+ err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen);
+ if (err) {
+ kfree(new);
+ goto out;
+ }
+ err = attach_undo(new);
+ if (err <= 0) {
+ if (err == 0)
+ err = -ENOENT;
+ kfree(new);
+ goto out;
+ }
+ offset += spi.cpt_next;
+ } while (offset < pos + sui->cpt_next);
+ }
+ err = 0;
+
+out:
+ return err;
+}
+
+__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ __u32 flag = 0;
+
+#if 0
+ if (ti->cpt_sysvsem_undo == CPT_NULL ||
+ lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo))
+ flag |= CLONE_SYSVSEM;
+#endif
+ return flag;
+}
+
+int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ int err;
+ struct sem_undo_list *f = current->sysvsem.undo_list;
+ cpt_object_t *obj;
+ struct cpt_object_hdr sui;
+
+ if (ti->cpt_sysvsem_undo == CPT_NULL) {
+ exit_sem(current);
+ return 0;
+ }
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx);
+ if (obj) {
+ if (obj->o_obj != f) {
+ exit_sem(current);
+ f = obj->o_obj;
+ atomic_inc(&f->refcnt);
+ current->sysvsem.undo_list = f;
+ }
+ return 0;
+ }
+
+ if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0)
+ goto out;
+
+ if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0)
+ goto out;
+
+ err = -ENOMEM;
+ obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx);
+ if (obj) {
+ err = 0;
+ cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx);
+ }
+
+ return 0;
+
+out:
+ return err;
+}
+
+struct _sarg {
+ int semid;
+ struct cpt_sysvsem_image *v;
+ __u32 *arr;
+};
+
+static int fixup_one_sem(int semid, struct sem_array *sma, void *arg)
+{
+ struct _sarg *warg = arg;
+
+ if (semid != warg->semid)
+ return 0;
+
+ sma->sem_perm.uid = warg->v->cpt_uid;
+ sma->sem_perm.gid = warg->v->cpt_gid;
+ sma->sem_perm.cuid = warg->v->cpt_cuid;
+ sma->sem_perm.cgid = warg->v->cpt_cgid;
+ sma->sem_perm.mode = warg->v->cpt_mode;
+ sma->sem_perm.seq = warg->v->cpt_seq;
+
+ sma->sem_ctime = warg->v->cpt_ctime;
+ sma->sem_otime = warg->v->cpt_otime;
+ memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8);
+ return 1;
+}
+
+static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr)
+{
+ struct _sarg warg;
+
+ warg.semid = semid;
+ warg.v = v;
+ warg.arr = arr;
+
+ return sysvipc_walk_sem(fixup_one_sem, &warg);
+}
+
+
+static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si,
+ struct cpt_context *ctx)
+{
+ int err;
+ __u32 *arr;
+ int nsems = (si->cpt_next - si->cpt_hdrlen)/8;
+
+ arr = kmalloc(nsems*8, GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+
+ err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen);
+ if (err)
+ goto out;
+ err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode);
+ if (err < 0) {
+ eprintk_ctx("SEM 3\n");
+ goto out;
+ }
+ err = fixup_sem(si->cpt_id, si, arr);
+ if (err == 0)
+ err = -ESRCH;
+ if (err > 0)
+ err = 0;
+out:
+ kfree(arr);
+ return err;
+}
+
+static int rst_sysv_sem(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_sysvsem_image sbuf;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ int err;
+ err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx);
+ if (err)
+ return err;
+ err = restore_sem(sec, &sbuf, ctx);
+ if (err)
+ return err;
+ sec += sbuf.cpt_next;
+ }
+ return 0;
+}
+
+struct _marg {
+ int msqid;
+ struct cpt_sysvmsg_image *v;
+ struct msg_queue *m;
+};
+
+static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg)
+{
+ struct _marg *warg = arg;
+
+ if (msqid != warg->msqid)
+ return 0;
+
+ msq->q_perm.uid = warg->v->cpt_uid;
+ msq->q_perm.gid = warg->v->cpt_gid;
+ msq->q_perm.cuid = warg->v->cpt_cuid;
+ msq->q_perm.cgid = warg->v->cpt_cgid;
+ msq->q_perm.mode = warg->v->cpt_mode;
+ msq->q_perm.seq = warg->v->cpt_seq;
+
+ msq->q_stime = warg->v->cpt_stime;
+ msq->q_rtime = warg->v->cpt_rtime;
+ msq->q_ctime = warg->v->cpt_ctime;
+ msq->q_lspid = warg->v->cpt_last_sender;
+ msq->q_lrpid = warg->v->cpt_last_receiver;
+ msq->q_qbytes = warg->v->cpt_qbytes;
+
+ warg->m = msq;
+ return 1;
+}
+
+struct _larg
+{
+ cpt_context_t * ctx;
+ loff_t pos;
+};
+
+static int do_load_msg(void * dst, int len, int offset, void * data)
+{
+ struct _larg * arg = data;
+ return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset);
+}
+
+static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos,
+ cpt_context_t * ctx)
+{
+ int err;
+ struct _marg warg;
+ loff_t endpos = pos + v->cpt_next;
+ struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+
+ pos += v->cpt_hdrlen;
+
+ warg.msqid = msqid;
+ warg.v = v;
+
+ err = sysvipc_walk_msg(fixup_one_msg, &warg);
+ if (err <= 0)
+ return err;
+
+ while (pos < endpos) {
+ struct cpt_sysvmsg_msg_image mi;
+ struct msg_msg *m;
+ struct _larg data = {
+ .ctx = ctx
+ };
+
+ err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx);
+ if (err)
+ return err;
+ data.pos = pos + mi.cpt_hdrlen;
+ m = sysv_msg_load(do_load_msg, mi.cpt_size, &data);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+ m->m_type = mi.cpt_type;
+ m->m_ts = mi.cpt_size;
+ list_add_tail(&m->m_list, &warg.m->q_messages);
+ warg.m->q_cbytes += m->m_ts;
+ warg.m->q_qnum++;
+ atomic_add(m->m_ts, &ns->msg_bytes);
+ atomic_inc(&ns->msg_hdrs);
+
+ pos += mi.cpt_next;
+ }
+ return 1;
+}
+
+static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si,
+ struct cpt_context *ctx)
+{
+ int err;
+
+ err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode);
+ if (err < 0) {
+ eprintk_ctx("MSG 3\n");
+ goto out;
+ }
+ err = fixup_msg(si->cpt_id, si, pos, ctx);
+ if (err == 0)
+ err = -ESRCH;
+ if (err > 0)
+ err = 0;
+out:
+ return err;
+}
+
+static int rst_sysv_msg(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_sysvmsg_image sbuf;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ int err;
+ err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx);
+ if (err)
+ return err;
+ err = restore_msg(sec, &sbuf, ctx);
+ if (err)
+ return err;
+ sec += sbuf.cpt_next;
+ }
+ return 0;
+}
+
+
+int rst_sysv_ipc(struct cpt_context *ctx)
+{
+ int err;
+
+ err = rst_sysv_sem(ctx);
+ if (!err)
+ err = rst_sysv_msg(ctx);
+
+ return err;
+}
diff --git a/kernel/cpt/rst_tty.c b/kernel/cpt/rst_tty.c
new file mode 100644
index 0000000..929ca26
--- /dev/null
+++ b/kernel/cpt/rst_tty.c
@@ -0,0 +1,384 @@
+/*
+ *
+ * kernel/cpt/rst_tty.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/tty.h>
+#include <linux/vmalloc.h>
+#include <linux/nsproxy.h>
+#include <asm/unistd.h>
+#include <asm/uaccess.h>
+#include <linux/cpt_image.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_files.h"
+#include "cpt_kernel.h"
+
+static int pty_setup(struct tty_struct *stty, loff_t pos,
+ struct cpt_tty_image *pi, struct cpt_context *ctx)
+{
+ unsigned long flags;
+
+ stty->pgrp = NULL;
+ stty->session = NULL;
+ stty->packet = pi->cpt_packet;
+ stty->stopped = pi->cpt_stopped;
+ stty->hw_stopped = pi->cpt_hw_stopped;
+ stty->flow_stopped = pi->cpt_flow_stopped;
+#define DONOT_CHANGE ((1<<TTY_CHARGED)|(1<<TTY_CLOSING)|(1<<TTY_LDISC))
+ flags = stty->flags & DONOT_CHANGE;
+ stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE);
+ stty->ctrl_status = pi->cpt_ctrl_status;
+ stty->winsize.ws_row = pi->cpt_ws_row;
+ stty->winsize.ws_col = pi->cpt_ws_col;
+ stty->winsize.ws_ypixel = pi->cpt_ws_prow;
+ stty->winsize.ws_xpixel = pi->cpt_ws_pcol;
+ stty->canon_column = pi->cpt_canon_column;
+ stty->column = pi->cpt_column;
+ stty->raw = pi->cpt_raw;
+ stty->real_raw = pi->cpt_real_raw;
+ stty->erasing = pi->cpt_erasing;
+ stty->lnext = pi->cpt_lnext;
+ stty->icanon = pi->cpt_icanon;
+ stty->closing = pi->cpt_closing;
+ stty->minimum_to_wake = pi->cpt_minimum_to_wake;
+
+ stty->termios->c_iflag = pi->cpt_c_iflag;
+ stty->termios->c_oflag = pi->cpt_c_oflag;
+ stty->termios->c_lflag = pi->cpt_c_lflag;
+ stty->termios->c_cflag = pi->cpt_c_cflag;
+ memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS);
+ memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags));
+
+ if (pi->cpt_next > pi->cpt_hdrlen) {
+ int err;
+ struct cpt_obj_bits b;
+ err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx);
+ if (err)
+ return err;
+ if (b.cpt_size == 0)
+ return 0;
+ err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen);
+ if (err)
+ return err;
+
+ spin_lock_irq(&stty->read_lock);
+ stty->read_tail = 0;
+ stty->read_cnt = b.cpt_size;
+ stty->read_head = b.cpt_size;
+ stty->canon_head = stty->read_tail + pi->cpt_canon_head;
+ stty->canon_data = pi->cpt_canon_data;
+ spin_unlock_irq(&stty->read_lock);
+ }
+
+ return 0;
+}
+
+/* Find slave/master tty in image, when we already know master/slave.
+ * It might be optimized, of course. */
+static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_TTY];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_tty_image *pibuf;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return CPT_NULL;
+ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
+ return CPT_NULL;
+ pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL);
+ if (pibuf == NULL) {
+ eprintk_ctx("cannot allocate buffer\n");
+ return CPT_NULL;
+ }
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx))
+ return CPT_NULL;
+ if (pibuf->cpt_index == pi->cpt_index &&
+ !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) &&
+ pos != sec) {
+ pty_setup(stty, sec, pibuf, ctx);
+ return sec;
+ }
+ sec += pibuf->cpt_next;
+ }
+ kfree(pibuf);
+ return CPT_NULL;
+}
+
+static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master,
+ struct cpt_context *ctx)
+{
+ int err;
+ struct iattr newattrs;
+ struct dentry *d = master->f_dentry;
+
+ newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE;
+ newattrs.ia_uid = ii->cpt_uid;
+ newattrs.ia_gid = ii->cpt_gid;
+ newattrs.ia_mode = ii->cpt_mode;
+
+ mutex_lock(&d->d_inode->i_mutex);
+ err = notify_change(d, &newattrs);
+ mutex_unlock(&d->d_inode->i_mutex);
+
+ return err;
+}
+
+/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open
+ * /dev/ptmx until we get pty with desired index.
+ */
+
+struct file *ptmx_open(int index, unsigned int flags)
+{
+ struct file *file;
+ struct file **stack = NULL;
+ int depth = 0;
+
+ for (;;) {
+ struct tty_struct *tty;
+
+ file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+ if (IS_ERR(file))
+ break;
+ tty = file->private_data;
+ if (tty->index == index)
+ break;
+
+ if (depth == PAGE_SIZE/sizeof(struct file *)) {
+ fput(file);
+ file = ERR_PTR(-EBUSY);
+ break;
+ }
+ if (stack == NULL) {
+ stack = (struct file **)__get_free_page(GFP_KERNEL);
+ if (!stack) {
+ fput(file);
+ file = ERR_PTR(-ENOMEM);
+ break;
+ }
+ }
+ stack[depth] = file;
+ depth++;
+ }
+ while (depth > 0) {
+ depth--;
+ fput(stack[depth]);
+ }
+ if (stack)
+ free_page((unsigned long)stack);
+ return file;
+}
+
+
+struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii,
+ unsigned flags, struct cpt_context *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+ struct file *master, *slave;
+ struct tty_struct *stty;
+ struct cpt_tty_image *pi;
+ static char *a = "pqrstuvwxyzabcde";
+ static char *b = "0123456789abcdef";
+ char pairname[16];
+ unsigned master_flags, slave_flags;
+
+ if (fi->cpt_priv == CPT_NULL)
+ return ERR_PTR(-EINVAL);
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx);
+ if (obj && obj->o_parent) {
+ dprintk_ctx("obtained pty as pair to existing\n");
+ master = obj->o_parent;
+ stty = master->private_data;
+
+ if (stty->driver->subtype == PTY_TYPE_MASTER &&
+ (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) {
+ wprintk_ctx("cloning ptmx\n");
+ get_file(master);
+ return master;
+ }
+
+ master = dentry_open(dget(master->f_dentry),
+ mntget(master->f_vfsmnt), flags, NULL);
+ if (!IS_ERR(master)) {
+ stty = master->private_data;
+ if (stty->driver->subtype != PTY_TYPE_MASTER)
+ fixup_tty_attrs(ii, master, ctx);
+ }
+ return master;
+ }
+
+ pi = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return ERR_PTR(err);
+ }
+
+ master_flags = slave_flags = 0;
+ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER)
+ master_flags = flags;
+ else
+ slave_flags = flags;
+
+ /*
+ * Open pair master/slave.
+ */
+ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) {
+ master = ptmx_open(pi->cpt_index, master_flags);
+ } else {
+ sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]);
+ master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+ }
+ if (IS_ERR(master)) {
+ eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master));
+ cpt_release_buf(ctx);
+ return master;
+ }
+ stty = master->private_data;
+ clear_bit(TTY_PTY_LOCK, &stty->flags);
+ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM)
+ sprintf(pairname, "/dev/pts/%d", stty->index);
+ else
+ sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]);
+ slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
+ if (IS_ERR(slave)) {
+ eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave));
+ fput(master);
+ cpt_release_buf(ctx);
+ return slave;
+ }
+
+ if (pi->cpt_drv_subtype != PTY_TYPE_MASTER)
+ fixup_tty_attrs(ii, slave, ctx);
+
+ cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx);
+ cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx);
+ cpt_object_add(CPT_OBJ_FILE, master, ctx);
+ cpt_object_add(CPT_OBJ_FILE, slave, ctx);
+
+ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) {
+ loff_t pos;
+ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
+ obj->o_parent = master;
+ cpt_obj_setpos(obj, fi->cpt_priv, ctx);
+ pty_setup(stty, fi->cpt_priv, pi, ctx);
+
+ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
+ obj->o_parent = slave;
+ pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx);
+ cpt_obj_setpos(obj, pos, ctx);
+
+ obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx);
+ cpt_obj_setpos(obj, CPT_NULL, ctx);
+ get_file(master);
+ cpt_release_buf(ctx);
+ return master;
+ } else {
+ loff_t pos;
+ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
+ obj->o_parent = slave;
+ cpt_obj_setpos(obj, fi->cpt_priv, ctx);
+ pty_setup(stty->link, fi->cpt_priv, pi, ctx);
+
+ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
+ obj->o_parent = master;
+ pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx);
+ cpt_obj_setpos(obj, pos, ctx);
+
+ obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx);
+ cpt_obj_setpos(obj, CPT_NULL, ctx);
+ get_file(slave);
+ cpt_release_buf(ctx);
+ return slave;
+ }
+}
+
+int rst_tty_jobcontrol(struct cpt_context *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_TTY];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ cpt_object_t *obj;
+ struct cpt_tty_image *pibuf = cpt_get_buf(ctx);
+
+ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) {
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx);
+ if (obj) {
+ struct tty_struct *stty = obj->o_obj;
+ if ((int)pibuf->cpt_pgrp > 0) {
+ rcu_read_lock();
+ stty->pgrp = get_pid(alloc_vpid_safe(pibuf->cpt_pgrp));
+ rcu_read_unlock();
+ if (!stty->pgrp)
+ dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp);
+ } else if (pibuf->cpt_pgrp) {
+ stty->pgrp = alloc_pid(current->nsproxy->pid_ns,
+ 0);
+ if (!stty->pgrp) {
+ eprintk_ctx("cannot allocate stray tty->pgrp");
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+ }
+ if ((int)pibuf->cpt_session > 0) {
+ struct pid *sess;
+
+ rcu_read_lock();
+ sess = get_pid(alloc_vpid_safe(pibuf->cpt_session));
+ rcu_read_unlock();
+ if (!sess) {
+ dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session);
+ } else if (!stty->session) {
+ stty->session = sess;
+ }
+ }
+ }
+ sec += pibuf->cpt_next;
+ cpt_release_buf(ctx);
+ }
+ return 0;
+}
diff --git a/kernel/cpt/rst_ubc.c b/kernel/cpt/rst_ubc.c
new file mode 100644
index 0000000..db1f982
--- /dev/null
+++ b/kernel/cpt/rst_ubc.c
@@ -0,0 +1,144 @@
+/*
+ *
+ * kernel/cpt/rst_ubc.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <bc/beancounter.h>
+#include <asm/signal.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+
+struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx);
+ if (obj == NULL) {
+ eprintk("RST: unknown ub @%Ld\n", (long long)pos);
+ return get_beancounter(get_exec_ub());
+ }
+ return get_beancounter(obj->o_obj);
+}
+
+void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id)
+{
+ to[bc_parm_id].barrier = from[bc_parm_id].barrier;
+ to[bc_parm_id].limit = from[bc_parm_id].limit;
+}
+
+void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id)
+{
+ ubprm[bc_parm_id].barrier = UB_MAXVALUE;
+ ubprm[bc_parm_id].limit = UB_MAXVALUE;
+}
+
+static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm,
+ int held)
+{
+ prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier);
+ prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit);
+ if (held)
+ prm->held = dmp->held;
+ prm->maxheld = dmp->maxheld;
+ prm->minheld = dmp->minheld;
+ prm->failcnt = dmp->failcnt;
+}
+
+static int restore_one_bc(struct cpt_beancounter_image *v,
+ cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct user_beancounter *bc;
+ cpt_object_t *pobj;
+ int resources, i;
+
+ if (v->cpt_parent != CPT_NULL) {
+ pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx);
+ if (pobj == NULL)
+ return -ESRCH;
+ bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1);
+ } else {
+ bc = get_exec_ub();
+ while (bc->parent)
+ bc = bc->parent;
+ get_beancounter(bc);
+ }
+ if (bc == NULL)
+ return -ENOMEM;
+ obj->o_obj = bc;
+
+ if (ctx->image_version < CPT_VERSION_18 &&
+ CPT_VERSION_MINOR(ctx->image_version) < 1)
+ goto out;
+
+ if (v->cpt_content == CPT_CONTENT_ARRAY)
+ resources = v->cpt_ub_resources;
+ else
+ resources = UB_RESOURCES_COMPAT;
+
+ if (resources > UB_RESOURCES)
+ return -EINVAL;
+
+ for (i = 0; i < resources; i++) {
+ restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0);
+ restore_one_bc_parm(v->cpt_parms + i * 2 + 1,
+ bc->ub_store + i, 1);
+ }
+
+out:
+ if (!bc->parent)
+ for (i = 0; i < UB_RESOURCES; i++)
+ copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i);
+
+ return 0;
+}
+
+int rst_undump_ubc(struct cpt_context *ctx)
+{
+ loff_t start, end;
+ struct cpt_beancounter_image *v;
+ cpt_object_t *obj;
+ int err;
+
+ err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end);
+ if (err)
+ return err;
+
+ while (start < end) {
+ v = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_UBC, start, v, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+
+ obj = alloc_cpt_object(GFP_KERNEL, ctx);
+ cpt_obj_setpos(obj, start, ctx);
+ intern_cpt_object(CPT_OBJ_UBC, obj, ctx);
+
+ err = restore_one_bc(v, obj, ctx);
+
+ cpt_release_buf(ctx);
+ if (err)
+ return err;
+
+ start += v->cpt_next;
+ }
+ return 0;
+}
+
+void rst_finish_ubc(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+
+ for_each_object(obj, CPT_OBJ_UBC)
+ put_beancounter(obj->o_obj);
+}
diff --git a/kernel/cpt/rst_undump.c b/kernel/cpt/rst_undump.c
new file mode 100644
index 0000000..68cc6c2
--- /dev/null
+++ b/kernel/cpt/rst_undump.c
@@ -0,0 +1,1077 @@
+/*
+ *
+ * kernel/cpt/rst_undump.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fs_struct.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/mnt_namespace.h>
+#include <linux/posix-timers.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/smp_lock.h>
+#include <linux/ve_proto.h>
+#include <linux/virtinfo.h>
+#include <linux/virtinfoscp.h>
+#include <linux/compat.h>
+#include <linux/vzcalluser.h>
+#include <linux/securebits.h>
+#include <bc/beancounter.h>
+#ifdef CONFIG_X86
+#include <asm/desc.h>
+#endif
+#include <asm/unistd.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
+#include <linux/utsname.h>
+#include <linux/futex.h>
+#include <linux/shm.h>
+
+#include "cpt_obj.h"
+#include "cpt_context.h"
+#include "cpt_files.h"
+#include "cpt_mm.h"
+#include "cpt_process.h"
+#include "cpt_socket.h"
+#include "cpt_net.h"
+#include "cpt_ubc.h"
+#include "cpt_kernel.h"
+
+static int rst_utsname(cpt_context_t *ctx);
+
+
+struct thr_context {
+ struct completion init_complete;
+ struct completion task_done;
+ int error;
+ struct cpt_context *ctx;
+ cpt_object_t *tobj;
+};
+
+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx);
+
+static int vps_rst_veinfo(struct cpt_context *ctx)
+{
+ int err;
+ struct cpt_veinfo_image *i;
+ struct ve_struct *ve;
+ struct timespec delta;
+ loff_t start, end;
+ struct ipc_namespace *ns;
+
+ err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end);
+ if (err)
+ goto out;
+
+ i = cpt_get_buf(ctx);
+ memset(i, 0, sizeof(*i));
+ err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx);
+ if (err)
+ goto out_rel;
+
+ ve = get_exec_env();
+ ns = ve->ve_ns->ipc_ns;
+
+ /* Damn. Fatal mistake, these two values are size_t! */
+ ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU;
+ ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU;
+ ns->shm_ctlmni = i->shm_ctl_mni;
+
+ ns->msg_ctlmax = i->msg_ctl_max;
+ ns->msg_ctlmni = i->msg_ctl_mni;
+ ns->msg_ctlmnb = i->msg_ctl_mnb;
+
+ BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr));
+ ns->sem_ctls[0] = i->sem_ctl_arr[0];
+ ns->sem_ctls[1] = i->sem_ctl_arr[1];
+ ns->sem_ctls[2] = i->sem_ctl_arr[2];
+ ns->sem_ctls[3] = i->sem_ctl_arr[3];
+
+ cpt_timespec_import(&delta, i->start_timespec_delta);
+ _set_normalized_timespec(&ve->start_timespec,
+ ve->start_timespec.tv_sec - delta.tv_sec,
+ ve->start_timespec.tv_nsec - delta.tv_nsec);
+ ve->start_jiffies -= i->start_jiffies_delta;
+ // // FIXME: what???
+ // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy;
+
+ ctx->last_vpid = i->last_pid;
+ if (i->rnd_va_space)
+ ve->_randomize_va_space = i->rnd_va_space - 1;
+
+ err = 0;
+out_rel:
+ cpt_release_buf(ctx);
+out:
+ return err;
+}
+
+static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ int err;
+ struct env_create_param3 param;
+
+ do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time);
+ do_gettimespec(&ctx->delta_time);
+
+ _set_normalized_timespec(&ctx->delta_time,
+ ctx->delta_time.tv_sec - ctx->start_time.tv_sec,
+ ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec);
+ ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec;
+ if (ctx->delta_nsec < 0) {
+ wprintk_ctx("Wall time is behind source by %Ld ns, "
+ "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec);
+ }
+
+ _set_normalized_timespec(&ctx->cpt_monotonic_time,
+ ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec,
+ ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec);
+
+ memset(&param, 0, sizeof(param));
+ param.iptables_mask = ctx->iptables_mask;
+ param.feature_mask = ctx->features;
+
+ /* feature_mask is set as required - pretend we know everything */
+ param.known_features = (ctx->image_version < CPT_VERSION_18) ?
+ VE_FEATURES_OLD : ~(__u64)0;
+
+ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK|VE_EXCLUSIVE, 2,
+ &param, sizeof(param));
+ if (err < 0)
+ eprintk_ctx("real_env_create: %d\n", err);
+
+ get_exec_env()->jiffies_fixup =
+ (ctx->delta_time.tv_sec < 0 ?
+ 0 : timespec_to_jiffies(&ctx->delta_time)) -
+ (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64);
+ dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup,
+ (long long)ctx->delta_nsec);
+ return err < 0 ? err : 0;
+}
+
+
+static int rst_creds(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ struct cred *cred;
+ struct user_struct *user;
+ struct group_info *gids;
+ int i;
+
+ cred = prepare_creds();
+ if (cred == NULL)
+ goto err_cred;
+
+ user = alloc_uid(get_exec_env()->user_ns, ti->cpt_user);
+ if (user == NULL)
+ goto err_uid;
+
+ gids = groups_alloc(ti->cpt_ngids);
+ if (gids == NULL)
+ goto err_gids;
+
+ free_uid(cred->user);
+ cred->user = user;
+
+ for (i=0; i<32; i++)
+ gids->small_block[i] = ti->cpt_gids[i];
+
+ put_group_info(cred->group_info);
+ cred->group_info = gids;
+
+ cred->uid = ti->cpt_uid;
+ cred->euid = ti->cpt_euid;
+ cred->suid = ti->cpt_suid;
+ cred->fsuid = ti->cpt_fsuid;
+ cred->gid = ti->cpt_gid;
+ cred->egid = ti->cpt_egid;
+ cred->sgid = ti->cpt_sgid;
+ cred->fsgid = ti->cpt_fsgid;
+
+ memcpy(&cred->cap_effective, &ti->cpt_ecap,
+ sizeof(cred->cap_effective));
+ memcpy(&cred->cap_inheritable, &ti->cpt_icap,
+ sizeof(cred->cap_inheritable));
+ memcpy(&cred->cap_permitted, &ti->cpt_pcap,
+ sizeof(cred->cap_permitted));
+
+ if (ctx->image_version < CPT_VERSION_26)
+ cred->securebits = (ti->cpt_keepcap != 0) ?
+ issecure_mask(SECURE_KEEP_CAPS) : 0;
+ else
+ cred->securebits = ti->cpt_keepcap;
+
+ commit_creds(cred);
+ return 0;
+
+err_gids:
+ free_uid(user);
+err_uid:
+ abort_creds(cred);
+err_cred:
+ return -ENOMEM;
+}
+
+static int hook(void *arg)
+{
+ struct thr_context *thr_ctx = arg;
+ struct cpt_context *ctx;
+ cpt_object_t *tobj;
+ struct cpt_task_image *ti;
+ int err = 0;
+ int exiting = 0;
+
+ current->state = TASK_UNINTERRUPTIBLE;
+ complete(&thr_ctx->init_complete);
+ schedule();
+
+ ctx = thr_ctx->ctx;
+ tobj = thr_ctx->tobj;
+ ti = tobj->o_image;
+
+ current->fs->umask = 0;
+
+ if (ti->cpt_pid == 1) {
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *bc;
+#endif
+
+ err = vps_rst_reparent_root(tobj, ctx);
+
+ if (err) {
+ rst_report_error(err, ctx);
+ goto out;
+ }
+
+ memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t));
+
+ if (ctx->statusfile) {
+ fput(ctx->statusfile);
+ ctx->statusfile = NULL;
+ }
+
+ if (ctx->lockfile) {
+ char b;
+ mm_segment_t oldfs;
+ err = -EINVAL;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ if (ctx->lockfile->f_op && ctx->lockfile->f_op->read)
+ err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos);
+ set_fs(oldfs);
+ fput(ctx->lockfile);
+ ctx->lockfile = NULL;
+ }
+
+ if (err) {
+ eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err);
+ goto out;
+ }
+ err = vps_rst_veinfo(ctx);
+ if (err) {
+ eprintk_ctx("rst_veinfo: %d\n", err);
+ goto out;
+ }
+
+ err = rst_utsname(ctx);
+ if (err) {
+ eprintk_ctx("rst_utsname: %d\n", err);
+ goto out;
+ }
+
+ err = rst_files_std(ti, ctx);
+ if (err) {
+ eprintk_ctx("rst_root_stds: %d\n", err);
+ goto out;
+ }
+
+ err = rst_root_namespace(ctx);
+ if (err) {
+ eprintk_ctx("rst_namespace: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_restore_net(ctx)) != 0) {
+ eprintk_ctx("rst_restore_net: %d\n", err);
+ goto out;
+ }
+
+ err = rst_sockets(ctx);
+ if (err) {
+ eprintk_ctx("rst_sockets: %d\n", err);
+ goto out;
+ }
+ err = rst_sysv_ipc(ctx);
+ if (err) {
+ eprintk_ctx("rst_sysv_ipc: %d\n", err);
+ goto out;
+ }
+#ifdef CONFIG_BEANCOUNTERS
+ bc = get_exec_ub();
+ set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE);
+ set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC);
+ set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE);
+ set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE);
+#endif
+ }
+
+ if ((err = rst_creds(ti, ctx)) != 0) {
+ eprintk_ctx("rst_creds: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_mm_complete(ti, ctx)) != 0) {
+ eprintk_ctx("rst_mm: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_files_complete(ti, ctx)) != 0) {
+ eprintk_ctx("rst_files: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_fs_complete(ti, ctx)) != 0) {
+ eprintk_ctx("rst_fs: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_semundo_complete(ti, ctx)) != 0) {
+ eprintk_ctx("rst_semundo: %d\n", err);
+ goto out;
+ }
+
+ if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) {
+ eprintk_ctx("rst_signal: %d\n", err);
+ goto out;
+ }
+
+ if (ti->cpt_personality != 0)
+ __set_personality(ti->cpt_personality);
+
+#ifdef CONFIG_X86_64
+ /* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */
+ if (!ti->cpt_64bit)
+ __set_personality(PER_LINUX32);
+#endif
+
+ current->set_child_tid = NULL;
+ current->clear_child_tid = NULL;
+ current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV);
+ current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV);
+ current->exit_code = ti->cpt_exit_code;
+ current->pdeath_signal = ti->cpt_pdeath_signal;
+
+ if (ti->cpt_restart.fn != CPT_RBL_0) {
+ if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP
+#ifdef CONFIG_COMPAT
+ || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP
+#endif
+ ) {
+ struct restart_block *rb;
+ ktime_t e;
+
+ e.tv64 = 0;
+
+ if (ctx->image_version >= CPT_VERSION_20)
+ e = ktime_add_ns(e, ti->cpt_restart.arg2);
+ else if (ctx->image_version >= CPT_VERSION_9)
+ e = ktime_add_ns(e, ti->cpt_restart.arg0);
+ else
+ e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC);
+ if (e.tv64 < 0)
+ e.tv64 = TICK_NSEC;
+ e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+
+ rb = &task_thread_info(current)->restart_block;
+ if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP)
+ rb->fn = hrtimer_nanosleep_restart;
+#ifdef CONFIG_COMPAT
+ else
+ rb->fn = compat_nanosleep_restart;
+#endif
+ if (ctx->image_version >= CPT_VERSION_20) {
+ rb->arg0 = ti->cpt_restart.arg0;
+ rb->arg1 = ti->cpt_restart.arg1;
+ rb->arg2 = e.tv64 & 0xFFFFFFFF;
+ rb->arg3 = e.tv64 >> 32;
+ } else if (ctx->image_version >= CPT_VERSION_9) {
+ rb->arg0 = ti->cpt_restart.arg2;
+ rb->arg1 = ti->cpt_restart.arg3;
+ rb->arg2 = e.tv64 & 0xFFFFFFFF;
+ rb->arg3 = e.tv64 >> 32;
+ } else {
+ rb->arg0 = ti->cpt_restart.arg1;
+ rb->arg1 = CLOCK_MONOTONIC;
+ rb->arg2 = e.tv64 & 0xFFFFFFFF;
+ rb->arg3 = e.tv64 >> 32;
+ }
+ } else if (ti->cpt_restart.fn == CPT_RBL_POLL) {
+ struct restart_block *rb;
+ ktime_t e;
+ struct timespec ts;
+ unsigned long timeout_jiffies;
+
+ e.tv64 = 0;
+ e = ktime_add_ns(e, ti->cpt_restart.arg2);
+ e = ktime_sub(e, timespec_to_ktime(ctx->delta_time));
+ ts = ns_to_timespec(ktime_to_ns(e));
+ timeout_jiffies = timespec_to_jiffies(&ts);
+
+ rb = &task_thread_info(current)->restart_block;
+ rb->fn = do_restart_poll;
+ rb->arg0 = ti->cpt_restart.arg0;
+ rb->arg1 = ti->cpt_restart.arg1;
+ rb->arg2 = timeout_jiffies & 0xFFFFFFFF;
+ rb->arg3 = (u64)timeout_jiffies >> 32;
+ } else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) {
+ struct restart_block *rb;
+ ktime_t e;
+
+ e.tv64 = 0;
+ e = ktime_add_ns(e, ti->cpt_restart.arg2);
+ e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time));
+
+ rb = &task_thread_info(current)->restart_block;
+ rb->fn = futex_wait_restart;
+ rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0;
+ rb->futex.val = ti->cpt_restart.arg1;
+ rb->futex.time = e.tv64;
+ rb->futex.flags = ti->cpt_restart.arg3;
+ } else
+ eprintk_ctx("unknown restart block\n");
+ }
+
+ if (thread_group_leader(current)) {
+ current->signal->it_real_incr.tv64 = 0;
+ if (ctx->image_version >= CPT_VERSION_9) {
+ current->signal->it_real_incr =
+ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr);
+ } else {
+ current->signal->it_real_incr =
+ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC);
+ }
+ current->signal->it[CPUCLOCK_PROF].incr = ti->cpt_it_prof_incr;
+ current->signal->it[CPUCLOCK_VIRT].incr = ti->cpt_it_virt_incr;
+ current->signal->it[CPUCLOCK_PROF].expires = ti->cpt_it_prof_value;
+ current->signal->it[CPUCLOCK_VIRT].expires = ti->cpt_it_virt_value;
+ }
+
+ err = rst_clone_children(tobj, ctx);
+ if (err) {
+ eprintk_ctx("rst_clone_children\n");
+ goto out;
+ }
+
+ if (exiting)
+ current->signal->flags |= SIGNAL_GROUP_EXIT;
+
+ if (ti->cpt_pid == 1) {
+ if ((err = rst_process_linkage(ctx)) != 0) {
+ eprintk_ctx("rst_process_linkage: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_do_filejobs(ctx)) != 0) {
+ eprintk_ctx("rst_do_filejobs: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_eventpoll(ctx)) != 0) {
+ eprintk_ctx("rst_eventpoll: %d\n", err);
+ goto out;
+ }
+#ifdef CONFIG_INOTIFY_USER
+ if ((err = rst_inotify(ctx)) != 0) {
+ eprintk_ctx("rst_inotify: %d\n", err);
+ goto out;
+ }
+#endif
+ if ((err = rst_sockets_complete(ctx)) != 0) {
+ eprintk_ctx("rst_sockets_complete: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_stray_files(ctx)) != 0) {
+ eprintk_ctx("rst_stray_files: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_posix_locks(ctx)) != 0) {
+ eprintk_ctx("rst_posix_locks: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_tty_jobcontrol(ctx)) != 0) {
+ eprintk_ctx("rst_tty_jobcontrol: %d\n", err);
+ goto out;
+ }
+ if ((err = rst_restore_fs(ctx)) != 0) {
+ eprintk_ctx("rst_restore_fs: %d\n", err);
+ goto out;
+ }
+ if (virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) {
+ err = -ECHRNG;
+ eprintk_ctx("scp_restore failed\n");
+ goto out;
+ }
+ if (ctx->last_vpid)
+ get_exec_env()->ve_ns->pid_ns->last_pid =
+ ctx->last_vpid;
+ }
+
+out:
+ thr_ctx->error = err;
+ complete(&thr_ctx->task_done);
+
+ if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
+ current->flags |= PF_EXIT_RESTART;
+ do_exit(ti->cpt_exit_code);
+ } else {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+
+ schedule();
+
+ dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm);
+
+ module_put(THIS_MODULE);
+ complete_and_exit(NULL, 0);
+ return 0;
+}
+
+#if 0
+static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx)
+{
+ struct task_beancounter *tbc;
+
+ tbc = task_bc(current);
+
+ put_beancounter(tbc->fork_sub);
+ tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx);
+ if (ti->cpt_mm_ub != CPT_NULL) {
+ put_beancounter(tbc->exec_ub);
+ tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx);
+ }
+}
+#endif
+
+static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx,
+ struct thr_context *thr_ctx)
+{
+ struct task_struct *tsk;
+ int pid;
+
+ thr_ctx->ctx = ctx;
+ thr_ctx->error = 0;
+ init_completion(&thr_ctx->init_complete);
+ init_completion(&thr_ctx->task_done);
+#if 0
+ set_task_ubs(obj->o_image, ctx);
+#endif
+
+ pid = local_kernel_thread(hook, thr_ctx, 0, 0);
+ if (pid < 0)
+ return pid;
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (tsk == NULL)
+ return -ESRCH;
+ cpt_obj_setobj(obj, tsk, ctx);
+ thr_ctx->tobj = obj;
+ return 0;
+}
+
+static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ struct task_struct *tsk = obj->o_obj;
+ struct cpt_task_image *ti = obj->o_image;
+
+ memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm));
+ rst_mm_basic(obj, ti, ctx);
+ return 0;
+}
+
+static int make_baby(cpt_object_t *cobj,
+ struct cpt_task_image *pi,
+ struct cpt_context *ctx)
+{
+ unsigned long flags;
+ struct cpt_task_image *ci = cobj->o_image;
+ struct thr_context thr_ctx;
+ struct task_struct *tsk;
+ pid_t pid;
+ struct fs_struct *tfs = NULL;
+
+ flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx)
+ | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx);
+ if (ci->cpt_rppid != pi->cpt_pid) {
+ flags |= CLONE_THREAD|CLONE_PARENT;
+ if (ci->cpt_signal != pi->cpt_signal ||
+ !(flags&CLONE_SIGHAND) ||
+ (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) {
+ eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n",
+ (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid,
+ (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags
+ );
+ return -EINVAL;
+ }
+ }
+
+ thr_ctx.ctx = ctx;
+ thr_ctx.error = 0;
+ init_completion(&thr_ctx.init_complete);
+ init_completion(&thr_ctx.task_done);
+ thr_ctx.tobj = cobj;
+
+#if 0
+ set_task_ubs(ci, ctx);
+#endif
+
+ if (current->fs == NULL) {
+ tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs;
+ if (tfs == NULL)
+ return -EINVAL;
+ write_lock(&tfs->lock);
+ tfs->users++;
+ write_unlock(&tfs->lock);
+ current->fs = tfs;
+ }
+ pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid);
+ if (tfs) {
+ current->fs = NULL;
+ write_lock(&tfs->lock);
+ tfs->users--;
+ WARN_ON(tfs->users == 0);
+ write_unlock(&tfs->lock);
+ }
+ if (pid < 0)
+ return pid;
+
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ if (tsk == NULL)
+ return -ESRCH;
+ cpt_obj_setobj(cobj, tsk, ctx);
+ thr_ctx.tobj = cobj;
+ wait_for_completion(&thr_ctx.init_complete);
+ wait_task_inactive(cobj->o_obj, 0);
+ rst_basic_init_task(cobj, ctx);
+
+ /* clone() increases group_stop_count if it was not zero and
+ * CLONE_THREAD was asked. Undo.
+ */
+ if (current->signal->group_stop_count && (flags & CLONE_THREAD)) {
+ if (tsk->signal != current->signal) BUG();
+ current->signal->group_stop_count--;
+ }
+
+ wake_up_process(tsk);
+ wait_for_completion(&thr_ctx.task_done);
+ wait_task_inactive(tsk, 0);
+
+ return thr_ctx.error;
+}
+
+static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx)
+{
+ int err = 0;
+ struct cpt_task_image *ti = obj->o_image;
+ cpt_object_t *cobj;
+
+ for_each_object(cobj, CPT_OBJ_TASK) {
+ struct cpt_task_image *ci = cobj->o_image;
+ if (cobj == obj)
+ continue;
+ if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) ||
+ (ci->cpt_leader == ti->cpt_pid &&
+ ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) {
+ err = make_baby(cobj, ti, ctx);
+ if (err) {
+ eprintk_ctx("make_baby: %d\n", err);
+ return err;
+ }
+ }
+ }
+ return 0;
+}
+
+static int read_task_images(struct cpt_context *ctx)
+{
+ int err;
+ loff_t start, end;
+
+ err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end);
+ if (err)
+ return err;
+
+ while (start < end) {
+ cpt_object_t *obj;
+ struct cpt_task_image *ti = cpt_get_buf(ctx);
+
+ err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx);
+ if (err) {
+ cpt_release_buf(ctx);
+ return err;
+ }
+#if 0
+ if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) {
+ eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid);
+ cpt_release_buf(ctx);
+ return -EINVAL;
+ }
+#endif
+ obj = alloc_cpt_object(GFP_KERNEL, ctx);
+ cpt_obj_setpos(obj, start, ctx);
+ intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
+ obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL);
+ if (obj->o_image == NULL) {
+ cpt_release_buf(ctx);
+ return -ENOMEM;
+ }
+ memcpy(obj->o_image, ti, sizeof(*ti));
+ err = ctx->pread(obj->o_image + sizeof(*ti),
+ ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti));
+ cpt_release_buf(ctx);
+ if (err)
+ return err;
+ start += ti->cpt_next;
+ }
+ return 0;
+}
+
+
+static int vps_rst_restore_tree(struct cpt_context *ctx)
+{
+ int err;
+ cpt_object_t *obj;
+ struct thr_context thr_ctx_root;
+
+ err = read_task_images(ctx);
+ if (err)
+ return err;
+
+ err = rst_undump_ubc(ctx);
+ if (err)
+ return err;
+
+ if (virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL)
+ return -ECHRNG;
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ err = rst_setup_pagein(ctx);
+ if (err)
+ return err;
+#endif
+ for_each_object(obj, CPT_OBJ_TASK) {
+ err = create_root_task(obj, ctx, &thr_ctx_root);
+ if (err)
+ return err;
+
+ wait_for_completion(&thr_ctx_root.init_complete);
+ wait_task_inactive(obj->o_obj, 0);
+ rst_basic_init_task(obj, ctx);
+
+ wake_up_process(obj->o_obj);
+ wait_for_completion(&thr_ctx_root.task_done);
+ wait_task_inactive(obj->o_obj, 0);
+ err = thr_ctx_root.error;
+ if (err)
+ return err;
+ break;
+ }
+
+ return err;
+}
+
+#ifndef CONFIG_IA64
+int rst_read_vdso(struct cpt_context *ctx)
+{
+ int err;
+ loff_t start, end;
+ struct cpt_page_block *pgb;
+
+ ctx->vdso = NULL;
+ err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end);
+ if (err)
+ return err;
+ if (start == CPT_NULL)
+ return 0;
+ if (end < start + sizeof(*pgb) + PAGE_SIZE)
+ return -EINVAL;
+
+ pgb = cpt_get_buf(ctx);
+ err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx);
+ if (err) {
+ goto err_buf;
+ }
+ ctx->vdso = (char*)__get_free_page(GFP_KERNEL);
+ if (ctx->vdso == NULL) {
+ err = -ENOMEM;
+ goto err_buf;
+ }
+ err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb));
+ if (err)
+ goto err_page;
+ if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) {
+ free_page((unsigned long)ctx->vdso);
+ ctx->vdso = NULL;
+ }
+
+ cpt_release_buf(ctx);
+ return 0;
+err_page:
+ free_page((unsigned long)ctx->vdso);
+ ctx->vdso = NULL;
+err_buf:
+ cpt_release_buf(ctx);
+ return err;
+}
+#endif
+
+int vps_rst_undump(struct cpt_context *ctx)
+{
+ int err;
+ unsigned long umask;
+
+ err = rst_open_dumpfile(ctx);
+ if (err)
+ return err;
+
+ if (ctx->tasks64) {
+#if defined(CONFIG_IA64)
+ if (ctx->image_arch != CPT_OS_ARCH_IA64)
+#elif defined(CONFIG_X86_64)
+ if (ctx->image_arch != CPT_OS_ARCH_EMT64)
+#else
+ if (1)
+#endif
+ {
+ eprintk_ctx("Cannot restore 64 bit container on this architecture\n");
+ return -EINVAL;
+ }
+ }
+
+ umask = current->fs->umask;
+ current->fs->umask = 0;
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ err = rst_setup_pagein(ctx);
+#endif
+#ifndef CONFIG_IA64
+ if (err == 0)
+ err = rst_read_vdso(ctx);
+#endif
+ if (err == 0)
+ err = vps_rst_restore_tree(ctx);
+
+ if (err == 0)
+ err = rst_restore_process(ctx);
+
+ if (err)
+ virtinfo_notifier_call(VITYPE_SCP,
+ VIRTINFO_SCP_RSTFAIL, ctx);
+
+ current->fs->umask = umask;
+
+ return err;
+}
+
+static int rst_unlock_ve(struct cpt_context *ctx)
+{
+ struct ve_struct *env;
+
+ env = get_ve_by_id(ctx->ve_id);
+ if (!env)
+ return -ESRCH;
+ down_write(&env->op_sem);
+ env->is_locked = 0;
+ up_write(&env->op_sem);
+ put_ve(env);
+ return 0;
+}
+
+int recalc_sigpending_tsk(struct task_struct *t);
+
+int rst_resume(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+ int err = 0;
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *bc;
+#endif
+
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+
+ fput(file);
+ }
+
+#ifdef CONFIG_BEANCOUNTERS
+ bc = get_beancounter_byuid(ctx->ve_id, 0);
+ BUG_ON(!bc);
+ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE);
+ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC);
+ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE);
+ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE);
+ put_beancounter(bc);
+#endif
+
+ rst_resume_network(ctx);
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+ struct cpt_task_image *ti = obj->o_image;
+
+ if (!tsk)
+ continue;
+
+ if (ti->cpt_state == TASK_UNINTERRUPTIBLE) {
+ dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm);
+
+ /* Weird... If a signal is sent to stopped task,
+ * nobody makes recalc_sigpending(). We have to do
+ * this by hands after wake_up_process().
+ * if we did this before a signal could arrive before
+ * wake_up_process() and stall.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!signal_pending(tsk))
+ recalc_sigpending_tsk(tsk);
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ wake_up_process(tsk);
+ } else {
+ if (ti->cpt_state == TASK_STOPPED ||
+ ti->cpt_state == TASK_TRACED) {
+ set_task_state(tsk, ti->cpt_state);
+ }
+ }
+ put_task_struct(tsk);
+ }
+
+ rst_unlock_ve(ctx);
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ rst_complete_pagein(ctx, 0);
+#endif
+
+ rst_finish_ubc(ctx);
+ cpt_object_destroy(ctx);
+
+ return err;
+}
+
+int rst_kill(struct cpt_context *ctx)
+{
+ cpt_object_t *obj;
+ int err = 0;
+
+ for_each_object(obj, CPT_OBJ_FILE) {
+ struct file *file = obj->o_obj;
+
+ fput(file);
+ }
+
+ for_each_object(obj, CPT_OBJ_TASK) {
+ struct task_struct *tsk = obj->o_obj;
+
+ if (tsk == NULL)
+ continue;
+
+ if (tsk->exit_state == 0) {
+ send_sig(SIGKILL, tsk, 1);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sigfillset(&tsk->blocked);
+ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
+ set_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ clear_tsk_thread_flag(tsk, TIF_FREEZE);
+ if (tsk->flags & PF_FROZEN)
+ tsk->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ wake_up_process(tsk);
+ }
+
+ put_task_struct(tsk);
+ }
+
+#ifdef CONFIG_VZ_CHECKPOINT_LAZY
+ rst_complete_pagein(ctx, 1);
+#endif
+
+ rst_finish_ubc(ctx);
+ cpt_object_destroy(ctx);
+
+ return err;
+}
+
+static int rst_utsname(cpt_context_t *ctx)
+{
+ int err;
+ loff_t sec = ctx->sections[CPT_SECT_UTSNAME];
+ loff_t endsec;
+ struct cpt_section_hdr h;
+ struct cpt_object_hdr o;
+ struct ve_struct *ve;
+ struct uts_namespace *ns;
+ int i;
+
+ if (sec == CPT_NULL)
+ return 0;
+
+ err = ctx->pread(&h, sizeof(h), ctx, sec);
+ if (err)
+ return err;
+ if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h))
+ return -EINVAL;
+
+ ve = get_exec_env();
+ ns = ve->ve_ns->uts_ns;
+
+ i = 0;
+ endsec = sec + h.cpt_next;
+ sec += h.cpt_hdrlen;
+ while (sec < endsec) {
+ int len;
+ char *ptr;
+ err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx);
+ if (err)
+ return err;
+ len = o.cpt_next - o.cpt_hdrlen;
+ if (len > __NEW_UTS_LEN + 1)
+ return -ENAMETOOLONG;
+ switch (i) {
+ case 0:
+ ptr = ns->name.nodename; break;
+ case 1:
+ ptr = ns->name.domainname; break;
+ default:
+ return -EINVAL;
+ }
+ err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen);
+ if (err)
+ return err;
+ i++;
+ sec += o.cpt_next;
+ }
+
+ return 0;
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 291ac58..63381db 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -150,7 +150,7 @@ static inline void check_for_tasks(int cpu)
struct task_struct *p;
write_lock_irq(&tasklist_lock);
- for_each_process(p) {
+ for_each_process_all(p) {
if (task_cpu(p) == cpu &&
(!cputime_eq(p->utime, cputime_zero) ||
!cputime_eq(p->stime, cputime_zero)))
diff --git a/kernel/exit.c b/kernel/exit.c
index f7864ac..38b3e22 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -22,6 +22,9 @@
#include <linux/fdtable.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
+#include <linux/fairsched.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
@@ -50,13 +53,16 @@
#include <linux/perf_event.h>
#include <trace/events/sched.h>
+#include <bc/misc.h>
+#include <bc/oom_kill.h>
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
#include "cred-internals.h"
-static void exit_mm(struct task_struct * tsk);
+void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p)
{
@@ -67,6 +73,9 @@ static void __unhash_process(struct task_struct *p)
detach_pid(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
+#ifdef CONFIG_VE
+ list_del_rcu(&p->ve_task_info.vetask_list);
+#endif
__get_cpu_var(process_counts)--;
}
list_del_rcu(&p->thread_group);
@@ -177,6 +186,8 @@ repeat:
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
+ nr_zombie--;
+ atomic_inc(&nr_dead);
/*
* If we are the last non-leader member of the thread
@@ -205,9 +216,12 @@ repeat:
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
+ put_task_fairsched_node(p);
write_unlock_irq(&tasklist_lock);
release_thread(p);
+ ub_task_uncharge(p);
+ pput_ve(p->ve_task_info.owner_env);
call_rcu(&p->rcu, delayed_put_task_struct);
p = leader;
@@ -422,6 +436,8 @@ void daemonize(const char *name, ...)
va_list args;
sigset_t blocked;
+ (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL);
+
va_start(args, name);
vsnprintf(current->comm, sizeof(current->comm), name, args);
va_end(args);
@@ -526,6 +542,7 @@ void put_files_struct(struct files_struct *files)
free_fdtable(fdt);
}
}
+EXPORT_SYMBOL_GPL(put_files_struct);
void reset_files_struct(struct files_struct *files)
{
@@ -598,10 +615,10 @@ retry:
* Search through everything else. We should not get
* here often
*/
- do_each_thread(g, c) {
+ do_each_thread_all(g, c) {
if (c->mm == mm)
goto assign_new_owner;
- } while_each_thread(g, c);
+ } while_each_thread_all(g, c);
read_unlock(&tasklist_lock);
/*
@@ -640,7 +657,7 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct * tsk)
+void exit_mm(struct task_struct * tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
@@ -648,6 +665,10 @@ static void exit_mm(struct task_struct * tsk)
mm_release(tsk, mm);
if (!mm)
return;
+
+ if (test_tsk_thread_flag(tsk, TIF_MEMDIE))
+ mm->oom_killed = 1;
+
/*
* Serialize with any possible pending coredump.
* We must hold mmap_sem around checking core_state
@@ -692,6 +713,7 @@ static void exit_mm(struct task_struct * tsk)
mm_update_next_owner(mm);
mmput(mm);
}
+EXPORT_SYMBOL_GPL(exit_mm);
/*
* When we die, we re-parent all our children.
@@ -706,7 +728,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
struct task_struct *thread;
thread = father;
- while_each_thread(father, thread) {
+ while_each_thread_ve(father, thread) {
if (thread->flags & PF_EXITING)
continue;
if (unlikely(pid_ns->child_reaper == father))
@@ -839,11 +861,16 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->self_exec_id != tsk->parent_exec_id))
tsk->exit_signal = SIGCHLD;
+ if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper)
+ /* We dont want people slaying init. */
+ tsk->exit_signal = SIGCHLD;
+
signal = tracehook_notify_death(tsk, &cookie, group_dead);
if (signal >= 0)
signal = do_notify_parent(tsk, signal);
tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
+ nr_zombie++;
/* mt-exec, de_thread() is waiting for us */
if (thread_group_leader(tsk) &&
@@ -900,6 +927,7 @@ NORET_TYPE void do_exit(long code)
panic("Attempted to kill the idle task!");
tracehook_report_exit(&code);
+ (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL);
validate_creds_for_do_exit(tsk);
@@ -983,7 +1011,15 @@ NORET_TYPE void do_exit(long code)
*/
perf_event_exit_task(tsk);
- exit_notify(tsk, group_dead);
+ if (!(tsk->flags & PF_EXIT_RESTART))
+ exit_notify(tsk, group_dead);
+ else {
+ write_lock_irq(&tasklist_lock);
+ tsk->exit_state = EXIT_ZOMBIE;
+ nr_zombie++;
+ write_unlock_irq(&tasklist_lock);
+ exit_task_namespaces(tsk);
+ }
#ifdef CONFIG_NUMA
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
@@ -1626,7 +1662,7 @@ repeat:
if (wo->wo_flags & __WNOTHREAD)
break;
- } while_each_thread(current, tsk);
+ } while_each_thread_ve(current, tsk);
read_unlock(&tasklist_lock);
notask:
@@ -1753,6 +1789,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
+EXPORT_SYMBOL_GPL(sys_wait4);
#ifdef __ARCH_WANT_SYS_WAITPID
diff --git a/kernel/fairsched.c b/kernel/fairsched.c
new file mode 100644
index 0000000..7cbd309
--- /dev/null
+++ b/kernel/fairsched.c
@@ -0,0 +1,683 @@
+/*
+ * Fair Scheduler
+ *
+ * Copyright (C) 2000-2008 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fairsched.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+struct fairsched_node fairsched_init_node = {
+ .id = FAIRSCHED_INIT_NODE_ID,
+ .tg = &init_task_group,
+#ifdef CONFIG_VE
+ .owner_env = get_ve0(),
+#endif
+ .weight = 1,
+};
+
+static DEFINE_MUTEX(fairsched_mutex);
+
+/* list protected with fairsched_mutex */
+static LIST_HEAD(fairsched_node_head);
+static int fairsched_nr_nodes;
+
+void __init fairsched_init_early(void)
+{
+ list_add(&fairsched_init_node.nodelist, &fairsched_node_head);
+ fairsched_nr_nodes++;
+}
+
+#define FSCHWEIGHT_BASE 512000
+
+/******************************************************************************
+ * cfs group shares = FSCHWEIGHT_BASE / fairsched weight
+ *
+ * vzctl cpuunits default 1000
+ * cfs shares default value is 1024 (see init_task_group_load in sched.c)
+ * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024
+ * ^--- from vzctl
+ * weight in 1..65535 --> shares in 7..512000
+ * shares should be >1 (see comment in sched_group_set_shares function)
+ *****************************************************************************/
+
+static struct fairsched_node *fairsched_find(unsigned int id)
+{
+ struct fairsched_node *p;
+ list_for_each_entry(p, &fairsched_node_head, nodelist) {
+ if (p->id == id)
+ return p;
+ }
+ return NULL;
+}
+
+/******************************************************************************
+ * System calls
+ *
+ * All do_xxx functions are called under fairsched mutex and after
+ * capability check.
+ *
+ * The binary interfaces follow some other Fair Scheduler implementations
+ * (although some system call arguments are not needed for our implementation).
+ *****************************************************************************/
+
+static int do_fairsched_mknod(unsigned int parent, unsigned int weight,
+ unsigned int newid)
+{
+ struct fairsched_node *node;
+ int retval;
+
+ retval = -EINVAL;
+ if (weight < 1 || weight > FSCHWEIGHT_MAX)
+ goto out;
+ if (newid < 0 || newid > INT_MAX)
+ goto out;
+
+ retval = -EBUSY;
+ if (fairsched_find(newid) != NULL)
+ goto out;
+
+ retval = -ENOMEM;
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (node == NULL)
+ goto out;
+
+ node->tg = sched_create_group(&init_task_group);
+ if (IS_ERR(node->tg))
+ goto out_free;
+
+ node->id = newid;
+ node->weight = weight;
+ sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight);
+#ifdef CONFIG_VE
+ node->owner_env = get_exec_env();
+#endif
+ list_add(&node->nodelist, &fairsched_node_head);
+ fairsched_nr_nodes++;
+
+ retval = newid;
+out:
+ return retval;
+
+out_free:
+ kfree(node);
+ return retval;
+}
+
+asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
+ unsigned int newid)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_mknod(parent, weight, newid);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL(sys_fairsched_mknod);
+
+static int do_fairsched_rmnod(unsigned int id)
+{
+ struct fairsched_node *node;
+ int retval;
+
+ retval = -EINVAL;
+ node = fairsched_find(id);
+ if (node == NULL)
+ goto out;
+ if (node == &fairsched_init_node)
+ goto out;
+
+ retval = -EBUSY;
+ if (node->refcnt)
+ goto out;
+
+ list_del(&node->nodelist);
+ fairsched_nr_nodes--;
+
+ sched_destroy_group(node->tg);
+ kfree(node);
+ retval = 0;
+out:
+ return retval;
+}
+
+asmlinkage int sys_fairsched_rmnod(unsigned int id)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_rmnod(id);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL(sys_fairsched_rmnod);
+
+static int do_fairsched_chwt(unsigned int id, unsigned weight)
+{
+ struct fairsched_node *node;
+
+ if (id == 0)
+ return -EINVAL;
+ if (weight < 1 || weight > FSCHWEIGHT_MAX)
+ return -EINVAL;
+
+ node = fairsched_find(id);
+ if (node == NULL)
+ return -ENOENT;
+
+ node->weight = weight;
+ sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight);
+
+ return 0;
+}
+
+asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_chwt(id, weight);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+
+static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus)
+{
+ struct fairsched_node *node;
+
+ if (id == 0)
+ return -EINVAL;
+
+ node = fairsched_find(id);
+ if (node == NULL)
+ return -ENOENT;
+
+ return 0;
+}
+
+asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_vcpus(id, vcpus);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL(sys_fairsched_vcpus);
+
+static int do_fairsched_rate(unsigned int id, int op, unsigned rate)
+{
+ struct fairsched_node *node;
+ int retval;
+
+ if (id == 0)
+ return -EINVAL;
+ if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31)))
+ return -EINVAL;
+
+ node = fairsched_find(id);
+ if (node == NULL)
+ return -ENOENT;
+
+ retval = -EINVAL;
+ switch (op) {
+ case FAIRSCHED_SET_RATE:
+ node->rate = rate;
+ node->rate_limited = 1;
+ retval = rate;
+ break;
+ case FAIRSCHED_DROP_RATE:
+ node->rate = 0;
+ node->rate_limited = 0;
+ retval = 0;
+ break;
+ case FAIRSCHED_GET_RATE:
+ if (node->rate_limited)
+ retval = node->rate;
+ else
+ retval = -ENODATA;
+ break;
+ }
+ return retval;
+}
+
+asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_rate(id, op, rate);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+
+static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid)
+{
+ struct task_struct *p;
+ struct fairsched_node *node;
+ int retval;
+
+ retval = -ENOENT;
+ node = fairsched_find(nodeid);
+ if (node == NULL)
+ goto out;
+
+ write_lock_irq(&tasklist_lock);
+ retval = -ESRCH;
+ p = find_task_by_vpid(pid);
+ if (p == NULL)
+ goto out_unlock;
+
+ get_task_struct(p);
+ put_task_fairsched_node(p);
+ p->fsched_node = node;
+ get_task_fairsched_node(p);
+ write_unlock_irq(&tasklist_lock);
+
+ smp_wmb();
+ sched_move_task(p);
+ put_task_struct(p);
+ return 0;
+
+out_unlock:
+ write_unlock_irq(&tasklist_lock);
+out:
+ return retval;
+}
+
+asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid)
+{
+ int retval;
+
+ if (!capable_setveid())
+ return -EPERM;
+
+ mutex_lock(&fairsched_mutex);
+ retval = do_fairsched_mvpr(pid, nodeid);
+ mutex_unlock(&fairsched_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL(sys_fairsched_mvpr);
+
+int fairsched_new_node(int id, unsigned int vcpus)
+{
+ int err;
+
+ mutex_lock(&fairsched_mutex);
+ /*
+ * We refuse to switch to an already existing node since nodes
+ * keep a pointer to their ve_struct...
+ */
+ err = do_fairsched_mknod(0, 1, id);
+ if (err < 0) {
+ printk(KERN_WARNING "Can't create fairsched node %d\n", id);
+ goto out;
+ }
+#if 0
+ err = do_fairsched_vcpus(id, vcpus);
+ if (err) {
+ printk(KERN_WARNING "Can't set sched vcpus on node %d\n", id);
+ goto cleanup;
+ }
+#endif
+ err = do_fairsched_mvpr(current->pid, id);
+ if (err) {
+ printk(KERN_WARNING "Can't switch to fairsched node %d\n", id);
+ goto cleanup;
+ }
+ mutex_unlock(&fairsched_mutex);
+ return 0;
+
+cleanup:
+ if (do_fairsched_rmnod(id))
+ printk(KERN_ERR "Can't clean fairsched node %d\n", id);
+out:
+ mutex_unlock(&fairsched_mutex);
+ return err;
+}
+EXPORT_SYMBOL(fairsched_new_node);
+
+void fairsched_drop_node(int id)
+{
+ mutex_lock(&fairsched_mutex);
+ if (task_fairsched_node_id(current) == id)
+ if (do_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID))
+ printk(KERN_WARNING "Can't leave sched node %d\n", id);
+ if (do_fairsched_rmnod(id))
+ printk(KERN_ERR "Can't remove fairsched node %d\n", id);
+ mutex_unlock(&fairsched_mutex);
+}
+EXPORT_SYMBOL(fairsched_drop_node);
+
+#ifdef CONFIG_PROC_FS
+
+/*********************************************************************/
+/*
+ * proc interface
+ */
+/*********************************************************************/
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+
+struct fairsched_node_dump {
+ int id;
+ unsigned weight;
+ unsigned rate;
+ int rate_limited;
+ int nr_pcpu;
+ int nr_tasks, nr_runtasks;
+};
+
+struct fairsched_dump {
+ int len;
+ struct fairsched_node_dump nodes[0];
+};
+
+static struct fairsched_dump *fairsched_do_dump(int compat)
+{
+ int nr_nodes;
+ int len;
+ struct fairsched_dump *dump;
+ struct fairsched_node *node;
+ struct fairsched_node_dump *p;
+
+ mutex_lock(&fairsched_mutex);
+ nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1);
+ len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]);
+ dump = ub_vmalloc(len);
+ if (dump == NULL)
+ goto out;
+
+ p = dump->nodes;
+ list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) {
+ if ((char *)p - (char *)dump >= len)
+ break;
+ p->nr_tasks = 0;
+ p->nr_runtasks = 0;
+#ifdef CONFIG_VE
+ if (!ve_accessible(node->owner_env, get_exec_env()))
+ continue;
+ p->nr_tasks = atomic_read(&node->owner_env->pcounter);
+ p->nr_runtasks = nr_running_ve(node->owner_env);
+#endif
+ p->id = node->id;
+ p->weight = node->weight;
+ p->rate = node->rate;
+ p->rate_limited = node->rate_limited;
+ p->nr_pcpu = num_online_cpus();
+ p++;
+ }
+ dump->len = p - dump->nodes;
+out:
+ mutex_unlock(&fairsched_mutex);
+ return dump;
+}
+
+#define FAIRSCHED_PROC_HEADLINES 2
+
+#define FAIRSHED_DEBUG " debug"
+
+#ifdef CONFIG_VE
+/*
+ * File format is dictated by compatibility reasons.
+ */
+static int fairsched_seq_show(struct seq_file *m, void *v)
+{
+ struct fairsched_dump *dump;
+ struct fairsched_node_dump *p;
+ unsigned vid, nid, pid, r;
+
+ dump = m->private;
+ p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL);
+ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
+ if (p == dump->nodes)
+ seq_printf(m, "Version: 2.6 debug\n");
+ else if (p == dump->nodes + 1)
+ seq_printf(m,
+ " veid "
+ " id "
+ " parent "
+ "weight "
+ " rate "
+ "tasks "
+ " run "
+ "cpus"
+ " "
+ "flg "
+ "ready "
+ " start_tag "
+ " value "
+ " delay"
+ "\n");
+ } else {
+ p -= FAIRSCHED_PROC_HEADLINES;
+ vid = nid = pid = 0;
+ r = (unsigned long)v & 3;
+ if (p == dump->nodes) {
+ if (r == 2)
+ nid = p->id;
+ } else {
+ if (!r)
+ nid = p->id;
+ else if (r == 1)
+ vid = pid = p->id;
+ else
+ vid = p->id, nid = 1;
+ }
+ seq_printf(m,
+ "%10u "
+ "%10u %10u %6u %5u %5u %5u %4u"
+ " "
+ " %c%c %5u %20Lu %20Lu %20Lu"
+ "\n",
+ vid,
+ nid,
+ pid,
+ p->weight,
+ p->rate,
+ p->nr_tasks,
+ p->nr_runtasks,
+ p->nr_pcpu,
+ p->rate_limited ? 'L' : '.',
+ '.',
+ p->nr_runtasks,
+ 0ll, 0ll, 0ll);
+ }
+
+ return 0;
+}
+
+static void *fairsched_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct fairsched_dump *dump;
+ unsigned long l;
+
+ dump = m->private;
+ if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES)
+ return NULL;
+ if (*pos < FAIRSCHED_PROC_HEADLINES)
+ return dump->nodes + *pos;
+ /* guess why... */
+ l = (unsigned long)(dump->nodes +
+ ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3);
+ l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3;
+ return (void *)l;
+}
+static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ ++*pos;
+ return fairsched_seq_start(m, pos);
+}
+#endif /* CONFIG_VE */
+
+static int fairsched2_seq_show(struct seq_file *m, void *v)
+{
+ struct fairsched_dump *dump;
+ struct fairsched_node_dump *p;
+
+ dump = m->private;
+ p = v;
+ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
+ if (p == dump->nodes)
+ seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n");
+ else if (p == dump->nodes + 1)
+ seq_printf(m,
+ " id "
+ "weight "
+ " rate "
+ " run "
+ "cpus"
+#ifdef FAIRSHED_DEBUG
+ " "
+ "flg "
+ "ready "
+ " start_tag "
+ " value "
+ " delay"
+#endif
+ "\n");
+ } else {
+ p -= FAIRSCHED_PROC_HEADLINES;
+ seq_printf(m,
+ "%10u %6u %5u %5u %4u"
+#ifdef FAIRSHED_DEBUG
+ " "
+ " %c%c %5u %20Lu %20Lu %20Lu"
+#endif
+ "\n",
+ p->id,
+ p->weight,
+ p->rate,
+ p->nr_runtasks,
+ p->nr_pcpu
+#ifdef FAIRSHED_DEBUG
+ ,
+ p->rate_limited ? 'L' : '.',
+ '.',
+ p->nr_runtasks,
+ 0ll, 0ll, 0ll
+#endif
+ );
+ }
+
+ return 0;
+}
+
+static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct fairsched_dump *dump;
+
+ dump = m->private;
+ if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES)
+ return NULL;
+ return dump->nodes + *pos;
+}
+static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ ++*pos;
+ return fairsched2_seq_start(m, pos);
+}
+static void fairsched2_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+#ifdef CONFIG_VE
+static struct seq_operations fairsched_seq_op = {
+ .start = fairsched_seq_start,
+ .next = fairsched_seq_next,
+ .stop = fairsched2_seq_stop,
+ .show = fairsched_seq_show
+};
+#endif
+static struct seq_operations fairsched2_seq_op = {
+ .start = fairsched2_seq_start,
+ .next = fairsched2_seq_next,
+ .stop = fairsched2_seq_stop,
+ .show = fairsched2_seq_show
+};
+static int fairsched_seq_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ struct seq_file *m;
+ int compat;
+
+#ifdef CONFIG_VE
+ compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1);
+ ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op);
+#else
+ compat = 0;
+ ret = seq_open(file, &fairsched2_seq_op);
+#endif
+ if (ret)
+ return ret;
+ m = file->private_data;
+ m->private = fairsched_do_dump(compat);
+ if (m->private == NULL) {
+ seq_release(inode, file);
+ ret = -ENOMEM;
+ }
+ return ret;
+}
+static int fairsched_seq_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m;
+ struct fairsched_dump *dump;
+
+ m = file->private_data;
+ dump = m->private;
+ m->private = NULL;
+ vfree(dump);
+ seq_release(inode, file);
+ return 0;
+}
+static struct file_operations proc_fairsched_operations = {
+ .open = fairsched_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = fairsched_seq_release
+};
+
+void __init fairsched_init_late(void)
+{
+ proc_create("fairsched", S_IRUGO, &glob_proc_root,
+ &proc_fairsched_operations);
+ proc_create("fairsched2", S_IRUGO, &glob_proc_root,
+ &proc_fairsched_operations);
+}
+
+#else
+
+void __init fairsched_init_late(void) { }
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/fork.c b/kernel/fork.c
index 28b4874..be960e6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -64,6 +64,8 @@
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -72,6 +74,10 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <bc/vmpages.h>
+#include <bc/misc.h>
+#include <bc/oom_kill.h>
+
#include <trace/events/sched.h>
/*
@@ -79,12 +85,14 @@
*/
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
+EXPORT_SYMBOL_GPL(nr_threads);
int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
+EXPORT_SYMBOL(tasklist_lock);
int nr_processes(void)
{
@@ -162,12 +170,18 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ ub_task_put(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
+#ifdef CONFIG_VE
+ put_ve(VE_TASK_INFO(tsk)->owner_env);
+ atomic_dec(&nr_dead);
+#endif
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);
/*
* macro override instead of weak attribute alias, to workaround
@@ -186,7 +200,7 @@ void __init fork_init(unsigned long mempages)
/* create a slab on which task_structs can be allocated */
task_struct_cachep =
kmem_cache_create("task_struct", sizeof(struct task_struct),
- ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK | SLAB_UBC, NULL);
#endif
/* do the arch specific task caches init */
@@ -316,6 +330,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
continue;
}
charge = 0;
+ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
+ mpnt->vm_flags & ~VM_LOCKED,
+ mpnt->vm_file, UB_HARD))
+ goto fail_noch;
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
if (security_vm_enough_memory(len))
@@ -373,7 +391,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
rb_parent = &tmp->vm_rb;
mm->map_count++;
- retval = copy_page_range(mm, oldmm, mpnt);
+ retval = copy_page_range(mm, oldmm, tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -392,6 +410,9 @@ out:
fail_nomem_policy:
kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
+ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
+ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
+fail_noch:
retval = -ENOMEM;
vm_unacct_memory(charge);
goto out;
@@ -459,6 +480,15 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ /*
+ * This looks ugly, buy when we came from
+ * sys_execve -> mm_alloc -> here
+ * we need to get exec_ub, not task_ub. But when
+ * we're here like this
+ * sys_fork() -> dup_mm -> here
+ * we need task_ub, not the exec one... xemul
+ */
+ set_mm_ub(mm, p);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -466,6 +496,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
return mm;
}
+ put_mm_ub(mm);
free_mm(mm);
return NULL;
}
@@ -484,6 +515,7 @@ struct mm_struct * mm_alloc(void)
}
return mm;
}
+EXPORT_SYMBOL_GPL(mm_alloc);
/*
* Called when the last reference to the mm
@@ -496,6 +528,7 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+ put_mm_ub(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -520,6 +553,9 @@ void mmput(struct mm_struct *mm)
put_swap_token(mm);
if (mm->binfmt)
module_put(mm->binfmt->module);
+ (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm);
+ if (mm->oom_killed)
+ ub_oom_task_dead(current);
mmdrop(mm);
}
}
@@ -570,18 +606,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
/* Get rid of any futexes when releasing the mm */
#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
+ if (!(tsk->flags & PF_EXIT_RESTART)) {
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+ }
#endif
/* Get rid of any cached register state */
@@ -670,6 +708,7 @@ fail_nocontext:
* because it calls destroy_context()
*/
mm_free_pgd(mm);
+ put_mm_ub(mm);
free_mm(mm);
return NULL;
}
@@ -975,6 +1014,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
+ pid_t vpid,
int trace)
{
int retval;
@@ -1022,6 +1062,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
rt_mutex_init_task(p);
+ if (ub_task_charge(current, p))
+ goto bad_fork_charge;
+
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
@@ -1145,7 +1188,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
- if ((retval = copy_namespaces(clone_flags, p)))
+ if ((retval = copy_namespaces(clone_flags, p, 0)))
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
@@ -1155,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (pid != &init_struct_pid) {
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns, vpid);
if (!pid)
goto bad_fork_cleanup_io;
@@ -1163,6 +1206,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
if (retval < 0)
goto bad_fork_free_pid;
+ if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD)
+ task_active_pid_ns(p)->flags |= PID_NS_HIDDEN;
}
}
@@ -1262,7 +1307,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
- if (signal_pending(current)) {
+ if (signal_pending(current) && !vpid) {
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
@@ -1290,14 +1335,24 @@ static struct task_struct *copy_process(unsigned long clone_flags,
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
+#ifdef CONFIG_VE
+ list_add_tail_rcu(&p->ve_task_info.vetask_list,
+ &p->ve_task_info.owner_env->vetask_lh);
+#endif
__get_cpu_var(process_counts)++;
}
attach_pid(p, PIDTYPE_PID, pid);
nr_threads++;
}
+ (void)get_ve(p->ve_task_info.owner_env);
+ pget_ve(p->ve_task_info.owner_env);
+#ifdef CONFIG_VE
+ seqcount_init(&p->ve_task_info.wakeup_lock);
+#endif
total_forks++;
spin_unlock(&current->sighand->siglock);
+ get_task_fairsched_node(p);
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
@@ -1340,6 +1395,9 @@ bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
exit_creds(p);
bad_fork_free:
+ ub_task_uncharge(p);
+ ub_task_put(p);
+bad_fork_charge:
free_task(p);
fork_out:
return ERR_PTR(retval);
@@ -1357,7 +1415,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
- &init_struct_pid, 0);
+ &init_struct_pid, 0, 0);
if (!IS_ERR(task))
init_idle(task, cpu);
@@ -1370,12 +1428,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long do_fork(unsigned long clone_flags,
+long do_fork_pid(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
- int __user *child_tidptr)
+ int __user *child_tidptr,
+ long vpid)
{
struct task_struct *p;
int trace = 0;
@@ -1413,6 +1472,10 @@ long do_fork(unsigned long clone_flags,
}
}
+ nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags);
+ if (nr)
+ return nr;
+
/*
* When called from kernel_thread, don't do user tracing stuff.
*/
@@ -1420,7 +1483,7 @@ long do_fork(unsigned long clone_flags,
trace = tracehook_prepare_clone(clone_flags);
p = copy_process(clone_flags, stack_start, regs, stack_size,
- child_tidptr, NULL, trace);
+ child_tidptr, NULL, vpid, trace);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1451,6 +1514,8 @@ long do_fork(unsigned long clone_flags,
*/
p->flags &= ~PF_STARTING;
+ (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p);
+
if (unlikely(clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
@@ -1474,6 +1539,8 @@ long do_fork(unsigned long clone_flags,
} else {
nr = PTR_ERR(p);
}
+
+ (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr);
return nr;
}
@@ -1489,25 +1556,38 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
+EXPORT_SYMBOL(do_fork_pid);
+
+long do_fork(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr)
+{
+ return do_fork_pid(clone_flags, stack_start, regs, stack_size,
+ parent_tidptr, child_tidptr, 0);
+}
+
void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
- SLAB_NOTRACK, sighand_ctor);
+ SLAB_NOTRACK|SLAB_UBC, sighand_ctor);
signal_cachep = kmem_cache_create("signal_cache",
sizeof(struct signal_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
files_cachep = kmem_cache_create("files_cache",
sizeof(struct files_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
fs_cachep = kmem_cache_create("fs_cache",
sizeof(struct fs_struct), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_UBC, NULL);
+ vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_UBC);
mmap_init();
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b..90bbf45 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -29,6 +29,28 @@ void refrigerator(void)
processes around? */
long save;
+#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE)
+ save = current->state;
+ current->state = TASK_UNINTERRUPTIBLE;
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (test_and_clear_thread_flag(TIF_FREEZE)) {
+ recalc_sigpending(); /* We sent fake signal, clean it up */
+ if (atomic_read(&global_suspend) ||
+ atomic_read(&get_exec_env()->suspend))
+ current->flags |= PF_FROZEN;
+ else
+ current->state = save;
+ } else {
+ /* Freeze request could be canceled before we entered
+ * refrigerator(). In this case we do nothing. */
+ current->state = save;
+ }
+ spin_unlock_irq(&current->sighand->siglock);
+
+ while (current->flags & PF_FROZEN)
+ schedule();
+#else
task_lock(current);
if (freezing(current)) {
frozen_process();
@@ -57,6 +79,7 @@ void refrigerator(void)
/* Remove the accounting blocker */
current->flags &= ~PF_FREEZING;
+#endif
pr_debug("%s left refrigerator\n", current->comm);
__set_current_state(save);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 1ad4fa6..b65727e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1601,8 +1601,6 @@ handle_fault:
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
-static long futex_wait_restart(struct restart_block *restart);
-
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
@@ -1876,7 +1874,7 @@ out:
}
-static long futex_wait_restart(struct restart_block *restart)
+long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
int fshared = 0;
@@ -1893,6 +1891,7 @@ static long futex_wait_restart(struct restart_block *restart)
restart->futex.bitset,
restart->futex.flags & FLAGS_CLOCKRT);
}
+EXPORT_SYMBOL_GPL(futex_wait_restart);
/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 931a4d9..b34a0b9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1545,6 +1545,7 @@ out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart);
long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d4e8417..102a8df 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -143,7 +143,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
return;
rcu_read_lock();
- do_each_thread(g, t) {
+ do_each_thread_all(g, t) {
if (!--max_count)
goto unlock;
if (!--batch_count) {
@@ -156,7 +156,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, timeout);
- } while_each_thread(g, t);
+ } while_each_thread_all(g, t);
unlock:
rcu_read_unlock();
}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 9147a31..64cef00 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1021,7 +1021,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
}
}
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (i >= ks->thr_query && !finished) {
int_to_threadref(thref, p->pid);
pack_threadid(ptr, thref);
@@ -1032,7 +1032,7 @@ static void gdb_cmd_query(struct kgdb_state *ks)
finished = 1;
}
i++;
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
*(--ptr) = '\0';
break;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9fcb53a..43c8f01 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
+ /* Don't allow request_module() inside VE. */
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
ret = security_kernel_module_request();
if (ret)
return ret;
@@ -469,6 +473,9 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
+ if (!ve_is_super(get_exec_env()))
+ return -EPERM;
+
BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
validate_creds(sub_info->cred);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75..064a191 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -128,14 +128,14 @@ static int __kprobes check_safety(void)
ret = freeze_processes();
if (ret == 0) {
struct task_struct *p, *q;
- do_each_thread(p, q) {
+ do_each_thread_all(p, q) {
if (p != current && p->state == TASK_RUNNING &&
p->pid != 0) {
printk("Check failed: %s is running\n",p->comm);
ret = -1;
goto loop_end;
}
- } while_each_thread(p, q);
+ } while_each_thread_all(p, q);
}
loop_end:
thaw_processes();
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84027cf..d3151a1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -14,6 +14,7 @@
#include <linux/file.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nsproxy.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -25,6 +26,7 @@ struct kthread_create_info
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
+ struct ve_struct *ve;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -67,6 +69,16 @@ static int kthread(void *_create)
init_completion(&self.exited);
current->vfork_done = &self.exited;
+ if (do_ve_enter_hook && create->ve != get_ve0()) {
+ ret = do_ve_enter_hook(create->ve, 0);
+ if (ret < 0) {
+ create->result = ERR_PTR(ret);
+ complete(&create->done);
+ goto out;
+ }
+ } else if (create->ve != get_ve0())
+ BUG();
+
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
@@ -76,7 +88,7 @@ static int kthread(void *_create)
ret = -EINTR;
if (!self.should_stop)
ret = threadfn(data);
-
+out:
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -94,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
}
/**
- * kthread_create - create a kthread.
+ * kthread_create_ve - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
* @namefmt: printf-style name for the thread.
@@ -112,7 +124,8 @@ static void create_kthread(struct kthread_create_info *create)
*
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
-struct task_struct *kthread_create(int (*threadfn)(void *data),
+struct task_struct *kthread_create_ve(struct ve_struct *ve,
+ int (*threadfn)(void *data),
void *data,
const char namefmt[],
...)
@@ -121,6 +134,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
create.threadfn = threadfn;
create.data = data;
+ create.ve = ve;
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -147,7 +161,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
return create.result;
}
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_ve);
/**
* kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af5672..99c3c9b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3742,7 +3742,7 @@ retry:
printk(KERN_CONT " locked it.\n");
}
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
/*
* It's not reliable to print a task's held locks
* if it's not sleeping (or if it's not the current
@@ -3755,7 +3755,7 @@ retry:
if (!unlock)
if (read_trylock(&tasklist_lock))
unlock = 1;
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
printk("\n");
printk("=============================================\n\n");
diff --git a/kernel/module.c b/kernel/module.c
index dfa33e8..48a2edc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2915,6 +2915,8 @@ static char *module_flags(struct module *mod, char *buf)
static void *m_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&module_mutex);
+ if (!ve_is_super(get_exec_env()))
+ return NULL;
return seq_list_start(&modules, *pos);
}
@@ -2979,7 +2981,7 @@ static const struct file_operations proc_modules_operations = {
static int __init proc_modules_init(void)
{
- proc_create("modules", 0, NULL, &proc_modules_operations);
+ proc_create("modules", 0, &glob_proc_root, &proc_modules_operations);
return 0;
}
module_init(proc_modules_init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9..73524d0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+void get_task_namespaces(struct task_struct *tsk)
+{
+ struct nsproxy *ns = tsk->nsproxy;
+ if (ns) {
+ get_nsproxy(ns);
+ }
+}
+
static inline struct nsproxy *create_nsproxy(void)
{
struct nsproxy *nsproxy;
@@ -69,7 +77,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ipc;
}
- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+ new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
if (IS_ERR(new_nsp->pid_ns)) {
err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
@@ -104,7 +112,8 @@ out_ns:
* called from clone. This now handles copy for nsproxy and all
* namespaces therein.
*/
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+ int force_admin)
{
struct nsproxy *old_ns = tsk->nsproxy;
struct nsproxy *new_ns;
@@ -119,9 +128,20 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
- if (!capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out;
+ if (!force_admin) {
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto out;
+ }
+
+ /*
+ * netns-vs-sysfs is deadly broken, thus new namespace
+ * (even in ve0) can bring the node down
+ */
+ if (flags & CLONE_NEWNET) {
+ err = -EINVAL;
+ goto out;
+ }
}
/*
@@ -148,6 +168,7 @@ out:
put_nsproxy(old_ns);
return err;
}
+EXPORT_SYMBOL(copy_namespaces);
void free_nsproxy(struct nsproxy *ns)
{
@@ -162,6 +183,22 @@ void free_nsproxy(struct nsproxy *ns)
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
+EXPORT_SYMBOL(free_nsproxy);
+
+struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk)
+{
+ struct mnt_namespace *mnt_ns = NULL;
+
+ task_lock(tsk);
+ if (tsk->nsproxy)
+ mnt_ns = tsk->nsproxy->mnt_ns;
+ if (mnt_ns)
+ get_mnt_ns(mnt_ns);
+ task_unlock(tsk);
+
+ return mnt_ns;
+}
+EXPORT_SYMBOL(get_task_mnt_ns);
/*
* Called from unshare. Unshare all the namespaces part of nsproxy.
@@ -179,6 +216,9 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (unshare_flags & CLONE_NEWNET)
+ return -EINVAL;
+
*new_nsp = create_new_namespaces(unshare_flags, current,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d..be987e7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -33,6 +33,7 @@
#include <linux/rculist.h>
#include <linux/bootmem.h>
#include <linux/hash.h>
+#include <bc/kmem.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
@@ -110,7 +111,7 @@ EXPORT_SYMBOL(is_container_init);
* For now it is easier to be safe than to prove it can't happen.
*/
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
static void free_pidmap(struct upid *upid)
{
@@ -121,8 +122,9 @@ static void free_pidmap(struct upid *upid)
clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}
+EXPORT_SYMBOL_GPL(free_pidmap);
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
@@ -182,6 +184,36 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -1;
}
+int set_pidmap(struct pid_namespace *pid_ns, pid_t pid)
+{
+ int offset;
+ struct pidmap *map;
+
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (map->page)
+ kfree(page);
+ else
+ map->page = page;
+ spin_unlock_irq(&pidmap_lock);
+ if (unlikely(!map->page))
+ return -ENOMEM;
+ }
+
+ if (test_and_set_bit(offset, map->page))
+ return -EBUSY;
+
+ atomic_dec(&map->nr_free);
+ return pid;
+}
+
int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
@@ -227,25 +259,34 @@ void free_pid(struct pid *pid)
/* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
unsigned long flags;
+ struct upid *upid;
spin_lock_irqsave(&pidmap_lock, flags);
- for (i = 0; i <= pid->level; i++)
- hlist_del_rcu(&pid->numbers[i].pid_chain);
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ for (i = 0; i <= pid->level; i++) {
+ upid = &pid->numbers[i];
+ if (!hlist_unhashed(&upid->pid_chain))
+ hlist_del_rcu(&upid->pid_chain);
+ }
+ spin_unlock(&pidmap_lock);
+ ub_kmemsize_uncharge(pid->ub,
+ kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep));
+ local_irq_restore(flags);
for (i = 0; i <= pid->level; i++)
free_pidmap(pid->numbers + i);
-
+ put_beancounter(pid->ub);
call_rcu(&pid->rcu, delayed_put_pid);
}
+EXPORT_SYMBOL_GPL(free_pid);
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid)
{
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
+ struct user_beancounter *ub;
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
@@ -253,7 +294,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ if (vpid != 0 && i == ns->level)
+ nr = set_pidmap(tmp, vpid);
+ else
+ nr = alloc_pidmap(tmp);
if (nr < 0)
goto out_free;
@@ -268,17 +312,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+#ifdef CONFIG_BEANCOUNTERS
+ ub = get_exec_ub();
+ local_irq_disable();
+ if (ub_kmemsize_charge(ub, kmem_cache_objuse(ns->pid_cachep), UB_HARD))
+ goto out_enable;
+ pid->ub = get_beancounter(ub);
+ spin_lock(&pidmap_lock);
+#else
spin_lock_irq(&pidmap_lock);
+#endif
for (i = ns->level; i >= 0; i--) {
upid = &pid->numbers[i];
hlist_add_head_rcu(&upid->pid_chain,
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ if (upid->ns->flags & PID_NS_HIDDEN)
+ while (i--)
+ INIT_HLIST_NODE(&pid->numbers[i].pid_chain);
}
spin_unlock_irq(&pidmap_lock);
out:
return pid;
+out_enable:
+ local_irq_enable();
+ put_pid_ns(ns);
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
@@ -287,6 +346,7 @@ out_free:
pid = NULL;
goto out;
}
+EXPORT_SYMBOL_GPL(alloc_pid);
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
@@ -309,6 +369,45 @@ struct pid *find_vpid(int nr)
}
EXPORT_SYMBOL_GPL(find_vpid);
+void reattach_pid(struct task_struct *tsk, enum pid_type type,
+ struct pid *pid)
+{
+ int i;
+ struct pid *old_pid;
+ struct pid_link *link;
+ struct upid *upid;
+
+ link = &tsk->pids[type];
+ old_pid = link->pid;
+
+ hlist_del_rcu(&link->node);
+ link->pid = pid;
+ hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+
+ if (type != PIDTYPE_PID) {
+ for (i = PIDTYPE_MAX; --i >= 0; )
+ if (!hlist_empty(&old_pid->tasks[i]))
+ return;
+
+ for (i = 0; i < pid->level; i++)
+ hlist_del_rcu(&old_pid->numbers[i].pid_chain);
+ } else {
+ for (i = PIDTYPE_MAX; --i >= 0; )
+ if (!hlist_empty(&old_pid->tasks[i]))
+ BUG();
+
+ for (i = 0; i < pid->level; i++)
+ hlist_replace_rcu(&old_pid->numbers[i].pid_chain,
+ &pid->numbers[i].pid_chain);
+
+ upid = &pid->numbers[pid->level];
+ hlist_add_head_rcu(&upid->pid_chain,
+ &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+ }
+
+ call_rcu(&old_pid->rcu, delayed_put_pid);
+}
+
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
@@ -321,6 +420,7 @@ void attach_pid(struct task_struct *task, enum pid_type type,
link->pid = pid;
hlist_add_head_rcu(&link->node, &pid->tasks[type]);
}
+EXPORT_SYMBOL_GPL(attach_pid);
static void __change_pid(struct task_struct *task, enum pid_type type,
struct pid *new)
@@ -341,6 +441,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
free_pid(pid);
}
+EXPORT_SYMBOL_GPL(detach_pid);
void detach_pid(struct task_struct *task, enum pid_type type)
{
@@ -387,6 +488,7 @@ struct task_struct *find_task_by_vpid(pid_t vnr)
{
return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
}
+EXPORT_SYMBOL(find_task_by_vpid);
struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
@@ -422,6 +524,17 @@ struct pid *find_get_pid(pid_t nr)
}
EXPORT_SYMBOL_GPL(find_get_pid);
+pid_t pid_to_vpid(pid_t nr)
+{
+ struct pid *pid;
+
+ pid = find_pid_ns(nr, &init_pid_ns);
+ if (pid)
+ return pid->numbers[pid->level].nr;
+ return -1;
+}
+EXPORT_SYMBOL_GPL(pid_to_vpid);
+
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
struct upid *upid;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796..6d3f029 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,9 @@
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
+#include <linux/module.h>
+
+#include <bc/kmem.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -136,6 +139,156 @@ void free_pid_ns(struct kref *kref)
put_pid_ns(parent);
}
+/*
+ * this is a dirty ugly hack.
+ */
+
+static int __pid_ns_attach_task(struct pid_namespace *ns,
+ struct task_struct *tsk, pid_t nr)
+{
+ struct pid *pid;
+ enum pid_type type;
+ unsigned long old_size, new_size;
+
+ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
+ if (!pid)
+ goto out;
+
+ if (nr == 0)
+ nr = alloc_pidmap(ns);
+ else
+ nr = set_pidmap(ns, nr);
+
+ if (nr < 0)
+ goto out_free;
+
+ memcpy(pid, task_pid(tsk),
+ sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid));
+ get_pid_ns(ns);
+ pid->level++;
+ BUG_ON(pid->level != ns->level);
+ pid->numbers[pid->level].nr = nr;
+ pid->numbers[pid->level].ns = ns;
+ atomic_set(&pid->count, 1);
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+
+ old_size = kmem_cache_objuse(pid->numbers[pid->level - 1].ns->pid_cachep);
+ new_size = kmem_cache_objuse(pid->numbers[pid->level].ns->pid_cachep);
+ local_irq_disable();
+ /*
+ * Depending on sizeof(struct foo), cache flags (redzoning, etc)
+ * and actual CPU (cacheline_size() jump from 64 to 128 bytes after
+ * CPU detection) new size can very well be smaller than old size.
+ */
+ if (new_size > old_size) {
+ if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0)
+ goto out_enable;
+ } else
+ ub_kmemsize_uncharge(pid->ub, old_size - new_size);
+
+ write_lock(&tasklist_lock);
+
+ spin_lock(&pidmap_lock);
+ reattach_pid(tsk, PIDTYPE_SID, pid);
+ reattach_pid(tsk, PIDTYPE_PGID, pid);
+ tsk->signal->leader_pid = pid;
+ current->signal->tty_old_pgrp = NULL;
+
+ reattach_pid(tsk, PIDTYPE_PID, pid);
+ spin_unlock(&pidmap_lock);
+
+ write_unlock_irq(&tasklist_lock);
+
+ return 0;
+
+out_enable:
+ local_irq_enable();
+ put_pid_ns(ns);
+out_free:
+ kmem_cache_free(ns->pid_cachep, pid);
+out:
+ return -ENOMEM;
+}
+
+int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk)
+{
+ return __pid_ns_attach_task(ns, tsk, 0);
+}
+EXPORT_SYMBOL_GPL(pid_ns_attach_task);
+
+int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk)
+{
+ int err;
+
+ err = __pid_ns_attach_task(ns, tsk, 1);
+ if (err < 0)
+ return err;
+
+ ns->child_reaper = tsk;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pid_ns_attach_init);
+
+#ifdef CONFIG_VE
+static noinline void show_lost_task(struct task_struct *p)
+{
+ printk("Lost task: %d/%s/%p blocked: %lx pending: %lx\n",
+ p->pid, p->comm, p,
+ p->blocked.sig[0],
+ p->pending.signal.sig[0]);
+}
+
+static void zap_ve_processes(struct ve_struct *env)
+{
+ /*
+ * Here the VE changes its state into "not running".
+ * op_sem taken for write is a barrier to all VE manipulations from
+ * ioctl: it waits for operations currently in progress and blocks all
+ * subsequent operations until is_running is set to 0 and op_sem is
+ * released.
+ */
+ down_write(&env->op_sem);
+ env->is_running = 0;
+ up_write(&env->op_sem);
+
+ /* wait for all init childs exit */
+ while (atomic_read(&env->pcounter) > 1) {
+ struct task_struct *g, *p;
+ long delay = 1;
+
+ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0)
+ continue;
+ /* it was ENOCHLD or no more children somehow */
+ if (atomic_read(&env->pcounter) == 1)
+ break;
+
+ /* clear all signals to avoid wakeups */
+ if (signal_pending(current))
+ flush_signals(current);
+ /* we have child without signal sent */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(delay);
+ delay = (delay < HZ) ? (delay << 1) : HZ;
+ read_lock(&tasklist_lock);
+ do_each_thread_ve(g, p) {
+ if (p != current) {
+ /*
+ * by that time no processes other then entered
+ * may exist in the VE. if some were missed by
+ * zap_pid_ns_processes() this was a BUG
+ */
+ if (!p->did_ve_enter)
+ show_lost_task(p);
+
+ force_sig_specific(SIGKILL, p);
+ }
+ } while_each_thread_ve(g, p);
+ read_unlock(&tasklist_lock);
+ }
+}
+#endif
+
void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
@@ -181,6 +334,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
} while (rc != -ECHILD);
acct_exit_ns(pid_ns);
+
+#ifdef CONFIG_VE
+ if (get_exec_env()->ve_ns->pid_ns == pid_ns)
+ zap_ve_processes(get_exec_env());
+#endif
return;
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4954407..da76c51 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,6 +31,8 @@
* POSIX clocks & timers
*/
#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
@@ -46,6 +48,9 @@
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/module.h>
+#include <linux/pid_namespace.h>
+
+#include <bc/beancounter.h>
/*
* Management arrays for POSIX timers. Timers are kept in slab memory
@@ -303,8 +308,8 @@ static __init int init_posix_timers(void)
register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof (struct k_itimer), 0, SLAB_PANIC,
- NULL);
+ sizeof (struct k_itimer), 0,
+ SLAB_PANIC|SLAB_UBC, NULL);
idr_init(&posix_timers_id);
return 0;
}
@@ -363,6 +368,7 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
{
struct task_struct *task;
int shared, ret = -1;
+
/*
* FIXME: if ->sigq is queued we can race with
* dequeue_signal()->do_schedule_next_timer().
@@ -379,8 +385,17 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
rcu_read_lock();
task = pid_task(timr->it_pid, PIDTYPE_PID);
if (task) {
+ struct ve_struct *ve;
+ struct user_beancounter *ub;
+
+ ve = set_exec_env(task->ve_task_info.owner_env);
+ ub = set_exec_ub(task->task_bc.task_ub);
+
shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
ret = send_sigqueue(timr->sigq, task, shared);
+
+ (void)set_exec_ub(ub);
+ (void)set_exec_env(ve);
}
rcu_read_unlock();
/* If we failed to send the signal the timer stops. */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e7cd671..732f532 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -15,6 +15,8 @@
#include <linux/syscalls.h>
#include <linux/freezer.h>
+atomic_t global_suspend = ATOMIC_INIT(0);
+
/*
* Timeout for stopping processes
*/
@@ -24,7 +26,9 @@ static inline int freezeable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
- (p->exit_state != 0))
+ (p->exit_state != 0) ||
+ (p->state == TASK_STOPPED) ||
+ (p->state == TASK_TRACED))
return 0;
return 1;
}
@@ -44,7 +48,7 @@ static int try_to_freeze_tasks(bool sig_only)
do {
todo = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (frozen(p) || !freezeable(p))
continue;
@@ -60,7 +64,7 @@ static int try_to_freeze_tasks(bool sig_only)
if (!task_is_stopped_or_traced(p) &&
!freezer_should_skip(p))
todo++;
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock(&tasklist_lock);
yield(); /* Yield is okay here */
if (time_after(jiffies, end_time))
@@ -84,13 +88,13 @@ static int try_to_freeze_tasks(bool sig_only)
elapsed_csecs / 100, elapsed_csecs % 100, todo);
show_state();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
task_lock(p);
if (freezing(p) && !freezer_should_skip(p))
printk(KERN_ERR " %s\n", p->comm);
cancel_freezing(p);
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock(&tasklist_lock);
} else {
printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
@@ -107,6 +111,7 @@ int freeze_processes(void)
{
int error;
+ atomic_inc(&global_suspend);
printk("Freezing user space processes ... ");
error = try_to_freeze_tasks(true);
if (error)
@@ -123,6 +128,7 @@ int freeze_processes(void)
Exit:
BUG_ON(in_atomic());
printk("\n");
+ atomic_dec(&global_suspend);
return error;
}
@@ -132,7 +138,7 @@ static void thaw_tasks(bool nosig_only)
struct task_struct *g, *p;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (!freezeable(p))
continue;
@@ -142,8 +148,10 @@ static void thaw_tasks(bool nosig_only)
if (cgroup_freezing_or_frozen(p))
continue;
- thaw_process(p);
- } while_each_thread(g, p);
+ if (!thaw_process(p))
+ printk(KERN_WARNING " Strange, %s not stopped\n",
+ p->comm );
+ } while_each_thread_all(g, p);
read_unlock(&tasklist_lock);
}
diff --git a/kernel/printk.c b/kernel/printk.c
index f38b07f..1041e53 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,7 +31,9 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/vzratelimit.h>
#include <linux/syscalls.h>
+#include <linux/veprintk.h>
#include <linux/kexec.h>
#include <asm/uaccess.h>
@@ -100,7 +102,7 @@ static int console_locked, console_suspended;
* It is also used in interesting ways to provide interlocking in
* release_console_sem().
*/
-static DEFINE_SPINLOCK(logbuf_lock);
+DEFINE_SPINLOCK(logbuf_lock);
#define LOG_BUF_MASK (log_buf_len-1)
#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -136,6 +138,7 @@ EXPORT_SYMBOL(console_set_on_cmdline);
/* Flag: console code may call schedule() */
static int console_may_schedule;
+int console_silence_loglevel;
#ifdef CONFIG_PRINTK
@@ -162,6 +165,19 @@ void log_buf_kexec_setup(void)
}
#endif
+static int __init setup_console_silencelevel(char *str)
+{
+ int level;
+
+ if (get_option(&str, &level) != 1)
+ return 0;
+
+ console_silence_loglevel = level;
+ return 1;
+}
+
+__setup("silencelevel=", setup_console_silencelevel);
+
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
@@ -278,6 +294,9 @@ int do_syslog(int type, char __user *buf, int len)
char c;
int error = 0;
+ if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7))
+ goto out;
+
error = security_syslog(type);
if (error)
return error;
@@ -298,15 +317,15 @@ int do_syslog(int type, char __user *buf, int len)
error = -EFAULT;
goto out;
}
- error = wait_event_interruptible(log_wait,
- (log_start - log_end));
+ error = wait_event_interruptible(ve_log_wait,
+ (ve_log_start - ve_log_end));
if (error)
goto out;
i = 0;
spin_lock_irq(&logbuf_lock);
- while (!error && (log_start != log_end) && i < len) {
- c = LOG_BUF(log_start);
- log_start++;
+ while (!error && (ve_log_start != ve_log_end) && i < len) {
+ c = VE_LOG_BUF(ve_log_start);
+ ve_log_start++;
spin_unlock_irq(&logbuf_lock);
error = __put_user(c,buf);
buf++;
@@ -332,15 +351,17 @@ int do_syslog(int type, char __user *buf, int len)
error = -EFAULT;
goto out;
}
+ if (ve_log_buf == NULL)
+ goto out;
count = len;
- if (count > log_buf_len)
- count = log_buf_len;
+ if (count > ve_log_buf_len)
+ count = ve_log_buf_len;
spin_lock_irq(&logbuf_lock);
- if (count > logged_chars)
- count = logged_chars;
+ if (count > ve_logged_chars)
+ count = ve_logged_chars;
if (do_clear)
- logged_chars = 0;
- limit = log_end;
+ ve_logged_chars = 0;
+ limit = ve_log_end;
/*
* __put_user() could sleep, and while we sleep
* printk() could overwrite the messages
@@ -349,9 +370,9 @@ int do_syslog(int type, char __user *buf, int len)
*/
for (i = 0; i < count && !error; i++) {
j = limit-1-i;
- if (j + log_buf_len < log_end)
+ if (j + ve_log_buf_len < ve_log_end)
break;
- c = LOG_BUF(j);
+ c = VE_LOG_BUF(j);
spin_unlock_irq(&logbuf_lock);
error = __put_user(c,&buf[count-1-i]);
cond_resched();
@@ -375,7 +396,7 @@ int do_syslog(int type, char __user *buf, int len)
}
break;
case 5: /* Clear ring buffer */
- logged_chars = 0;
+ ve_logged_chars = 0;
break;
case 6: /* Disable logging to console */
if (saved_console_loglevel == -1)
@@ -392,18 +413,21 @@ int do_syslog(int type, char __user *buf, int len)
error = -EINVAL;
if (len < 1 || len > 8)
goto out;
+ error = 0;
+ /* VE has no console, so return success */
+ if (!ve_is_super(get_exec_env()))
+ goto out;
if (len < minimum_console_loglevel)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
saved_console_loglevel = -1;
- error = 0;
break;
case 9: /* Number of chars in the log buffer */
- error = log_end - log_start;
+ error = ve_log_end - ve_log_start;
break;
case 10: /* Size of the log buffer */
- error = log_buf_len;
+ error = ve_log_buf_len;
break;
default:
error = -EINVAL;
@@ -514,14 +538,14 @@ static void call_console_drivers(unsigned start, unsigned end)
static void emit_log_char(char c)
{
- LOG_BUF(log_end) = c;
- log_end++;
- if (log_end - log_start > log_buf_len)
- log_start = log_end - log_buf_len;
- if (log_end - con_start > log_buf_len)
- con_start = log_end - log_buf_len;
- if (logged_chars < log_buf_len)
- logged_chars++;
+ VE_LOG_BUF(ve_log_end) = c;
+ ve_log_end++;
+ if (ve_log_end - ve_log_start > ve_log_buf_len)
+ ve_log_start = ve_log_end - ve_log_buf_len;
+ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len)
+ con_start = ve_log_end - ve_log_buf_len;
+ if (ve_logged_chars < ve_log_buf_len)
+ ve_logged_chars++;
}
/*
@@ -586,6 +610,30 @@ static int have_callable_console(void)
* See the vsnprintf() documentation for format string extensions over C99.
*/
+static inline int ve_log_init(void)
+{
+#ifdef CONFIG_VE
+ if (ve_log_buf != NULL)
+ return 0;
+
+ if (ve_is_super(get_exec_env())) {
+ ve0._log_wait = &log_wait;
+ ve0._log_start = &log_start;
+ ve0._log_end = &log_end;
+ ve0._logged_chars = &logged_chars;
+ ve0.log_buf = log_buf;
+ return 0;
+ }
+
+ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC);
+ if (!ve_log_buf)
+ return -ENOMEM;
+
+ memset(ve_log_buf, 0, ve_log_buf_len);
+#endif
+ return 0;
+}
+
asmlinkage int printk(const char *fmt, ...)
{
va_list args;
@@ -667,13 +715,14 @@ static inline void printk_delay(void)
}
}
-asmlinkage int vprintk(const char *fmt, va_list args)
+asmlinkage int __vprintk(const char *fmt, va_list args)
{
int printed_len = 0;
int current_log_level = default_message_loglevel;
unsigned long flags;
int this_cpu;
char *p;
+ int err, need_wake;
boot_delay_msec();
printk_delay();
@@ -705,6 +754,13 @@ asmlinkage int vprintk(const char *fmt, va_list args)
spin_lock(&logbuf_lock);
printk_cpu = this_cpu;
+ err = ve_log_init();
+ if (err) {
+ spin_unlock(&logbuf_lock);
+ printed_len = err;
+ goto out_lockdep;
+ }
+
if (recursion_bug) {
recursion_bug = 0;
strcpy(printk_buf, recursion_bug_msg);
@@ -788,19 +844,67 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* will release 'logbuf_lock' regardless of whether it
* actually gets the semaphore or not.
*/
- if (acquire_console_semaphore_for_printk(this_cpu))
+ if (!ve_is_super(get_exec_env())) {
+ need_wake = (ve_log_start != ve_log_end);
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+ lockdep_on();
+ raw_local_irq_restore(flags);
+ if (!oops_in_progress && need_wake)
+ wake_up_interruptible(&ve_log_wait);
+ goto out_preempt;
+ } else if (acquire_console_semaphore_for_printk(this_cpu))
release_console_sem();
+out_lockdep:
lockdep_on();
out_restore_irqs:
raw_local_irq_restore(flags);
+out_preempt:
preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
EXPORT_SYMBOL(vprintk);
+asmlinkage int vprintk(const char *fmt, va_list args)
+{
+ int i;
+ struct ve_struct *env;
+
+ env = set_exec_env(get_ve0());
+ i = __vprintk(fmt, args);
+ (void)set_exec_env(env);
+ return i;
+}
+
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+{
+ int printed_len;
+ va_list args2;
+
+ printed_len = 0;
+ va_copy(args2, args);
+ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG))
+ printed_len = vprintk(fmt, args);
+ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG))
+ printed_len = __vprintk(fmt, args2);
+ return printed_len;
+}
+
+asmlinkage int ve_printk(int dst, const char *fmt, ...)
+{
+ va_list args;
+ int printed_len;
+
+ va_start(args, fmt);
+ printed_len = ve_vprintk(dst, fmt, args);
+ va_end(args);
+ return printed_len;
+}
+EXPORT_SYMBOL(ve_printk);
+
#else
static void call_console_drivers(unsigned start, unsigned end)
@@ -1058,6 +1162,7 @@ void release_console_sem(void)
_con_start = con_start;
_log_end = log_end;
con_start = log_end; /* Flush */
+ printk_cpu = UINT_MAX;
spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(_con_start, _log_end);
@@ -1066,6 +1171,7 @@ void release_console_sem(void)
}
console_locked = 0;
up(&console_sem);
+ printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
wake_up_klogd();
@@ -1382,6 +1488,36 @@ int printk_ratelimit(void)
}
EXPORT_SYMBOL(printk_ratelimit);
+/*
+ * Rate limiting stuff.
+ */
+int vz_ratelimit(struct vz_rate_info *p)
+{
+ unsigned long cjif, djif;
+ unsigned long flags;
+ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
+ long new_bucket;
+
+ spin_lock_irqsave(&ratelimit_lock, flags);
+ cjif = jiffies;
+ djif = cjif - p->last;
+ if (djif < p->interval) {
+ if (p->bucket >= p->burst) {
+ spin_unlock_irqrestore(&ratelimit_lock, flags);
+ return 0;
+ }
+ p->bucket++;
+ } else {
+ new_bucket = p->bucket - (djif / (unsigned)p->interval);
+ if (new_bucket < 0)
+ new_bucket = 0;
+ p->bucket = new_bucket + 1;
+ }
+ p->last = cjif;
+ spin_unlock_irqrestore(&ratelimit_lock, flags);
+ return 1;
+}
+
/**
* printk_timed_ratelimit - caller-controlled printk ratelimiting
* @caller_jiffies: pointer to caller's state
@@ -1405,3 +1541,65 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
}
EXPORT_SYMBOL(printk_timed_ratelimit);
#endif
+
+static cpumask_t nmi_show_regs_cpus = CPU_MASK_NONE;
+static unsigned long nmi_show_regs_timeout;
+
+void __attribute__((weak)) send_nmi_ipi_allbutself(void)
+{
+ cpus_clear(nmi_show_regs_cpus);
+}
+
+static void busted_show_regs(struct pt_regs *regs, int in_nmi)
+{
+ if (!regs || (in_nmi && spin_is_locked(&logbuf_lock)))
+ return;
+
+ bust_spinlocks(1);
+ printk("----------- IPI show regs -----------\n");
+ show_regs(regs);
+ bust_spinlocks(0);
+}
+
+void nmi_show_regs(struct pt_regs *regs, int in_nmi)
+{
+ if (cpus_empty(nmi_show_regs_cpus))
+ goto doit;
+
+ /* Previous request still in progress */
+ if (time_before(jiffies, nmi_show_regs_timeout))
+ return;
+
+ if (!in_nmi || !spin_is_locked(&logbuf_lock)) {
+ int cpu;
+
+ bust_spinlocks(1);
+ printk("previous show regs lost IPI to: ");
+ for_each_cpu_mask(cpu, nmi_show_regs_cpus)
+ printk("%d ", cpu);
+ printk("\n");
+ bust_spinlocks(0);
+ }
+
+doit:
+ nmi_show_regs_timeout = jiffies + HZ/10;
+ nmi_show_regs_cpus = cpu_online_map;
+ cpu_clear(raw_smp_processor_id(), nmi_show_regs_cpus);
+ busted_show_regs(regs, in_nmi);
+ send_nmi_ipi_allbutself();
+}
+
+/* call only from nmi handler */
+int do_nmi_show_regs(struct pt_regs *regs, int cpu)
+{
+ static DEFINE_SPINLOCK(nmi_show_regs_lock);
+
+ if (!cpu_isset(cpu, nmi_show_regs_cpus))
+ return 0;
+
+ spin_lock(&nmi_show_regs_lock);
+ busted_show_regs(regs, 1);
+ cpu_clear(cpu, nmi_show_regs_cpus);
+ spin_unlock(&nmi_show_regs_lock);
+ return 1;
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09c..8967db7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -130,6 +130,8 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* or halting the specified task is impossible.
*/
int dumpable = 0;
+ int vps_dumpable = 0;
+
/* Don't let security modules deny introspection */
if (task == current)
return 0;
@@ -147,11 +149,17 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
}
rcu_read_unlock();
smp_rmb();
- if (task->mm)
+ if (task->mm) {
dumpable = get_dumpable(task->mm);
+ vps_dumpable = (task->mm->vps_dumpable == 1);
+ }
+
if (!dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
-
+ if (!vps_dumpable && !ve_is_super(get_exec_env()))
+ return -EPERM;
+ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env()))
+ return -EPERM;
return security_ptrace_access_check(task, mode);
}
@@ -190,6 +198,9 @@ int ptrace_attach(struct task_struct *task)
task_unlock(task);
if (retval)
goto unlock_creds;
+ retval = -EACCES;
+ if (task->mm->vps_dumpable == 2)
+ goto unlock_creds;
write_lock_irq(&tasklist_lock);
retval = -EPERM;
@@ -396,6 +407,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
}
return copied;
}
+EXPORT_SYMBOL_GPL(access_process_vm);
static int ptrace_setoptions(struct task_struct *child, long data)
{
@@ -584,6 +596,10 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
{
struct task_struct *child;
+ /* ptracing of init from inside CT is dangerous */
+ if (pid == 1 && !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
rcu_read_lock();
child = find_task_by_vpid(pid);
if (child)
diff --git a/kernel/sched.c b/kernel/sched.c
index 34d924e..bf1165c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,8 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
+#include <linux/fairsched.h>
+#include <linux/ve_proto.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -355,6 +357,8 @@ static inline struct task_group *task_group(struct task_struct *p)
#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
+#elif defined(CONFIG_VZ_FAIRSCHED)
+ tg = p->fsched_node->tg;
#else
tg = &init_task_group;
#endif
@@ -563,6 +567,9 @@ struct rq {
*/
unsigned long nr_uninterruptible;
+ unsigned long nr_sleeping;
+ unsigned long nr_stopped;
+
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
@@ -647,6 +654,12 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+struct kernel_stat_glob kstat_glob;
+DEFINE_SPINLOCK(kstat_glb_lock);
+EXPORT_SYMBOL(kstat_glob);
+EXPORT_SYMBOL(kstat_glb_lock);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat);
+
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
@@ -998,6 +1011,220 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
spin_unlock_irqrestore(&rq->lock, *flags);
}
+#ifdef CONFIG_VE
+struct ve_cpu_stats static_ve_cpu_stats;
+EXPORT_SYMBOL(static_ve_cpu_stats);
+
+static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu)
+{
+ VE_CPU_STATS(ve, cpu)->nr_iowait++;
+}
+
+static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu)
+{
+ VE_CPU_STATS(ve, cpu)->nr_iowait--;
+}
+
+static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu)
+{
+ VE_CPU_STATS(ve, cpu)->nr_unint++;
+}
+
+static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu)
+{
+ VE_CPU_STATS(ve, cpu)->nr_unint--;
+}
+
+#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0)
+
+cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu)
+{
+ struct ve_cpu_stats *ve_stat;
+ unsigned v;
+ cycles_t strt, ret, cycles;
+
+ ve_stat = VE_CPU_STATS(ve, cpu);
+ do {
+ v = read_seqcount_begin(&ve_stat->stat_lock);
+ ret = ve_stat->idle_time;
+ strt = ve_stat->strt_idle_time;
+ if (strt && nr_iowait_ve(ve) == 0) {
+ cycles = get_cycles();
+ if (cycles_after(cycles, strt))
+ ret += cycles - strt;
+ }
+ } while (read_seqcount_retry(&ve_stat->stat_lock, v));
+ return ret;
+}
+EXPORT_SYMBOL(ve_sched_get_idle_time);
+
+cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu)
+{
+ struct ve_cpu_stats *ve_stat;
+ unsigned v;
+ cycles_t strt, ret, cycles;
+
+ ve_stat = VE_CPU_STATS(ve, cpu);
+ do {
+ v = read_seqcount_begin(&ve_stat->stat_lock);
+ ret = ve_stat->iowait_time;
+ strt = ve_stat->strt_idle_time;
+ if (strt && nr_iowait_ve(ve) > 0) {
+ cycles = get_cycles();
+ if (cycles_after(cycles, strt))
+ ret += cycles - strt;
+ }
+ } while (read_seqcount_retry(&ve_stat->stat_lock, v));
+ return ret;
+}
+EXPORT_SYMBOL(ve_sched_get_iowait_time);
+
+static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles)
+{
+ struct ve_cpu_stats *ve_stat;
+
+ ve_stat = VE_CPU_STATS(ve, cpu);
+
+ write_seqcount_begin(&ve_stat->stat_lock);
+ if (ve_stat->strt_idle_time) {
+ if (cycles_after(cycles, ve_stat->strt_idle_time)) {
+ if (nr_iowait_ve(ve) == 0)
+ ve_stat->idle_time +=
+ cycles - ve_stat->strt_idle_time;
+ else
+ ve_stat->iowait_time +=
+ cycles - ve_stat->strt_idle_time;
+ }
+ ve_stat->strt_idle_time = 0;
+ }
+ write_seqcount_end(&ve_stat->stat_lock);
+}
+
+static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles)
+{
+ struct ve_cpu_stats *ve_stat;
+
+ ve_stat = VE_CPU_STATS(ve, cpu);
+
+ write_seqcount_begin(&ve_stat->stat_lock);
+ ve_stat->strt_idle_time = cycles;
+ write_seqcount_end(&ve_stat->stat_lock);
+}
+
+static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles)
+{
+ if (++VE_CPU_STATS(ve, cpu)->nr_running == 1)
+ ve_stop_idle(ve, cpu, cycles);
+}
+
+static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles)
+{
+ if (--VE_CPU_STATS(ve, cpu)->nr_running == 0)
+ ve_strt_idle(ve, cpu, cycles);
+}
+
+void ve_sched_attach(struct ve_struct *target_ve)
+{
+ struct task_struct *tsk;
+ unsigned int cpu;
+ cycles_t cycles;
+
+ tsk = current;
+ preempt_disable();
+ cycles = get_cycles();
+ cpu = task_cpu(tsk);
+ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles);
+ ve_nr_running_inc(target_ve, cpu, cycles);
+ preempt_enable();
+}
+EXPORT_SYMBOL(ve_sched_attach);
+
+static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc)
+{
+ struct ve_task_info *ti;
+
+ ti = VE_TASK_INFO(p);
+ write_seqcount_begin(&ti->wakeup_lock);
+ ti->wakeup_stamp = cyc;
+ write_seqcount_end(&ti->wakeup_lock);
+}
+
+static inline void update_sched_lat(struct task_struct *t, cycles_t cycles)
+{
+ int cpu;
+ cycles_t ve_wstamp;
+
+ /* safe due to runqueue lock */
+ cpu = smp_processor_id();
+ ve_wstamp = t->ve_task_info.wakeup_stamp;
+
+ if (ve_wstamp && cycles > ve_wstamp) {
+ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
+ cpu, cycles - ve_wstamp);
+ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve,
+ cpu, cycles - ve_wstamp);
+ }
+}
+
+static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles)
+{
+#ifdef CONFIG_FAIRSCHED
+ if (prev != this_pcpu()->idle) {
+#else
+ if (prev != this_rq()->idle) {
+#endif
+ VE_CPU_STATS(prev->ve_task_info.owner_env,
+ smp_processor_id())->used_time +=
+ cycles - prev->ve_task_info.sched_time;
+
+ prev->ve_task_info.sched_time = cycles;
+ }
+}
+#else
+static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles)
+{
+}
+
+static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles)
+{
+}
+
+static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu)
+{
+}
+
+static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu)
+{
+}
+
+static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu)
+{
+}
+
+static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu)
+{
+}
+
+static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles)
+{
+}
+#endif
+
+struct task_nrs_struct {
+ long nr_running;
+ long nr_unint;
+ long nr_stopped;
+ long nr_sleeping;
+ long nr_iowait;
+ long long nr_switches;
+} ____cacheline_aligned_in_smp;
+
+unsigned long nr_zombie = 0; /* protected by tasklist_lock */
+EXPORT_SYMBOL(nr_zombie);
+
+atomic_t nr_dead = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_dead);
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -1943,11 +2170,21 @@ static int effective_prio(struct task_struct *p)
*/
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
{
- if (task_contributes_to_load(p))
+ cycles_t cycles;
+
+#ifdef CONFIG_VE
+ cycles = get_cycles();
+ write_wakeup_stamp(p, cycles);
+ p->ve_task_info.sleep_time += cycles;
+#endif
+ if (task_contributes_to_load(p)) {
rq->nr_uninterruptible--;
+ ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p));
+ }
enqueue_task(rq, p, wakeup);
inc_nr_running(rq);
+ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles);
}
/*
@@ -1955,11 +2192,31 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
*/
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
{
- if (task_contributes_to_load(p))
+ cycles_t cycles;
+ unsigned int cpu;
+
+ cycles = get_cycles();
+ cpu = task_cpu(p);
+
+ p->ve_task_info.sleep_time -= cycles;
+
+#if 0 /* this is broken */
+ if (p->state == TASK_INTERRUPTIBLE) {
+ rq->nr_sleeping++;
+ }
+ if (p->state == TASK_STOPPED) {
+ rq->nr_stopped++;
+ }
+#endif
+
+ if (task_contributes_to_load(p)) {
rq->nr_uninterruptible++;
+ ve_nr_unint_inc(VE_TASK_INFO(p)->owner_env, cpu);
+ }
dequeue_task(rq, p, sleep);
dec_nr_running(rq);
+ ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles);
}
/**
@@ -2276,6 +2533,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
return ncsw;
}
+EXPORT_SYMBOL_GPL(wait_task_inactive);
/***
* kick_process - kick a running thread to enter/exit the kernel
@@ -2372,8 +2630,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
*
* First fix up the nr_uninterruptible count:
*/
- if (task_contributes_to_load(p))
+ if (task_contributes_to_load(p)) {
rq->nr_uninterruptible--;
+ ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, cpu);
+ }
+
p->state = TASK_WAKING;
task_rq_unlock(rq, &flags);
@@ -2607,6 +2868,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_VE
+ /* cosmetic: sleep till wakeup below */
+ p->ve_task_info.sleep_time -= get_cycles();
+#endif
plist_node_init(&p->pushable_tasks, MAX_PRIO);
put_cpu();
@@ -2637,6 +2902,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
*/
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
+ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p),
+ get_cycles());
}
trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, WF_FORK);
@@ -2839,6 +3106,7 @@ asmlinkage void schedule_tail(struct task_struct *prev)
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
+EXPORT_SYMBOL_GPL(schedule_tail);
/*
* context_switch - switch to the new MM and the new
@@ -2910,6 +3178,7 @@ unsigned long nr_running(void)
return sum;
}
+EXPORT_SYMBOL_GPL(nr_running);
unsigned long nr_uninterruptible(void)
{
@@ -2927,6 +3196,7 @@ unsigned long nr_uninterruptible(void)
return sum;
}
+EXPORT_SYMBOL_GPL(nr_uninterruptible);
unsigned long long nr_context_switches(void)
{
@@ -2962,6 +3232,72 @@ unsigned long this_cpu_load(void)
}
+unsigned long nr_stopped(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_stopped;
+ if (unlikely((long)sum < 0))
+ sum = 0;
+ return sum;
+}
+EXPORT_SYMBOL(nr_stopped);
+
+unsigned long nr_sleeping(void)
+{
+ unsigned long i, sum = 0;
+
+ for_each_online_cpu(i)
+ sum += cpu_rq(i)->nr_sleeping;
+ if (unlikely((long)sum < 0))
+ sum = 0;
+ return sum;
+}
+EXPORT_SYMBOL(nr_sleeping);
+
+#ifdef CONFIG_VE
+unsigned long nr_running_ve(struct ve_struct *ve)
+{
+ int i;
+ long sum = 0;
+ cpumask_t ve_cpus;
+
+ ve_cpu_online_map(ve, &ve_cpus);
+ for_each_cpu_mask(i, ve_cpus)
+ sum += VE_CPU_STATS(ve, i)->nr_running;
+ return (unsigned long)(sum < 0 ? 0 : sum);
+}
+EXPORT_SYMBOL(nr_running_ve);
+
+unsigned long nr_uninterruptible_ve(struct ve_struct *ve)
+{
+ int i;
+ long sum = 0;
+ cpumask_t ve_cpus;
+
+ sum = 0;
+ ve_cpu_online_map(ve, &ve_cpus);
+ for_each_cpu_mask(i, ve_cpus)
+ sum += VE_CPU_STATS(ve, i)->nr_unint;
+ return (unsigned long)(sum < 0 ? 0 : sum);
+}
+EXPORT_SYMBOL(nr_uninterruptible_ve);
+
+unsigned long nr_iowait_ve(struct ve_struct *ve)
+{
+ int i;
+ long sum = 0;
+ cpumask_t ve_cpus;
+
+ ve_cpu_online_map(ve, &ve_cpus);
+ for_each_cpu_mask(i, ve_cpus)
+ sum += VE_CPU_STATS(ve, i)->nr_iowait;
+ return (unsigned long)(sum < 0 ? 0 : sum);
+}
+EXPORT_SYMBOL(nr_iowait_ve);
+#endif
+
/* Variables and functions for calc_load */
static atomic_long_t calc_load_tasks;
static unsigned long calc_load_update;
@@ -2983,6 +3319,16 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
+void get_avenrun_ve(struct ve_struct *ve,
+ unsigned long *loads, unsigned long offset, int shift)
+{
+ loads[0] = (ve->avenrun[0] + offset) << shift;
+ loads[1] = (ve->avenrun[1] + offset) << shift;
+ loads[2] = (ve->avenrun[2] + offset) << shift;
+}
+
+
+
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
@@ -2991,6 +3337,35 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
return load >> FSHIFT;
}
+#ifdef CONFIG_VE
+static void calc_load_ve(void)
+{
+ unsigned long flags, nr_unint, nr_active;
+ struct ve_struct *ve;
+
+ read_lock(&ve_list_lock);
+ for_each_ve(ve) {
+ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve);
+ nr_active *= FIXED_1;
+
+ ve->avenrun[0] = calc_load(ve->avenrun[0], EXP_1, nr_active);
+ ve->avenrun[1] = calc_load(ve->avenrun[1], EXP_5, nr_active);
+ ve->avenrun[2] = calc_load(ve->avenrun[2], EXP_15, nr_active);
+ }
+ read_unlock(&ve_list_lock);
+
+ nr_unint = nr_uninterruptible() * FIXED_1;
+ spin_lock_irqsave(&kstat_glb_lock, flags);
+ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
+ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
+ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
+ spin_unlock_irqrestore(&kstat_glb_lock, flags);
+
+}
+#else
+#define calc_load_ve() do { } while (0)
+#endif
+
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
@@ -3010,6 +3385,8 @@ void calc_global_load(void)
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+ calc_load_ve();
+
calc_load_update += LOAD_FREQ;
}
@@ -3074,6 +3451,16 @@ static void update_cpu_load(struct rq *this_rq)
}
}
+#ifdef CONFIG_VE
+#define update_ve_cpu_time(p, time, tick) \
+ do { \
+ VE_CPU_STATS((p)->ve_task_info.owner_env, \
+ task_cpu(p))->time += tick; \
+ } while (0)
+#else
+#define update_ve_cpu_time(p, time, tick) do { } while (0)
+#endif
+
#ifdef CONFIG_SMP
/*
@@ -3174,8 +3561,15 @@ void sched_exec(void)
static void pull_task(struct rq *src_rq, struct task_struct *p,
struct rq *this_rq, int this_cpu)
{
+ struct ve_struct *ve;
+ cycles_t cycles = get_cycles();
+
+ ve = VE_TASK_INFO(p)->owner_env;
+
deactivate_task(src_rq, p, 0);
+ ve_nr_running_dec(ve, task_cpu(p), cycles);
set_task_cpu(p, this_cpu);
+ ve_nr_running_inc(ve, task_cpu(p), cycles);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
}
@@ -5052,10 +5446,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
/* Add user time to cpustat. */
tmp = cputime_to_cputime64(cputime);
- if (TASK_NICE(p) > 0)
+ if (TASK_NICE(p) > 0) {
cpustat->nice = cputime64_add(cpustat->nice, tmp);
- else
+ update_ve_cpu_time(p, nice, tmp);
+ } else {
cpustat->user = cputime64_add(cpustat->user, tmp);
+ update_ve_cpu_time(p, user, tmp);
+ }
cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
@@ -5112,6 +5509,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
/* Add system time to cpustat. */
tmp = cputime_to_cputime64(cputime);
+ update_ve_cpu_time(p, system, tmp);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
else if (softirq_count())
@@ -5490,6 +5888,8 @@ need_resched_nonpreemptible:
next = pick_next_task(rq);
if (likely(prev != next)) {
+ cycles_t cycles = get_cycles();
+
sched_info_switch(prev, next);
perf_event_task_sched_out(prev, next, cpu);
@@ -5497,6 +5897,22 @@ need_resched_nonpreemptible:
rq->curr = next;
++*switch_count;
+#ifdef CONFIG_VE
+ prev->ve_task_info.sleep_stamp = cycles;
+ if (prev->state == TASK_RUNNING && prev != this_rq()->idle)
+ write_wakeup_stamp(prev, cycles);
+ update_sched_lat(next, cycles);
+
+ /* because next & prev are protected with
+ * runqueue lock we may not worry about
+ * wakeup_stamp and sched_time protection
+ * (same thing in 'else' branch below)
+ */
+ update_ve_task_info(prev, cycles);
+ next->ve_task_info.sched_time = cycles;
+ write_wakeup_stamp(next, 0);
+#endif
+
context_switch(rq, prev, next); /* unlocks the rq */
/*
* the context switch might have flipped the stack from under
@@ -5504,8 +5920,10 @@ need_resched_nonpreemptible:
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- } else
+ } else {
+ update_ve_task_info(prev, get_cycles());
spin_unlock_irq(&rq->lock);
+ }
post_schedule(rq);
@@ -6289,7 +6707,7 @@ recheck:
/*
* Allow unprivileged RT tasks to decrease priority:
*/
- if (user && !capable(CAP_SYS_NICE)) {
+ if (user && !capable(CAP_SYS_ADMIN)) {
if (rt_policy(policy)) {
unsigned long rlim_rtprio;
@@ -6800,11 +7218,16 @@ EXPORT_SYMBOL(yield);
void __sched io_schedule(void)
{
struct rq *rq = raw_rq();
+#ifdef CONFIG_VE
+ struct ve_struct *ve = current->ve_task_info.owner_env;
+#endif
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
current->in_iowait = 1;
+ ve_nr_iowait_inc(ve, task_cpu(current));
schedule();
+ ve_nr_iowait_dec(ve, task_cpu(current));
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
@@ -6815,11 +7238,16 @@ long __sched io_schedule_timeout(long timeout)
{
struct rq *rq = raw_rq();
long ret;
+#ifdef CONFIG_VE
+ struct ve_struct *ve = current->ve_task_info.owner_env;
+#endif
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
current->in_iowait = 1;
+ ve_nr_iowait_inc(ve, task_cpu(current));
ret = schedule_timeout(timeout);
+ ve_nr_iowait_dec(ve, task_cpu(current));
current->in_iowait = 0;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
@@ -6926,17 +7354,7 @@ void sched_show_task(struct task_struct *p)
state = p->state ? __ffs(p->state) + 1 : 0;
printk(KERN_INFO "%-13.13s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running ");
- else
- printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
- if (state == TASK_RUNNING)
- printk(KERN_CONT " running task ");
- else
- printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
+ printk(KERN_CONT " %p ", p);
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
@@ -6953,13 +7371,13 @@ void show_state_filter(unsigned long state_filter)
#if BITS_PER_LONG == 32
printk(KERN_INFO
- " task PC stack pid father\n");
+ " task taskaddr stack pid father\n");
#else
printk(KERN_INFO
- " task PC stack pid father\n");
+ " task taskaddr stack pid father\n");
#endif
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
* console might take alot of time:
@@ -6967,7 +7385,7 @@ void show_state_filter(unsigned long state_filter)
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
touch_all_softlockup_watchdogs();
@@ -7336,13 +7754,13 @@ static void migrate_live_tasks(int src_cpu)
read_lock(&tasklist_lock);
- do_each_thread(t, p) {
+ do_each_thread_all(t, p) {
if (p == current)
continue;
if (task_cpu(p) == src_cpu)
move_task_off_dead_cpu(src_cpu, p);
- } while_each_thread(t, p);
+ } while_each_thread_all(t, p);
read_unlock(&tasklist_lock);
}
@@ -9490,6 +9908,7 @@ void __init sched_init(void)
update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
__alignof__(unsigned long));
#endif
+ kstat_glob.sched_lat.cur = &per_cpu__glob_kstat_lat;
for_each_possible_cpu(i) {
struct rq *rq;
@@ -9503,7 +9922,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.shares = init_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
+#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED)
/*
* How much cpu bandwidth does init_task_group get?
*
@@ -9549,7 +9968,7 @@ void __init sched_init(void)
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
+#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED)
init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
#elif defined CONFIG_USER_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
@@ -9615,6 +10034,7 @@ void __init sched_init(void)
* During early bootup we pretend to be a normal task:
*/
current->sched_class = &fair_sched_class;
+ fairsched_init_early();
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
@@ -9693,7 +10113,7 @@ void normalize_rt_tasks(void)
struct rq *rq;
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
/*
* Only normalize user tasks:
*/
@@ -9724,7 +10144,7 @@ void normalize_rt_tasks(void)
__task_rq_unlock(rq);
spin_unlock(&p->pi_lock);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock_irqrestore(&tasklist_lock, flags);
}
@@ -10170,10 +10590,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
- do_each_thread(g, p) {
+ do_each_thread_ve(g, p) {
if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
return 1;
- } while_each_thread(g, p);
+ } while_each_thread_ve(g, p);
return 0;
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf0..95930c1 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -135,12 +135,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
if (!p->se.on_rq || task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
read_unlock_irqrestore(&tasklist_lock, flags);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 4d0658d..fcb5698 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,13 +33,32 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
+#include <bc/misc.h>
#include "audit.h" /* audit_signal_info() */
/*
* SLAB caches for signal bits.
*/
-static struct kmem_cache *sigqueue_cachep;
+struct kmem_cache *sigqueue_cachep;
+EXPORT_SYMBOL(sigqueue_cachep);
+
+static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t)
+{
+ struct ve_struct *ve;
+
+ /* always allow signals from the kernel */
+ if (info == SEND_SIG_FORCED ||
+ (!is_si_special(info) && SI_FROMKERNEL(info)))
+ return 0;
+
+ ve = current->ve_task_info.owner_env;
+ if (ve->ve_ns->pid_ns->child_reaper != t)
+ return 0;
+ if (ve_is_super(get_exec_env()))
+ return 0;
+ return !sig_user_defined(t, sig) || sig_kernel_only(sig);
+}
static void __user *sig_handler(struct task_struct *t, int sig)
{
@@ -118,7 +137,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
-static int recalc_sigpending_tsk(struct task_struct *t)
+int recalc_sigpending_tsk(struct task_struct *t)
{
if (t->signal->group_stop_count > 0 ||
PENDING(&t->pending, &t->blocked) ||
@@ -143,6 +162,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
if (recalc_sigpending_tsk(t))
signal_wake_up(t, 0);
}
+EXPORT_SYMBOL_GPL(recalc_sigpending_tsk);
void recalc_sigpending(void)
{
@@ -209,8 +229,13 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
atomic_inc(&user->sigpending);
if (override_rlimit ||
atomic_read(&user->sigpending) <=
- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
+ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
+ if (q && ub_siginfo_charge(q, get_task_ub(t))) {
+ kmem_cache_free(sigqueue_cachep, q);
+ q = NULL;
+ }
+ }
if (unlikely(q == NULL)) {
atomic_dec(&user->sigpending);
free_uid(user);
@@ -229,6 +254,7 @@ static void __sigqueue_free(struct sigqueue *q)
return;
atomic_dec(&q->user->sigpending);
free_uid(q->user);
+ ub_siginfo_uncharge(q);
kmem_cache_free(sigqueue_cachep, q);
}
@@ -409,7 +435,18 @@ still_pending:
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
siginfo_t *info)
{
- int sig = next_signal(pending, mask);
+ int sig = 0;
+
+ /* SIGKILL must have priority, otherwise it is quite easy
+ * to create an unkillable process, sending sig < SIGKILL
+ * to self */
+ if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+ if (!sigismember(mask, SIGKILL))
+ sig = SIGKILL;
+ }
+
+ if (likely(!sig))
+ sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
@@ -532,6 +569,7 @@ void signal_wake_up(struct task_struct *t, int resume)
if (!wake_up_state(t, mask))
kick_process(t);
}
+EXPORT_SYMBOL_GPL(signal_wake_up);
/*
* Remove signals in mask from the pending set and queue.
@@ -655,7 +693,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
t = p;
do {
rm_from_queue(sigmask(SIGCONT), &t->pending);
- } while_each_thread(p, t);
+ } while_each_thread_all(p, t);
} else if (sig == SIGCONT) {
unsigned int why;
/*
@@ -687,7 +725,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
state |= TASK_INTERRUPTIBLE;
}
wake_up_state(t, state);
- } while_each_thread(p, t);
+ } while_each_thread_all(p, t);
/*
* Notify the parent with CLD_CONTINUED if we were stopped.
@@ -809,7 +847,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
do {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
- } while_each_thread(p, t);
+ } while_each_thread_all(p, t);
return;
}
}
@@ -1080,7 +1118,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
int ret = check_kill_permission(sig, info, p);
if (!ret && sig)
- ret = do_send_sig_info(sig, info, p, true);
+ ret = sig_ve_ignored(sig, info, p) ? 0 :
+ do_send_sig_info(sig, info, p, true);
return ret;
}
@@ -1205,7 +1244,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
int retval = 0, count = 0;
struct task_struct * p;
- for_each_process(p) {
+ for_each_process_ve(p) {
if (task_pid_vnr(p) > 1 &&
!same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
@@ -1396,6 +1435,14 @@ int do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!task_ptrace(tsk) &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+#ifdef CONFIG_VE
+ /* Allow to send only SIGCHLD from VE */
+ if (sig != SIGCHLD &&
+ tsk->ve_task_info.owner_env !=
+ tsk->parent->ve_task_info.owner_env)
+ sig = SIGCHLD;
+#endif
+
info.si_signo = sig;
info.si_errno = 0;
/*
@@ -1720,7 +1767,9 @@ static int do_signal_stop(int signr)
/* Now we don't run again until woken by SIGCONT or SIGKILL */
do {
+ set_stop_state(current);
schedule();
+ clear_stop_state(current);
} while (try_to_freeze());
tracehook_finish_jctl();
@@ -1782,8 +1831,6 @@ relock:
* Now that we woke up, it's crucial if we're supposed to be
* frozen that we freeze now before running anything substantial.
*/
- try_to_freeze();
-
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
@@ -2281,7 +2328,8 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
* probe. No signal is actually delivered.
*/
if (!error && sig) {
- error = do_send_sig_info(sig, info, p, false);
+ if (!sig_ve_ignored(sig, info, p))
+ error = do_send_sig_info(sig, info, p, false);
/*
* If lock_task_sighand() failed we pretend the task
* dies after receiving the signal. The window is tiny,
@@ -2678,5 +2726,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
void __init signals_init(void)
{
- sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f8749e5..c9aeeb5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,8 @@
#include <linux/smp.h>
#include <linux/tick.h>
+#include <bc/beancounter.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
@@ -190,10 +192,14 @@ EXPORT_SYMBOL(local_bh_enable_ip);
asmlinkage void __do_softirq(void)
{
+ struct user_beancounter *ub;
struct softirq_action *h;
__u32 pending;
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;
+ struct ve_struct *envid;
+
+ envid = set_exec_env(get_ve0());
pending = local_softirq_pending();
account_system_vtime(current);
@@ -210,6 +216,7 @@ restart:
h = softirq_vec;
+ ub = set_exec_ub(get_ub0());
do {
if (pending & 1) {
int prev_count = preempt_count();
@@ -232,6 +239,7 @@ restart:
h++;
pending >>= 1;
} while (pending);
+ (void)set_exec_ub(ub);
local_irq_disable();
@@ -245,6 +253,7 @@ restart:
lockdep_softirq_exit();
account_system_vtime(current);
+ (void)set_exec_env(envid);
_local_bh_enable();
}
@@ -298,6 +307,7 @@ void irq_exit(void)
{
account_system_vtime(current);
trace_hardirq_exit();
+ restore_context();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
diff --git a/kernel/sys.c b/kernel/sys.c
index ce17760..3073c3e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -10,6 +10,8 @@
#include <linux/mman.h>
#include <linux/smp_lock.h>
#include <linux/notifier.h>
+#include <linux/virtinfo.h>
+#include <linux/pid_namespace.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
#include <linux/highuid.h>
@@ -115,6 +117,102 @@ EXPORT_SYMBOL(cad_pid);
void (*pm_power_off_prepare)(void);
+DECLARE_MUTEX(virtinfo_sem);
+EXPORT_SYMBOL(virtinfo_sem);
+static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
+
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+ struct vnotifier_block **p;
+
+ for (p = &virtinfo_chain[type];
+ *p != NULL && nb->priority < (*p)->priority;
+ p = &(*p)->next);
+ nb->next = *p;
+ smp_wmb();
+ *p = nb;
+}
+
+EXPORT_SYMBOL(__virtinfo_notifier_register);
+
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+ down(&virtinfo_sem);
+ __virtinfo_notifier_register(type, nb);
+ up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_register);
+
+struct virtinfo_cnt_struct {
+ volatile unsigned long exit[NR_CPUS];
+ volatile unsigned long entry;
+};
+static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
+
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
+{
+ struct vnotifier_block **p;
+ int entry_cpu, exit_cpu;
+ unsigned long cnt, ent;
+
+ down(&virtinfo_sem);
+ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
+ *p = nb->next;
+ smp_mb();
+
+ for_each_cpu_mask(entry_cpu, cpu_possible_map) {
+ while (1) {
+ cnt = 0;
+ for_each_cpu_mask(exit_cpu, cpu_possible_map)
+ cnt +=
+ per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
+ smp_rmb();
+ ent = per_cpu(virtcnt, entry_cpu).entry;
+ if (cnt == ent)
+ break;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ / 100);
+ }
+ }
+ up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_unregister);
+
+int virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+ int ret;
+ int entry_cpu, exit_cpu;
+ struct vnotifier_block *nb;
+
+ entry_cpu = get_cpu();
+ per_cpu(virtcnt, entry_cpu).entry++;
+ smp_wmb();
+ put_cpu();
+
+ nb = virtinfo_chain[type];
+ ret = NOTIFY_DONE;
+ while (nb)
+ {
+ ret = nb->notifier_call(nb, n, data, ret);
+ if(ret & NOTIFY_STOP_MASK) {
+ ret &= ~NOTIFY_STOP_MASK;
+ break;
+ }
+ nb = nb->next;
+ }
+
+ exit_cpu = get_cpu();
+ smp_wmb();
+ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
+ put_cpu();
+
+ return ret;
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_call);
+
/*
* set the priority of a task
* - the caller must hold the RCU read lock
@@ -190,10 +288,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
!(user = find_user(who)))
goto out_unlock; /* No processes for this user */
- do_each_thread(g, p)
+ do_each_thread_ve(g, p) {
if (__task_cred(p)->uid == who)
error = set_one_prio(p, niceval, error);
- while_each_thread(g, p);
+ } while_each_thread_ve(g, p);
if (who != cred->uid)
free_uid(user); /* For find_user() */
break;
@@ -253,13 +351,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
!(user = find_user(who)))
goto out_unlock; /* No processes for this user */
- do_each_thread(g, p)
+ do_each_thread_ve(g, p)
if (__task_cred(p)->uid == who) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
}
- while_each_thread(g, p);
+ while_each_thread_ve(g, p);
if (who != cred->uid)
free_uid(user); /* for find_user() */
break;
@@ -375,6 +473,25 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
magic2 != LINUX_REBOOT_MAGIC2C))
return -EINVAL;
+#ifdef CONFIG_VE
+ if (!ve_is_super(get_exec_env()))
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART:
+ case LINUX_REBOOT_CMD_HALT:
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ case LINUX_REBOOT_CMD_RESTART2:
+ force_sig(SIGKILL,
+ get_exec_env()->ve_ns->pid_ns->child_reaper);
+
+ case LINUX_REBOOT_CMD_CAD_ON:
+ case LINUX_REBOOT_CMD_CAD_OFF:
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+#endif
+
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
@@ -925,8 +1042,27 @@ void do_sys_times(struct tms *tms)
tms->tms_cstime = cputime_to_clock_t(cstime);
}
+#ifdef CONFIG_VE
+unsigned long long ve_relative_clock(struct timespec * ts)
+{
+ unsigned long long offset = 0;
+
+ if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec ||
+ (ts->tv_sec == get_exec_env()->start_timespec.tv_sec &&
+ ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec))
+ offset = (unsigned long long)(ts->tv_sec -
+ get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC
+ + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec;
+ return nsec_to_clock_t(offset);
+}
+#endif
+
SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
{
+#ifdef CONFIG_VE
+ struct timespec now;
+#endif
+
if (tbuf) {
struct tms tmp;
@@ -934,8 +1070,15 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
return -EFAULT;
}
+#ifndef CONFIG_VE
force_successful_syscall_return();
return (long) jiffies_64_to_clock_t(get_jiffies_64());
+#else
+ /* Compare to calculation in fs/proc/array.c */
+ do_posix_clock_monotonic_gettime(&now);
+ force_successful_syscall_return();
+ return ve_relative_clock(&now);
+#endif
}
/*
@@ -1133,7 +1276,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
@@ -1182,7 +1325,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_VE_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e06d0b8..7216e06 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -179,3 +179,17 @@ cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_event_open);
+cond_syscall(sys_getluid);
+cond_syscall(sys_setluid);
+cond_syscall(sys_setublimit);
+cond_syscall(compat_sys_setublimit);
+cond_syscall(sys_ubstat);
+cond_syscall(compat_sys_lutime);
+
+/* fairsched compat */
+cond_syscall(sys_fairsched_mknod);
+cond_syscall(sys_fairsched_rmnod);
+cond_syscall(sys_fairsched_mvpr);
+cond_syscall(sys_fairsched_vcpus);
+cond_syscall(sys_fairsched_chwt);
+cond_syscall(sys_fairsched_rate);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b8bd058..5b754e4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,7 @@
#include <linux/ftrace.h>
#include <linux/slow-work.h>
#include <linux/perf_event.h>
+#include <linux/ve_task.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -83,6 +84,21 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
+extern int ve_area_access_check; /* fs/namei.c */
+int ve_allow_kthreads = 1;
+EXPORT_SYMBOL(ve_allow_kthreads);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+extern int sysrq_key_scancode;
+#endif
+
+extern int alloc_fail_warn;
+int decode_call_traces = 1;
+
+#ifdef CONFIG_VE
+int glob_ve_meminfo = 0;
+EXPORT_SYMBOL(glob_ve_meminfo);
+#endif
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifndef CONFIG_MMU
@@ -169,6 +185,12 @@ static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+static int proc_dointvec_ve(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+static int sysctl_data_ve(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen);
+
static struct ctl_table root_table[];
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
@@ -178,9 +200,31 @@ static struct ctl_table_header root_table_header = {
.root = &sysctl_table_root,
.set = &sysctl_table_root.default_set,
};
-static struct ctl_table_root sysctl_table_root = {
+
+#ifdef CONFIG_VE
+static int sysctl_root_perms(struct ctl_table_root *root,
+ struct nsproxy *namespaces, struct ctl_table *table)
+{
+ if (ve_is_super(get_exec_env()))
+ return table->mode;
+ else
+ return table->mode & ~0222;
+}
+
+static struct ctl_table_root sysctl_table_groot = {
.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
+ .default_set.list = LIST_HEAD_INIT(sysctl_table_groot.default_set.list),
+ .default_set.parent = &sysctl_table_root.default_set,
+};
+#else
+#define sysctl_root_perms NULL
+#define sysctl_table_groot sysctl_table_root
+#endif
+
+static struct ctl_table_root sysctl_table_root = {
+ .root_list = LIST_HEAD_INIT(sysctl_table_groot.root_list),
.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+ .permissions = sysctl_root_perms,
};
static struct ctl_table kern_table[];
@@ -504,6 +548,20 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+ {
+ .procname = "silence-level",
+ .data = &console_silence_loglevel,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "alloc_fail_warn",
+ .data = &alloc_fail_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#ifdef __hppa__
{
.ctl_name = KERN_HPPA_PWRSW,
@@ -699,6 +757,24 @@ static struct ctl_table kern_table[] = {
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
+#ifdef CONFIG_VE
+ {
+ .procname = "ve_meminfo",
+ .data = &glob_ve_meminfo,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+#ifdef CONFIG_MAGIC_SYSRQ
+ {
+ .procname = "sysrq-key",
+ .data = &sysrq_key_scancode,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{
.ctl_name = KERN_PANIC_ON_OOPS,
.procname = "panic_on_oops",
@@ -824,10 +900,13 @@ static struct ctl_table kern_table[] = {
{
.ctl_name = KERN_RANDOMIZE,
.procname = "randomize_va_space",
- .data = &randomize_va_space,
+ .data = &_randomize_va_space,
+ .extra1 = (void *)offsetof(struct ve_struct,
+ _randomize_va_space),
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_ve,
+ .strategy = &sysctl_data_ve,
},
#endif
#if defined(CONFIG_S390) && defined(CONFIG_SMP)
@@ -1424,6 +1503,21 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "vsyscall",
+ .data = &sysctl_at_vsyscall,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "odirect_enable",
+ .data = &odirect_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
@@ -1600,6 +1694,13 @@ static struct ctl_table fs_table[] = {
};
static struct ctl_table debug_table[] = {
+ {
+ .procname = "decode_call_traces",
+ .data = &decode_call_traces,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#if defined(CONFIG_X86) || defined(CONFIG_PPC)
{
.ctl_name = CTL_UNNUMBERED,
@@ -2150,10 +2251,27 @@ struct ctl_table_header *__register_sysctl_paths(
struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
struct ctl_table *table)
{
+ if (!ve_is_super(get_exec_env())) {
+ WARN_ON(1);
+ return NULL;
+ }
+
return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
path, table);
}
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
+ struct ctl_table *table, int virtual_handler)
+{
+ if (!ve_is_super(get_exec_env())) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ return __register_sysctl_paths(&sysctl_table_groot, current->nsproxy,
+ path, table);
+}
+
/**
* register_sysctl_table - register a sysctl table hierarchy
* @table: the top-level table structure
@@ -2170,6 +2288,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
return register_sysctl_paths(null_path, table);
}
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
+ int virtual_handler)
+{
+ static const struct ctl_path null_path[] = { {} };
+
+ return register_sysctl_glob_paths(null_path, table, virtual_handler);
+}
+
/**
* unregister_sysctl_table - unregister a sysctl table hierarchy
* @header: the header returned from register_sysctl_table
@@ -2231,6 +2357,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
return NULL;
}
+struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table,
+ int vh)
+{
+ return NULL;
+}
+
+struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path,
+ struct ctl_table *table, int vh)
+{
+ return NULL;
+}
+
void unregister_sysctl_table(struct ctl_table_header * table)
{
}
@@ -2902,6 +3040,25 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_VE
+static int proc_dointvec_ve(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp_table;
+
+ tmp_table = *table;
+ tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1;
+
+ return proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+}
+#else
+static int proc_dointvec_ve(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif /* CONFIG_VE */
+
#else /* CONFIG_PROC_FS */
int proc_dostring(struct ctl_table *table, int write,
@@ -2996,6 +3153,27 @@ int sysctl_data(struct ctl_table *table,
return 1;
}
+#ifdef CONFIG_VE
+static int sysctl_data_ve(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ struct ctl_table tmp_table;
+
+ tmp_table = *table;
+ tmp_table.data = (char *)get_exec_env() + (unsigned long)table->extra1;
+
+ return sysctl_data(&tmp_table, oldval, oldlenp, newval, newlen);
+}
+#else
+static int sysctl_data_ve(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return sysctl_data(table, oldval, oldlenp, newval, newlen);
+}
+#endif
+
/* The generic string strategy routine: */
int sysctl_string(struct ctl_table *table,
void __user *oldval, size_t __user *oldlenp,
@@ -3175,6 +3353,13 @@ int sysctl_data(struct ctl_table *table,
return -ENOSYS;
}
+static int sysctl_data_ve(struct ctl_table *table,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen)
+{
+ return -ENOSYS;
+}
+
int sysctl_string(struct ctl_table *table,
void __user *oldval, size_t __user *oldlenp,
void __user *newval, size_t newlen)
@@ -3236,6 +3421,56 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
return 0;
}
+#ifdef CONFIG_PID_NS
+#include <linux/pid_namespace.h>
+
+static int proc_pid_ns_hide_child(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int tmp, res;
+
+ tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0;
+
+ res = __do_proc_dointvec(&tmp, table, write, buffer,
+ lenp, ppos, NULL, NULL);
+ if (res || !write)
+ return res;
+
+ if (tmp)
+ current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD;
+ else
+ current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD;
+ return 0;
+}
+
+static struct ctl_table pid_ns_kern_table[] = {
+ {
+ .procname = "pid_ns_hide_child",
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_pid_ns_hide_child,
+ },
+ {}
+};
+
+static struct ctl_table pid_ns_root_table[] = {
+ {
+ .ctl_name = CTL_KERN,
+ .procname = "kernel",
+ .mode = 0555,
+ .child = pid_ns_kern_table,
+ },
+ {}
+};
+
+static __init int pid_ns_sysctl_init(void)
+{
+ register_sysctl_table(pid_ns_root_table);
+ return 0;
+}
+postcore_initcall(pid_ns_sysctl_init);
+#endif /* CONFIG_PID_NS */
+
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
@@ -3249,7 +3484,9 @@ EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
EXPORT_SYMBOL(register_sysctl_table);
+EXPORT_SYMBOL(register_sysctl_glob_table);
EXPORT_SYMBOL(register_sysctl_paths);
+EXPORT_SYMBOL(register_sysctl_glob_paths);
EXPORT_SYMBOL(sysctl_intvec);
EXPORT_SYMBOL(sysctl_jiffies);
EXPORT_SYMBOL(sysctl_ms_jiffies);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d..d1ff8ff 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
stats->nvcsw += tsk->nvcsw;
stats->nivcsw += tsk->nivcsw;
- } while_each_thread(first, tsk);
+ } while_each_thread_all(first, tsk);
unlock_task_sighand(first, &flags);
rc = 0;
diff --git a/kernel/time.c b/kernel/time.c
index 2e2e469..c0cce6d 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -610,10 +610,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t);
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
+ WARN_ON((long)x < 0);
if (x >= ~0UL / (HZ / USER_HZ))
return ~0UL;
return x * (HZ / USER_HZ);
#else
+ WARN_ON((long)x < 0);
/* Don't worry about loss of precision here .. */
if (x >= ~0UL / HZ * USER_HZ)
return ~0UL;
@@ -626,6 +628,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
u64 jiffies_64_to_clock_t(u64 x)
{
+ WARN_ON((s64)x < 0);
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
x = div_u64(x * USER_HZ, HZ);
@@ -648,6 +651,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t);
u64 nsec_to_clock_t(u64 x)
{
+ WARN_ON((s64)x < 0);
#if (NSEC_PER_SEC % USER_HZ) == 0
return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8b709de..0af7669 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -154,6 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
* used instead.
*/
struct timespec xtime __attribute__ ((aligned (16)));
+EXPORT_SYMBOL_GPL(xtime);
struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static struct timespec total_sleep_time;
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d..0ba4a86 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
#include <linux/kallsyms.h>
#include <linux/perf_event.h>
#include <linux/sched.h>
+#include <linux/virtinfo.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -1000,6 +1001,7 @@ static inline void __run_timers(struct tvec_base *base)
spin_unlock_irq(&base->lock);
{
int preempt_count = preempt_count();
+ struct ve_struct *ve;
#ifdef CONFIG_LOCKDEP
/*
@@ -1023,7 +1025,9 @@ static inline void __run_timers(struct tvec_base *base)
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
+ ve = set_exec_env(get_ve0());
fn(data);
+ (void)set_exec_env(ve);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1441,20 +1445,35 @@ int do_sysinfo(struct sysinfo *info)
unsigned long mem_total, sav_total;
unsigned int mem_unit, bitcount;
struct timespec tp;
+ struct ve_struct *ve;
memset(info, 0, sizeof(struct sysinfo));
+ ve = get_exec_env();
ktime_get_ts(&tp);
monotonic_to_bootbased(&tp);
info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
- get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+ if (ve_is_super(ve)) {
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
- info->procs = nr_threads;
+ info->procs = nr_threads;
+ } else {
+ info->uptime -= ve->start_timespec.tv_sec;
+
+ info->procs = atomic_read(&ve->pcounter);
+
+ get_avenrun_ve(ve, info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+ }
si_meminfo(info);
si_swapinfo(info);
+#ifdef CONFIG_BEANCOUNTERS
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info)
+ & NOTIFY_FAIL)
+ return -ENOMSG;
+#endif
/*
* If the sum of all the available memory (i.e. ram + swap)
* is less than can be stored in a 32 bit unsigned long then
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0cccb6c..03d83f5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3091,7 +3091,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ do_each_thread_all(g, t) {
if (start == end) {
ret = -EAGAIN;
goto unlock;
@@ -3105,7 +3105,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
smp_wmb();
t->ret_stack = ret_stack_list[start++];
}
- } while_each_thread(g, t);
+ } while_each_thread_all(g, t);
unlock:
read_unlock_irqrestore(&tasklist_lock, flags);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5..b328e97 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -596,11 +596,11 @@ void syscall_regfunc(void)
if (!sys_tracepoint_refcount) {
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ do_each_thread_ve(g, t) {
/* Skip kernel threads. */
if (t->mm)
set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
+ } while_each_thread_ve(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
}
sys_tracepoint_refcount++;
@@ -614,9 +614,9 @@ void syscall_unregfunc(void)
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ do_each_thread_ve(g, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
+ } while_each_thread_ve(g, t);
read_unlock_irqrestore(&tasklist_lock, flags);
}
}
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165..23f4e4a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -323,6 +323,7 @@ static void cleanup_user_struct(struct work_struct *w)
done:
uids_mutex_unlock();
}
+EXPORT_SYMBOL_GPL(free_uid);
/* IRQs are disabled and uidhash_lock is held upon function entry.
* IRQ state (as stored in flags) is restored and uidhash_lock released
@@ -422,6 +423,7 @@ void free_uid(struct user_struct *up)
else
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(free_uid);
struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
{
@@ -488,13 +490,14 @@ out_unlock:
uids_mutex_unlock();
return NULL;
}
+EXPORT_SYMBOL_GPL(alloc_uid);
static int __init uid_cache_init(void)
{
int n;
uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
for(n = 0; n < UIDHASH_SZ; ++n)
INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8..3c24b38 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -59,6 +59,7 @@ int create_user_ns(struct cred *new)
return 0;
}
+EXPORT_SYMBOL(create_user_ns);
/*
* Deferred destructor for a user namespace. This is required because
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 69eae35..678dd61 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -26,6 +26,10 @@ static void *get_uts(ctl_table *table, int write)
down_read(&uts_sem);
else
down_write(&uts_sem);
+
+ if (strcmp(table->procname, "virt_osrelease") == 0)
+ return virt_utsname.release;
+
return which;
}
@@ -126,19 +130,27 @@ static struct ctl_table uts_kern_table[] = {
{}
};
-static struct ctl_table uts_root_table[] = {
+static struct ctl_table uts_virt_osrelease_table[] = {
{
- .ctl_name = CTL_KERN,
- .procname = "kernel",
- .mode = 0555,
- .child = uts_kern_table,
+ .procname = "virt_osrelease",
+ .data = virt_utsname.release,
+ .maxlen = sizeof(virt_utsname.release),
+ .mode = 0644,
+ .proc_handler = &proc_do_uts_string,
+ .strategy = sysctl_uts_string,
},
{}
};
+static struct ctl_path uts_path[] = {
+ { .ctl_name = CTL_KERN, .procname = "kernel", },
+ { }
+};
+
static int __init utsname_sysctl_init(void)
{
- register_sysctl_table(uts_root_table);
+ register_sysctl_glob_paths(uts_path, uts_kern_table, 1);
+ register_sysctl_paths(uts_path, uts_virt_osrelease_table);
return 0;
}
diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile
new file mode 100644
index 0000000..9d60161
--- /dev/null
+++ b/kernel/ve/Makefile
@@ -0,0 +1,16 @@
+#
+#
+# kernel/ve/Makefile
+#
+# Copyright (C) 2000-2005 SWsoft
+# All rights reserved.
+#
+# Licensing governed by "linux/COPYING.SWsoft" file.
+
+obj-$(CONFIG_VE) = ve.o veowner.o hooks.o
+obj-$(CONFIG_VZ_WDOG) += vzwdog.o
+obj-$(CONFIG_VE_CALLS) += vzmon.o
+
+vzmon-objs = vecalls.o
+
+obj-$(CONFIG_VZ_DEV) += vzdev.o
diff --git a/kernel/ve/hooks.c b/kernel/ve/hooks.c
new file mode 100644
index 0000000..1b82c35
--- /dev/null
+++ b/kernel/ve/hooks.c
@@ -0,0 +1,114 @@
+/*
+ * linux/kernel/ve/hooks.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ve_proto.h>
+#include <linux/module.h>
+
+static struct list_head ve_hooks[VE_MAX_CHAINS];
+static DECLARE_RWSEM(ve_hook_sem);
+
+void ve_hook_register(int chain, struct ve_hook *vh)
+{
+ struct list_head *lh;
+ struct ve_hook *tmp;
+
+ BUG_ON(chain > VE_MAX_CHAINS);
+
+ down_write(&ve_hook_sem);
+ list_for_each(lh, &ve_hooks[chain]) {
+ tmp = list_entry(lh, struct ve_hook, list);
+ if (vh->priority < tmp->priority)
+ break;
+ }
+
+ list_add_tail(&vh->list, lh);
+ up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_register);
+
+void ve_hook_unregister(struct ve_hook *vh)
+{
+ down_write(&ve_hook_sem);
+ list_del(&vh->list);
+ up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_unregister);
+
+static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve)
+{
+ int err;
+
+ err = 0;
+ if (try_module_get(vh->owner)) {
+ err = vh->init(ve);
+ module_put(vh->owner);
+ }
+ return err;
+}
+
+static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve)
+{
+ if (vh->fini != NULL && try_module_get(vh->owner)) {
+ vh->fini(ve);
+ module_put(vh->owner);
+ }
+}
+
+int ve_hook_iterate_init(int chain, void *ve)
+{
+ struct ve_hook *vh;
+ int err;
+
+ err = 0;
+
+ down_read(&ve_hook_sem);
+ list_for_each_entry(vh, &ve_hooks[chain], list)
+ if ((err = ve_hook_init(vh, ve)) < 0)
+ break;
+
+ if (err)
+ list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list)
+ ve_hook_fini(vh, ve);
+
+ up_read(&ve_hook_sem);
+ return err;
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_init);
+
+void ve_hook_iterate_fini(int chain, void *ve)
+{
+ struct ve_hook *vh;
+
+ down_read(&ve_hook_sem);
+ list_for_each_entry_reverse(vh, &ve_hooks[chain], list)
+ ve_hook_fini(vh, ve);
+ up_read(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_fini);
+
+static int __init ve_hooks_init(void)
+{
+ int i;
+
+ for (i = 0; i < VE_MAX_CHAINS; i++)
+ INIT_LIST_HEAD(&ve_hooks[i]);
+ return 0;
+}
+
+core_initcall(ve_hooks_init);
+
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
new file mode 100644
index 0000000..907d944
--- /dev/null
+++ b/kernel/ve/ve.c
@@ -0,0 +1,161 @@
+/*
+ * linux/kernel/ve/ve.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+/*
+ * 've.c' helper file performing VE sub-system initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/kdev_t.h>
+#include <linux/termios.h>
+#include <linux/tty_driver.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/ve_proto.h>
+#include <linux/devpts_fs.h>
+#include <linux/user_namespace.h>
+
+#include <linux/vzcalluser.h>
+
+unsigned long vz_rstamp = 0x37e0f59d;
+
+#ifdef CONFIG_MODULES
+struct module no_module = { .state = MODULE_STATE_GOING };
+EXPORT_SYMBOL(no_module);
+#endif
+
+#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS)
+void (*do_env_free_hook)(struct ve_struct *ve);
+EXPORT_SYMBOL(do_env_free_hook);
+
+void do_env_free(struct ve_struct *env)
+{
+ BUG_ON(atomic_read(&env->pcounter) > 0);
+ BUG_ON(env->is_running);
+
+ preempt_disable();
+ do_env_free_hook(env);
+ preempt_enable();
+}
+EXPORT_SYMBOL(do_env_free);
+#endif
+
+int (*do_ve_enter_hook)(struct ve_struct *ve, unsigned int flags);
+EXPORT_SYMBOL(do_ve_enter_hook);
+
+struct ve_struct ve0 = {
+ .counter = ATOMIC_INIT(1),
+ .pcounter = ATOMIC_INIT(1),
+ .ve_list = LIST_HEAD_INIT(ve0.ve_list),
+ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh),
+ .start_jiffies = INITIAL_JIFFIES,
+ .ve_ns = &init_nsproxy,
+ .ve_netns = &init_net,
+ .user_ns = &init_user_ns,
+ .is_running = 1,
+ .op_sem = __RWSEM_INITIALIZER(ve0.op_sem),
+#ifdef CONFIG_VE_IPTABLES
+ .ipt_mask = VE_IP_ALL,
+ ._iptables_modules = VE_IP_ALL,
+#endif
+ .features = VE_FEATURE_SIT | VE_FEATURE_IPIP |
+ VE_FEATURE_PPP,
+ ._randomize_va_space =
+#ifdef CONFIG_COMPAT_BRK
+ 1,
+#else
+ 2,
+#endif
+};
+
+EXPORT_SYMBOL(ve0);
+
+LIST_HEAD(ve_list_head);
+rwlock_t ve_list_lock = RW_LOCK_UNLOCKED;
+
+LIST_HEAD(ve_cleanup_list);
+DEFINE_SPINLOCK(ve_cleanup_lock);
+struct task_struct *ve_cleanup_thread;
+
+EXPORT_SYMBOL(ve_list_lock);
+EXPORT_SYMBOL(ve_list_head);
+EXPORT_SYMBOL(ve_cleanup_lock);
+EXPORT_SYMBOL(ve_cleanup_list);
+EXPORT_SYMBOL(ve_cleanup_thread);
+
+static DEFINE_PER_CPU(struct ve_cpu_stats, ve0_cpustats);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats);
+
+void init_ve0(void)
+{
+ struct ve_struct *ve;
+
+ ve = get_ve0();
+ ve->cpu_stats = &per_cpu__ve0_cpustats;
+ ve->sched_lat_ve.cur = &per_cpu__ve0_lat_stats;
+ list_add(&ve->ve_list, &ve_list_head);
+}
+
+void ve_cleanup_schedule(struct ve_struct *ve)
+{
+ BUG_ON(ve_cleanup_thread == NULL);
+
+ spin_lock(&ve_cleanup_lock);
+ list_add_tail(&ve->cleanup_list, &ve_cleanup_list);
+ spin_unlock(&ve_cleanup_lock);
+
+ wake_up_process(ve_cleanup_thread);
+}
+
+#ifdef CONFIG_BLK_CGROUP
+extern int blkiocg_set_weight(struct cgroup *cgroup, u64 val);
+
+static u64 ioprio_weight[VE_IOPRIO_MAX] = {200, 275, 350, 425, 500, 575, 650, 725};
+
+int ve_set_ioprio(int veid, int ioprio)
+{
+ struct ve_struct *ve;
+ int ret;
+
+ if (ioprio < VE_IOPRIO_MIN || ioprio >= VE_IOPRIO_MAX)
+ return -ERANGE;
+
+ ret = -ESRCH;
+ read_lock(&ve_list_lock);
+ for_each_ve(ve) {
+ if (ve->veid != veid)
+ continue;
+ ret = blkiocg_set_weight(ve->ve_cgroup, ioprio_weight[ioprio]);
+ break;
+ }
+ read_unlock(&ve_list_lock);
+
+ return ret;
+}
+#else
+int ve_set_ioprio(int veid, int ioprio)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_BLK_CGROUP */
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
new file mode 100644
index 0000000..9947b57
--- /dev/null
+++ b/kernel/ve/vecalls.c
@@ -0,0 +1,2335 @@
+/*
+ * linux/kernel/ve/vecalls.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ */
+
+/*
+ * 'vecalls.c' is file with basic VE support. It provides basic primities
+ * along with initialization script
+ */
+
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sys.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/mnt_namespace.h>
+#include <linux/termios.h>
+#include <linux/tty_driver.h>
+#include <linux/netdevice.h>
+#include <linux/wait.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/utsname.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/devpts_fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/user_namespace.h>
+#include <linux/sysfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/suspend.h>
+#include <linux/rcupdate.h>
+#include <linux/in.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/pid.h>
+#include <net/pkt_sched.h>
+#include <bc/beancounter.h>
+#include <linux/nsproxy.h>
+#include <linux/kobject.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/tty.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/oom.h>
+
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
+#include <net/arp.h>
+#include <net/ipv6.h>
+
+#include <linux/ve_proto.h>
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <linux/vzcalluser.h>
+#ifdef CONFIG_VZ_FAIRSCHED
+#include <linux/fairsched.h>
+#endif
+
+#include <linux/virtinfo.h>
+#include <linux/utsrelease.h>
+#include <linux/major.h>
+
+int nr_ve = 1; /* One VE always exists. Compatibility with vestat */
+EXPORT_SYMBOL(nr_ve);
+
+static int do_env_enter(struct ve_struct *ve, unsigned int flags);
+static int alloc_ve_tty_drivers(struct ve_struct* ve);
+static void free_ve_tty_drivers(struct ve_struct* ve);
+static int register_ve_tty_drivers(struct ve_struct* ve);
+static void unregister_ve_tty_drivers(struct ve_struct* ve);
+static int init_ve_tty_drivers(struct ve_struct *);
+static void fini_ve_tty_drivers(struct ve_struct *);
+static void clear_termios(struct tty_driver* driver );
+
+static void vecalls_exit(void);
+
+struct ve_struct *__find_ve_by_id(envid_t veid)
+{
+ struct ve_struct *ve;
+
+ for_each_ve(ve) {
+ if (ve->veid == veid)
+ return ve;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(__find_ve_by_id);
+
+struct ve_struct *get_ve_by_id(envid_t veid)
+{
+ struct ve_struct *ve;
+ read_lock(&ve_list_lock);
+ ve = __find_ve_by_id(veid);
+ get_ve(ve);
+ read_unlock(&ve_list_lock);
+ return ve;
+}
+EXPORT_SYMBOL(get_ve_by_id);
+
+/*
+ * real_put_ve() MUST be used instead of put_ve() inside vecalls.
+ */
+static void real_do_env_free(struct ve_struct *ve);
+static inline void real_put_ve(struct ve_struct *ve)
+{
+ if (ve && atomic_dec_and_test(&ve->counter)) {
+ BUG_ON(atomic_read(&ve->pcounter) > 0);
+ BUG_ON(ve->is_running);
+ real_do_env_free(ve);
+ }
+}
+
+static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
+{
+ struct ve_struct *ve;
+ struct vz_cpu_stat *vstat;
+ int retval;
+ int i, cpu;
+ unsigned long tmp;
+
+ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
+ return -EPERM;
+ if (veid == 0)
+ return -ESRCH;
+
+ vstat = kzalloc(sizeof(*vstat), GFP_KERNEL);
+ if (!vstat)
+ return -ENOMEM;
+
+ retval = -ESRCH;
+ read_lock(&ve_list_lock);
+ ve = __find_ve_by_id(veid);
+ if (ve == NULL)
+ goto out_unlock;
+ for_each_online_cpu(cpu) {
+ struct ve_cpu_stats *st;
+
+ st = VE_CPU_STATS(ve, cpu);
+ vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user);
+ vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice);
+ vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system);
+ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu);
+ }
+ vstat->uptime_clk = get_cycles() - ve->start_cycles;
+ vstat->uptime_jif = (unsigned long)cputime64_to_clock_t(
+ get_jiffies_64() - ve->start_jiffies);
+ for (i = 0; i < 3; i++) {
+ tmp = ve->avenrun[i] + (FIXED_1/200);
+ vstat->avenrun[i].val_int = LOAD_INT(tmp);
+ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
+ }
+ read_unlock(&ve_list_lock);
+
+ retval = 0;
+ if (copy_to_user(buf, vstat, sizeof(*vstat)))
+ retval = -EFAULT;
+out_free:
+ kfree(vstat);
+ return retval;
+
+out_unlock:
+ read_unlock(&ve_list_lock);
+ goto out_free;
+}
+
+static int real_setdevperms(envid_t veid, unsigned type,
+ dev_t dev, unsigned mask)
+{
+ struct ve_struct *ve;
+ int err;
+
+ if (!capable_setveid() || veid == 0)
+ return -EPERM;
+
+ if ((ve = get_ve_by_id(veid)) == NULL)
+ return -ESRCH;
+
+ down_read(&ve->op_sem);
+ err = -ESRCH;
+ if (ve->is_running)
+ err = set_device_perms_ve(ve, type, dev, mask);
+ up_read(&ve->op_sem);
+ real_put_ve(ve);
+ return err;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE start: subsystems
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static int prepare_proc_root(struct ve_struct *ve)
+{
+ struct proc_dir_entry *de;
+
+ de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL);
+ if (de == NULL)
+ return -ENOMEM;
+
+ memcpy(de + 1, "/proc", 6);
+ de->name = (char *)(de + 1);
+ de->namelen = 5;
+ de->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ de->nlink = 2;
+ atomic_set(&de->count, 1);
+
+ ve->proc_root = de;
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static int init_ve_proc(struct ve_struct *ve)
+{
+ int err;
+
+ err = prepare_proc_root(ve);
+ if (err)
+ goto out_root;
+
+ err = register_ve_fs_type(ve, &proc_fs_type,
+ &ve->proc_fstype, &ve->proc_mnt);
+ if (err)
+ goto out_reg;
+
+#ifdef CONFIG_PRINTK
+ proc_create("kmsg", S_IRUSR, ve->proc_root, &proc_kmsg_operations);
+#endif
+ proc_mkdir("vz", ve->proc_root);
+
+ ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt);
+ return 0;
+
+out_reg:
+ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */
+ ;
+out_root:
+ return err;
+}
+
+static void fini_ve_proc(struct ve_struct *ve)
+{
+ remove_proc_entry("vz", ve->proc_root);
+ remove_proc_entry("kmsg", ve->proc_root);
+ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt);
+ ve->proc_mnt = NULL;
+}
+
+static void free_ve_proc(struct ve_struct *ve)
+{
+ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only,
+ so we check that everything was removed and not lost */
+ if (ve->proc_root && ve->proc_root->subdir) {
+ struct proc_dir_entry *p = ve->proc_root;
+ printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid);
+ while ((p = p->subdir) != NULL)
+ printk("/%s", p->name);
+ printk(" is not removed!\n");
+ }
+
+ kfree(ve->proc_root);
+ kfree(ve->proc_fstype);
+
+ ve->proc_fstype = NULL;
+ ve->proc_root = NULL;
+}
+#else
+#define init_ve_proc(ve) (0)
+#define fini_ve_proc(ve) do { } while (0)
+#define free_ve_proc(ve) do { } while (0)
+#endif
+
+#ifdef CONFIG_UNIX98_PTYS
+#include <linux/devpts_fs.h>
+
+/*
+ * DEVPTS needs a virtualization: each environment should see each own list of
+ * pseudo-terminals.
+ * To implement it we need to have separate devpts superblocks for each
+ * VE, and each VE should mount its own one.
+ * Thus, separate vfsmount structures are required.
+ * To minimize intrusion into vfsmount lookup code, separate file_system_type
+ * structures are created.
+ *
+ * In addition to this, patch fo character device itself is required, as file
+ * system itself is used only for MINOR/MAJOR lookup.
+ */
+
+static int init_ve_devpts(struct ve_struct *ve)
+{
+ return register_ve_fs_type(ve, &devpts_fs_type,
+ &ve->devpts_fstype, &ve->devpts_mnt);
+}
+
+static void fini_ve_devpts(struct ve_struct *ve)
+{
+ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt);
+}
+#else
+#define init_ve_devpts(ve) (0)
+#define fini_ve_devpts(ve) do { } while (0)
+#endif
+
+static int init_ve_shmem(struct ve_struct *ve)
+{
+ return register_ve_fs_type(ve,
+ &tmpfs_fs_type,
+ &ve->shmem_fstype,
+ &ve->shmem_mnt);
+}
+
+static void fini_ve_shmem(struct ve_struct *ve)
+{
+ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt);
+ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */
+ ve->shmem_mnt = NULL;
+}
+
+#ifdef CONFIG_SYSFS
+static int init_ve_sysfs_root(struct ve_struct *ve)
+{
+ struct sysfs_dirent *sysfs_root;
+
+ sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL);
+ if (sysfs_root == NULL)
+ return -ENOMEM;
+ sysfs_root->s_name = "";
+ atomic_set(&sysfs_root->s_count, 1);
+ sysfs_root->s_flags = SYSFS_DIR;
+ sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ sysfs_root->s_ino = 1;
+
+ ve->_sysfs_root = sysfs_root;
+ return 0;
+}
+#endif
+
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+extern struct device_attribute ve_net_class_attributes[];
+static inline int init_ve_netclass(void)
+{
+ struct class *nc;
+ int err;
+
+ nc = kzalloc(sizeof(*nc), GFP_KERNEL);
+ if (!nc)
+ return -ENOMEM;
+
+ nc->name = net_class.name;
+ nc->dev_release = net_class.dev_release;
+ nc->dev_uevent = net_class.dev_uevent;
+ nc->dev_attrs = ve_net_class_attributes;
+
+ err = class_register(nc);
+ if (!err) {
+ get_exec_env()->net_class = nc;
+ return 0;
+ }
+ kfree(nc);
+ return err;
+}
+
+static inline void fini_ve_netclass(void)
+{
+ struct ve_struct *ve = get_exec_env();
+
+ class_unregister(ve->net_class);
+ kfree(ve->net_class);
+ ve->net_class = NULL;
+}
+#else
+static inline int init_ve_netclass(void) { return 0; }
+static inline void fini_ve_netclass(void) { ; }
+#endif
+
+static const struct {
+ unsigned minor;
+ char *name;
+} mem_class_devices [] = {
+ {3, "null"},
+ {5, "zero"},
+ {7, "full"},
+ {8, "random"},
+ {9, "urandom"},
+ {0, NULL},
+};
+
+static int init_ve_mem_class(void)
+{
+ int i;
+ struct class *ve_mem_class;
+
+ ve_mem_class = class_create(THIS_MODULE, "mem");
+ if (IS_ERR(ve_mem_class))
+ return -ENOMEM;
+
+ for (i = 0; mem_class_devices[i].name; i++)
+ device_create(ve_mem_class, NULL,
+ MKDEV(MEM_MAJOR, mem_class_devices[i].minor),
+ NULL, mem_class_devices[i].name);
+
+ get_exec_env()->mem_class = ve_mem_class;
+ return 0;
+}
+
+
+void fini_ve_mem_class(void)
+{
+ int i;
+ struct class *ve_mem_class = get_exec_env()->mem_class;
+
+ for (i = 0; mem_class_devices[i].name; i++)
+ device_destroy(ve_mem_class,
+ MKDEV(MEM_MAJOR, mem_class_devices[i].minor));
+ class_destroy(ve_mem_class);
+}
+
+static int init_ve_sysfs(struct ve_struct *ve)
+{
+ int err;
+
+#ifdef CONFIG_SYSFS
+ err = 0;
+ if (ve->features & VE_FEATURE_SYSFS) {
+ err = init_ve_sysfs_root(ve);
+ if (err != 0)
+ goto out;
+ err = register_ve_fs_type(ve,
+ &sysfs_fs_type,
+ &ve->sysfs_fstype,
+ &ve->sysfs_mnt);
+ if (err != 0)
+ goto out_fs_type;
+ }
+#endif
+
+ err = classes_init();
+ if (err != 0)
+ goto err_classes;
+
+ err = devices_init();
+ if (err != 0)
+ goto err_devices;
+
+ err = init_ve_netclass();
+ if (err != 0)
+ goto err_net;
+
+ err = init_ve_tty_class();
+ if (err != 0)
+ goto err_tty;
+
+ err = init_ve_mem_class();
+ if (err != 0)
+ goto err_mem;
+
+ return 0;
+
+err_mem:
+ fini_ve_tty_class();
+err_tty:
+ fini_ve_netclass();
+err_net:
+ devices_fini();
+err_devices:
+ classes_fini();
+err_classes:
+#ifdef CONFIG_SYSFS
+ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
+ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
+out_fs_type:
+ kfree(ve->_sysfs_root);
+ ve->_sysfs_root = NULL;
+out:
+#endif
+ return err;
+}
+
+static void fini_ve_sysfs(struct ve_struct *ve)
+{
+ fini_ve_mem_class();
+ fini_ve_tty_class();
+ fini_ve_netclass();
+ devices_fini();
+ classes_fini();
+#ifdef CONFIG_SYSFS
+ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
+ ve->sysfs_mnt = NULL;
+ kfree(ve->_sysfs_root);
+ ve->_sysfs_root = NULL;
+ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
+#endif
+}
+
+static void free_ve_filesystems(struct ve_struct *ve)
+{
+#ifdef CONFIG_SYSFS
+ kfree(ve->sysfs_fstype);
+ ve->sysfs_fstype = NULL;
+#endif
+ kfree(ve->shmem_fstype);
+ ve->shmem_fstype = NULL;
+
+ kfree(ve->devpts_fstype);
+ ve->devpts_fstype = NULL;
+
+ free_ve_proc(ve);
+}
+
+static int init_printk(struct ve_struct *ve)
+{
+ struct ve_prep_printk {
+ wait_queue_head_t log_wait;
+ unsigned log_start;
+ unsigned log_end;
+ unsigned logged_chars;
+ } *tmp;
+
+ tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ init_waitqueue_head(&tmp->log_wait);
+ ve->_log_wait = &tmp->log_wait;
+ ve->_log_start = &tmp->log_start;
+ ve->_log_end = &tmp->log_end;
+ ve->_logged_chars = &tmp->logged_chars;
+ /* ve->log_buf will be initialized later by ve_log_init() */
+ return 0;
+}
+
+static void fini_printk(struct ve_struct *ve)
+{
+ /*
+ * there is no spinlock protection here because nobody can use
+ * log_buf at the moments when this code is called.
+ */
+ kfree(ve->log_buf);
+ kfree(ve->_log_wait);
+}
+
+static void fini_venet(struct ve_struct *ve)
+{
+#ifdef CONFIG_INET
+ tcp_v4_kill_ve_sockets(ve);
+ synchronize_net();
+#endif
+}
+
+static int init_ve_sched(struct ve_struct *ve)
+{
+ int err;
+
+ err = fairsched_new_node(ve->veid, 0);
+ if (err == 0)
+ ve_sched_attach(ve);
+
+ return err;
+}
+
+static void fini_ve_sched(struct ve_struct *ve)
+{
+ fairsched_drop_node(ve->veid);
+}
+
+/*
+ * Namespaces
+ */
+
+static inline int init_ve_namespaces(struct ve_struct *ve,
+ struct nsproxy **old)
+{
+ int err;
+ struct task_struct *tsk;
+ struct nsproxy *cur;
+
+ tsk = current;
+ cur = tsk->nsproxy;
+
+ err = copy_namespaces(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID,
+ tsk, 1);
+ if (err < 0)
+ return err;
+
+ ve->ve_ns = get_nsproxy(tsk->nsproxy);
+ memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release,
+ sizeof(virt_utsname.release));
+
+ if (cur->pid_ns->flags & PID_NS_HIDE_CHILD)
+ ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN;
+
+ *old = cur;
+ return 0;
+}
+
+static inline void fini_ve_namespaces(struct ve_struct *ve,
+ struct nsproxy *old)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *tmp;
+
+ if (old) {
+ tmp = tsk->nsproxy;
+ tsk->nsproxy = get_nsproxy(old);
+ put_nsproxy(tmp);
+ tmp = ve->ve_ns;
+ ve->ve_ns = get_nsproxy(old);
+ put_nsproxy(tmp);
+ } else {
+ put_user_ns(ve->user_ns);
+ put_nsproxy(ve->ve_ns);
+ ve->ve_ns = NULL;
+ }
+}
+
+static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old)
+{
+ int err;
+ struct task_struct *tsk;
+ struct nsproxy *cur;
+
+ tsk = current;
+ cur = tsk->nsproxy;
+
+ err = copy_namespaces(CLONE_NEWNET, tsk, 1);
+ if (err < 0)
+ return err;
+
+ put_nsproxy(ve->ve_ns);
+ ve->ve_ns = get_nsproxy(tsk->nsproxy);
+ ve->ve_netns = get_net(ve->ve_ns->net_ns);
+ *old = cur;
+ return 0;
+}
+
+static inline void switch_ve_namespaces(struct ve_struct *ve,
+ struct task_struct *tsk)
+{
+ struct nsproxy *old_ns;
+ struct nsproxy *new_ns;
+
+ BUG_ON(tsk != current);
+ old_ns = tsk->nsproxy;
+ new_ns = ve->ve_ns;
+
+ if (old_ns != new_ns) {
+ tsk->nsproxy = get_nsproxy(new_ns);
+ put_nsproxy(old_ns);
+ }
+}
+
+static __u64 get_ve_features(env_create_param_t *data, int datalen)
+{
+ __u64 known_features;
+
+ if (datalen < sizeof(struct env_create_param3))
+ /* this version of vzctl is aware of VE_FEATURES_OLD only */
+ known_features = VE_FEATURES_OLD;
+ else
+ known_features = data->known_features;
+
+ /*
+ * known features are set as required
+ * yet unknown features are set as in VE_FEATURES_DEF
+ */
+ return (data->feature_mask & known_features) |
+ (VE_FEATURES_DEF & ~known_features);
+}
+
+static int init_ve_struct(struct ve_struct *ve, envid_t veid,
+ u32 class_id, env_create_param_t *data, int datalen)
+{
+ (void)get_ve(ve);
+ ve->veid = veid;
+ ve->class_id = class_id;
+ ve->features = get_ve_features(data, datalen);
+ INIT_LIST_HEAD(&ve->vetask_lh);
+ init_rwsem(&ve->op_sem);
+
+ ve->start_timespec = current->start_time;
+ /* The value is wrong, but it is never compared to process
+ * start times */
+ ve->start_jiffies = get_jiffies_64();
+ ve->start_cycles = get_cycles();
+
+ ve->_randomize_va_space = ve0._randomize_va_space;
+
+ return 0;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * /proc/meminfo virtualization
+ *
+ **********************************************************************
+ **********************************************************************/
+static int ve_set_meminfo(envid_t veid, unsigned long val)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ struct ve_struct *ve;
+
+ ve = get_ve_by_id(veid);
+ if (!ve)
+ return -EINVAL;
+
+ if (val == 0)
+ val = VE_MEMINFO_SYSTEM;
+ else if (val == 1)
+ val = VE_MEMINFO_DEFAULT;
+
+ ve->meminfo_val = val;
+ real_put_ve(ve);
+ return 0;
+#else
+ return -ENOTTY;
+#endif
+}
+
+static int init_ve_meminfo(struct ve_struct *ve)
+{
+ ve->meminfo_val = VE_MEMINFO_DEFAULT;
+ return 0;
+}
+
+static inline void fini_ve_meminfo(struct ve_struct *ve)
+{
+}
+
+static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk)
+{
+ read_lock(&tsk->fs->lock);
+ ve->root_path = tsk->fs->root;
+ read_unlock(&tsk->fs->lock);
+ mark_tree_virtual(&ve->root_path);
+}
+
+static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk)
+{
+ /* required for real_setdevperms from register_ve_<fs> above */
+ memcpy(&ve->ve_cap_bset, &tsk->cred->cap_effective, sizeof(kernel_cap_t));
+}
+
+static int ve_list_add(struct ve_struct *ve)
+{
+ write_lock_irq(&ve_list_lock);
+ if (__find_ve_by_id(ve->veid) != NULL)
+ goto err_exists;
+
+ list_add(&ve->ve_list, &ve_list_head);
+ nr_ve++;
+ write_unlock_irq(&ve_list_lock);
+ return 0;
+
+err_exists:
+ write_unlock_irq(&ve_list_lock);
+ return -EEXIST;
+}
+
+static void ve_list_del(struct ve_struct *ve)
+{
+ write_lock_irq(&ve_list_lock);
+ list_del(&ve->ve_list);
+ nr_ve--;
+ write_unlock_irq(&ve_list_lock);
+}
+
+static void set_task_ve_caps(struct ve_struct *ve, struct cred *new)
+{
+ const struct cred *cur;
+ kernel_cap_t bset;
+
+ bset = ve->ve_cap_bset;
+ cur = current_cred();
+ new->cap_effective = cap_intersect(cur->cap_effective, bset);
+ new->cap_inheritable = cap_intersect(cur->cap_inheritable, bset);
+ new->cap_permitted = cap_intersect(cur->cap_permitted, bset);
+ new->cap_bset = cap_intersect(cur->cap_bset, bset);
+
+ if (commit_creds(new))
+ /* too late to rollback, but commit currently just works */
+ BUG();
+}
+
+void ve_move_task(struct task_struct *tsk, struct ve_struct *new, struct cred *new_creds)
+{
+ struct ve_struct *old;
+
+ might_sleep();
+ BUG_ON(tsk != current);
+ BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk)));
+
+ /* this probihibts ptracing of task entered to VE from host system */
+ if (tsk->mm)
+ tsk->mm->vps_dumpable = 0;
+ /* setup capabilities before enter */
+ set_task_ve_caps(new, new_creds);
+
+ /* Drop OOM protection. */
+ if (tsk->signal->oom_adj == OOM_DISABLE)
+ tsk->signal->oom_adj = 0;
+
+ old = tsk->ve_task_info.owner_env;
+ tsk->ve_task_info.owner_env = new;
+ tsk->ve_task_info.exec_env = new;
+
+ write_lock_irq(&tasklist_lock);
+ list_del_rcu(&tsk->ve_task_info.vetask_list);
+ write_unlock_irq(&tasklist_lock);
+
+ synchronize_rcu();
+
+ write_lock_irq(&tasklist_lock);
+ list_add_tail_rcu(&tsk->ve_task_info.vetask_list,
+ &new->vetask_lh);
+ write_unlock_irq(&tasklist_lock);
+
+ atomic_dec(&old->pcounter);
+ real_put_ve(old);
+
+ atomic_inc(&new->pcounter);
+ get_ve(new);
+
+ cgroup_set_task_css(tsk, new->ve_css_set);
+
+ new->user_ns = get_user_ns(new_creds->user->user_ns);
+}
+
+EXPORT_SYMBOL(ve_move_task);
+
+#ifdef CONFIG_VE_IPTABLES
+
+static __u64 setup_iptables_mask(__u64 init_mask)
+{
+ /* Remove when userspace will start supplying IPv6-related bits. */
+ init_mask &= ~VE_IP_IPTABLES6;
+ init_mask &= ~VE_IP_FILTER6;
+ init_mask &= ~VE_IP_MANGLE6;
+ init_mask &= ~VE_IP_IPTABLE_NAT_MOD;
+ init_mask &= ~VE_NF_CONNTRACK_MOD;
+
+ if (mask_ipt_allow(init_mask, VE_IP_IPTABLES))
+ init_mask |= VE_IP_IPTABLES6;
+ if (mask_ipt_allow(init_mask, VE_IP_FILTER))
+ init_mask |= VE_IP_FILTER6;
+ if (mask_ipt_allow(init_mask, VE_IP_MANGLE))
+ init_mask |= VE_IP_MANGLE6;
+ if (mask_ipt_allow(init_mask, VE_IP_NAT))
+ init_mask |= VE_IP_IPTABLE_NAT;
+ if (mask_ipt_allow(init_mask, VE_IP_CONNTRACK))
+ init_mask |= VE_NF_CONNTRACK;
+
+ return init_mask;
+}
+
+#endif
+
+static inline int init_ve_cpustats(struct ve_struct *ve)
+{
+ ve->cpu_stats = alloc_percpu(struct ve_cpu_stats);
+ if (ve->cpu_stats == NULL)
+ return -ENOMEM;
+ ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct);
+ if (ve == NULL)
+ goto fail;
+ return 0;
+
+fail:
+ free_percpu(ve->cpu_stats);
+ return -ENOMEM;
+}
+
+static inline void free_ve_cpustats(struct ve_struct *ve)
+{
+ free_percpu(ve->cpu_stats);
+ ve->cpu_stats = NULL;
+ free_percpu(ve->sched_lat_ve.cur);
+ ve->sched_lat_ve.cur = NULL;
+}
+
+static int alone_in_pgrp(struct task_struct *tsk)
+{
+ struct task_struct *p;
+ int alone = 0;
+
+ read_lock(&tasklist_lock);
+ do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) {
+ if (p != tsk)
+ goto out;
+ } while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p);
+ do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) {
+ if (p != tsk)
+ goto out;
+ } while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p);
+ alone = 1;
+out:
+ read_unlock(&tasklist_lock);
+ return alone;
+}
+
+static int do_env_create(envid_t veid, unsigned int flags, u32 class_id,
+ env_create_param_t *data, int datalen)
+{
+ struct task_struct *tsk;
+ struct cred *new_creds;
+ struct ve_struct *old;
+ struct ve_struct *old_exec;
+ struct ve_struct *ve;
+ __u64 init_mask;
+ int err;
+ struct nsproxy *old_ns, *old_ns_net;
+ DECLARE_COMPLETION_ONSTACK(sysfs_completion);
+
+ tsk = current;
+ old = VE_TASK_INFO(tsk)->owner_env;
+
+ if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
+ return -EINVAL;
+
+ if (tsk->signal->tty) {
+ printk("ERR: CT init has controlling terminal\n");
+ return -EINVAL;
+ }
+ if (task_pgrp(tsk) != task_pid(tsk) ||
+ task_session(tsk) != task_pid(tsk)) {
+ int may_setsid;
+
+ read_lock(&tasklist_lock);
+ may_setsid = !tsk->signal->leader &&
+ !pid_task(find_pid_ns(task_pid_nr(tsk), &init_pid_ns), PIDTYPE_PGID);
+ read_unlock(&tasklist_lock);
+
+ if (!may_setsid) {
+ printk("ERR: CT init is process group leader\n");
+ return -EINVAL;
+ }
+ }
+ /* Check that the process is not a leader of non-empty group/session.
+ * If it is, we cannot virtualize its PID and must fail. */
+ if (!alone_in_pgrp(tsk)) {
+ printk("ERR: CT init is not alone in process group\n");
+ return -EINVAL;
+ }
+
+
+ VZTRACE("%s: veid=%d classid=%d pid=%d\n",
+ __FUNCTION__, veid, class_id, current->pid);
+
+ err = -ENOMEM;
+ ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL);
+ if (ve == NULL)
+ goto err_struct;
+
+ init_ve_struct(ve, veid, class_id, data, datalen);
+ __module_get(THIS_MODULE);
+ down_write(&ve->op_sem);
+ if (flags & VE_LOCK)
+ ve->is_locked = 1;
+
+ /*
+ * this should be done before adding to list
+ * because if calc_load_ve finds this ve in
+ * list it will be very surprised
+ */
+ if ((err = init_ve_cpustats(ve)) < 0)
+ goto err_cpu_stats;
+
+ if ((err = ve_list_add(ve)) < 0)
+ goto err_exist;
+
+ /* this should be done before context switching */
+ if ((err = init_printk(ve)) < 0)
+ goto err_log_wait;
+
+ old_exec = set_exec_env(ve);
+
+ if ((err = init_ve_sched(ve)) < 0)
+ goto err_sched;
+
+ set_ve_root(ve, tsk);
+
+ if ((err = init_ve_sysfs(ve)))
+ goto err_sysfs;
+
+ if ((err = init_ve_namespaces(ve, &old_ns)))
+ goto err_ns;
+
+ if ((err = init_ve_proc(ve)))
+ goto err_proc;
+
+
+ init_mask = data ? data->iptables_mask : VE_IP_DEFAULT;
+
+#ifdef CONFIG_VE_IPTABLES
+ /* Set up ipt_mask as it will be used during
+ * net namespace initialization
+ */
+ init_mask = setup_iptables_mask(init_mask);
+ ve->ipt_mask = init_mask;
+#endif
+
+ if ((err = init_ve_netns(ve, &old_ns_net)))
+ goto err_netns;
+
+ if ((err = init_ve_cgroups(ve)))
+ goto err_cgroup;
+
+ if ((err = init_ve_tty_drivers(ve)) < 0)
+ goto err_tty;
+
+ if ((err = init_ve_shmem(ve)))
+ goto err_shmem;
+
+ if ((err = init_ve_devpts(ve)))
+ goto err_devpts;
+
+ if((err = init_ve_meminfo(ve)))
+ goto err_meminf;
+
+ set_ve_caps(ve, tsk);
+
+ if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0)
+ goto err_vpid;
+
+ new_creds = prepare_creds();
+ if (new_creds == NULL)
+ goto err_creds;
+
+ if ((err = create_user_ns(new_creds)) < 0)
+ goto err_uns;
+
+ if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0)
+ goto err_ve_hook;
+
+ put_nsproxy(old_ns);
+ put_nsproxy(old_ns_net);
+
+ /* finally: set vpids and move inside */
+ ve_move_task(tsk, ve, new_creds);
+
+ ve->is_running = 1;
+ up_write(&ve->op_sem);
+
+ printk(KERN_INFO "CT: %d: started\n", veid);
+ return veid;
+
+err_ve_hook:
+ /* creds will put user and user ns */
+err_uns:
+ abort_creds(new_creds);
+err_creds:
+ mntget(ve->proc_mnt);
+err_vpid:
+ fini_venet(ve);
+ fini_ve_meminfo(ve);
+err_meminf:
+ fini_ve_devpts(ve);
+err_devpts:
+ fini_ve_shmem(ve);
+err_shmem:
+ fini_ve_tty_drivers(ve);
+err_tty:
+ fini_ve_cgroups(ve);
+err_cgroup:
+ fini_ve_namespaces(ve, old_ns_net);
+ put_nsproxy(old_ns_net);
+ ve->ve_netns->sysfs_completion = &sysfs_completion;
+ put_net(ve->ve_netns);
+ wait_for_completion(&sysfs_completion);
+err_netns:
+ /*
+ * If process hasn't become VE's init, proc_mnt won't be put during
+ * pidns death, so this mntput by hand is needed. If it has, we
+ * compensate with mntget above.
+ */
+ mntput(ve->proc_mnt);
+ fini_ve_proc(ve);
+err_proc:
+ /* free_ve_utsname() is called inside real_put_ve() */
+ fini_ve_namespaces(ve, old_ns);
+ put_nsproxy(old_ns);
+ /*
+ * We need to compensate, because fini_ve_namespaces() assumes
+ * ve->ve_ns will continue to be used after, but VE will be freed soon
+ * (in kfree() sense).
+ */
+ put_nsproxy(ve->ve_ns);
+err_ns:
+ fini_ve_sysfs(ve);
+err_sysfs:
+ /* It is safe to restore current->envid here because
+ * ve_fairsched_detach does not use current->envid. */
+ /* Really fairsched code uses current->envid in sys_fairsched_mknod
+ * only. It is correct if sys_fairsched_mknod is called from
+ * userspace. If sys_fairsched_mknod is called from
+ * ve_fairsched_attach, then node->envid and node->parent_node->envid
+ * are explicitly set to valid value after the call. */
+ /* FIXME */
+ VE_TASK_INFO(tsk)->owner_env = old;
+ VE_TASK_INFO(tsk)->exec_env = old_exec;
+
+ fini_ve_sched(ve);
+err_sched:
+ (void)set_exec_env(old_exec);
+
+ /* we can jump here having incorrect envid */
+ VE_TASK_INFO(tsk)->owner_env = old;
+ fini_printk(ve);
+err_log_wait:
+ /* cpustats will be freed in do_env_free */
+ ve_list_del(ve);
+ up_write(&ve->op_sem);
+
+ real_put_ve(ve);
+err_struct:
+ printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err);
+ return err;
+
+err_exist:
+ free_ve_cpustats(ve);
+err_cpu_stats:
+ kfree(ve);
+ module_put(THIS_MODULE);
+ goto err_struct;
+}
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE start/stop callbacks
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int real_env_create(envid_t veid, unsigned flags, u32 class_id,
+ env_create_param_t *data, int datalen)
+{
+ int status;
+ struct ve_struct *ve;
+
+ if (!flags) {
+ status = get_exec_env()->veid;
+ goto out;
+ }
+
+ status = -EPERM;
+ if (!capable_setveid())
+ goto out;
+
+ status = -EINVAL;
+ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE)))
+ goto out;
+
+ status = -EINVAL;
+ ve = get_ve_by_id(veid);
+ if (ve) {
+ if (flags & VE_TEST) {
+ status = 0;
+ goto out_put;
+ }
+ if (flags & VE_EXCLUSIVE) {
+ status = -EACCES;
+ goto out_put;
+ }
+ if (flags & VE_CREATE) {
+ flags &= ~VE_CREATE;
+ flags |= VE_ENTER;
+ }
+ } else {
+ if (flags & (VE_TEST|VE_ENTER)) {
+ status = -ESRCH;
+ goto out;
+ }
+ }
+
+ if (flags & VE_CREATE) {
+ status = do_env_create(veid, flags, class_id, data, datalen);
+ goto out;
+ } else if (flags & VE_ENTER)
+ status = do_env_enter(ve, flags);
+
+ /* else: returning EINVAL */
+
+out_put:
+ real_put_ve(ve);
+out:
+ return status;
+}
+EXPORT_SYMBOL(real_env_create);
+
+static int do_env_enter(struct ve_struct *ve, unsigned int flags)
+{
+ struct task_struct *tsk = current;
+ struct cred *new_creds;
+ int err;
+
+ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid);
+
+ err = -EBUSY;
+ down_read(&ve->op_sem);
+ if (!ve->is_running)
+ goto out_up;
+ if (ve->is_locked && !(flags & VE_SKIPLOCK))
+ goto out_up;
+ err = -EINVAL;
+ if (!thread_group_leader(tsk) || !thread_group_empty(tsk))
+ goto out_up;
+
+ new_creds = prepare_creds();
+ if (new_creds == NULL)
+ goto out_up;
+
+#ifdef CONFIG_VZ_FAIRSCHED
+ err = sys_fairsched_mvpr(task_pid_vnr(current), ve->veid);
+ if (err) {
+ abort_creds(new_creds);
+ goto out_up;
+ }
+#endif
+ ve_sched_attach(ve);
+ switch_ve_namespaces(ve, tsk);
+ ve_move_task(current, ve, new_creds);
+
+ /* Check that the process is not a leader of non-empty group/session.
+ * If it is, we cannot virtualize its PID. Do not fail, just leave
+ * it non-virtual.
+ */
+ if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK))
+ pid_ns_attach_task(ve->ve_ns->pid_ns, tsk);
+
+ /* Unlike VE_CREATE, we do not setsid() in VE_ENTER.
+ * Process is allowed to be in an external group/session.
+ * If user space callers wants, it will do setsid() after
+ * VE_ENTER.
+ */
+ err = VE_TASK_INFO(tsk)->owner_env->veid;
+ tsk->did_ve_enter = 1;
+
+out_up:
+ up_read(&ve->op_sem);
+ return err;
+}
+
+static void env_cleanup(struct ve_struct *ve)
+{
+ struct ve_struct *old_ve;
+ DECLARE_COMPLETION_ONSTACK(sysfs_completion);
+
+ VZTRACE("real_do_env_cleanup\n");
+
+ down_read(&ve->op_sem);
+ old_ve = set_exec_env(ve);
+
+ ve_hook_iterate_fini(VE_SS_CHAIN, ve);
+
+ fini_venet(ve);
+
+ /* no new packets in flight beyond this point */
+
+ fini_ve_sched(ve);
+
+ fini_ve_devpts(ve);
+ fini_ve_shmem(ve);
+ unregister_ve_tty_drivers(ve);
+ fini_ve_meminfo(ve);
+
+ fini_ve_cgroups(ve);
+
+ fini_ve_namespaces(ve, NULL);
+ ve->ve_netns->sysfs_completion = &sysfs_completion;
+ put_net(ve->ve_netns);
+ wait_for_completion(&sysfs_completion);
+ fini_ve_proc(ve);
+ fini_ve_sysfs(ve);
+
+ (void)set_exec_env(old_ve);
+ fini_printk(ve); /* no printk can happen in ve context anymore */
+
+ ve_list_del(ve);
+ up_read(&ve->op_sem);
+
+ real_put_ve(ve);
+}
+
+static DECLARE_COMPLETION(vzmond_complete);
+static int vzmond_helper(void *arg)
+{
+ char name[18];
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)arg;
+ snprintf(name, sizeof(name), "vzmond/%d", ve->veid);
+ daemonize(name);
+ env_cleanup(ve);
+ module_put_and_exit(0);
+}
+
+static void do_pending_env_cleanups(void)
+{
+ int err;
+ struct ve_struct *ve;
+
+ spin_lock(&ve_cleanup_lock);
+ while (1) {
+ if (list_empty(&ve_cleanup_list) || need_resched())
+ break;
+
+ ve = list_first_entry(&ve_cleanup_list,
+ struct ve_struct, cleanup_list);
+ list_del(&ve->cleanup_list);
+ spin_unlock(&ve_cleanup_lock);
+
+ __module_get(THIS_MODULE);
+ err = kernel_thread(vzmond_helper, (void *)ve, 0);
+ if (err < 0) {
+ env_cleanup(ve);
+ module_put(THIS_MODULE);
+ }
+
+ spin_lock(&ve_cleanup_lock);
+ }
+ spin_unlock(&ve_cleanup_lock);
+}
+
+static inline int have_pending_cleanups(void)
+{
+ return !list_empty(&ve_cleanup_list);
+}
+
+static int vzmond(void *arg)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop() || have_pending_cleanups()) {
+ schedule();
+ try_to_freeze();
+ if (signal_pending(current))
+ flush_signals(current);
+
+ do_pending_env_cleanups();
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (have_pending_cleanups())
+ __set_current_state(TASK_RUNNING);
+ }
+
+ __set_task_state(current, TASK_RUNNING);
+ complete_and_exit(&vzmond_complete, 0);
+}
+
+static int __init init_vzmond(void)
+{
+ ve_cleanup_thread = kthread_run(vzmond, NULL, "vzmond");
+ if (IS_ERR(ve_cleanup_thread))
+ return PTR_ERR(ve_cleanup_thread);
+ else
+ return 0;
+}
+
+static void fini_vzmond(void)
+{
+ kthread_stop(ve_cleanup_thread);
+ WARN_ON(!list_empty(&ve_cleanup_list));
+}
+
+static void real_do_env_free(struct ve_struct *ve)
+{
+ VZTRACE("real_do_env_free\n");
+
+ free_ve_tty_drivers(ve);
+ free_ve_filesystems(ve);
+ free_ve_cpustats(ve);
+ printk(KERN_INFO "CT: %d: stopped\n", VEID(ve));
+ kfree(ve);
+
+ module_put(THIS_MODULE);
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE TTY handling
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base,
+ struct ve_struct *ve)
+{
+ size_t size;
+ struct tty_driver *driver;
+
+ /* FIXME: make it a normal way (or wait till ms version) */
+
+ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC);
+ if (!driver)
+ goto out;
+
+ memcpy(driver, base, sizeof(struct tty_driver));
+
+ driver->driver_state = NULL;
+
+ size = base->num * 3 * sizeof(void *);
+ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) {
+ void **p;
+ p = kzalloc(size, GFP_KERNEL_UBC);
+ if (!p)
+ goto out_free;
+
+ driver->ttys = (struct tty_struct **)p;
+ driver->termios = (struct ktermios **)(p + driver->num);
+ driver->termios_locked = (struct ktermios **)
+ (p + driver->num * 2);
+ } else {
+ driver->ttys = NULL;
+ driver->termios = NULL;
+ driver->termios_locked = NULL;
+ }
+
+ driver->owner_env = ve;
+ driver->flags |= TTY_DRIVER_INSTALLED;
+ kref_init(&driver->kref);
+
+ return driver;
+
+out_free:
+ kfree(driver);
+out:
+ return NULL;
+}
+
+static void free_ve_tty_driver(struct tty_driver *driver)
+{
+ if (!driver)
+ return;
+
+ clear_termios(driver);
+ kfree(driver->ttys);
+ kfree(driver);
+}
+
+static int alloc_ve_tty_drivers(struct ve_struct* ve)
+{
+#ifdef CONFIG_LEGACY_PTYS
+ /* Traditional BSD devices */
+ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve);
+ if (!ve->pty_driver)
+ goto out_mem;
+
+ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve);
+ if (!ve->pty_slave_driver)
+ goto out_mem;
+
+ ve->pty_driver->other = ve->pty_slave_driver;
+ ve->pty_slave_driver->other = ve->pty_driver;
+#endif
+
+#ifdef CONFIG_UNIX98_PTYS
+ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve);
+ if (!ve->ptm_driver)
+ goto out_mem;
+
+ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve);
+ if (!ve->pts_driver)
+ goto out_mem;
+
+ ve->ptm_driver->other = ve->pts_driver;
+ ve->pts_driver->other = ve->ptm_driver;
+
+ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys),
+ GFP_KERNEL_UBC);
+ if (!ve->allocated_ptys)
+ goto out_mem;
+ ida_init(ve->allocated_ptys);
+#endif
+ return 0;
+
+out_mem:
+ free_ve_tty_drivers(ve);
+ return -ENOMEM;
+}
+
+static void free_ve_tty_drivers(struct ve_struct* ve)
+{
+#ifdef CONFIG_LEGACY_PTYS
+ free_ve_tty_driver(ve->pty_driver);
+ free_ve_tty_driver(ve->pty_slave_driver);
+ ve->pty_driver = ve->pty_slave_driver = NULL;
+#endif
+#ifdef CONFIG_UNIX98_PTYS
+ free_ve_tty_driver(ve->ptm_driver);
+ free_ve_tty_driver(ve->pts_driver);
+ if (ve->allocated_ptys)
+ ida_destroy(ve->allocated_ptys);
+ kfree(ve->allocated_ptys);
+ ve->ptm_driver = ve->pts_driver = NULL;
+ ve->allocated_ptys = NULL;
+#endif
+}
+
+static inline void __register_tty_driver(struct tty_driver *driver)
+{
+ list_add(&driver->tty_drivers, &tty_drivers);
+}
+
+static inline void __unregister_tty_driver(struct tty_driver *driver)
+{
+ if (!driver)
+ return;
+ list_del(&driver->tty_drivers);
+}
+
+static int register_ve_tty_drivers(struct ve_struct* ve)
+{
+ mutex_lock(&tty_mutex);
+#ifdef CONFIG_UNIX98_PTYS
+ __register_tty_driver(ve->ptm_driver);
+ __register_tty_driver(ve->pts_driver);
+#endif
+#ifdef CONFIG_LEGACY_PTYS
+ __register_tty_driver(ve->pty_driver);
+ __register_tty_driver(ve->pty_slave_driver);
+#endif
+ mutex_unlock(&tty_mutex);
+
+ return 0;
+}
+
+static void unregister_ve_tty_drivers(struct ve_struct* ve)
+{
+ VZTRACE("unregister_ve_tty_drivers\n");
+
+ mutex_lock(&tty_mutex);
+#ifdef CONFIG_LEGACY_PTYS
+ __unregister_tty_driver(ve->pty_driver);
+ __unregister_tty_driver(ve->pty_slave_driver);
+#endif
+#ifdef CONFIG_UNIX98_PTYS
+ __unregister_tty_driver(ve->ptm_driver);
+ __unregister_tty_driver(ve->pts_driver);
+#endif
+ mutex_unlock(&tty_mutex);
+}
+
+static int init_ve_tty_drivers(struct ve_struct *ve)
+{
+ int err;
+
+ if ((err = alloc_ve_tty_drivers(ve)))
+ goto err_ttyalloc;
+ if ((err = register_ve_tty_drivers(ve)))
+ goto err_ttyreg;
+ return 0;
+
+err_ttyreg:
+ free_ve_tty_drivers(ve);
+err_ttyalloc:
+ return err;
+}
+
+static void fini_ve_tty_drivers(struct ve_struct *ve)
+{
+ unregister_ve_tty_drivers(ve);
+ free_ve_tty_drivers(ve);
+}
+
+/*
+ * Free the termios and termios_locked structures because
+ * we don't want to get memory leaks when modular tty
+ * drivers are removed from the kernel.
+ */
+static void clear_termios(struct tty_driver *driver)
+{
+ int i;
+ struct ktermios *tp;
+
+ if (driver->termios == NULL)
+ return;
+ for (i = 0; i < driver->num; i++) {
+ tp = driver->termios[i];
+ if (tp) {
+ driver->termios[i] = NULL;
+ kfree(tp);
+ }
+ tp = driver->termios_locked[i];
+ if (tp) {
+ driver->termios_locked[i] = NULL;
+ kfree(tp);
+ }
+ }
+}
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Pieces of VE network
+ *
+ **********************************************************************
+ **********************************************************************/
+
+#ifdef CONFIG_NET
+#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#endif
+
+static int ve_dev_add(envid_t veid, char *dev_name)
+{
+ struct net_device *dev;
+ struct ve_struct *dst_ve;
+ struct net *dst_net;
+ int err = -ESRCH;
+
+ dst_ve = get_ve_by_id(veid);
+ if (dst_ve == NULL)
+ goto out;
+
+ dst_net = dst_ve->ve_netns;
+
+ rtnl_lock();
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_name(&init_net, dev_name);
+ read_unlock(&dev_base_lock);
+ if (dev == NULL)
+ goto out_unlock;
+
+ err = __dev_change_net_namespace(dev, dst_net, dev_name, get_exec_ub());
+out_unlock:
+ rtnl_unlock();
+ real_put_ve(dst_ve);
+
+ if (dev == NULL)
+ printk(KERN_WARNING "%s: device %s not found\n",
+ __func__, dev_name);
+out:
+ return err;
+}
+
+static int ve_dev_del(envid_t veid, char *dev_name)
+{
+ struct net_device *dev;
+ struct ve_struct *src_ve;
+ struct net *src_net;
+ int err = -ESRCH;
+
+ src_ve = get_ve_by_id(veid);
+ if (src_ve == NULL)
+ goto out;
+
+ src_net = src_ve->ve_netns;
+
+ rtnl_lock();
+
+ read_lock(&dev_base_lock);
+ dev = __dev_get_by_name(src_net, dev_name);
+ read_unlock(&dev_base_lock);
+ if (dev == NULL)
+ goto out_unlock;
+
+ err = __dev_change_net_namespace(dev, &init_net, dev_name,
+ netdev_bc(dev)->owner_ub);
+out_unlock:
+ rtnl_unlock();
+ real_put_ve(src_ve);
+
+ if (dev == NULL)
+ printk(KERN_WARNING "%s: device %s not found\n",
+ __func__, dev_name);
+out:
+ return err;
+}
+
+int real_ve_dev_map(envid_t veid, int op, char *dev_name)
+{
+ if (!capable_setveid())
+ return -EPERM;
+ switch (op) {
+ case VE_NETDEV_ADD:
+ return ve_dev_add(veid, dev_name);
+ case VE_NETDEV_DEL:
+ return ve_dev_del(veid, dev_name);
+ default:
+ return -EINVAL;
+ }
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE information via /proc
+ *
+ **********************************************************************
+ **********************************************************************/
+#ifdef CONFIG_PROC_FS
+#if BITS_PER_LONG == 32
+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
+#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
+#else
+#define VESTAT_LINE_WIDTH (12 * 21)
+#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
+#endif
+
+static int vestat_seq_show(struct seq_file *m, void *v)
+{
+ struct list_head *entry;
+ struct ve_struct *ve;
+ struct ve_struct *curve;
+ int cpu;
+ unsigned long user_ve, nice_ve, system_ve;
+ unsigned long long uptime;
+ cycles_t uptime_cycles, idle_time, strv_time, used;
+
+ entry = (struct list_head *)v;
+ ve = list_entry(entry, struct ve_struct, ve_list);
+
+ curve = get_exec_env();
+ if (entry == ve_list_head.next ||
+ (!ve_is_super(curve) && ve == curve)) {
+ /* print header */
+ seq_printf(m, "%-*s\n",
+ VESTAT_LINE_WIDTH - 1,
+ "Version: 2.2");
+ seq_printf(m, VESTAT_HEAD_FMT, "VEID",
+ "user", "nice", "system",
+ "uptime", "idle",
+ "strv", "uptime", "used",
+ "maxlat", "totlat", "numsched");
+ }
+
+ if (ve == get_ve0())
+ return 0;
+
+ user_ve = nice_ve = system_ve = 0;
+ idle_time = strv_time = used = 0;
+
+ for_each_online_cpu(cpu) {
+ struct ve_cpu_stats *st;
+
+ st = VE_CPU_STATS(ve, cpu);
+ user_ve += st->user;
+ nice_ve += st->nice;
+ system_ve += st->system;
+ used += st->used_time;
+ idle_time += ve_sched_get_idle_time(ve, cpu);
+ }
+ uptime_cycles = get_cycles() - ve->start_cycles;
+ uptime = get_jiffies_64() - ve->start_jiffies;
+
+ seq_printf(m, VESTAT_LINE_FMT, ve->veid,
+ user_ve, nice_ve, system_ve,
+ (unsigned long long)uptime,
+ (unsigned long long)idle_time,
+ (unsigned long long)strv_time,
+ (unsigned long long)uptime_cycles,
+ (unsigned long long)used,
+ (unsigned long long)ve->sched_lat_ve.last.maxlat,
+ (unsigned long long)ve->sched_lat_ve.last.totlat,
+ ve->sched_lat_ve.last.count);
+ return 0;
+}
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct ve_struct *curve;
+
+ curve = get_exec_env();
+ read_lock(&ve_list_lock);
+ if (!ve_is_super(curve)) {
+ if (*pos != 0)
+ return NULL;
+ return curve;
+ }
+
+ return seq_list_start(&ve_list_head, *pos);
+}
+EXPORT_SYMBOL(ve_seq_start);
+
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ if (!ve_is_super(get_exec_env()))
+ return NULL;
+ else
+ return seq_list_next(v, &ve_list_head, pos);
+}
+EXPORT_SYMBOL(ve_seq_next);
+
+void ve_seq_stop(struct seq_file *m, void *v)
+{
+ read_unlock(&ve_list_lock);
+}
+EXPORT_SYMBOL(ve_seq_stop);
+
+static struct seq_operations vestat_seq_op = {
+ .start = ve_seq_start,
+ .next = ve_seq_next,
+ .stop = ve_seq_stop,
+ .show = vestat_seq_show
+};
+
+static int vestat_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &vestat_seq_op);
+}
+
+static struct file_operations proc_vestat_operations = {
+ .open = vestat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+static struct seq_operations devperms_seq_op = {
+ .start = ve_seq_start,
+ .next = ve_seq_next,
+ .stop = ve_seq_stop,
+ .show = devperms_seq_show,
+};
+
+static int devperms_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &devperms_seq_op);
+}
+
+static struct file_operations proc_devperms_ops = {
+ .open = devperms_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int vz_version_show(struct seq_file *file, void* v)
+{
+ static const char ver[] = VZVERSION "\n";
+
+ return seq_puts(file, ver);
+}
+
+static int vz_version_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, vz_version_show, NULL);
+}
+
+static struct file_operations proc_vz_version_oparations = {
+ .open = vz_version_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static inline unsigned long ve_used_mem(struct user_beancounter *ub)
+{
+ extern int glob_ve_meminfo;
+ return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held :
+ ub->ub_parms[UB_PRIVVMPAGES].held ;
+}
+
+static void ve_swapinfo(struct sysinfo *val, struct user_beancounter *ub)
+{
+ unsigned long size, used;
+
+ size = ub->ub_parms[UB_SWAPPAGES].limit;
+ used = ub->ub_parms[UB_SWAPPAGES].held;
+
+ if (size == UB_MAXVALUE)
+ size = 0;
+
+ val->totalswap = size;
+ val->freeswap = size > used ? size - used : 0;
+}
+
+static inline int ve_mi_replace(struct meminfo *mi, int old_ret)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ struct user_beancounter *ub;
+ unsigned long meminfo_val;
+ unsigned long nodettram;
+ unsigned long usedmem;
+
+ meminfo_val = get_exec_env()->meminfo_val;
+ if (meminfo_val == VE_MEMINFO_DEFAULT)
+ return old_ret; /* Default behaviour */
+
+ if (meminfo_val == VE_MEMINFO_SYSTEM)
+ return NOTIFY_DONE | NOTIFY_STOP_MASK; /* No virtualization */
+
+ nodettram = mi->si.totalram;
+ ub = top_beancounter(current->mm->mm_ub);
+ usedmem = ve_used_mem(ub);
+
+ memset(mi, 0, sizeof(*mi));
+
+ mi->si.totalram = (meminfo_val > nodettram) ?
+ nodettram : meminfo_val;
+ mi->si.freeram = (mi->si.totalram > usedmem) ?
+ (mi->si.totalram - usedmem) : 0;
+
+ ve_swapinfo(&mi->si, ub);
+
+ return NOTIFY_OK | NOTIFY_STOP_MASK;
+#else
+ return NOTIFY_DONE;
+#endif
+}
+
+static int meminfo_call(struct vnotifier_block *self,
+ unsigned long event, void *arg, int old_ret)
+{
+ if (event != VIRTINFO_MEMINFO)
+ return old_ret;
+
+ return ve_mi_replace((struct meminfo *)arg, old_ret);
+}
+
+
+static struct vnotifier_block meminfo_notifier_block = {
+ .notifier_call = meminfo_call
+};
+
+/* /proc/vz/veinfo */
+
+static ve_seq_print_t veaddr_seq_print_cb;
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t cb)
+{
+ rcu_assign_pointer(veaddr_seq_print_cb, cb);
+}
+EXPORT_SYMBOL(vzmon_register_veaddr_print_cb);
+
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb)
+{
+ rcu_assign_pointer(veaddr_seq_print_cb, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb);
+
+static int veinfo_seq_show(struct seq_file *m, void *v)
+{
+ struct ve_struct *ve;
+ ve_seq_print_t veaddr_seq_print;
+
+ ve = list_entry((struct list_head *)v, struct ve_struct, ve_list);
+
+ seq_printf(m, "%10u %5u %5u", ve->veid,
+ ve->class_id, atomic_read(&ve->pcounter));
+
+ rcu_read_lock();
+ veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb);
+ if (veaddr_seq_print)
+ veaddr_seq_print(m, ve);
+ rcu_read_unlock();
+
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static struct seq_operations veinfo_seq_op = {
+ .start = ve_seq_start,
+ .next = ve_seq_next,
+ .stop = ve_seq_stop,
+ .show = veinfo_seq_show,
+};
+
+static int veinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &veinfo_seq_op);
+}
+
+static struct file_operations proc_veinfo_operations = {
+ .open = veinfo_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init init_vecalls_proc(void)
+{
+ struct proc_dir_entry *de;
+
+ de = proc_create("vestat", S_IFREG | S_IRUSR, proc_vz_dir,
+ &proc_vestat_operations);
+ if (!de)
+ printk(KERN_WARNING "VZMON: can't make vestat proc entry\n");
+
+ de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
+ &proc_devperms_ops);
+ if (!de)
+ printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
+
+ de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
+ &proc_vz_version_oparations);
+ if (!de)
+ printk(KERN_WARNING "VZMON: can't make version proc entry\n");
+
+ de = proc_create("veinfo", S_IFREG | S_IRUSR, proc_vz_dir,
+ &proc_veinfo_operations);
+ if (!de)
+ printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n");
+
+ virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block);
+ return 0;
+}
+
+static void fini_vecalls_proc(void)
+{
+ remove_proc_entry("version", proc_vz_dir);
+ remove_proc_entry("devperms", proc_vz_dir);
+ remove_proc_entry("vestat", proc_vz_dir);
+ remove_proc_entry("veinfo", proc_vz_dir);
+ virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block);
+}
+#else
+#define init_vecalls_proc() (0)
+#define fini_vecalls_proc() do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * User ctl
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ err = -ENOTTY;
+ switch(cmd) {
+ case VZCTL_MARK_ENV_TO_DOWN: {
+ /* Compatibility issue */
+ err = 0;
+ }
+ break;
+ case VZCTL_SETDEVPERMS: {
+ /* Device type was mistakenly declared as dev_t
+ * in the old user-kernel interface.
+ * That's wrong, dev_t is a kernel internal type.
+ * I use `unsigned' not having anything better in mind.
+ * 2001/08/11 SAW */
+ struct vzctl_setdevperms s;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = real_setdevperms(s.veid, s.type,
+ new_decode_dev(s.dev), s.mask);
+ }
+ break;
+#ifdef CONFIG_INET
+ case VZCTL_VE_NETDEV: {
+ struct vzctl_ve_netdev d;
+ char *s;
+ err = -EFAULT;
+ if (copy_from_user(&d, (void __user *)arg, sizeof(d)))
+ break;
+ err = -ENOMEM;
+ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
+ if (s == NULL)
+ break;
+ err = -EFAULT;
+ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
+ s[IFNAMSIZ] = 0;
+ err = real_ve_dev_map(d.veid, d.op, s);
+ }
+ kfree(s);
+ }
+ break;
+#endif
+ case VZCTL_ENV_CREATE: {
+ struct vzctl_env_create s;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = real_env_create(s.veid, s.flags, s.class_id,
+ NULL, 0);
+ }
+ break;
+ case VZCTL_ENV_CREATE_DATA: {
+ struct vzctl_env_create_data s;
+ env_create_param_t *data;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err=-EINVAL;
+ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN ||
+ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN ||
+ s.data == 0)
+ break;
+ err = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ break;
+
+ err = -EFAULT;
+ if (copy_from_user(data, (void __user *)s.data,
+ s.datalen))
+ goto free_data;
+ err = real_env_create(s.veid, s.flags, s.class_id,
+ data, s.datalen);
+free_data:
+ kfree(data);
+ }
+ break;
+ case VZCTL_GET_CPU_STAT: {
+ struct vzctl_cpustatctl s;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = ve_get_cpu_stat(s.veid, s.cpustat);
+ }
+ break;
+ case VZCTL_VE_MEMINFO: {
+ struct vzctl_ve_meminfo s;
+ err = -EFAULT;
+ if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+ break;
+ err = ve_set_meminfo(s.veid, s.val);
+ }
+ break;
+ }
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_vzcalls_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int err;
+
+ switch(cmd) {
+ case VZCTL_GET_CPU_STAT: {
+ /* FIXME */
+ }
+ case VZCTL_COMPAT_ENV_CREATE_DATA: {
+ struct compat_vzctl_env_create_data cs;
+ struct vzctl_env_create_data __user *s;
+
+ s = compat_alloc_user_space(sizeof(*s));
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+
+ if (put_user(cs.veid, &s->veid) ||
+ put_user(cs.flags, &s->flags) ||
+ put_user(cs.class_id, &s->class_id) ||
+ put_user(compat_ptr(cs.data), &s->data) ||
+ put_user(cs.datalen, &s->datalen))
+ break;
+ err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA,
+ (unsigned long)s);
+ break;
+ }
+#ifdef CONFIG_NET
+ case VZCTL_COMPAT_VE_NETDEV: {
+ struct compat_vzctl_ve_netdev cs;
+ struct vzctl_ve_netdev __user *s;
+
+ s = compat_alloc_user_space(sizeof(*s));
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+
+ if (put_user(cs.veid, &s->veid) ||
+ put_user(cs.op, &s->op) ||
+ put_user(compat_ptr(cs.dev_name), &s->dev_name))
+ break;
+ err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s);
+ break;
+ }
+#endif
+ case VZCTL_COMPAT_VE_MEMINFO: {
+ struct compat_vzctl_ve_meminfo cs;
+ err = -EFAULT;
+ if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+ break;
+ err = ve_set_meminfo(cs.veid, cs.val);
+ break;
+ }
+ default:
+ err = vzcalls_ioctl(file, cmd, arg);
+ break;
+ }
+ return err;
+}
+#endif
+
+static struct vzioctlinfo vzcalls = {
+ .type = VZCTLTYPE,
+ .ioctl = vzcalls_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_vzcalls_ioctl,
+#endif
+ .owner = THIS_MODULE,
+};
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Init/exit stuff
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static inline __init int init_vecalls_ioctls(void)
+{
+ vzioctl_register(&vzcalls);
+ return 0;
+}
+
+static inline void fini_vecalls_ioctls(void)
+{
+ vzioctl_unregister(&vzcalls);
+}
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *table_header;
+
+static ctl_table kernel_table[] = {
+ {
+ .procname = "ve_allow_kthreads",
+ .data = &ve_allow_kthreads,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { 0 }
+};
+
+static ctl_table root_table[] = {
+ {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table},
+ { 0 }
+};
+
+static int init_vecalls_sysctl(void)
+{
+ table_header = register_sysctl_table(root_table);
+ if (!table_header)
+ return -ENOMEM ;
+ return 0;
+}
+
+static void fini_vecalls_sysctl(void)
+{
+ unregister_sysctl_table(table_header);
+}
+#else
+static int init_vecalls_sysctl(void) { return 0; }
+static void fini_vecalls_sysctl(void) { ; }
+#endif
+
+static int __init vecalls_init(void)
+{
+ int err;
+
+ err = init_vecalls_sysctl();
+ if (err)
+ goto out_vzmond;
+
+ err = init_vzmond();
+ if (err < 0)
+ goto out_sysctl;
+
+ err = init_vecalls_proc();
+ if (err < 0)
+ goto out_proc;
+
+ err = init_vecalls_ioctls();
+ if (err < 0)
+ goto out_ioctls;
+
+ /* We can easy dereference this hook if VE is running
+ * because in this case vzmon refcount > 0
+ */
+ do_ve_enter_hook = do_env_enter;
+ /*
+ * This one can also be dereferenced since not freed
+ * VE holds reference on module
+ */
+ do_env_free_hook = real_do_env_free;
+
+ return 0;
+
+out_ioctls:
+ fini_vecalls_proc();
+out_proc:
+ fini_vzmond();
+out_sysctl:
+ fini_vecalls_sysctl();
+out_vzmond:
+ return err;
+}
+
+static void vecalls_exit(void)
+{
+ do_env_free_hook = NULL;
+ do_ve_enter_hook = NULL;
+ fini_vecalls_ioctls();
+ fini_vecalls_proc();
+ fini_vzmond();
+ fini_vecalls_sysctl();
+}
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Control");
+MODULE_LICENSE("GPL v2");
+
+module_init(vecalls_init)
+module_exit(vecalls_exit)
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
new file mode 100644
index 0000000..0726e44
--- /dev/null
+++ b/kernel/ve/veowner.c
@@ -0,0 +1,160 @@
+/*
+ * kernel/ve/veowner.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/ipc.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/inetdevice.h>
+#include <linux/pid_namespace.h>
+#include <linux/xattr.h>
+#include <asm/system.h>
+#include <asm/io.h>
+
+#include <net/tcp.h>
+
+void prepare_ve0_process(struct task_struct *tsk)
+{
+ VE_TASK_INFO(tsk)->exec_env = get_ve0();
+ VE_TASK_INFO(tsk)->owner_env = get_ve0();
+ VE_TASK_INFO(tsk)->sleep_time = 0;
+ VE_TASK_INFO(tsk)->wakeup_stamp = 0;
+ VE_TASK_INFO(tsk)->sched_time = 0;
+ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock);
+
+ if (tsk->pid) {
+ list_add_rcu(&tsk->ve_task_info.vetask_list,
+ &get_ve0()->vetask_lh);
+ atomic_inc(&get_ve0()->pcounter);
+ }
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * proc entries
+ * ------------------------------------------------------------------------
+ */
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_vz_dir;
+EXPORT_SYMBOL(proc_vz_dir);
+
+struct proc_dir_entry *glob_proc_vz_dir;
+EXPORT_SYMBOL(glob_proc_vz_dir);
+
+static void prepare_proc(void)
+{
+ proc_vz_dir = proc_mkdir("vz", NULL);
+ if (!proc_vz_dir)
+ panic("Can't create /proc/vz dir\n");
+
+ glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root);
+ if (!proc_vz_dir)
+ panic("Can't create /proc/vz dir\n");
+}
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * OpenVZ sysctl
+ * ------------------------------------------------------------------------
+ */
+int ve_xattr_policy = VE_XATTR_POLICY_ACCEPT;
+extern int ve_area_access_check;
+
+#ifdef CONFIG_INET
+static struct ctl_table vz_ipv4_route_table[] = {
+ {
+ .procname = "src_check",
+ .data = &ip_rt_src_check,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { 0 }
+};
+
+static struct ctl_path net_ipv4_route_path[] = {
+ { .ctl_name = CTL_NET, .procname = "net", },
+ { .ctl_name = NET_IPV4, .procname = "ipv4", },
+ { .ctl_name = NET_IPV4_ROUTE, .procname = "route", },
+ { }
+};
+#endif
+
+static struct ctl_table vz_fs_table[] = {
+ {
+ .procname = "ve-area-access-check",
+ .data = &ve_area_access_check,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "ve-xattr-policy",
+ .data = &ve_xattr_policy,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { 0 }
+};
+
+static struct ctl_path fs_path[] = {
+ { .ctl_name = CTL_FS, .procname = "fs", },
+ { }
+};
+
+static void prepare_sysctl(void)
+{
+#ifdef CONFIG_INET
+ register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table);
+#endif
+ register_sysctl_paths(fs_path, vz_fs_table);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * XXX init_ve_system
+ * ------------------------------------------------------------------------
+ */
+
+void init_ve_system(void)
+{
+ struct task_struct *init_entry;
+ struct ve_struct *ve;
+
+ ve = get_ve0();
+
+ init_entry = init_pid_ns.child_reaper;
+ /* if ve_move_task to VE0 (e.g. in cpt code) *
+ * occurs, ve_cap_bset on VE0 is required */
+ ve->ve_cap_bset = CAP_INIT_EFF_SET;
+
+ read_lock(&init_entry->fs->lock);
+ ve->root_path = init_entry->fs->root;
+ read_unlock(&init_entry->fs->lock);
+
+#ifdef CONFIG_PROC_FS
+ prepare_proc();
+#endif
+ prepare_sysctl();
+}
diff --git a/kernel/ve/vzdev.c b/kernel/ve/vzdev.c
new file mode 100644
index 0000000..cc4b1b7
--- /dev/null
+++ b/kernel/ve/vzdev.c
@@ -0,0 +1,154 @@
+/*
+ * kernel/ve/vzdev.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vzctl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/vzcalluser.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <linux/device.h>
+#include <linux/smp_lock.h>
+
+#define VZCTL_MAJOR 126
+#define VZCTL_NAME "vzctl"
+
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo Interface");
+MODULE_LICENSE("GPL v2");
+
+static LIST_HEAD(ioctls);
+static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED;
+
+static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd)
+{
+ struct vzioctlinfo *h;
+
+ spin_lock(&ioctl_lock);
+ list_for_each_entry(h, &ioctls, list) {
+ if (h->type == _IOC_TYPE(cmd))
+ goto found;
+ }
+ h = NULL;
+found:
+ if (h && !try_module_get(h->owner))
+ h = NULL;
+ spin_unlock(&ioctl_lock);
+ return h;
+}
+
+static void vzctl_put_handler(struct vzioctlinfo *h)
+{
+ if (!h)
+ return;
+
+ module_put(h->owner);
+}
+
+long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct vzioctlinfo *h;
+ int err;
+
+ err = -ENOTTY;
+ h = vzctl_get_handler(cmd);
+ if (h && h->ioctl)
+ err = (*h->ioctl)(file, cmd, arg);
+ vzctl_put_handler(h);
+
+ return err;
+}
+
+long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct vzioctlinfo *h;
+ int err;
+
+ err = -ENOIOCTLCMD;
+ h = vzctl_get_handler(cmd);
+ if (h && h->compat_ioctl)
+ err = (*h->compat_ioctl)(file, cmd, arg);
+ vzctl_put_handler(h);
+
+ return err;
+}
+
+void vzioctl_register(struct vzioctlinfo *inf)
+{
+ spin_lock(&ioctl_lock);
+ list_add(&inf->list, &ioctls);
+ spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_register);
+
+void vzioctl_unregister(struct vzioctlinfo *inf)
+{
+ spin_lock(&ioctl_lock);
+ list_del_init(&inf->list);
+ spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_unregister);
+
+/*
+ * Init/exit stuff.
+ */
+static struct file_operations vzctl_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = vzctl_ioctl,
+ .compat_ioctl = compat_vzctl_ioctl,
+};
+
+static struct class *vzctl_class;
+
+static void __exit vzctl_exit(void)
+{
+ device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
+ class_destroy(vzctl_class);
+ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+}
+
+static int __init vzctl_init(void)
+{
+ int ret;
+ struct device *class_err;
+
+ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
+ if (ret < 0)
+ goto out;
+
+ vzctl_class = class_create(THIS_MODULE, "vzctl");
+ if (IS_ERR(vzctl_class)) {
+ ret = PTR_ERR(vzctl_class);
+ goto out_cleandev;
+ }
+
+ class_err = device_create(vzctl_class, NULL,
+ MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME);
+ if (IS_ERR(class_err)) {
+ ret = PTR_ERR(class_err);
+ goto out_rmclass;
+ }
+
+ goto out;
+
+out_rmclass:
+ class_destroy(vzctl_class);
+out_cleandev:
+ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+out:
+ return ret;
+}
+
+module_init(vzctl_init)
+module_exit(vzctl_exit);
diff --git a/kernel/ve/vzevent.c b/kernel/ve/vzevent.c
new file mode 100644
index 0000000..554f169
--- /dev/null
+++ b/kernel/ve/vzevent.c
@@ -0,0 +1,125 @@
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/errno.h>
+#include <linux/ve_proto.h>
+#include <linux/vzevent.h>
+
+#define NETLINK_UEVENT 31
+#define VZ_EVGRP_ALL 0x01
+
+/*
+ * NOTE: the original idea was to send events via kobject_uevent(),
+ * however, it turns out that it has negative consequences like
+ * start of /sbin/hotplug which tries to react on our events in inadequate manner.
+ */
+
+static struct sock *vzev_sock;
+
+static char *action_to_string(int action)
+{
+ switch (action) {
+ case KOBJ_MOUNT:
+ return "ve-mount";
+ case KOBJ_UMOUNT:
+ return "ve-umount";
+ case KOBJ_START:
+ return "ve-start";
+ case KOBJ_STOP:
+ return "ve-stop";
+ default:
+ return NULL;
+ }
+}
+
+static int do_vzevent_send(int event, char *msg, int len)
+{
+ struct sk_buff *skb;
+ char *buf, *action;
+ int alen;
+
+ action = action_to_string(event);
+ alen = strlen(action);
+
+ skb = alloc_skb(len + 1 + alen, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ buf = skb_put(skb, len + 1 + alen);
+ memcpy(buf, action, alen);
+ buf[alen] = '@';
+ memcpy(buf + alen + 1, msg, len);
+ (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL);
+ return 0;
+}
+
+int vzevent_send(int event, const char *attrs_fmt, ...)
+{
+ va_list args;
+ int len, err;
+ struct ve_struct *ve;
+ char *page;
+
+ err = -ENOMEM;
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ goto out;
+
+ va_start(args, attrs_fmt);
+ len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args);
+ va_end(args);
+
+ ve = set_exec_env(get_ve0());
+ err = do_vzevent_send(event, page, len);
+ (void)set_exec_env(ve);
+ free_page((unsigned long)page);
+out:
+ return err;
+}
+EXPORT_SYMBOL(vzevent_send);
+
+static int ve_start(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ vzevent_send(KOBJ_START, "%d", ve->veid);
+ return 0;
+}
+
+static void ve_stop(void *data)
+{
+ struct ve_struct *ve;
+
+ ve = (struct ve_struct *)data;
+ vzevent_send(KOBJ_STOP, "%d", ve->veid);
+}
+
+static struct ve_hook ve_start_stop_hook = {
+ .init = ve_start,
+ .fini = ve_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_AFTERALL,
+};
+
+static int __init init_vzevent(void)
+{
+ vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE);
+ if (vzev_sock == NULL)
+ return -ENOMEM;
+ ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook);
+ return 0;
+}
+
+static void __exit exit_vzevent(void)
+{
+ ve_hook_unregister(&ve_start_stop_hook);
+ sock_release(vzev_sock->sk_socket);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_vzevent);
+module_exit(exit_vzevent);
diff --git a/kernel/ve/vzwdog.c b/kernel/ve/vzwdog.c
new file mode 100644
index 0000000..7cbef81
--- /dev/null
+++ b/kernel/ve/vzwdog.c
@@ -0,0 +1,322 @@
+/*
+ * kernel/ve/vzwdog.c
+ *
+ * Copyright (C) 2000-2005 SWsoft
+ * All rights reserved.
+ *
+ * Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/kobject.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
+#include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/* Staff regading kernel thread polling VE validity */
+static int sleep_timeout = 60;
+static struct task_struct *wdog_thread_tsk;
+
+extern void show_mem(void);
+
+static struct file *intr_file;
+static char page[PAGE_SIZE];
+
+static void parse_irq_list(int len)
+{
+ int i, k, skip;
+ for (i = 0; i < len; ) {
+ k = i;
+ while (i < len && page[i] != '\n' && page[i] != ':')
+ i++;
+ skip = 0;
+ if (i < len && page[i] != '\n') {
+ i++; /* skip ':' */
+ while (i < len && (page[i] == ' ' || page[i] == '0'))
+ i++;
+ skip = (i < len && (page[i] < '0' || page[i] > '9'));
+ while (i < len && page[i] != '\n')
+ i++;
+ }
+ if (!skip)
+ printk("%.*s\n", i - k, page + k);
+ if (i < len)
+ i++; /* skip '\n' */
+ }
+}
+
+extern loff_t vfs_llseek(struct file *file, loff_t, int);
+extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *);
+extern struct file *filp_open(const char *filename, int flags, int mode);
+extern int filp_close(struct file *filp, fl_owner_t id);
+static void show_irq_list(void)
+{
+ mm_segment_t fs;
+ int r;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ vfs_llseek(intr_file, 0, 0);
+ r = vfs_read(intr_file, (void __user *)page, sizeof(page),
+ &intr_file->f_pos);
+ set_fs(fs);
+
+ if (r > 0)
+ parse_irq_list(r);
+}
+
+static void show_alloc_latency(void)
+{
+ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+ "A0",
+ "L0",
+ "H0",
+ "L1",
+ "H1"
+ };
+ int i;
+
+ printk("lat: ");
+ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+ struct kstat_lat_struct *p;
+ cycles_t maxlat, avg0, avg1, avg2;
+
+ p = &kstat_glob.alloc_lat[i];
+ spin_lock_irq(&kstat_glb_lock);
+ maxlat = p->last.maxlat;
+ avg0 = p->avg[0];
+ avg1 = p->avg[1];
+ avg2 = p->avg[2];
+ spin_unlock_irq(&kstat_glb_lock);
+
+ printk("%s %Lu (%Lu %Lu %Lu)",
+ alloc_descr[i],
+ (unsigned long long)maxlat,
+ (unsigned long long)avg0,
+ (unsigned long long)avg1,
+ (unsigned long long)avg2);
+ }
+ printk("\n");
+}
+
+static void show_schedule_latency(void)
+{
+ struct kstat_lat_pcpu_struct *p;
+ cycles_t maxlat, totlat, avg0, avg1, avg2;
+ unsigned long count;
+
+ p = &kstat_glob.sched_lat;
+ spin_lock_irq(&kstat_glb_lock);
+ maxlat = p->last.maxlat;
+ totlat = p->last.totlat;
+ count = p->last.count;
+ avg0 = p->avg[0];
+ avg1 = p->avg[1];
+ avg2 = p->avg[2];
+ spin_unlock_irq(&kstat_glb_lock);
+
+ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
+ (unsigned long long)maxlat,
+ (unsigned long long)totlat,
+ count,
+ (unsigned long long)avg0,
+ (unsigned long long)avg1,
+ (unsigned long long)avg2);
+}
+
+static void show_header(void)
+{
+ struct timeval tv;
+
+ do_gettimeofday(&tv);
+ preempt_disable();
+ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
+ tv.tv_sec, (long)tv.tv_usec,
+ (unsigned long long)get_jiffies_64(),
+ smp_processor_id());
+#ifdef CONFIG_FAIRSCHED
+ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n",
+ cycles_per_jiffy, HZ);
+#else
+ printk("*** jiffies_per_second %u ***\n", HZ);
+#endif
+ preempt_enable();
+}
+
+static void show_pgdatinfo(void)
+{
+ pg_data_t *pgdat;
+
+ printk("pgdat:");
+ for_each_online_pgdat(pgdat) {
+ printk(" %d: %lu,%lu,%lu",
+ pgdat->node_id,
+ pgdat->node_start_pfn,
+ pgdat->node_present_pages,
+ pgdat->node_spanned_pages);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ printk(",%p", pgdat->node_mem_map);
+#endif
+ }
+ printk("\n");
+}
+
+static int show_partitions_io(struct gendisk *gp)
+{
+ struct disk_part_iter piter;
+ struct hd_struct *hd;
+ char buf[BDEVNAME_SIZE];
+ int cpu;
+
+ /*
+ if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
+ seq_puts(seqf, "major minor name"
+ " rio rmerge rsect ruse wio wmerge "
+ "wsect wuse running use aveq"
+ "\n\n");
+ */
+
+ disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
+ while ((hd = disk_part_iter_next(&piter))) {
+ cpu = part_stat_lock();
+ part_round_stats(cpu, hd);
+ part_stat_unlock();
+ printk("%4d %7d %s %lu %lu %llu "
+ "%u %lu %lu %llu %u %u %u %u\n",
+ MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
+ disk_name(gp, hd->partno, buf),
+ part_stat_read(hd, ios[0]),
+ part_stat_read(hd, merges[0]),
+ (unsigned long long)part_stat_read(hd, sectors[0]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+ part_stat_read(hd, ios[1]),
+ part_stat_read(hd, merges[1]),
+ (unsigned long long)part_stat_read(hd, sectors[1]),
+ jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+ part_in_flight(hd),
+ jiffies_to_msecs(part_stat_read(hd, io_ticks)),
+ jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+ );
+ }
+ disk_part_iter_exit(&piter);
+
+ return 0;
+}
+
+static int show_one_disk_io(struct device *dev, void *x)
+{
+ char *name;
+ char buf[BDEVNAME_SIZE];
+ struct gendisk *gd;
+
+ gd = dev_to_disk(dev);
+
+ name = disk_name(gd, 0, buf);
+ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
+ isdigit(name[4]))
+ return 0;
+
+ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
+ isdigit(name[3]))
+ return 0;
+
+ show_partitions_io(gd);
+
+ return 0;
+}
+
+static void show_diskio(void)
+{
+ printk("disk_io: ");
+ class_for_each_device(&block_class, NULL, NULL, show_one_disk_io);
+ printk("\n");
+}
+
+static void show_nrprocs(void)
+{
+ unsigned long _nr_running, _nr_sleeping,
+ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
+
+ _nr_running = nr_running();
+ _nr_unint = nr_uninterruptible();
+ _nr_sleeping = nr_sleeping();
+ _nr_zombie = nr_zombie;
+ _nr_dead = atomic_read(&nr_dead);
+ _nr_stopped = nr_stopped();
+
+ printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
+ "Z %lu, X %lu, T %lu (tot %d)\n",
+ nr_ve, _nr_running, _nr_sleeping, _nr_unint,
+ _nr_zombie, _nr_dead, _nr_stopped, nr_threads);
+}
+
+static void wdog_print(void)
+{
+ show_header();
+ show_irq_list();
+ show_pgdatinfo();
+ show_mem();
+ show_diskio();
+ show_schedule_latency();
+ show_alloc_latency();
+ show_nrprocs();
+}
+
+static int wdog_loop(void* data)
+{
+ while (1) {
+ wdog_print();
+ try_to_freeze();
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ schedule_timeout(sleep_timeout*HZ);
+ }
+ return 0;
+}
+
+static int __init wdog_init(void)
+{
+ struct file *file;
+
+ file = filp_open("/proc/interrupts", 0, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ intr_file = file;
+
+ wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog");
+ if (IS_ERR(wdog_thread_tsk)) {
+ filp_close(intr_file, NULL);
+ return -EBUSY;
+ }
+ return 0;
+}
+
+static void __exit wdog_exit(void)
+{
+ kthread_stop(wdog_thread_tsk);
+ filp_close(intr_file, NULL);
+}
+
+module_param(sleep_timeout, int, 0660);
+MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
+MODULE_DESCRIPTION("Virtuozzo WDOG");
+MODULE_LICENSE("GPL v2");
+
+module_init(wdog_init)
+module_exit(wdog_exit)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 234ceb1..789f5b9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -136,6 +136,15 @@ config DEBUG_SECTION_MISMATCH
- Enable verbose reporting from modpost to help solving
the section mismatches reported.
+config SYSRQ_DEBUG
+ bool "Debugging via sysrq keys"
+ depends on MAGIC_SYSRQ
+ default y
+ help
+ Say Y if you want to extend functionality of magic key. It will
+ provide you with some debugging facilities such as dumping and
+ writing memory, resolving symbols and some other.
+
config DEBUG_KERNEL
bool "Kernel debugging"
help
diff --git a/lib/is_single_threaded.c b/lib/is_single_threaded.c
index bd2bea9..397a3ce 100644
--- a/lib/is_single_threaded.c
+++ b/lib/is_single_threaded.c
@@ -30,7 +30,7 @@ bool current_is_single_threaded(void)
ret = false;
rcu_read_lock();
- for_each_process(p) {
+ for_each_process_ve(p) {
if (unlikely(p->flags & PF_KTHREAD))
continue;
if (unlikely(p == task->group_leader))
@@ -48,7 +48,7 @@ bool current_is_single_threaded(void)
* forked before exiting.
*/
smp_rmb();
- } while_each_thread(p, t);
+ } while_each_thread_ve(p, t);
}
ret = true;
found:
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 920a3ca..ff954e7 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -38,6 +38,8 @@ static const char *kobject_actions[] = {
[KOBJ_REMOVE] = "remove",
[KOBJ_CHANGE] = "change",
[KOBJ_MOVE] = "move",
+ [KOBJ_START] = "start",
+ [KOBJ_STOP] = "stop",
[KOBJ_ONLINE] = "online",
[KOBJ_OFFLINE] = "offline",
};
diff --git a/lib/nlattr.c b/lib/nlattr.c
index c4706eb..fcb694e 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -196,7 +196,7 @@ int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len,
}
if (unlikely(rem > 0))
- printk(KERN_WARNING "netlink: %d bytes leftover after parsing "
+ ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing "
"attributes.\n", rem);
err = 0;
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 238e72a..57c038d 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/quicklist.h>
+#include <linux/module.h>
void show_mem(void)
{
@@ -61,3 +62,4 @@ void show_mem(void)
quicklist_total_size());
#endif
}
+EXPORT_SYMBOL_GPL(show_mem);
diff --git a/mm/filemap.c b/mm/filemap.c
index 8e96c90..9189743 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,6 +42,7 @@
#include <linux/buffer_head.h> /* for try_to_free_buffers */
#include <asm/mman.h>
+#include <bc/io_acct.h>
/*
* Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -121,6 +122,7 @@ void __remove_from_page_cache(struct page *page)
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
+ ub_io_release_debug(page);
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d..c2f4508 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
+#include <bc/vmpages.h>
/*
* We do use our own empty page to avoid interference with other users
@@ -194,6 +195,8 @@ retry:
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush_notify(vma, address, pte);
page_remove_rmap(page);
+ pb_remove_ref(page, mm);
+ ub_unused_privvm_inc(mm, vma);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85a..92d38a6 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -21,6 +21,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <bc/vmpages.h>
+
#include "internal.h"
static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -38,6 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page);
+ pb_remove_ref(page, mm);
page_cache_release(page);
update_hiwater_rss(mm);
dec_mm_counter(mm, file_rss);
@@ -64,8 +67,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte)
goto out;
- if (!pte_none(*pte))
+ if (!pte_none(*pte)) {
zap_pte(mm, vma, addr, pte);
+ ub_unused_privvm_inc(mm, vma);
+ }
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
/*
@@ -222,7 +227,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
* drop PG_Mlocked flag for over-mapped range
*/
unsigned int saved_flags = vma->vm_flags;
- munlock_vma_pages_range(vma, start, start + size);
+ __munlock_vma_pages_range(vma, start, start + size, 0);
vma->vm_flags = saved_flags;
}
@@ -258,3 +263,4 @@ out:
return err;
}
+EXPORT_SYMBOL_GPL(sys_remap_file_pages);
diff --git a/mm/internal.h b/mm/internal.h
index 17bc0df..6a5669b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -66,8 +66,14 @@ static inline unsigned long page_order(struct page *page)
#ifdef CONFIG_HAVE_MLOCK
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
+extern void __munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int acct);
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ __munlock_vma_pages_range(vma, start, end, 1);
+}
+
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
{
munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc641..9d28f5c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -226,7 +226,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
goto out;
- for_each_process (tsk) {
+ for_each_process_all (tsk) {
if (!task_early_kill(tsk))
continue;
list_for_each_entry (vma, &av->head, anon_vma_node) {
@@ -263,7 +263,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
read_lock(&tasklist_lock);
spin_lock(&mapping->i_mmap_lock);
- for_each_process(tsk) {
+ for_each_process_all(tsk) {
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
if (!task_early_kill(tsk))
diff --git a/mm/memory.c b/mm/memory.c
index 4e59455..c5108b1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,9 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
+#include <linux/virtinfo.h>
+#include <linux/sched.h>
+#include <linux/vzstat.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
@@ -57,6 +60,11 @@
#include <linux/swapops.h>
#include <linux/elf.h>
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/kmem.h>
+#include <bc/vmpages.h>
+
#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -94,7 +102,7 @@ EXPORT_SYMBOL(high_memory);
* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
* as ancient (libc5 based) binaries can segfault. )
*/
-int randomize_va_space __read_mostly =
+int _randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
1;
#else
@@ -132,18 +140,21 @@ void pgd_clear_bad(pgd_t *pgd)
pgd_ERROR(*pgd);
pgd_clear(pgd);
}
+EXPORT_SYMBOL_GPL(pgd_clear_bad);
void pud_clear_bad(pud_t *pud)
{
pud_ERROR(*pud);
pud_clear(pud);
}
+EXPORT_SYMBOL_GPL(pud_clear_bad);
void pmd_clear_bad(pmd_t *pmd)
{
pmd_ERROR(*pmd);
pmd_clear(pmd);
}
+EXPORT_SYMBOL_GPL(pmd_clear_bad);
/*
* Note: this doesn't free the actual pages themselves. That
@@ -356,6 +367,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
pte_free(mm, new);
return 0;
}
+EXPORT_SYMBOL_GPL(__pte_alloc);
int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
@@ -565,6 +577,7 @@ check_pfn:
out:
return pfn_to_page(pfn);
}
+EXPORT_SYMBOL_GPL(vm_normal_page);
/*
* copy one vm_area from one task to the other. Assumes the page tables
@@ -575,7 +588,7 @@ out:
static inline void
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr, int *rss)
+ unsigned long addr, int *rss, struct page_beancounter **pbc)
{
unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
@@ -630,6 +643,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (page) {
get_page(page);
page_dup_rmap(page);
+ pb_dup_ref(page, dst_mm, pbc);
rss[PageAnon(page)]++;
}
@@ -637,21 +651,36 @@ out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
}
+#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1)))
+#ifdef CONFIG_BEANCOUNTERS
+#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub)
+#else
+#define same_ub(mm1, mm2) 1
+#endif
+
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
- int rss[2];
+ int rss[2], rss_tot;
+ struct page_beancounter *pbc;
+ int err;
+ err = -ENOMEM;
+ pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL;
again:
+ if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr)))
+ goto out;
rss[1] = rss[0] = 0;
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
- return -ENOMEM;
+ goto out;
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -674,23 +703,32 @@ again:
progress++;
continue;
}
- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss,
+ &pbc);
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap_nested(orig_src_pte);
+ rss_tot = rss[0] + rss[1];
+ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot);
add_mm_rss(dst_mm, rss[0], rss[1]);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
if (addr != end)
goto again;
- return 0;
+
+ err = 0;
+out:
+ pb_free_list(&pbc);
+ return err;
}
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+ pud_t *dst_pud, pud_t *src_pud,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pmd_t *src_pmd, *dst_pmd;
@@ -705,14 +743,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, addr, next))
+ dst_vma, vma, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ pgd_t *dst_pgd, pgd_t *src_pgd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pud_t *src_pud, *dst_pud;
@@ -727,19 +767,21 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, addr, next))
+ dst_vma, vma, addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma)
+int __copy_page_range(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *vma,
+ unsigned long addr, size_t size)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = vma->vm_mm;
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long end = addr + size;
int ret;
/*
@@ -783,7 +825,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, addr, next))) {
+ dst_vma, vma, addr, next))) {
ret = -ENOMEM;
break;
}
@@ -794,6 +836,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
vma->vm_start, end);
return ret;
}
+EXPORT_SYMBOL_GPL(__copy_page_range);
+
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+ struct vm_area_struct *dst_vma, struct vm_area_struct *vma)
+{
+ if (dst_vma->vm_mm != dst)
+ BUG();
+ if (vma->vm_mm != src)
+ BUG();
+ return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start);
+}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
@@ -805,6 +858,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
int file_rss = 0;
int anon_rss = 0;
+ int rss;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -860,6 +914,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
file_rss--;
}
page_remove_rmap(page);
+ pb_remove_ref(page, mm);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
tlb_remove_page(tlb, page);
@@ -880,6 +935,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+ rss = -(file_rss + anon_rss);
+ ub_unused_privvm_add(mm, vma, rss);
add_mm_rss(mm, file_rss, anon_rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
@@ -1994,6 +2051,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
int reuse = 0, ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
+ struct page_beancounter *pbc;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2100,6 +2158,8 @@ reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (old_page)
+ ClearPageCheckpointed(old_page);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
@@ -2113,6 +2173,9 @@ reuse:
gotten:
pte_unmap_unlock(page_table, ptl);
+ if (unlikely(pb_alloc(&pbc)))
+ goto oom_nopb;
+
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2147,12 +2210,15 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
+ pb_remove_ref(old_page, mm);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
}
- } else
+ } else {
+ ub_unused_privvm_dec(mm, vma);
inc_mm_counter(mm, anon_rss);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2164,6 +2230,7 @@ gotten:
*/
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
+ pb_add_ref(new_page, mm, &pbc);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -2207,6 +2274,7 @@ gotten:
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
+ pb_free(&pbc);
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
@@ -2246,6 +2314,8 @@ unlock:
oom_free_new:
page_cache_release(new_page);
oom:
+ pb_free(&pbc);
+oom_nopb:
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
@@ -2502,10 +2572,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte;
struct mem_cgroup *ptr = NULL;
int ret = 0;
+ struct page_beancounter *pbc;
+ cycles_t start;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- goto out;
+ goto out_nostat;
+ if (unlikely(pb_alloc(&pbc)))
+ return VM_FAULT_OOM;
+
+ start = get_cycles();
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
@@ -2580,6 +2656,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
inc_mm_counter(mm, anon_rss);
+ ub_percpu_inc(mm->mm_ub, swapin);
pte = mk_pte(page, vma->vm_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2588,11 +2665,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
+ pb_add_ref(page, mm, &pbc);
+ ub_unused_privvm_dec(mm, vma);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)
+ || swap_readonly(page))
try_to_free_swap(page);
unlock_page(page);
@@ -2608,6 +2688,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock:
pte_unmap_unlock(page_table, ptl);
out:
+ pb_free(&pbc);
+ spin_lock_irq(&kstat_glb_lock);
+ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
+ spin_unlock_irq(&kstat_glb_lock);
+out_nostat:
return ret;
out_nomap:
mem_cgroup_cancel_charge_swapin(ptr);
@@ -2615,6 +2700,7 @@ out_nomap:
out_page:
unlock_page(page);
out_release:
+ pb_free(&pbc);
page_cache_release(page);
return ret;
}
@@ -2631,6 +2717,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
spinlock_t *ptl;
pte_t entry;
+ struct page_beancounter *pbc = NULL;
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
@@ -2645,6 +2732,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Allocate our own private page. */
pte_unmap(page_table);
+ if (unlikely(pb_alloc(&pbc)))
+ goto oom_nopb;
+
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2665,12 +2755,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
+ pb_add_ref(page, mm, &pbc);
+ ub_unused_privvm_dec(mm, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, entry);
unlock:
+ pb_free(&pbc);
pte_unmap_unlock(page_table, ptl);
return 0;
release:
@@ -2680,6 +2773,8 @@ release:
oom_free_page:
page_cache_release(page);
oom:
+ pb_free(&pbc);
+oom_nopb:
return VM_FAULT_OOM;
}
@@ -2707,6 +2802,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int anon = 0;
int charged = 0;
struct page *dirty_page = NULL;
+ struct page_beancounter *pbc;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
@@ -2716,9 +2812,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.flags = flags;
vmf.page = NULL;
+ ret = VM_FAULT_OOM;
+ if (unlikely(pb_alloc(&pbc)))
+ goto oom_nopb;
+
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
- return ret;
+ goto out_fault;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
@@ -2812,6 +2912,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {
+ struct user_beancounter *ub;
+
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
@@ -2828,6 +2930,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
}
set_pte_at(mm, address, page_table, entry);
+ ub = page_ub(page);
+ if (ub != NULL &&
+#ifdef CONFIG_BC_IO_ACCOUNTING
+ !((unsigned long)ub & PAGE_IO_MARK) &&
+#endif
+ ub->ub_magic == UB_MAGIC) {
+ /*
+ * WOW: Page was already charged as page_ub. This may
+ * happens for example then some driver export its low
+ * memory pages to user space. We can't account page as
+ * page_ub and page_bp at the same time. So uncharge
+ * page from UB counter.
+ */
+ WARN_ON_ONCE(1);
+ ub_page_uncharge(page, 0);
+ }
+
+ pb_add_ref(page, mm, &pbc);
+ ub_unused_privvm_dec(mm, vma);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
@@ -2867,6 +2988,9 @@ out:
page_cache_release(vmf.page);
}
+out_fault:
+ pb_free(&pbc);
+oom_nopb:
return ret;
unwritable_page:
@@ -2994,6 +3118,27 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *pte;
+#ifdef CONFIG_VZ_GENCALLS
+ do {
+ int ret;
+#ifdef CONFIG_BEANCOUNTERS
+ struct task_beancounter *tbc;
+
+ tbc = &current->task_bc;
+ if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) &&
+ tbc->pgfault_allot) {
+ tbc->pgfault_allot--;
+ break; /* skip notifier */
+ }
+#endif
+ ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN,
+ (void *)1);
+ if (ret & NOTIFY_FAIL)
+ return VM_FAULT_SIGBUS;
+ if (ret & NOTIFY_OK)
+ return VM_FAULT_MINOR; /* retry */
+ } while (0);
+#endif
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
@@ -3038,6 +3183,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
}
#endif /* __PAGETABLE_PUD_FOLDED */
+EXPORT_SYMBOL_GPL(__pud_alloc);
+
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
@@ -3068,6 +3215,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+EXPORT_SYMBOL_GPL(__pmd_alloc);
+
int make_pages_present(unsigned long addr, unsigned long end)
{
int ret, len, write;
@@ -3087,6 +3236,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
return ret == len ? 0 : -EFAULT;
}
+EXPORT_SYMBOL(make_pages_present);
+
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d..cafc267 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
init_waitqueue_head(&pool->wait);
pool->alloc = alloc_fn;
pool->free = free_fn;
+ if (alloc_fn == mempool_alloc_slab)
+ kmem_mark_nocharge((struct kmem_cache *)pool_data);
/*
* First pre-allocate the guaranteed number of buffers.
@@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
unsigned long flags;
BUG_ON(new_min_nr <= 0);
+ gfp_mask &= ~__GFP_UBC;
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
@@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
+ gfp_mask &= ~__GFP_UBC;
gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
diff --git a/mm/mlock.c b/mm/mlock.c
index 2e05c97..1ebf6e1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -18,6 +18,7 @@
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
+#include <bc/vmpages.h>
#include "internal.h"
@@ -309,12 +310,14 @@ no_mlock:
* and re-mlocked by try_to_{munlock|unmap} before we unmap and
* free them. This will result in freeing mlocked pages.
*/
-void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+void __munlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int acct)
{
unsigned long addr;
lru_add_drain();
+ if (acct)
+ ub_locked_uncharge(vma->vm_mm, end - start);
vma->vm_flags &= ~VM_LOCKED;
for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -374,6 +377,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
goto out; /* don't set VM_LOCKED, don't count */
}
+ if (newflags & VM_LOCKED) {
+ ret = ub_locked_charge(mm, end - start);
+ if (ret < 0)
+ goto out;
+ }
+
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
@@ -385,13 +394,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
if (start != vma->vm_start) {
ret = split_vma(mm, vma, start, 1);
if (ret)
- goto out;
+ goto out_uncharge;
}
if (end != vma->vm_end) {
ret = split_vma(mm, vma, end, 0);
if (ret)
- goto out;
+ goto out_uncharge;
}
success:
@@ -421,6 +430,11 @@ success:
out:
*prev = vma;
return ret;
+
+out_uncharge:
+ if (newflags & VM_LOCKED)
+ ub_locked_uncharge(mm, end - start);
+ goto out;
}
static int do_mlock(unsigned long start, size_t len, int on)
@@ -499,6 +513,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
up_write(&current->mm->mmap_sem);
return error;
}
+EXPORT_SYMBOL_GPL(sys_mlock);
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
@@ -511,6 +526,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
up_write(&current->mm->mmap_sem);
return ret;
}
+EXPORT_SYMBOL_GPL(sys_munlock);
static int do_mlockall(int flags)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index ae19746..a5dd0bf 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/virtinfo.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -41,10 +42,13 @@
#define arch_mmap_check(addr, len, flags) (0)
#endif
+#include <bc/vmpages.h>
+
#ifndef arch_rebalance_pgtables
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft);
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -110,6 +114,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
vm_acct_memory(pages);
+#ifdef CONFIG_BEANCOUNTERS
+ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM,
+ (void *)pages)
+ & (NOTIFY_OK | NOTIFY_FAIL)) {
+ case NOTIFY_OK:
+ return 0;
+ case NOTIFY_FAIL:
+ vm_unacct_memory(pages);
+ return -ENOMEM;
+ }
+#endif
+
/*
* Sometimes we want to use more memory than we have
*/
@@ -231,6 +247,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
struct vm_area_struct *next = vma->vm_next;
might_sleep();
+
+ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
+ vma->vm_flags, vma->vm_file);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file) {
@@ -288,7 +307,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out;
/* Ok, looks good - let it rip. */
- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk)
goto out;
set_brk:
mm->brk = brk;
@@ -1106,6 +1125,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
+ unsigned long ub_charged = 0;
/* Clear old maps */
error = -ENOMEM;
@@ -1145,6 +1165,11 @@ munmap_back:
vm_flags |= VM_ACCOUNT;
}
+ if (ub_memory_charge(mm, len, vm_flags, file,
+ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD)))
+ goto charge_error;
+ ub_charged = 1;
+
/*
* Can we just expand an old mapping?
*/
@@ -1157,7 +1182,8 @@ munmap_back:
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL |
+ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0));
if (!vma) {
error = -ENOMEM;
goto unacct_error;
@@ -1187,6 +1213,19 @@ munmap_back:
goto unmap_and_free_vma;
if (vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
+ if (vm_flags != vma->vm_flags) {
+ /*
+ * ->vm_flags has been changed in f_op->mmap method.
+ * We have to recharge ub memory.
+ */
+ ub_memory_uncharge(mm, len, vm_flags, file);
+ if (ub_memory_charge(mm, len, vma->vm_flags, file,
+ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) {
+ ub_charged = 0;
+ error = -ENOMEM;
+ goto unmap_and_free_vma;
+ }
+ }
/* Can addr have changed??
*
@@ -1240,6 +1279,9 @@ unmap_and_free_vma:
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
+ if (ub_charged)
+ ub_memory_uncharge(mm, len, vm_flags, file);
+charge_error:
if (charged)
vm_unacct_memory(charged);
return error;
@@ -1570,12 +1612,16 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
if (is_hugepage_only_range(vma->vm_mm, new_start, size))
return -EFAULT;
+ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
+ vma->vm_file, UB_SOFT))
+ goto fail_charge;
+
/*
* Overcommit.. This must be the final test, as it will
* update security statistics.
*/
if (security_vm_enough_memory_mm(mm, grow))
- return -ENOMEM;
+ goto fail_sec;
/* Ok, everything looks good - let it rip */
mm->total_vm += grow;
@@ -1583,6 +1629,11 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
return 0;
+
+fail_sec:
+ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
+fail_charge:
+ return -ENOMEM;
}
#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
@@ -1869,6 +1920,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
return 0;
}
+EXPORT_SYMBOL_GPL(split_vma);
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
@@ -1976,7 +2028,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
* anonymous maps. eventually we may be able to do some
* brk-specific accounting here.
*/
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
@@ -2036,8 +2088,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
+ if (ub_memory_charge(mm, len, flags, NULL, soft))
+ goto fail_charge;
+
if (security_vm_enough_memory(len >> PAGE_SHIFT))
- return -ENOMEM;
+ goto fail_sec;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
@@ -2048,11 +2103,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
/*
* create a vma struct for an anonymous mapping
*/
- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
- }
+ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL |
+ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0));
+ if (!vma)
+ goto fail_alloc;
vma->vm_mm = mm;
vma->vm_start = addr;
@@ -2068,8 +2122,19 @@ out:
mm->locked_vm += (len >> PAGE_SHIFT);
}
return addr;
+
+fail_alloc:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+fail_sec:
+ ub_memory_uncharge(mm, len, flags, NULL);
+fail_charge:
+ return -ENOMEM;
}
+unsigned long do_brk(unsigned long addr, unsigned long len)
+{
+ return __do_brk(addr, len, UB_SOFT);
+}
EXPORT_SYMBOL(do_brk);
/* Release all mmaps. */
@@ -2262,10 +2327,11 @@ static void special_mapping_close(struct vm_area_struct *vma)
{
}
-static const struct vm_operations_struct special_mapping_vmops = {
+const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
};
+EXPORT_SYMBOL_GPL(special_mapping_vmops);
/*
* Called with mm->mmap_sem held for writing.
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d17..ee9dfe1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -14,6 +14,7 @@ struct pglist_data *first_online_pgdat(void)
{
return NODE_DATA(first_online_node);
}
+EXPORT_SYMBOL_GPL(first_online_pgdat);
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
@@ -23,6 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
return NULL;
return NODE_DATA(nid);
}
+EXPORT_SYMBOL_GPL(next_online_pgdat);
/*
* next_zone - helper magic for for_each_zone()
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d..3978aa8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/shm.h>
@@ -29,6 +30,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <bc/vmpages.h>
+
#ifndef pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
@@ -142,6 +145,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long charged = 0;
pgoff_t pgoff;
int error;
+ unsigned long ch_size;
+ int ch_dir;
int dirty_accountable = 0;
if (newflags == oldflags) {
@@ -149,6 +154,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
return 0;
}
+ error = -ENOMEM;
+ ch_size = nrpages - pages_in_vma_range(vma, start, end);
+ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma);
+ if (ch_dir == PRIVVM_ERROR)
+ goto fail_ch;
+
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
@@ -160,7 +171,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
- return -ENOMEM;
+ goto fail_sec;
newflags |= VM_ACCOUNT;
}
}
@@ -212,10 +223,16 @@ success:
mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ if (ch_dir == PRIVVM_TO_SHARED)
+ __ub_unused_privvm_dec(mm, ch_size);
return 0;
fail:
vm_unacct_memory(charged);
+fail_sec:
+ if (ch_dir == PRIVVM_TO_PRIVATE)
+ __ub_unused_privvm_dec(mm, ch_size);
+fail_ch:
return error;
}
@@ -318,3 +335,4 @@ out:
up_write(&current->mm->mmap_sem);
return error;
}
+EXPORT_SYMBOL_GPL(sys_mprotect);
diff --git a/mm/mremap.c b/mm/mremap.c
index 8451908..8e19c85 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -27,6 +27,8 @@
#include "internal.h"
+#include <bc/vmpages.h>
+
static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
@@ -177,12 +179,16 @@ static unsigned long move_vma(struct vm_area_struct *vma,
int split = 0;
int err;
+ if (ub_memory_charge(mm, new_len, vm_flags,
+ vma->vm_file, UB_HARD))
+ goto err;
+
/*
* We'd prefer to avoid failure later on in do_munmap:
* which may split one vma into three before unmapping.
*/
if (mm->map_count >= sysctl_max_map_count - 3)
- return -ENOMEM;
+ goto err_nomem;
/*
* Advise KSM to break any KSM pages in the area to be moved:
@@ -194,12 +200,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
err = ksm_madvise(vma, old_addr, old_addr + old_len,
MADV_UNMERGEABLE, &vm_flags);
if (err)
- return err;
+ goto err_nomem;
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
if (!new_vma)
- return -ENOMEM;
+ goto err_nomem;
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
if (moved_len < old_len) {
@@ -258,7 +264,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_addr + new_len);
}
- return new_addr;
+ if (new_addr != -ENOMEM)
+ return new_addr;
+
+err_nomem:
+ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
+err:
+ return -ENOMEM;
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
@@ -458,7 +470,13 @@ unsigned long do_mremap(unsigned long addr,
if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
- int pages = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long len = (new_len - old_len);
+ int pages = len >> PAGE_SHIFT;
+
+ ret = -ENOMEM;
+ if (ub_memory_charge(mm, len, vma->vm_flags,
+ vma->vm_file, UB_HARD))
+ goto out;
vma_adjust(vma, vma->vm_start,
addr + new_len, vma->vm_pgoff, NULL);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 83cd9bb..c54799e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -19,6 +19,8 @@
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/virtinfo.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
@@ -28,6 +30,9 @@
#include <linux/memcontrol.h>
#include <linux/security.h>
+#include <bc/beancounter.h>
+#include <bc/oom_kill.h>
+
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks;
@@ -224,16 +229,16 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints,
+struct task_struct *select_bad_process(struct user_beancounter *ub,
struct mem_cgroup *mem)
{
struct task_struct *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
- *ppoints = 0;
+ unsigned long chosen_points = 0;
do_posix_clock_monotonic_gettime(&uptime);
- for_each_process(p) {
+ for_each_process_all(p) {
unsigned long points;
/*
@@ -247,6 +252,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
continue;
if (mem && !task_in_mem_cgroup(p, mem))
continue;
+ if (ub_oom_task_skip(ub, p))
+ continue;
/*
* This task already has access to memory reserves and is
@@ -275,16 +282,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
return ERR_PTR(-1UL);
chosen = p;
- *ppoints = ULONG_MAX;
+ chosen_points = ULONG_MAX;
}
if (p->signal->oom_adj == OOM_DISABLE)
continue;
points = badness(p, uptime.tv_sec);
- if (points > *ppoints || !chosen) {
+ if (points > chosen_points || !chosen) {
chosen = p;
- *ppoints = points;
+ chosen_points = points;
}
}
@@ -310,7 +317,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
"name\n");
- do_each_thread(g, p) {
+ do_each_thread_all(g, p) {
struct mm_struct *mm;
if (mem && !task_in_mem_cgroup(p, mem))
@@ -334,7 +341,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
p->comm);
task_unlock(p);
- } while_each_thread(g, p);
+ } while_each_thread_all(g, p);
}
/*
@@ -369,10 +376,22 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
+ ub_oom_task_killed(p);
}
static int oom_kill_task(struct task_struct *p)
{
+ struct user_beancounter *ub;
+
+ task_lock(p);
+ if (p->mm == NULL) {
+ task_unlock(p);
+ return 1;
+ }
+
+ ub = get_beancounter(mm_ub(p->mm));
+ task_unlock(p);
+
/* WARNING: mm may not be dereferenced since we did not obtain its
* value from get_task_mm(p). This is OK since all we need to do is
* compare mm to q->mm below.
@@ -381,17 +400,18 @@ static int oom_kill_task(struct task_struct *p)
* change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us.
*/
- if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
+ if (p->signal->oom_adj == OOM_DISABLE)
return 1;
__oom_kill_task(p, 1);
+ ub_oom_mm_killed(ub);
+ put_beancounter(ub);
return 0;
}
-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned long points, struct mem_cgroup *mem,
- const char *message)
+int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ struct mem_cgroup *mem, const char *message)
{
struct task_struct *c;
@@ -419,8 +439,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
return 0;
}
- printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
- message, task_pid_nr(p), p->comm, points);
+ printk(KERN_ERR "%s: kill process %d (%s) or a child\n",
+ message, task_pid_nr(p), p->comm);
/* Try to kill a child first */
list_for_each_entry(c, &p->children, sibling) {
@@ -449,7 +469,7 @@ retry:
if (!p)
p = current;
- if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ if (oom_kill_process(p, gfp_mask, 0, mem,
"Memory cgroup out of memory"))
goto retry;
out:
@@ -527,31 +547,39 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
static void __out_of_memory(gfp_t gfp_mask, int order)
{
struct task_struct *p;
- unsigned long points;
+ struct user_beancounter *ub = NULL;
if (sysctl_oom_kill_allocating_task)
- if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+ if (!oom_kill_process(current, gfp_mask, order, NULL,
"Out of memory (oom_kill_allocating_task)"))
return;
retry:
+ put_beancounter(ub);
+
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
- p = select_bad_process(&points, NULL);
+ ub = ub_oom_select_worst();
+ p = select_bad_process(ub, NULL);
if (PTR_ERR(p) == -1UL)
return;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
+ if (ub != NULL)
+ goto retry;
+
read_unlock(&tasklist_lock);
+ ub_oom_unlock();
panic("Out of memory and no killable processes...\n");
}
- if (oom_kill_process(p, gfp_mask, order, points, NULL,
- "Out of memory"))
+ if (oom_kill_process(p, gfp_mask, order, NULL, "Out of memory"))
goto retry;
+
+ put_beancounter(ub);
}
/*
@@ -577,10 +605,27 @@ void pagefault_out_of_memory(void)
if (sysctl_panic_on_oom)
panic("out of memory from page fault. panic_on_oom is selected.\n");
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL)
+ & (NOTIFY_OK | NOTIFY_FAIL))
+ return;
+
+ if (ub_oom_lock())
+ goto rest_and_return;
+
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "%s invoked PF oom-killer: oomkilladj=%d\n",
+ current->comm, current->signal->oom_adj);
+ dump_stack();
+ show_mem();
+ show_slab_info();
+ }
+
read_lock(&tasklist_lock);
__out_of_memory(0, 0); /* unknown gfp_mask and order */
read_unlock(&tasklist_lock);
+ ub_oom_unlock();
+
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory.
@@ -614,6 +659,23 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
if (sysctl_panic_on_oom == 2)
panic("out of memory. Compulsory panic_on_oom is selected.\n");
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL)
+ & (NOTIFY_OK | NOTIFY_FAIL))
+ return;
+
+ if (ub_oom_lock())
+ goto out_oom_lock;
+
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "%s invoked oom-killer: "
+ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+ current->comm, gfp_mask, order,
+ current->signal->oom_adj);
+ dump_stack();
+ show_mem();
+ show_slab_info();
+ }
+
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
@@ -623,7 +685,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, 0, NULL,
+ oom_kill_process(current, gfp_mask, order, NULL,
"No available memory (MPOL_BIND)");
break;
@@ -637,7 +699,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
}
read_unlock(&tasklist_lock);
+ ub_oom_unlock();
+out_oom_lock:
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d792..45641b9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -35,6 +35,8 @@
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <bc/io_acct.h>
+
/*
* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
* will look to see if it needs to force writeback or throttling.
@@ -1069,6 +1071,7 @@ int write_one_page(struct page *page, int wait)
} else {
unlock_page(page);
}
+
return ret;
}
EXPORT_SYMBOL(write_one_page);
@@ -1087,14 +1090,15 @@ int __set_page_dirty_no_writeback(struct page *page)
* Helper function for set_page_dirty family.
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+int account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
- task_io_account_write(PAGE_CACHE_SIZE);
+ return 1;
}
+ return 0;
}
/*
@@ -1114,6 +1118,9 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
*/
int __set_page_dirty_nobuffers(struct page *page)
{
+ int acct;
+
+ acct = 0;
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
@@ -1121,16 +1128,19 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!mapping)
return 1;
+ acct = 0;
spin_lock_irq(&mapping->tree_lock);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ acct = account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irq(&mapping->tree_lock);
+ if (acct)
+ task_io_account_write(page, PAGE_CACHE_SIZE, 0);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1268,6 +1278,7 @@ int clear_page_dirty_for_io(struct page *page)
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
+ ub_io_release_context(page, PAGE_CACHE_SIZE);
return 1;
}
return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36992b6..cd0501c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,9 @@
#include <asm/div64.h>
#include "internal.h"
+#include <bc/kmem.h>
+#include <bc/io_acct.h>
+
/*
* Array of node states.
*/
@@ -105,6 +108,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
32,
};
+EXPORT_SYMBOL(nr_swap_pages);
EXPORT_SYMBOL(totalram_pages);
static char * const zone_names[MAX_NR_ZONES] = {
@@ -510,6 +514,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
+ ub_io_release_debug(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -601,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+ ub_page_uncharge(page, order);
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1102,6 +1108,7 @@ static void free_hot_cold_page(struct page *page, int cold)
pcp = &zone_pcp(zone, get_cpu())->pcp;
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
+ ub_page_uncharge(page, 0);
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1783,6 +1790,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
+int alloc_fail_warn;
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -1904,7 +1913,7 @@ rebalance:
}
nopage:
- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+ if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
printk(KERN_WARNING "%s: page allocation failure."
" order:%d, mode:0x%x\n",
p->comm, order, gfp_mask);
@@ -1919,6 +1928,29 @@ got_pg:
}
+extern unsigned long cycles_per_jiffy;
+static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
+ struct page *page, cycles_t time)
+{
+#ifdef CONFIG_VE
+ int ind;
+ unsigned long flags;
+
+ time = (jiffies - time) * cycles_per_jiffy;
+ if (!(gfp_mask & __GFP_WAIT))
+ ind = 0;
+ else if (!(gfp_mask & __GFP_HIGHMEM))
+ ind = (order > 0 ? 2 : 1);
+ else
+ ind = (order > 0 ? 4 : 3);
+ spin_lock_irqsave(&kstat_glb_lock, flags);
+ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time);
+ if (!page)
+ kstat_glob.alloc_fails[ind]++;
+ spin_unlock_irqrestore(&kstat_glb_lock, flags);
+#endif
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -1930,6 +1962,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zone *preferred_zone;
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ cycles_t start;
gfp_mask &= gfp_allowed_mask;
@@ -1953,6 +1986,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (!preferred_zone)
return NULL;
+ start = jiffies;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
@@ -1962,6 +1996,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ __alloc_collect_stats(gfp_mask, order, page, start);
+ if (page && ub_page_charge(page, order, gfp_mask)) {
+ __free_pages(page, order);
+ page = NULL;
+ }
+
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373..c9752f0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,9 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+
#include <asm/tlbflush.h>
#include "internal.h"
@@ -133,6 +136,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
}
return 0;
}
+EXPORT_SYMBOL_GPL(anon_vma_prepare);
void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
{
@@ -158,6 +162,7 @@ void anon_vma_link(struct vm_area_struct *vma)
spin_unlock(&anon_vma->lock);
}
}
+EXPORT_SYMBOL_GPL(anon_vma_link);
void anon_vma_unlink(struct vm_area_struct *vma)
{
@@ -189,7 +194,7 @@ static void anon_vma_ctor(void *data)
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor);
}
/*
@@ -215,12 +220,14 @@ out:
rcu_read_unlock();
return NULL;
}
+EXPORT_SYMBOL_GPL(page_lock_anon_vma);
void page_unlock_anon_vma(struct anon_vma *anon_vma)
{
spin_unlock(&anon_vma->lock);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(page_unlock_anon_vma);
/*
* At what user virtual address is page expected in @vma?
@@ -738,6 +745,12 @@ void page_remove_rmap(struct page *page)
page_clear_dirty(page);
set_page_dirty(page);
}
+ /*
+ * Well, when a page is unmapped, we cannot keep PG_checkpointed
+ * flag, it is not accessible via process VM and we have no way
+ * to reset its state
+ */
+ ClearPageCheckpointed(page);
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -851,6 +864,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
page_remove_rmap(page);
+ ub_unused_privvm_inc(mm, vma);
+ ub_percpu_inc(mm->mm_ub, unmap);
+ pb_remove_ref(page, mm);
page_cache_release(page);
out_unmap:
@@ -966,6 +982,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
set_page_dirty(page);
page_remove_rmap(page);
+ ub_percpu_inc(mm->mm_ub, unmap);
+ pb_remove_ref(page, mm);
+ ub_unused_privvm_inc(mm, vma);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99..bc74e50 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,11 @@
#include <linux/swap.h>
#include <linux/ima.h>
+#ifdef CONFIG_VE
+#define shm_mnt (get_exec_env()->shmem_mnt)
+#else
static struct vfsmount *shm_mnt;
+#endif
#ifdef CONFIG_SHMEM
/*
@@ -60,6 +64,8 @@ static struct vfsmount *shm_mnt;
#include <linux/seq_file.h>
#include <linux/magic.h>
+#include <bc/vmpages.h>
+
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
@@ -107,14 +113,31 @@ enum sgp_type {
};
#ifdef CONFIG_TMPFS
+
+#include <linux/virtinfo.h>
+
+static unsigned long tmpfs_ram_pages(void)
+{
+ struct meminfo mi;
+
+ if (ve_is_super(get_exec_env()))
+ return totalram_pages;
+
+ memset(&mi, 0, sizeof(mi));
+ si_meminfo(&mi.si);
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) & NOTIFY_FAIL)
+ return 0;
+ return mi.si.totalram;
+}
+
static unsigned long shmem_default_max_blocks(void)
{
- return totalram_pages / 2;
+ return tmpfs_ram_pages() / 2;
}
static unsigned long shmem_default_max_inodes(void)
{
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+ return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2);
}
#endif
@@ -214,7 +237,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
-static const struct file_operations shmem_file_operations;
+const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
@@ -277,7 +300,7 @@ static void shmem_free_inode(struct super_block *sb)
*
* It has to be called with the spinlock held.
*/
-static void shmem_recalc_inode(struct inode *inode)
+static void shmem_recalc_inode(struct inode *inode, long swp_freed)
{
struct shmem_inode_info *info = SHMEM_I(inode);
long freed;
@@ -287,6 +310,8 @@ static void shmem_recalc_inode(struct inode *inode)
info->alloced -= freed;
shmem_unacct_blocks(info->flags, freed);
shmem_free_blocks(inode, freed);
+ if (freed > swp_freed)
+ ub_tmpfs_respages_sub(info, freed - swp_freed);
}
}
@@ -391,6 +416,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
struct page *page = kmap_atomic_to_page(entry);
set_page_private(page, page_private(page) + incdec);
}
+
+ if (incdec == 1)
+ ub_tmpfs_respages_dec(info);
+ else
+ ub_tmpfs_respages_inc(info);
}
/**
@@ -407,14 +437,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct page *page = NULL;
swp_entry_t *entry;
+ unsigned long ub_val;
if (sgp != SGP_WRITE &&
((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return ERR_PTR(-EINVAL);
+ ub_val = 0;
+ if (info->next_index <= index) {
+ ub_val = index + 1 - info->next_index;
+ if (ub_shmpages_charge(info, ub_val))
+ return ERR_PTR(-ENOSPC);
+ }
+
while (!(entry = shmem_swp_entry(info, index, &page))) {
- if (sgp == SGP_READ)
- return shmem_swp_map(ZERO_PAGE(0));
+ if (sgp == SGP_READ) {
+ entry = shmem_swp_map(ZERO_PAGE(0));
+ goto out;
+ }
/*
* Test free_blocks against 1 not 0, since we have 1 data
* page (and perhaps indirect index pages) yet to allocate:
@@ -424,7 +464,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
spin_lock(&sbinfo->stat_lock);
if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock);
- return ERR_PTR(-ENOSPC);
+ entry = ERR_PTR(-ENOSPC);
+ goto out;
}
sbinfo->free_blocks--;
inode->i_blocks += BLOCKS_PER_PAGE;
@@ -432,31 +473,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) |
+ __GFP_UBC);
if (page)
set_page_private(page, 0);
spin_lock(&info->lock);
if (!page) {
- shmem_free_blocks(inode, 1);
- return ERR_PTR(-ENOMEM);
+ entry = ERR_PTR(-ENOMEM);
+ goto out_block;
}
if (sgp != SGP_WRITE &&
((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
entry = ERR_PTR(-EINVAL);
- break;
+ goto out_dir;
}
- if (info->next_index <= index)
+ if (info->next_index <= index) {
+ ub_val = 0;
info->next_index = index + 1;
+ }
}
if (page) {
/* another task gave its page, or truncated the file */
shmem_free_blocks(inode, 1);
shmem_dir_free(page);
}
- if (info->next_index <= index && !IS_ERR(entry))
+ if (info->next_index <= index)
info->next_index = index + 1;
return entry;
+
+out_dir:
+ shmem_dir_free(page);
+out_block:
+ shmem_free_blocks(inode, 1);
+out:
+ if (ub_val)
+ ub_shmpages_uncharge(info, ub_val);
+ return entry;
}
/**
@@ -564,6 +617,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
return;
spin_lock(&info->lock);
+ ub_shmpages_uncharge(info, info->next_index - idx);
info->flags |= SHMEM_TRUNCATE;
if (likely(end == (loff_t) -1)) {
limit = info->next_index;
@@ -750,7 +804,7 @@ done2:
info->swapped -= nr_swaps_freed;
if (nr_pages_to_free)
shmem_free_blocks(inode, nr_pages_to_free);
- shmem_recalc_inode(inode);
+ shmem_recalc_inode(inode, nr_swaps_freed);
spin_unlock(&info->lock);
/*
@@ -833,6 +887,7 @@ static void shmem_delete_inode(struct inode *inode)
}
}
BUG_ON(inode->i_blocks);
+ shmi_ub_put(info);
shmem_free_inode(inode->i_sb);
clear_inode(inode);
}
@@ -1020,6 +1075,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
out: return found; /* 0 or 1 or -ENOMEM */
}
+#ifdef CONFIG_BEANCOUNTERS
+#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub))
+#else
+#define shm_get_swap_page(info) (get_swap_page(NULL))
+#endif
+
/*
* Move the page from the page cache to the swap cache.
*/
@@ -1051,7 +1112,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* discarded.
*/
if (wbc->for_reclaim)
- swap = get_swap_page();
+ swap = shm_get_swap_page(info);
else
swap.val = 0;
@@ -1069,7 +1130,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
free_swap_and_cache(*entry);
shmem_swp_set(info, entry, 0);
}
- shmem_recalc_inode(inode);
+ shmem_recalc_inode(inode, 0);
if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
remove_from_page_cache(page);
@@ -1252,7 +1313,7 @@ repeat:
}
spin_lock(&info->lock);
- shmem_recalc_inode(inode);
+ shmem_recalc_inode(inode, 0);
entry = shmem_swp_alloc(info, idx, sgp);
if (IS_ERR(entry)) {
spin_unlock(&info->lock);
@@ -1455,6 +1516,7 @@ repeat:
clear_highpage(filepage);
flush_dcache_page(filepage);
SetPageUptodate(filepage);
+ ub_tmpfs_respages_inc(info);
if (sgp == SGP_DIRTY)
set_page_dirty(filepage);
}
@@ -1512,20 +1574,27 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
spin_lock(&info->lock);
if (lock && !(info->flags & VM_LOCKED)) {
+ if (ub_lockedshm_charge(info, inode->i_size) < 0)
+ goto out_ch;
+
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
if (!lock && (info->flags & VM_LOCKED) && user) {
+ ub_lockedshm_uncharge(info, inode->i_size);
user_shm_unlock(inode->i_size, user);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
scan_mapping_unevictable_pages(file->f_mapping);
}
- retval = 0;
+ spin_unlock(&info->lock);
+ return 0;
out_nomem:
+ ub_lockedshm_uncharge(info, inode->i_size);
+out_ch:
spin_unlock(&info->lock);
return retval;
}
@@ -1559,6 +1628,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
inode->i_generation = get_seconds();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
+ shmi_ub_set(info, get_exec_ub());
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
@@ -2182,7 +2252,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
size = memparse(value,&rest);
if (*rest == '%') {
size <<= PAGE_SHIFT;
- size *= totalram_pages;
+ size *= tmpfs_ram_pages();
do_div(size, 100);
rest++;
}
@@ -2424,7 +2494,7 @@ static const struct address_space_operations shmem_aops = {
.error_remove_page = generic_error_remove_page,
};
-static const struct file_operations shmem_file_operations = {
+const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = generic_file_llseek,
@@ -2437,6 +2507,7 @@ static const struct file_operations shmem_file_operations = {
.splice_write = generic_file_splice_write,
#endif
};
+EXPORT_SYMBOL_GPL(shmem_file_operations);
static const struct inode_operations shmem_inode_operations = {
.truncate = shmem_truncate,
@@ -2506,6 +2577,10 @@ static const struct vm_operations_struct shmem_vm_ops = {
#endif
};
+int is_shmem_mapping(struct address_space *map)
+{
+ return (map != NULL && map->a_ops == &shmem_aops);
+}
static int shmem_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
@@ -2513,12 +2588,13 @@ static int shmem_get_sb(struct file_system_type *fs_type,
return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
}
-static struct file_system_type tmpfs_fs_type = {
+struct file_system_type tmpfs_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
.get_sb = shmem_get_sb,
.kill_sb = kill_litter_super,
};
+EXPORT_SYMBOL(tmpfs_fs_type);
int __init init_tmpfs(void)
{
@@ -2608,6 +2684,36 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
/* common code */
+static inline int shm_charge_ahead(struct inode *inode)
+{
+#ifdef CONFIG_BEANCOUNTERS
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ unsigned long idx;
+ swp_entry_t *entry;
+
+ if (!inode->i_size)
+ return 0;
+ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ /*
+ * Just touch info to allocate space for entry and
+ * make all UBC checks
+ */
+ spin_lock(&info->lock);
+ entry = shmem_swp_alloc(info, idx, SGP_CACHE);
+ if (IS_ERR(entry))
+ goto err;
+ shmem_swp_unmap(entry);
+ spin_unlock(&info->lock);
+ return 0;
+
+err:
+ spin_unlock(&info->lock);
+ return PTR_ERR(entry);
+#else
+ return 0;
+#endif
+}
+
/**
* shmem_file_setup - get an unlinked file living in tmpfs
* @name: name for dentry (to be seen in /proc/<pid>/maps
@@ -2653,6 +2759,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0; /* It is unlinked */
+ error = shm_charge_ahead(inode);
+ if (error)
+ goto close_file;
init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
&shmem_file_operations);
@@ -2689,6 +2798,8 @@ int shmem_zero_setup(struct vm_area_struct *vma)
if (vma->vm_file)
fput(vma->vm_file);
+ else if (vma->vm_flags & VM_WRITE)
+ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index 5d1a782..f23819e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,30 +115,14 @@
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/nmi.h>
+#include <linux/vzstat.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
-/*
- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
- * 0 for faster, smaller code (especially in the critical paths).
- *
- * STATS - 1 to collect stats for /proc/slabinfo.
- * 0 for faster, smaller code (especially in the critical paths).
- *
- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
- */
-
-#ifdef CONFIG_DEBUG_SLAB
-#define DEBUG 1
-#define STATS 1
-#define FORCED_DEBUG 1
-#else
-#define DEBUG 0
-#define STATS 0
-#define FORCED_DEBUG 0
-#endif
+#include <bc/kmem.h>
/* Shouldn't this be in a header file somewhere? */
#define BYTES_PER_WORD sizeof(void *)
@@ -173,19 +157,21 @@
#endif
/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
+#if SLAB_DEBUG
# define CREATE_MASK (SLAB_RED_ZONE | \
SLAB_POISON | SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_STORE_USER | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_UBC | SLAB_NO_CHARGE | \
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#else
# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
SLAB_CACHE_DMA | \
SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
+ SLAB_UBC | SLAB_NO_CHARGE | \
SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
#endif
@@ -389,12 +375,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define REAPTIMEOUT_CPUC (2*HZ)
#define REAPTIMEOUT_LIST3 (4*HZ)
-#if STATS
+#if SLAB_STATS
+#define STATS_INC_GROWN(x) ((x)->grown++)
+#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_INC_SHRUNK(x) ((x)->shrunk++)
+
#define STATS_INC_ACTIVE(x) ((x)->num_active++)
#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
-#define STATS_INC_GROWN(x) ((x)->grown++)
-#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
#define STATS_SET_HIGH(x) \
do { \
if ((x)->num_active > (x)->high_mark) \
@@ -414,11 +402,12 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
#else
+#define STATS_INC_GROWN(x) do { } while (0)
+#define STATS_ADD_REAPED(x,y) do { } while (0)
+#define STATS_INC_SHRUNK(x) do { } while (0)
#define STATS_INC_ACTIVE(x) do { } while (0)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
-#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -431,7 +420,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
#define STATS_INC_FREEMISS(x) do { } while (0)
#endif
-#if DEBUG
+#if SLAB_DEBUG
/*
* memory layout of objects:
@@ -571,6 +560,8 @@ struct cache_sizes malloc_sizes[] = {
#define CACHE(x) { .cs_size = (x) },
#include <linux/kmalloc_sizes.h>
CACHE(ULONG_MAX)
+#include <linux/kmalloc_sizes.h>
+ CACHE(ULONG_MAX)
#undef CACHE
};
EXPORT_SYMBOL(malloc_sizes);
@@ -584,10 +575,17 @@ struct cache_names {
static struct cache_names __initdata cache_names[] = {
#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
#include <linux/kmalloc_sizes.h>
+ {NULL,},
+#undef CACHE
+#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" },
+#include <linux/kmalloc_sizes.h>
{NULL,}
#undef CACHE
};
+int malloc_cache_num;
+EXPORT_SYMBOL(malloc_cache_num);
+
static struct arraycache_init initarray_cache __initdata =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
@@ -663,7 +661,8 @@ static inline void init_lock_keys(void)
* Guard access to the cache-chain.
*/
static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
+static LIST_HEAD(cache_chain);
+static DEFINE_SPINLOCK(cache_chain_lock);
/*
* chicken and egg problem: delay the per-cpu array allocation
@@ -697,7 +696,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
{
struct cache_sizes *csizep = malloc_sizes;
-#if DEBUG
+ if (gfpflags & __GFP_UBC)
+ csizep += malloc_cache_num;
+#if SLAB_DEBUG
/* This happens if someone tries to call
* kmem_cache_create(), or __kmalloc(), before
* the generic caches are initialized.
@@ -727,9 +728,102 @@ static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
return __find_general_cachep(size, gfpflags);
}
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+{
+ return (kmem_bufctl_t *) (slabp + 1);
+}
+
+#ifdef CONFIG_BEANCOUNTERS
+#define init_slab_ubps(cachep, slabp) do { \
+ if (!((cachep)->flags & SLAB_UBC)) \
+ break; \
+ memset(slab_ubcs(cachep, slabp), 0, \
+ (cachep)->num * sizeof(void *)); \
+ } while (0)
+
+#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1)
+#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0)
+#define set_cache_objuse(cachep) do { \
+ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \
+ (cachep)->num - 1) / (cachep)->num; \
+ if (!OFF_SLAB(cachep)) \
+ break; \
+ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \
+ (cachep)->num - 1) / (cachep)->num; \
+ } while (0)
+
+void kmem_mark_nocharge(struct kmem_cache *cachep)
+{
+ cachep->flags |= SLAB_NO_CHARGE;
+}
+
+int kmem_cache_objuse(struct kmem_cache *cachep)
+{
+ return cachep->objuse;
+}
+
+EXPORT_SYMBOL(kmem_cache_objuse);
+
+int kmem_obj_objuse(void *obj)
+{
+ return virt_to_cache(obj)->objuse;
+}
+
+int kmem_dname_objuse(void *obj)
+{
+ return virt_to_cache(obj)->objuse;
+}
+
+unsigned long ub_cache_growth(struct kmem_cache *cachep)
+{
+#if SLAB_STATS
+ return (cachep->grown - cachep->reaped - cachep->shrunk)
+ << cachep->gfporder;
+#else
+ return 0;
+#endif
+}
+
+#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\
+ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\
+ sizeof(void *))))
+
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj)
+{
+ struct slab *slabp;
+ int objnr;
+
+ BUG_ON(!(cachep->flags & SLAB_UBC));
+ slabp = virt_to_slab(obj);
+ objnr = (obj - slabp->s_mem) / cachep->buffer_size;
+ return slab_ubcs(cachep, slabp) + objnr;
+}
+
+struct user_beancounter *slab_ub(void *obj)
+{
+ return *ub_slab_ptr(virt_to_cache(obj), obj);
+}
+
+EXPORT_SYMBOL(slab_ub);
+
+#else
+#define UB_ALIGN(flags) 1
+#define UB_EXTRA(flags) 0
+#define set_cache_objuse(c) do { } while (0)
+#define init_slab_ubps(c, s) do { } while (0)
+#endif
+
+static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags)
+{
+ size_t size_noub;
+
+ size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
+ return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags);
+}
+
+static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags)
{
- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
+ return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align);
}
/*
@@ -774,20 +868,23 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* into account.
*/
nr_objs = (slab_size - sizeof(struct slab)) /
- (buffer_size + sizeof(kmem_bufctl_t));
+ (buffer_size + sizeof(kmem_bufctl_t) +
+ UB_EXTRA(flags));
/*
* This calculated number will be either the right
* amount, or one greater than what we want.
*/
- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
- > slab_size)
+ if (slab_mgmt_size(nr_objs, align, flags) +
+ nr_objs * buffer_size > slab_size)
nr_objs--;
+ BUG_ON(slab_mgmt_size(nr_objs, align, flags) +
+ nr_objs * buffer_size > slab_size);
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
- mgmt_size = slab_mgmt_size(nr_objs, align);
+ mgmt_size = slab_mgmt_size(nr_objs, align, flags);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -1338,6 +1435,7 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
MAKE_ALL_LISTS(cachep, ptr, nodeid);
cachep->nodelists[nodeid] = ptr;
}
+static int offslab_limit;
/*
* For setting up all the kmem_list3s for cache whose buffer_size is same as
@@ -1408,7 +1506,6 @@ void __init kmem_cache_init(void)
node = numa_node_id();
/* 1) create the cache_cache */
- INIT_LIST_HEAD(&cache_chain);
list_add(&cache_cache.next, &cache_chain);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
@@ -1420,7 +1517,7 @@ void __init kmem_cache_init(void)
*/
cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
+#if SLAB_DEBUG
cache_cache.obj_size = cache_cache.buffer_size;
#endif
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
@@ -1467,6 +1564,7 @@ void __init kmem_cache_init(void)
slab_early_init = 0;
+ for (i = 0; i < 2; i++) {
while (sizes->cs_size != ULONG_MAX) {
/*
* For performance, all the general caches are L1 aligned.
@@ -1479,21 +1577,30 @@ void __init kmem_cache_init(void)
sizes->cs_cachep = kmem_cache_create(names->name,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+ ARCH_KMALLOC_FLAGS|SLAB_PANIC|
+ (i ? SLAB_UBC : 0)|SLAB_NO_CHARGE,
NULL);
}
+ if (!(OFF_SLAB(sizes->cs_cachep)))
+ offslab_limit = sizes->cs_size;
#ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = kmem_cache_create(
- names->name_dma,
+ sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+ (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE|
SLAB_PANIC,
NULL);
#endif
sizes++;
names++;
}
+
+ sizes++;
+ names++;
+ if (!i)
+ malloc_cache_num = sizes - malloc_sizes;
+ }
/* 4) Replace the bootstrap head arrays */
{
struct array_cache *ptr;
@@ -1674,7 +1781,7 @@ static void kmem_rcu_free(struct rcu_head *head)
kmem_cache_free(cachep->slabp_cache, slab_rcu);
}
-#if DEBUG
+#if SLAB_DEBUG
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1751,7 +1858,7 @@ static void dump_line(char *data, int offset, int limit)
}
#endif
-#if DEBUG
+#if SLAB_DEBUG
static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
{
@@ -1844,7 +1951,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
}
#endif
-#if DEBUG
+#if SLAB_DEBUG
static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
int i;
@@ -1944,7 +2051,6 @@ static void __kmem_cache_destroy(struct kmem_cache *cachep)
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
@@ -1957,15 +2063,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
continue;
if (flags & CFLGS_OFF_SLAB) {
- /*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
- */
- offslab_limit = size - sizeof(struct slab);
- offslab_limit /= sizeof(kmem_bufctl_t);
+ int slab_size;
- if (num > offslab_limit)
+ slab_size = slab_mgmt_size_noalign(num, flags);
+ if (slab_size > offslab_limit)
break;
}
@@ -2133,9 +2234,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
}
-#if DEBUG
+#if SLAB_DEBUG
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
-#if FORCED_DEBUG
+#if SLAB_FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
* large objects, if the increased size would increase the object size
@@ -2225,7 +2326,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (!cachep)
goto oops;
-#if DEBUG
+#if SLAB_DEBUG
cachep->obj_size = size;
/*
@@ -2247,7 +2348,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
+#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
&& cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
cachep->obj_offset += PAGE_SIZE - size;
@@ -2279,8 +2380,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep = NULL;
goto oops;
}
- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+ slab_size = slab_mgmt_size(cachep->num, align, flags);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2293,8 +2393,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- slab_size =
- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
+ slab_size = slab_mgmt_size_noalign(cachep->num, flags);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2340,7 +2439,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
/* cache setup completed, link it into the list */
+ spin_lock(&cache_chain_lock);
list_add(&cachep->next, &cache_chain);
+ spin_unlock(&cache_chain_lock);
+ set_cache_objuse(cachep);
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
@@ -2353,7 +2455,7 @@ oops:
}
EXPORT_SYMBOL(kmem_cache_create);
-#if DEBUG
+#if SLAB_DEBUG
static void check_irq_off(void)
{
BUG_ON(!irqs_disabled());
@@ -2449,10 +2551,11 @@ static int drain_freelist(struct kmem_cache *cache,
}
slabp = list_entry(p, struct slab, list);
-#if DEBUG
+#if SLAB_DEBUG
BUG_ON(slabp->inuse);
#endif
list_del(&slabp->list);
+ STATS_INC_SHRUNK(cache);
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
@@ -2535,10 +2638,14 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
/*
* the chain is never empty, cache_cache is never destroyed
*/
+ spin_lock(&cache_chain_lock);
list_del(&cachep->next);
+ spin_unlock(&cache_chain_lock);
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
+ spin_lock(&cache_chain_lock);
list_add(&cachep->next, &cache_chain);
+ spin_unlock(&cache_chain_lock);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
return;
@@ -2547,6 +2654,8 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
rcu_barrier();
+
+ ub_kmemcache_free(cachep);
__kmem_cache_destroy(cachep);
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
@@ -2573,7 +2682,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
slabp = kmem_cache_alloc_node(cachep->slabp_cache,
- local_flags, nodeid);
+ (local_flags & ~__GFP_UBC), nodeid);
/*
* If the first object in the slab is leaked (it's allocated
* but no one has a reference to it), we want to make sure
@@ -2593,14 +2702,10 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
slabp->s_mem = objp + colour_off;
slabp->nodeid = nodeid;
slabp->free = 0;
+ init_slab_ubps(cachep, slabp);
return slabp;
}
-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
-{
- return (kmem_bufctl_t *) (slabp + 1);
-}
-
static void cache_init_objs(struct kmem_cache *cachep,
struct slab *slabp)
{
@@ -2608,7 +2713,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, slabp, i);
-#if DEBUG
+#if SLAB_DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
poison_obj(cachep, objp, POISON_FREE);
@@ -2666,7 +2771,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
slabp->inuse++;
next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
+#if SLAB_DEBUG
slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
WARN_ON(slabp->nodeid != nodeid);
#endif
@@ -2680,7 +2785,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
{
unsigned int objnr = obj_to_index(cachep, slabp, objp);
-#if DEBUG
+#if SLAB_DEBUG
/* Verify that the slab belongs to the intended node */
WARN_ON(slabp->nodeid != nodeid);
@@ -2768,7 +2873,7 @@ static int cache_grow(struct kmem_cache *cachep,
* 'nodeid'.
*/
if (!objp)
- objp = kmem_getpages(cachep, local_flags, nodeid);
+ objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid);
if (!objp)
goto failed;
@@ -2801,7 +2906,7 @@ failed:
return 0;
}
-#if DEBUG
+#if SLAB_DEBUG
/*
* Perform extra freeing checks:
@@ -3014,12 +3119,12 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
might_sleep_if(flags & __GFP_WAIT);
-#if DEBUG
+#if SLAB_DEBUG
kmem_flagcheck(cachep, flags);
#endif
}
-#if DEBUG
+#if SLAB_DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, void *caller)
{
@@ -3389,11 +3494,16 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
- local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
flags);
prefetchw(objp);
+ if (objp && should_charge(cachep->flags, flags) &&
+ ub_slab_charge(cachep, objp, flags)) {
+ kmem_cache_free(cachep, objp);
+ objp = NULL;
+ }
+ local_irq_restore(save_flags);
if (likely(objp))
kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
@@ -3430,6 +3540,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
/* fixup slab chains */
if (slabp->inuse == 0) {
if (l3->free_objects > l3->free_limit) {
+ STATS_INC_SHRUNK(cachep);
l3->free_objects -= cachep->num;
/* No need to drop any previously held
* lock here, even if we have a off-slab slab
@@ -3458,7 +3569,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
int node = numa_node_id();
batchcount = ac->batchcount;
-#if DEBUG
+#if SLAB_DEBUG
BUG_ON(!batchcount || batchcount > ac->avail);
#endif
check_irq_off();
@@ -3479,7 +3590,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
free_block(cachep, ac->entry, batchcount, node);
free_done:
-#if STATS
+#if SLAB_STATS
{
int i = 0;
struct list_head *p;
@@ -3516,6 +3627,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+ if (should_uncharge(cachep->flags))
+ ub_slab_uncharge(cachep, objp);
+
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
@@ -3970,7 +4084,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
-#if DEBUG
+#if SLAB_DEBUG
/*
* With debugging enabled, large batchcount lead to excessively long
* periods with disabled local interrupts. Limit the batchcount
@@ -4037,6 +4151,7 @@ static void cache_reap(struct work_struct *w)
/* Give up. Setup the next iteration. */
goto out;
+ {KSTAT_PERF_ENTER(cache_reap)
list_for_each_entry(searchp, &cache_chain, next) {
check_irq_on();
@@ -4077,6 +4192,7 @@ next:
check_irq_on();
mutex_unlock(&cache_chain_mutex);
next_reap_node();
+ KSTAT_PERF_LEAVE(cache_reap)}
out:
/* Set up the next iteration */
schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
@@ -4090,7 +4206,7 @@ static void print_slabinfo_header(struct seq_file *m)
* Output format version, so at least we can change it
* without _too_ many complaints.
*/
-#if STATS
+#if SLAB_STATS
seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
#else
seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4099,14 +4215,82 @@ static void print_slabinfo_header(struct seq_file *m)
"<objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
+#if SLAB_STATS
seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow> <shrunk>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
}
+#define SHOW_TOP_SLABS 10
+
+static unsigned long get_cache_size(struct kmem_cache *cachep)
+{
+ unsigned long flags;
+ unsigned long slabs;
+ struct kmem_list3 *l3;
+ struct list_head *lh;
+ int node;
+
+ slabs = 0;
+
+ for_each_online_node (node) {
+ l3 = cachep->nodelists[node];
+ if (l3 == NULL)
+ continue;
+
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each (lh, &l3->slabs_full)
+ slabs++;
+ list_for_each (lh, &l3->slabs_partial)
+ slabs++;
+ list_for_each (lh, &l3->slabs_free)
+ slabs++;
+ spin_unlock_irqrestore(&l3->list_lock, flags);
+ }
+
+ return slabs * (PAGE_SIZE << cachep->gfporder) +
+ (OFF_SLAB(cachep) ?
+ cachep->slabp_cache->buffer_size * slabs : 0);
+}
+
+void show_slab_info(void)
+{
+ int i, j;
+ unsigned long size;
+ struct kmem_cache *ptr;
+ unsigned long sizes[SHOW_TOP_SLABS];
+ struct kmem_cache *top[SHOW_TOP_SLABS];
+
+ memset(top, 0, sizeof(top));
+ memset(sizes, 0, sizeof(sizes));
+
+ printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+ spin_lock(&cache_chain_lock);
+ list_for_each_entry (ptr, &cache_chain, next) {
+ size = get_cache_size(ptr);
+
+ j = 0;
+ for (i = 1; i < SHOW_TOP_SLABS; i++)
+ if (sizes[i] < sizes[j])
+ j = i;
+
+ if (size > sizes[j]) {
+ sizes[j] = size;
+ top[j] = ptr;
+ }
+ }
+
+ for (i = 0; i < SHOW_TOP_SLABS; i++)
+ if (top[i])
+ printk("%-21s: size %10lu objsize %10u\n",
+ top[i]->name, sizes[i],
+ top[i]->buffer_size);
+ spin_unlock(&cache_chain_lock);
+}
+
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
@@ -4185,19 +4369,20 @@ static int s_show(struct seq_file *m, void *p)
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d",
name, active_objs, num_objs, cachep->buffer_size,
cachep->num, (1 << cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount, cachep->shared);
seq_printf(m, " : slabdata %6lu %6lu %6lu",
active_slabs, num_slabs, shared_avail);
-#if STATS
+#if SLAB_STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
unsigned long allocs = cachep->num_allocations;
unsigned long grown = cachep->grown;
unsigned long reaped = cachep->reaped;
+ unsigned long shrunk = cachep->shrunk;
unsigned long errors = cachep->errors;
unsigned long max_freeable = cachep->max_freeable;
unsigned long node_allocs = cachep->node_allocs;
@@ -4205,9 +4390,10 @@ static int s_show(struct seq_file *m, void *p)
unsigned long overflows = cachep->node_overflow;
seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+ %4lu %4lu %4lu %4lu %4lu %4lu",
+ allocs, high, grown,
reaped, errors, max_freeable, node_allocs,
- node_frees, overflows);
+ node_frees, overflows, shrunk);
}
/* cpu stats */
{
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc7..22ae4a8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -29,6 +29,8 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <bc/kmem.h>
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -149,9 +151,11 @@
/*
* Set of flags that will prevent slab merging
+ *
+ * FIXME - think over how to allow merging accountable slubs
*/
#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | SLAB_UBC)
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -201,6 +205,8 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
+static DEFINE_SPINLOCK(cache_chain_lock);
+
#ifdef CONFIG_SLUB_DEBUG
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
@@ -321,6 +327,90 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
return x.x & OO_MASK;
}
+#ifdef CONFIG_BEANCOUNTERS
+static inline void inc_cache_grown(struct kmem_cache *s)
+{
+ atomic_inc(&s->grown);
+}
+
+static inline void dec_cache_grown(struct kmem_cache *s)
+{
+ atomic_dec(&s->grown);
+}
+
+unsigned long ub_cache_growth(struct kmem_cache *cachep)
+{
+ return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */
+}
+
+static void __flush_cpu_slab(struct kmem_cache *s, int cpu);
+
+int kmem_cache_objuse(struct kmem_cache *cachep)
+{
+ return cachep->objuse;
+}
+
+EXPORT_SYMBOL(kmem_cache_objuse);
+
+int kmem_obj_objuse(void *obj)
+{
+ return kmem_cache_objuse(virt_to_head_page(obj)->slab);
+}
+
+EXPORT_SYMBOL(kmem_obj_objuse);
+
+int kmem_dname_objuse(void *obj)
+{
+ struct kmem_cache *s;
+
+ /*
+ * Allocations larger than PAGE_SIZE/2 go directly through
+ * __get_free_pages() and aren't associated with any cache.
+ */
+ s = virt_to_head_page(obj)->slab;
+ if (!s)
+ return PAGE_SIZE;
+ return kmem_cache_objuse(s);
+}
+
+#define page_ubs(pg) (pg->bc.slub_ubs)
+
+struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj)
+{
+ struct page *pg;
+
+ BUG_ON(!(s->flags & SLAB_UBC));
+ pg = virt_to_head_page(obj);
+ return page_ubs(pg) + slab_index(obj, s, page_address(pg));
+}
+
+EXPORT_SYMBOL(ub_slab_ptr);
+
+struct user_beancounter *slab_ub(void *obj)
+{
+ struct page *pg;
+
+ pg = virt_to_head_page(obj);
+ BUG_ON(!(pg->slab->flags & SLAB_UBC));
+ return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))];
+}
+
+EXPORT_SYMBOL(slab_ub);
+
+void kmem_mark_nocharge(struct kmem_cache *cachep)
+{
+ cachep->flags |= SLAB_NO_CHARGE;
+}
+#else
+static inline void inc_cache_grown(struct kmem_cache *s)
+{
+}
+
+static inline void dec_cache_grown(struct kmem_cache *s)
+{
+}
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
/*
* Debug settings:
@@ -1105,6 +1195,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
+ flags &= ~__GFP_UBC;
flags |= s->allocflags;
/*
@@ -1149,9 +1240,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1 << oo_order(oo));
+ inc_cache_grown(s);
return page;
}
+static void __free_slab(struct kmem_cache *s, struct page *page);
+
static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
@@ -1174,6 +1268,18 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
+#ifdef CONFIG_BEANCOUNTERS
+ if (s->flags & SLAB_UBC) {
+ BUG_ON(page_ubs(page) != NULL);
+ page_ubs(page) = kzalloc(page->objects * sizeof(void *),
+ flags & ~__GFP_UBC);
+ if (page_ubs(page) == NULL) {
+ __free_slab(s, page);
+ page = NULL;
+ goto out;
+ }
+ }
+#endif
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
@@ -1225,6 +1331,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlab(page);
reset_page_mapcount(page);
+#ifdef CONFIG_BEANCOUNTERS
+ if (page_ubs(page) != NULL) {
+ BUG_ON(!(s->flags & SLAB_UBC));
+ kfree(page_ubs(page));
+ page_ubs(page) = NULL;
+ }
+#endif
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
__free_pages(page, order);
@@ -1249,6 +1362,8 @@ static void free_slab(struct kmem_cache *s, struct page *page)
call_rcu(head, rcu_free_slab);
} else
__free_slab(s, page);
+
+ dec_cache_grown(s);
}
static void discard_slab(struct kmem_cache *s, struct page *page)
@@ -1733,6 +1848,13 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
c->freelist = object[c->offset];
stat(c, ALLOC_FASTPATH);
}
+
+ if (object && should_charge(s->flags, gfpflags) &&
+ ub_slab_charge(s, object, gfpflags)) {
+ kmem_cache_free(s, object);
+ object = NULL;
+ }
+
local_irq_restore(flags);
if (unlikely((gfpflags & __GFP_ZERO) && object))
@@ -1875,6 +1997,9 @@ static __always_inline void slab_free(struct kmem_cache *s,
c = get_cpu_slab(s, smp_processor_id());
kmemcheck_slab_free(s, object, c->objsize);
debug_check_no_locks_freed(object, c->objsize);
+
+ if (should_uncharge(s->flags))
+ ub_slab_uncharge(s, x);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(object, c->objsize);
if (likely(page == c->page && c->node >= 0)) {
@@ -2497,6 +2622,9 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
+#ifdef CONFIG_BEANCOUNTERS
+ s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo));
+#endif
if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
goto error;
@@ -2630,9 +2758,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
down_write(&slub_lock);
+ spin_lock(&cache_chain_lock);
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ spin_unlock(&cache_chain_lock);
up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
@@ -2642,8 +2772,10 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- } else
+ } else {
+ spin_unlock(&cache_chain_lock);
up_write(&slub_lock);
+ }
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2653,6 +2785,10 @@ EXPORT_SYMBOL(kmem_cache_destroy);
struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
EXPORT_SYMBOL(kmalloc_caches);
+#ifdef CONFIG_BEANCOUNTERS
+struct kmem_cache ub_kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
+EXPORT_SYMBOL(ub_kmalloc_caches);
+#endif
static int __init setup_slub_min_order(char *str)
{
@@ -2695,6 +2831,11 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
{
unsigned int flags = 0;
+ if (gfp_flags & __GFP_UBC) {
+ flags = SLAB_UBC | SLAB_NO_CHARGE;
+ gfp_flags &= ~__GFP_UBC;
+ }
+
if (gfp_flags & SLUB_DMA)
flags = SLAB_CACHE_DMA;
@@ -2706,7 +2847,9 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
flags, NULL))
goto panic;
+ spin_lock(&cache_chain_lock);
list_add(&s->list, &slab_caches);
+ spin_unlock(&cache_chain_lock);
if (sysfs_slab_add(s))
goto panic;
@@ -2779,7 +2922,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
goto unlock_out;
}
+ spin_lock(&cache_chain_lock);
list_add(&s->list, &slab_caches);
+ spin_unlock(&cache_chain_lock);
kmalloc_caches_dma[index] = s;
if (slab_state >= SYSFS)
@@ -2843,11 +2988,14 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
index = fls(size - 1);
#ifdef CONFIG_ZONE_DMA
- if (unlikely((flags & SLUB_DMA)))
+ if (unlikely((flags & SLUB_DMA))) {
+ BUG_ON(flags & __GFP_UBC);
return dma_kmalloc_cache(index, flags);
+ }
#endif
- return &kmalloc_caches[index];
+
+ return __kmalloc_cache(flags, index);
}
void *__kmalloc(size_t size, gfp_t flags)
@@ -3187,6 +3335,11 @@ void __init kmem_cache_init(void)
create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
sizeof(struct kmem_cache_node), GFP_NOWAIT);
kmalloc_caches[0].refcount = -1;
+#ifdef CONFIG_BEANCOUNTERS
+ create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc",
+ sizeof(struct kmem_cache_node), GFP_NOWAIT | __GFP_UBC);
+ ub_kmalloc_caches[0].refcount = -1;
+#endif
caches++;
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -3199,17 +3352,29 @@ void __init kmem_cache_init(void)
if (KMALLOC_MIN_SIZE <= 32) {
create_kmalloc_cache(&kmalloc_caches[1],
"kmalloc-96", 96, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+ create_kmalloc_cache(&ub_kmalloc_caches[1],
+ "kmalloc-96-ubc", 96, GFP_NOWAIT | __GFP_UBC);
+#endif
caches++;
}
if (KMALLOC_MIN_SIZE <= 64) {
create_kmalloc_cache(&kmalloc_caches[2],
"kmalloc-192", 192, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+ create_kmalloc_cache(&ub_kmalloc_caches[2],
+ "kmalloc-192-ubc", 192, GFP_NOWAIT | __GFP_UBC);
+#endif
caches++;
}
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
"kmalloc", 1 << i, GFP_NOWAIT);
+#ifdef CONFIG_BEANCOUNTERS
+ create_kmalloc_cache(&ub_kmalloc_caches[i],
+ "kmalloc-ubc", 1 << i, GFP_NOWAIT | __GFP_UBC);
+#endif
caches++;
}
@@ -3255,9 +3420,14 @@ void __init kmem_cache_init(void)
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
kmalloc_caches[i]. name =
kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+#ifdef CONFIG_BEANCOUNTERS
+ ub_kmalloc_caches[i].name =
+ kasprintf(GFP_NOWAIT | __GFP_UBC, "kmalloc-%d-ubc", 1 << i);
+#endif
+ }
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
@@ -3383,11 +3553,15 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (s) {
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
+ spin_lock(&cache_chain_lock);
list_add(&s->list, &slab_caches);
+ spin_unlock(&cache_chain_lock);
up_write(&slub_lock);
if (sysfs_slab_add(s)) {
down_write(&slub_lock);
+ spin_lock(&cache_chain_lock);
list_del(&s->list);
+ spin_unlock(&cache_chain_lock);
up_write(&slub_lock);
kfree(s);
goto err;
@@ -4555,6 +4729,8 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'a';
if (s->flags & SLAB_DEBUG_FREE)
*p++ = 'F';
+ if (s->flags & SLAB_UBC)
+ *p++ = 'b';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
if (p != name + 1)
@@ -4707,6 +4883,76 @@ static void print_slabinfo_header(struct seq_file *m)
seq_putc(m, '\n');
}
+#define SHOW_TOP_SLABS 10
+
+static unsigned long get_cache_size(struct kmem_cache *cache)
+{
+ unsigned long flags;
+ unsigned long slabs;
+ struct kmem_cache_node *n;
+ struct list_head *lh;
+ int cpu, node;
+
+ slabs = 0;
+
+ for_each_online_cpu(cpu)
+ slabs++;
+
+ for_each_online_node(node) {
+ n = get_node(cache, node);
+ if (!n)
+ continue;
+ spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLUB_DEBUG
+ list_for_each(lh, &n->full)
+ slabs++;
+#endif
+ list_for_each(lh, &n->partial)
+ slabs++;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
+
+ return slabs * (PAGE_SIZE << oo_order(cache->oo));
+}
+
+void show_slab_info(void)
+{
+ int i, j;
+ unsigned long size;
+ struct kmem_cache *ptr;
+ unsigned long sizes[SHOW_TOP_SLABS];
+ struct kmem_cache *top[SHOW_TOP_SLABS];
+
+ memset(top, 0, sizeof(top));
+ memset(sizes, 0, sizeof(sizes));
+
+ printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+ spin_lock(&cache_chain_lock);
+ list_for_each_entry(ptr, &slab_caches, list) {
+ size = get_cache_size(ptr);
+
+ j = 0;
+ for (i = 1; i < SHOW_TOP_SLABS; i++) {
+ if (sizes[i] < sizes[j])
+ j = i;
+ }
+ if (size > sizes[j]) {
+ sizes[j] = size;
+ top[j] = ptr;
+ }
+ }
+
+ for (i = 0; i < SHOW_TOP_SLABS; i++) {
+ if (top[i])
+ printk("%-21s: size %10lu objsize %10u\n",
+ top[i]->name, sizes[i],
+ top[i]->size);
+ }
+
+ spin_unlock(&cache_chain_lock);
+}
+
static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb..8e4805b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,9 @@
#include <asm/pgtable.h>
+#include <bc/vmpages.h>
+#include <bc/io_acct.h>
+
/*
* swapper_space is a fiction, retained to simplify the path through
* vmscan's shrink_page_list, to make sync_page look nicer, and to allow
@@ -46,6 +49,7 @@ struct address_space swapper_space = {
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
.backing_dev_info = &swap_backing_dev_info,
};
+EXPORT_SYMBOL(swapper_space);
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -70,7 +74,7 @@ void show_swap_cache_info(void)
* __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
@@ -119,6 +123,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
return error;
}
+EXPORT_SYMBOL(add_to_swap_cache);
+
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
@@ -148,11 +154,18 @@ int add_to_swap(struct page *page)
{
swp_entry_t entry;
int err;
+ struct user_beancounter *ub;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageUptodate(page));
- entry = get_swap_page();
+
+ ub = pb_grab_page_ub(page);
+ if (IS_ERR(ub))
+ return 0;
+
+ entry = get_swap_page(ub);
+ put_beancounter(ub);
if (!entry.val)
return 0;
@@ -348,6 +361,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+EXPORT_SYMBOL(read_swap_cache_async);
+
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590ee..9ce0143 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -35,6 +35,8 @@
#include <linux/swapops.h>
#include <linux/page_cgroup.h>
+#include <bc/vmpages.h>
+
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
@@ -47,9 +49,13 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct swap_info[MAX_SWAPFILES];
+struct swap_info_struct swap_info[MAX_SWAPFILES];
+EXPORT_SYMBOL(total_swap_pages);
+EXPORT_SYMBOL(swap_lock);
+EXPORT_SYMBOL(swap_list);
+EXPORT_SYMBOL(swap_info);
static DEFINE_MUTEX(swapon_mutex);
@@ -454,7 +460,7 @@ no_page:
return 0;
}
-swp_entry_t get_swap_page(void)
+swp_entry_t get_swap_page(struct user_beancounter *ub)
{
struct swap_info_struct *si;
pgoff_t offset;
@@ -475,6 +481,8 @@ swp_entry_t get_swap_page(void)
wrapped++;
}
+ if (si->flags & SWP_READONLY)
+ continue;
if (!si->highest_bit)
continue;
if (!(si->flags & SWP_WRITEOK))
@@ -485,6 +493,7 @@ swp_entry_t get_swap_page(void)
offset = scan_swap_map(si, SWAP_CACHE);
if (offset) {
spin_unlock(&swap_lock);
+ ub_swapentry_inc(si, offset, ub);
return swp_entry(type, offset);
}
next = swap_list.next;
@@ -496,6 +505,8 @@ noswap:
return (swp_entry_t) {0};
}
+EXPORT_SYMBOL(get_swap_page);
+
/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
@@ -504,7 +515,7 @@ swp_entry_t get_swap_page_of_type(int type)
spin_lock(&swap_lock);
si = swap_info + type;
- if (si->flags & SWP_WRITEOK) {
+ if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) {
nr_swap_pages--;
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, SWAP_MAP);
@@ -577,6 +588,7 @@ static int swap_entry_free(struct swap_info_struct *p,
count = p->swap_map[offset];
/* free if no reference */
if (!count) {
+ ub_swapentry_dec(p, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -606,6 +618,8 @@ void swap_free(swp_entry_t entry)
}
}
+EXPORT_SYMBOL(swap_free);
+
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
@@ -690,6 +704,25 @@ int try_to_free_swap(struct page *page)
return 1;
}
+int swap_readonly(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p == NULL)
+ return 0;
+
+ spin_unlock(&swap_lock);
+ if ((p->flags & (SWP_USED|SWP_WRITEOK|SWP_READONLY)) ==
+ (SWP_USED|SWP_WRITEOK))
+ return 0;
+
+ return 1;
+}
+
+
/*
* Free the swap entry like above, but also try to
* free the page cache entry if it is the last user.
@@ -728,6 +761,7 @@ int free_swap_and_cache(swp_entry_t entry)
}
return p != NULL;
}
+EXPORT_SYMBOL(free_swap_and_cache);
#ifdef CONFIG_HIBERNATION
/*
@@ -811,12 +845,14 @@ unsigned int count_swap_pages(int type, int free)
* force COW, vm_page_prot omits write permission from any private vma.
*/
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, swp_entry_t entry, struct page *page)
+ unsigned long addr, swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
struct mem_cgroup *ptr = NULL;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
+ struct mm_struct *mm = vma->vm_mm;
if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
ret = -ENOMEM;
@@ -831,9 +867,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
- inc_mm_counter(vma->vm_mm, anon_rss);
+ inc_mm_counter(mm, anon_rss);
+ ub_unused_privvm_dec(mm, vma);
+ pb_add_ref(page, mm, pb);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
+ set_pte_at(mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
page_add_anon_rmap(page, vma, addr);
mem_cgroup_commit_charge_swapin(page, ptr);
@@ -851,7 +889,8 @@ out_nolock:
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
pte_t swp_pte = swp_entry_to_pte(entry);
pte_t *pte;
@@ -874,7 +913,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (unlikely(pte_same(*pte, swp_pte))) {
pte_unmap(pte);
- ret = unuse_pte(vma, pmd, addr, entry, page);
+ ret = unuse_pte(vma, pmd, addr, entry, page, pb);
if (ret)
goto out;
pte = pte_offset_map(pmd, addr);
@@ -887,7 +926,8 @@ out:
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
pmd_t *pmd;
unsigned long next;
@@ -898,7 +938,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ ret = unuse_pte_range(vma, pmd, addr, next, entry, page, pb);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -907,7 +947,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
pud_t *pud;
unsigned long next;
@@ -918,7 +959,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ ret = unuse_pmd_range(vma, pud, addr, next, entry, page, pb);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -926,7 +967,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
}
static int unuse_vma(struct vm_area_struct *vma,
- swp_entry_t entry, struct page *page)
+ swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
pgd_t *pgd;
unsigned long addr, end, next;
@@ -948,7 +990,7 @@ static int unuse_vma(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+ ret = unuse_pud_range(vma, pgd, addr, next, entry, page, pb);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
@@ -956,7 +998,8 @@ static int unuse_vma(struct vm_area_struct *vma,
}
static int unuse_mm(struct mm_struct *mm,
- swp_entry_t entry, struct page *page)
+ swp_entry_t entry, struct page *page,
+ struct page_beancounter **pb)
{
struct vm_area_struct *vma;
int ret = 0;
@@ -972,7 +1015,7 @@ static int unuse_mm(struct mm_struct *mm,
lock_page(page);
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
+ if (vma->anon_vma && (ret = unuse_vma(vma, entry, page, pb)))
break;
}
up_read(&mm->mmap_sem);
@@ -1034,6 +1077,7 @@ static int try_to_unuse(unsigned int type)
int retval = 0;
int reset_overflow = 0;
int shmem;
+ struct page_beancounter *pb;
/*
* When searching mms for an entry, a good strategy is to
@@ -1086,6 +1130,13 @@ static int try_to_unuse(unsigned int type)
break;
}
+ pb = NULL;
+ if (pb_alloc_all(&pb)) {
+ page_cache_release(page);
+ retval = -ENOMEM;
+ break;
+ }
+
/*
* Don't hold on to start_mm if it looks like exiting.
*/
@@ -1108,6 +1159,20 @@ static int try_to_unuse(unsigned int type)
lock_page(page);
wait_on_page_writeback(page);
+ /* If read failed we cannot map not-uptodate page to
+ * user space. Actually, we are in serious troubles,
+ * we do not even know what process to kill. So, the only
+ * variant remains: to stop swapoff() and allow someone
+ * to kill processes to zap invalid pages.
+ */
+ if (unlikely(!PageUptodate(page))) {
+ pb_free_list(&pb);
+ unlock_page(page);
+ page_cache_release(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* Remove all references to entry.
* Whenever we reach init_mm, there's no address space
@@ -1119,7 +1184,7 @@ static int try_to_unuse(unsigned int type)
if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page);
else
- retval = unuse_mm(start_mm, entry, page);
+ retval = unuse_mm(start_mm, entry, page, &pb);
}
if (swap_count(*swap_map)) {
int set_start_mm = (*swap_map >= swcount);
@@ -1149,7 +1214,7 @@ static int try_to_unuse(unsigned int type)
set_start_mm = 1;
shmem = shmem_unuse(entry, page);
} else
- retval = unuse_mm(mm, entry, page);
+ retval = unuse_mm(mm, entry, page, &pb);
if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
@@ -1171,6 +1236,8 @@ static int try_to_unuse(unsigned int type)
retval = shmem;
break;
}
+
+ pb_free_list(&pb);
if (retval) {
unlock_page(page);
page_cache_release(page);
@@ -1518,6 +1585,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
int i, type, prev;
int err;
+ /* VE admin check is just to be on the safe side, the admin may affect
+ * swaps only if he has access to special, i.e. if he has been granted
+ * access to the block device or if the swap file is in the area
+ * visible to him. */
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1627,6 +1698,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ ub_swap_fini(p);
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1649,6 +1721,8 @@ out:
return err;
}
+EXPORT_SYMBOL(sys_swapoff);
+
#ifdef CONFIG_PROC_FS
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
@@ -1729,21 +1803,55 @@ static const struct seq_operations swaps_op = {
.show = swap_show
};
+#include <linux/virtinfo.h>
+
+static int swap_show_ve(struct seq_file *swap, void *v)
+{
+ struct meminfo mi;
+
+ memset(&mi, 0, sizeof(mi));
+ si_swapinfo(&mi.si);
+ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi)
+ & NOTIFY_FAIL)
+ goto out;
+
+ seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ if (!mi.si.totalswap)
+ goto out;
+ seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n",
+ "/dev/null",
+ "partition",
+ mi.si.totalswap << (PAGE_SHIFT - 10),
+ (mi.si.totalswap - mi.si.freeswap) << (PAGE_SHIFT - 10),
+ -1);
+out:
+ return 0;
+}
+
static int swaps_open(struct inode *inode, struct file *file)
{
+ if (!ve_is_super(get_exec_env()))
+ return single_open(file, &swap_show_ve, NULL);
return seq_open(file, &swaps_op);
}
+static int swaps_release(struct inode *inode, struct file *file)
+{
+ if (!ve_is_super(file->owner_env))
+ return single_release(inode, file);
+ return seq_release(inode, file);
+}
+
static const struct file_operations proc_swaps_operations = {
.open = swaps_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = swaps_release,
};
static int __init procswaps_init(void)
{
- proc_create("swaps", 0, NULL, &proc_swaps_operations);
+ proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations);
return 0;
}
__initcall(procswaps_init);
@@ -1973,6 +2081,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
}
+ if (ub_swap_init(p, maxpages)) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
p->flags |= SWP_SOLIDSTATE;
@@ -1991,6 +2104,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->prio = --least_priority;
p->swap_map = swap_map;
p->flags |= SWP_WRITEOK;
+ if (swap_flags & SWAP_FLAG_READONLY)
+ p->flags |= SWP_READONLY;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
@@ -2049,6 +2164,8 @@ out:
return error;
}
+EXPORT_SYMBOL(sys_swapon);
+
void si_swapinfo(struct sysinfo *val)
{
unsigned int i;
@@ -2146,6 +2263,8 @@ void swap_duplicate(swp_entry_t entry)
__swap_duplicate(entry, SWAP_MAP);
}
+EXPORT_SYMBOL(swap_duplicate);
+
/*
* @entry: swap entry for which we allocate swap cache.
*
diff --git a/mm/truncate.c b/mm/truncate.c
index 258bda7..a09fa8c 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -78,6 +78,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
+ ub_io_release_context(page, account_size);
}
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c228731..b579d8d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,9 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include <bc/kmem.h>
+#include <bc/debug.h>
+
/*** Page table manipulation functions ***/
@@ -1349,7 +1352,7 @@ struct vm_struct *remove_vm_area(const void *addr)
return NULL;
}
-static void __vunmap(const void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages, int uncharge)
{
struct vm_struct *area;
@@ -1374,6 +1377,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) {
int i;
+ if (uncharge)
+ dec_vmalloc_charged(area);
for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
@@ -1407,7 +1412,7 @@ void vfree(const void *addr)
kmemleak_free(addr);
- __vunmap(addr, 1);
+ __vunmap(addr, 1, 1);
}
EXPORT_SYMBOL(vfree);
@@ -1424,7 +1429,7 @@ void vunmap(const void *addr)
{
BUG_ON(in_interrupt());
might_sleep();
- __vunmap(addr, 0);
+ __vunmap(addr, 0, 0);
}
EXPORT_SYMBOL(vunmap);
@@ -1511,10 +1516,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (map_vm_area(area, prot, &pages))
goto fail;
+
+ inc_vmalloc_charged(area, gfp_mask);
return area->addr;
fail:
- vfree(area->addr);
+ __vunmap(area->addr, 1, 0);
return NULL;
}
@@ -1599,6 +1606,26 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+void *ub_vmalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+EXPORT_SYMBOL(ub_vmalloc);
+
+void *vmalloc_best(unsigned long size)
+{
+ return vmalloc(size);
+}
+
+EXPORT_SYMBOL(vmalloc_best);
+
+void *ub_vmalloc_best(unsigned long size)
+{
+ return ub_vmalloc(size);
+}
+
+EXPORT_SYMBOL(ub_vmalloc_best);
+
/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
@@ -1640,6 +1667,13 @@ void *vmalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vmalloc_node);
+void *ub_vmalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node(size, 1, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ub_vmalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -2335,6 +2369,40 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
kfree(vms);
}
+void vprintstat(void)
+{
+ struct vm_struct *p, *last_p = NULL;
+ unsigned long addr, size, free_size, max_free_size;
+ int num;
+
+ addr = VMALLOC_START;
+ size = max_free_size = 0;
+ num = 0;
+
+ read_lock(&vmlist_lock);
+ for (p = vmlist; p; p = p->next) {
+ free_size = (unsigned long)p->addr - addr;
+ if (free_size > max_free_size)
+ max_free_size = free_size;
+ addr = (unsigned long)p->addr + p->size;
+ size += p->size;
+ ++num;
+ last_p = p;
+ }
+ if (last_p) {
+ free_size = VMALLOC_END -
+ ((unsigned long)last_p->addr + last_p->size);
+ if (free_size > max_free_size)
+ max_free_size = free_size;
+ }
+ read_unlock(&vmlist_lock);
+
+ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n"
+ " Max_Free: %luKB Start: %lx End: %lx\n",
+ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num,
+ max_free_size/1024, VMALLOC_START, VMALLOC_END);
+}
+
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 692807f..a1bd5b6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,10 +41,14 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <bc/oom_kill.h>
+#include <bc/io_acct.h>
+
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
+#include <linux/vzstat.h>
#include "internal.h"
@@ -210,6 +214,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
+ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+ return 1;
+
if (!down_read_trylock(&shrinker_rwsem))
return 1; /* Assume we'll be able to shrink next time */
@@ -245,6 +252,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
+ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+ goto done;
+
nr_before = (*shrinker->shrink)(0, gfp_mask);
shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
if (shrink_ret == -1)
@@ -259,6 +269,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
shrinker->nr += total_scan;
}
+done:
up_read(&shrinker_rwsem);
return ret;
}
@@ -376,6 +387,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
*/
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
+ ub_io_release_context(page, 0);
ClearPageDirty(page);
printk("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
@@ -1321,6 +1333,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ {KSTAT_PERF_ENTER(refill_inact)
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
@@ -1394,6 +1407,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
LRU_BASE + file * LRU_FILE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
+ KSTAT_PERF_LEAVE(refill_inact)}
}
static int inactive_anon_is_low_global(struct zone *zone)
@@ -1636,6 +1650,8 @@ static void shrink_zone(int priority, struct zone *zone,
nr_reclaimed += shrink_list(l, nr_to_scan,
zone, sc, priority);
}
+ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+ return;
}
/*
* On large memory systems, scan >> priority can become
@@ -1714,6 +1730,9 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
}
shrink_zone(priority, zone, sc);
+
+ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+ break;
}
}
@@ -1745,10 +1764,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+ KSTAT_PERF_ENTER(ttfp);
delayacct_freepages_start();
if (scanning_global_lru(sc))
count_vm_event(ALLOCSTALL);
+
+ ub_oom_start();
/*
* mem_cgroup will not do shrink_slab.
*/
@@ -1797,6 +1819,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->may_writepage = 1;
}
+ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) {
+ ret = 1;
+ goto out;
+ }
+
/* Take a nap, wait for some writeback to complete */
if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1828,6 +1855,7 @@ out:
delayacct_freepages_end();
+ KSTAT_PERF_LEAVE(ttfp);
return ret;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c81321f..44bf18f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -15,6 +15,7 @@
#include <linux/cpu.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
+#include <linux/virtinfo.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -35,6 +36,20 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
}
}
+unsigned long vm_events(enum vm_event_item i)
+{
+ int cpu;
+ unsigned long sum;
+ struct vm_event_state *st;
+
+ sum = 0;
+ for_each_online_cpu(cpu) {
+ st = &per_cpu(vm_event_states, cpu);
+ sum += st->event[i];
+ }
+
+ return (sum < 0 ? 0 : sum);
+}
/*
* Accumulate the vm event counters across all CPUs.
* The result is unavoidably approximate - it can change
@@ -800,30 +815,40 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
unsigned long *v;
#ifdef CONFIG_VM_EVENT_COUNTERS
unsigned long *e;
+#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \
+ sizeof(struct vm_event_state))
+#else
+#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long))
#endif
int i;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
-#endif
+ v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- v[i] = global_page_state(i);
+
+ if (ve_is_super(get_exec_env())) {
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ v[i] = global_page_state(i);
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ e = v + NR_VM_ZONE_STAT_ITEMS;
+ all_vm_events(e);
+ e[PGPGIN] /= 2; /* sectors -> kbytes */
+ e[PGPGOUT] /= 2;
#endif
+ } else
+ memset(v, 0, VMSTAT_BUFSIZE);
+
+ if (virtinfo_notifier_call(VITYPE_GENERAL,
+ VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) {
+ kfree(v);
+ m->private = NULL;
+ return ERR_PTR(-ENOMSG);
+ }
+
return v + *pos;
}
@@ -942,7 +967,7 @@ static int __init setup_vmstat(void)
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
- proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+ proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations);
proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
#endif
return 0;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a29c5ab..d56a1ea 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/sched.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <net/p8022.h>
@@ -105,7 +106,7 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev)
{
struct vlan_group *grp;
- grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL);
+ grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC);
if (!grp)
return NULL;
@@ -127,7 +128,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id)
return 0;
size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
- array = kzalloc(size, GFP_KERNEL);
+ array = kzalloc(size, GFP_KERNEL_UBC);
if (array == NULL)
return -ENOBUFS;
@@ -147,6 +148,7 @@ void unregister_vlan_dev(struct net_device *dev)
const struct net_device_ops *ops = real_dev->netdev_ops;
struct vlan_group *grp;
u16 vlan_id = vlan->vlan_id;
+ struct ve_struct *env;
ASSERT_RTNL();
@@ -164,7 +166,9 @@ void unregister_vlan_dev(struct net_device *dev)
synchronize_net();
+ env = set_exec_env(dev->owner_env);
unregister_netdevice(dev);
+ set_exec_env(env);
/* If the group is now empty, kill off the group. */
if (grp->nr_vlans == 0) {
@@ -551,6 +555,17 @@ static struct notifier_block vlan_notifier_block __read_mostly = {
.notifier_call = vlan_device_event,
};
+static inline int vlan_check_caps(void)
+{
+ if (capable(CAP_NET_ADMIN))
+ return 1;
+#ifdef CONFIG_VE
+ if (capable(CAP_VE_NET_ADMIN))
+ return 1;
+#endif
+ return 0;
+}
+
/*
* VLAN IOCTL handler.
* o execute requested action or pass command to the device driver
@@ -592,7 +607,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
switch (args.cmd) {
case SET_VLAN_INGRESS_PRIORITY_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
vlan_dev_set_ingress_priority(dev,
args.u.skb_priority,
@@ -602,7 +617,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
case SET_VLAN_EGRESS_PRIORITY_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
err = vlan_dev_set_egress_priority(dev,
args.u.skb_priority,
@@ -611,7 +626,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
case SET_VLAN_FLAG_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
err = vlan_dev_change_flags(dev,
args.vlan_qos ? args.u.flag : 0,
@@ -620,7 +635,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
case SET_VLAN_NAME_TYPE_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
if ((args.u.name_type >= 0) &&
(args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
@@ -636,14 +651,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
case ADD_VLAN_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
err = register_vlan_device(dev, args.u.VID);
break;
case DEL_VLAN_CMD:
err = -EPERM;
- if (!capable(CAP_NET_ADMIN))
+ if (!vlan_check_caps())
break;
unregister_vlan_dev(dev);
err = 0;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4198ec5..c8f0ca2 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -24,6 +24,7 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/sched.h>
#include <linux/ethtool.h>
#include <net/arp.h>
@@ -291,6 +292,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ struct ve_struct *env;
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
@@ -324,7 +326,10 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
skb->dev = vlan_dev_info(dev)->real_dev;
len = skb->len;
+ skb->owner_env = skb->dev->owner_env;
+ env = set_exec_env(skb->owner_env);
ret = dev_queue_xmit(skb);
+ set_exec_env(env);
if (likely(ret == NET_XMIT_SUCCESS)) {
txq->tx_packets++;
@@ -338,6 +343,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ struct ve_struct *env;
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
u16 vlan_tci;
@@ -350,7 +356,10 @@ static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
skb->dev = vlan_dev_info(dev)->real_dev;
len = skb->len;
+ skb->owner_env = skb->dev->owner_env;
+ env = set_exec_env(skb->owner_env);
ret = dev_queue_xmit(skb);
+ set_exec_env(env);
if (likely(ret == NET_XMIT_SUCCESS)) {
txq->tx_packets++;
@@ -829,4 +838,6 @@ void vlan_setup(struct net_device *dev)
dev->ethtool_ops = &vlan_ethtool_ops;
memset(dev->broadcast, 0, ETH_ALEN);
+ if (!ve_is_super(get_exec_env()))
+ dev->features |= NETIF_F_VIRTUAL;
}
diff --git a/net/bridge/br.c b/net/bridge/br.c
index e1241c7..455a4e6 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -62,8 +62,11 @@ static int __init br_init(void)
if (err)
goto err_out4;
+ get_ve0()->features |= VE_FEATURE_BRIDGE;
+
brioctl_set(br_ioctl_deviceless_stub);
br_handle_frame_hook = br_handle_frame;
+ br_hard_xmit_hook = br_xmit;
#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
br_fdb_test_addr_hook = br_fdb_test_addr;
@@ -101,6 +104,7 @@ static void __exit br_deinit(void)
#endif
br_handle_frame_hook = NULL;
+ br_hard_xmit_hook = NULL;
br_fdb_fini();
}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 07a0777..ce33b80 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -32,16 +32,47 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
+ skb->brmark = BR_ALREADY_SEEN;
+
if (dest[0] & 1)
br_flood_deliver(br, skb);
else if ((dst = __br_fdb_get(br, dest)) != NULL)
- br_deliver(dst->dst, skb);
+ br_deliver(dst->dst, skb, 1);
else
br_flood_deliver(br, skb);
return NETDEV_TX_OK;
}
+int br_xmit(struct sk_buff *skb, struct net_bridge_port *port)
+{
+ struct net_bridge *br = port->br;
+ const unsigned char *dest = skb->data;
+ struct net_bridge_fdb_entry *dst;
+
+ if (!br->via_phys_dev)
+ return 0;
+
+ br->dev->stats.tx_packets++;
+ br->dev->stats.tx_bytes += skb->len;
+
+ skb_reset_mac_header(skb);
+ skb_pull(skb, ETH_HLEN);
+
+ skb->brmark = BR_ALREADY_SEEN;
+
+ if (dest[0] & 1)
+ br_xmit_deliver(br, port, skb);
+ else if ((dst = __br_fdb_get(br, dest)) != NULL)
+ br_deliver(dst->dst, skb, 0);
+ else
+ br_xmit_deliver(br, port, skb);
+
+ skb_push(skb, ETH_HLEN);
+
+ return 0;
+}
+
static int br_dev_open(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index bc1704a..2c9f7f1 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -82,14 +82,24 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
}
/* called with rcu_read_lock */
-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free)
{
if (should_deliver(to, skb)) {
+ if (!free) {
+ struct sk_buff *skb2;
+
+ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ to->dev->stats.tx_dropped++;
+ return;
+ }
+ skb = skb2;
+ }
__br_deliver(to, skb);
return;
}
- kfree_skb(skb);
+ if (free)
+ kfree_skb(skb);
}
/* called with rcu_read_lock */
@@ -105,6 +115,7 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
/* called under bridge lock */
static void br_flood(struct net_bridge *br, struct sk_buff *skb,
+ int free,
void (*__packet_hook)(const struct net_bridge_port *p,
struct sk_buff *skb))
{
@@ -136,18 +147,41 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
return;
}
- kfree_skb(skb);
+ if (free)
+ kfree_skb(skb);
}
/* called with rcu_read_lock */
void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
{
- br_flood(br, skb, __br_deliver);
+ br_flood(br, skb, 1, __br_deliver);
+}
+
+/* called with rcu_read_lock */
+void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry_rcu(p, &br->port_list, list) {
+ if (p == port)
+ continue;
+ if (should_deliver(p, skb)) {
+ struct sk_buff *skb2;
+
+ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ br->dev->stats.tx_dropped++;
+ return;
+ }
+ __br_deliver(p, skb2);
+ }
+ }
}
/* called under bridge lock */
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb)
{
- br_flood(br, skb, __br_forward);
+ skb->brmark = BR_ALREADY_SEEN;
+ br_flood(br, skb, 1, __br_forward);
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 4a9f527..c8d499e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -12,6 +12,7 @@
*/
#include <linux/kernel.h>
+#include <linux/nsproxy.h>
#include <linux/netdevice.h>
#include <linux/ethtool.h>
#include <linux/if_arp.h>
@@ -158,6 +159,11 @@ static void del_br(struct net_bridge *br)
{
struct net_bridge_port *p, *n;
+ if (br->master_dev) {
+ dev_put(br->master_dev);
+ br->master_dev = NULL;
+ }
+
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
@@ -423,6 +429,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
+ if (!(dev->features & NETIF_F_VIRTUAL) && !br->master_dev) {
+ dev_hold(dev);
+ br->master_dev = dev;
+ }
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, p);
@@ -458,6 +468,16 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
+ if (br->master_dev == dev) {
+ br->master_dev = NULL;
+ dev_put(dev);
+ list_for_each_entry(p, &br->port_list, list)
+ if (!(p->dev->features & NETIF_F_VIRTUAL)) {
+ dev_hold(p->dev);
+ br->master_dev = p->dev;
+ break;
+ }
+ }
spin_unlock_bh(&br->lock);
return 0;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 5ee1a36..0fc65bf 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -28,7 +28,13 @@ static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb)
brdev->stats.rx_bytes += skb->len;
indev = skb->dev;
- skb->dev = brdev;
+ if (!br->via_phys_dev)
+ skb->dev = brdev;
+ else {
+ skb->brmark = BR_ALREADY_SEEN;
+ if (br->master_dev)
+ skb->dev = br->master_dev;
+ }
NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
netif_receive_skb);
@@ -56,7 +62,7 @@ int br_handle_frame_finish(struct sk_buff *skb)
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
- if (br->dev->flags & IFF_PROMISC)
+ if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev)
skb2 = skb;
dst = NULL;
@@ -147,6 +153,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb)
forward:
switch (p->state) {
+ struct net_device *out;
+
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
if (rhook != NULL) {
@@ -156,7 +164,12 @@ forward:
}
/* fall through */
case BR_STATE_LEARNING:
- if (!compare_ether_addr(p->br->dev->dev_addr, dest))
+ if (skb->brmark == BR_ALREADY_SEEN)
+ return skb;
+
+ out = p->br->via_phys_dev ? p->br->master_dev : p->br->dev;
+
+ if (out && !compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 6a6433d..5a12508 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/if_bridge.h>
#include <linux/netdevice.h>
+#include <linux/nsproxy.h>
#include <linux/times.h>
#include <net/net_namespace.h>
#include <asm/uaccess.h>
@@ -140,6 +141,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
b.root_port = br->root_port;
b.stp_enabled = (br->stp_enabled != BR_NO_STP);
+ b.via_phys_dev = br->via_phys_dev;
b.ageing_time = jiffies_to_clock_t(br->ageing_time);
b.hello_timer_value = br_timer_value(&br->hello_timer);
b.tcn_timer_value = br_timer_value(&br->tcn_timer);
@@ -262,6 +264,13 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
br_stp_set_enabled(br, args[1]);
return 0;
+ case BRCTL_SET_VIA_ORIG_DEV:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ br->via_phys_dev = args[1] ? 1 : 0;
+ return 0;
+
case BRCTL_SET_BRIDGE_PRIORITY:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -371,6 +380,9 @@ static int old_deviceless(struct net *net, void __user *uarg)
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
+ if (!(get_exec_env()->features & VE_FEATURE_BRIDGE))
+ return -ENOTTY;
+
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 2114e45..fd8d422 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -17,6 +17,10 @@
#include <linux/if_bridge.h>
#include <net/route.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzcalluser.h>
+
#define BR_HASH_BITS 8
#define BR_HASH_SIZE (1 << BR_HASH_BITS)
@@ -91,6 +95,8 @@ struct net_bridge
spinlock_t lock;
struct list_head port_list;
struct net_device *dev;
+ struct net_device *master_dev;
+ unsigned char via_phys_dev;
spinlock_t hash_lock;
struct hlist_head hash[BR_HASH_SIZE];
struct list_head age_list;
@@ -145,6 +151,7 @@ static inline int br_is_root_bridge(const struct net_bridge *br)
extern void br_dev_setup(struct net_device *dev);
extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
struct net_device *dev);
+extern netdev_tx_t br_xmit(struct sk_buff *skb, struct net_bridge_port *port);
/* br_fdb.c */
extern int br_fdb_init(void);
@@ -169,12 +176,13 @@ extern void br_fdb_update(struct net_bridge *br,
/* br_forward.c */
extern void br_deliver(const struct net_bridge_port *to,
- struct sk_buff *skb);
+ struct sk_buff *skb, int free);
extern int br_dev_queue_push_xmit(struct sk_buff *skb);
extern void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb);
extern int br_forward_finish(struct sk_buff *skb);
extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
+extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb);
extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
/* br_if.c */
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index ee4820a..d45a726 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -182,6 +182,28 @@ static ssize_t store_stp_state(struct device *d,
static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
store_stp_state);
+static ssize_t show_via_phys_dev_state(struct device *cd,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(cd);
+ return sprintf(buf, "%d\n", br->via_phys_dev);
+}
+
+static int set_via_phys_dev_state(struct net_bridge *br, unsigned long val)
+{
+ br->via_phys_dev = val ? 1 : 0;
+ return 0;
+}
+
+static ssize_t store_via_phys_dev_state(struct device *cd,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ return store_bridge_parm(cd, buf, len, set_via_phys_dev_state);
+}
+
+static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state,
+ store_via_phys_dev_state);
+
static ssize_t show_priority(struct device *d, struct device_attribute *attr,
char *buf)
{
@@ -351,6 +373,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_max_age.attr,
&dev_attr_ageing_time.attr,
&dev_attr_stp_state.attr,
+ &dev_attr_via_phys_dev.attr,
&dev_attr_priority.attr,
&dev_attr_bridge_id.attr,
&dev_attr_root_id.attr,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 4ade301..9732b07 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -57,6 +57,8 @@
#include <net/tcp_states.h>
#include <trace/events/skb.h>
+#include <bc/net.h>
+
/*
* Is a socket 'connection oriented' ?
*/
@@ -723,6 +725,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
{
struct sock *sk = sock->sk;
unsigned int mask;
+ int no_ubc_space;
sock_poll_wait(file, sk->sk_sleep, wait);
mask = 0;
@@ -732,8 +735,14 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
mask |= POLLERR;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP;
- if (sk->sk_shutdown == SHUTDOWN_MASK)
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
+ no_ubc_space = 0;
mask |= POLLHUP;
+ } else {
+ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
+ if (no_ubc_space)
+ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
+ }
/* readable? */
if (!skb_queue_empty(&sk->sk_receive_queue) ||
@@ -750,7 +759,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
}
/* writable? */
- if (sock_writeable(sk))
+ if (!no_ubc_space && sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
diff --git a/net/core/dev.c b/net/core/dev.c
index 74d0cce..ee00d53 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,9 @@
#include "net-sysfs.h"
+#include <bc/beancounter.h>
+#include <bc/kmem.h>
+
/* Instead of increasing this, you should create a hash table. */
#define MAX_GRO_SKBS 8
@@ -193,20 +196,6 @@ static struct list_head ptype_all __read_mostly; /* Taps */
DEFINE_RWLOCK(dev_base_lock);
EXPORT_SYMBOL(dev_base_lock);
-#define NETDEV_HASHBITS 8
-#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
-
-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
-{
- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
- return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
-}
-
-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
-{
- return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
-}
-
/* Device list insertion */
static int list_netdevice(struct net_device *dev)
{
@@ -1697,6 +1686,24 @@ static int dev_gso_segment(struct sk_buff *skb)
return 0;
}
+#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
+int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port);
+EXPORT_SYMBOL(br_hard_xmit_hook);
+static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct net_bridge_port *port;
+
+ if (((port = rcu_dereference(dev->br_port)) == NULL) ||
+ (skb->brmark == BR_ALREADY_SEEN))
+ return 0;
+
+ return br_hard_xmit_hook(skb, port);
+}
+#else
+#define bridge_hard_start_xmit(skb, dev) (0)
+#endif
+
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
@@ -1721,6 +1728,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
+ bridge_hard_start_xmit(skb, dev);
+
rc = ops->ndo_start_xmit(skb, dev);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
@@ -1747,6 +1756,9 @@ gso:
skb->next = nskb->next;
nskb->next = NULL;
+
+ bridge_hard_start_xmit(skb, dev);
+
rc = ops->ndo_start_xmit(nskb, dev);
if (unlikely(rc != NETDEV_TX_OK)) {
nskb->next = skb->next;
@@ -2288,6 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb)
struct net_device *null_or_orig;
int ret = NET_RX_DROP;
__be16 type;
+ struct ve_struct *old_ve;
if (!skb->tstamp.tv64)
net_timestamp(skb);
@@ -2317,6 +2330,16 @@ int netif_receive_skb(struct sk_buff *skb)
skb_reset_transport_header(skb);
skb->mac_len = skb->network_header - skb->mac_header;
+#ifdef CONFIG_VE
+ /*
+ * Skb might be alloced in another VE context, than its device works.
+ * So, set the correct owner_env.
+ */
+ skb->owner_env = skb->dev->owner_env;
+ BUG_ON(skb->owner_env == NULL);
+#endif
+ old_ve = set_exec_env(skb->owner_env);
+
pt_prev = NULL;
rcu_read_lock();
@@ -2375,6 +2398,7 @@ ncls:
out:
rcu_read_unlock();
+ (void)set_exec_env(old_ve);
return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -3394,8 +3418,13 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
return -EOVERFLOW;
}
}
- if (dev->flags != old_flags) {
- printk(KERN_INFO "device %s %s promiscuous mode\n",
+ /*
+ * Promiscous mode on LOOPBACK/POINTTOPOINT devices does
+ * not mean anything
+ */
+ if ((dev->flags != old_flags) &&
+ !(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+ ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n",
dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
"left");
if (audit_enabled) {
@@ -4547,16 +4576,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
* - require strict serialization.
* - do not return a value
*/
+ case SIOCSIFMTU:
+ case SIOCSIFHWADDR:
case SIOCSIFFLAGS:
+ case SIOCSIFTXQLEN:
+ if (!capable(CAP_NET_ADMIN) &&
+ !capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+ dev_load(net, ifr.ifr_name);
+ rtnl_lock();
+ ret = dev_ifsioc(net, &ifr, cmd);
+ rtnl_unlock();
+ return ret;
+
case SIOCSIFMETRIC:
- case SIOCSIFMTU:
case SIOCSIFMAP:
- case SIOCSIFHWADDR:
case SIOCSIFSLAVE:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCSIFHWBROADCAST:
- case SIOCSIFTXQLEN:
case SIOCSMIIREG:
case SIOCBONDENSLAVE:
case SIOCBONDRELEASE:
@@ -4619,12 +4657,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
*/
static int dev_new_index(struct net *net)
{
- static int ifindex;
for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(net, ifindex))
- return ifindex;
+ if (++net->ifindex <= 0)
+ net->ifindex = 1;
+ if (!__dev_get_by_index(net, net->ifindex))
+ return net->ifindex;
}
}
@@ -4779,6 +4816,10 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = -EPERM;
+ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev))
+ goto out;
+
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
netdev_init_queue_locks(dev);
@@ -4849,6 +4890,10 @@ int register_netdevice(struct net_device *dev)
set_bit(__LINK_STATE_PRESENT, &dev->state);
+ dev->owner_env = get_exec_env();
+ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub());
+ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub());
+
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
@@ -5029,12 +5074,14 @@ static void netdev_wait_allrefs(struct net_device *dev)
void netdev_run_todo(void)
{
struct list_head list;
+ struct ve_struct *old_ve;
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
+ old_ve = get_exec_env();
while (!list_empty(&list)) {
struct net_device *dev
= list_entry(list.next, struct net_device, todo_list);
@@ -5047,6 +5094,7 @@ void netdev_run_todo(void)
continue;
}
+ (void)set_exec_env(dev->owner_env);
dev->reg_state = NETREG_UNREGISTERED;
on_each_cpu(flush_backlog, dev, 1);
@@ -5059,12 +5107,21 @@ void netdev_run_todo(void)
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
+ put_beancounter(netdev_bc(dev)->exec_ub);
+ put_beancounter(netdev_bc(dev)->owner_ub);
+ netdev_bc(dev)->exec_ub = NULL;
+ netdev_bc(dev)->owner_ub = NULL;
+
+ /* It must be the very last action,
+ * after this 'dev' may point to freed up memory.
+ */
if (dev->destructor)
dev->destructor(dev);
/* Free network device */
kobject_put(&dev->dev.kobj);
}
+ (void)set_exec_env(old_ve);
}
/**
@@ -5147,13 +5204,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kzalloc(alloc_size, GFP_KERNEL);
+ p = kzalloc(alloc_size, GFP_KERNEL_UBC);
if (!p) {
printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
return NULL;
}
- tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
+ tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL_UBC);
if (!tx) {
printk(KERN_ERR "alloc_netdev: Unable to allocate "
"tx qdiscs.\n");
@@ -5296,11 +5353,18 @@ EXPORT_SYMBOL(unregister_netdev);
* Callers must hold the rtnl semaphore.
*/
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat,
+ struct user_beancounter *exec_ub)
{
char buf[IFNAMSIZ];
const char *destname;
int err;
+ struct user_beancounter *tmp_ub;
+#ifdef CONFIG_VE
+ struct ve_struct *cur_ve = get_exec_env();
+ struct ve_struct *src_ve = dev->owner_env;
+ struct ve_struct *dst_ve = net->owner_ve;
+#endif
ASSERT_RTNL();
@@ -5360,6 +5424,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
err = -ENODEV;
unlist_netdevice(dev);
+ dev->owner_env = dst_ve;
+ tmp_ub = netdev_bc(dev)->exec_ub;
+ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub);
+ put_beancounter(tmp_ub);
+
synchronize_net();
/* Shutdown queueing discipline. */
@@ -5368,7 +5437,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Notify protocols, that we are about to destroy
this device. They should clean all the things.
*/
+ set_exec_env(src_ve);
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ (void)set_exec_env(cur_ve);
/*
* Flush the unicast and multicast chains
@@ -5376,7 +5447,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
dev_unicast_flush(dev);
dev_addr_discard(dev);
+ set_exec_env(src_ve);
netdev_unregister_kobject(dev);
+ set_exec_env(cur_ve);
/* Actually switch the network namespace */
dev_net_set(dev, net);
@@ -5394,14 +5467,18 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
}
/* Fixup kobjects */
+ set_exec_env(dst_ve);
err = netdev_register_kobject(dev);
+ set_exec_env(cur_ve);
WARN_ON(err);
/* Add the device back in the hashes */
list_netdevice(dev);
/* Notify protocols, that a new device appeared. */
+ set_exec_env(dst_ve);
call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ (void)set_exec_env(cur_ve);
/*
* Prevent userspace races by waiting until the network
@@ -5416,6 +5493,14 @@ out:
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+{
+ struct user_beancounter *ub = get_exec_ub();
+
+ return __dev_change_net_namespace(dev, net, pat, ub);
+}
+EXPORT_SYMBOL(__dev_change_net_namespace);
+
static int dev_cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *ocpu)
@@ -5507,7 +5592,7 @@ static struct hlist_head *netdev_create_hash(void)
int i;
struct hlist_head *hash;
- hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+ hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC);
if (hash != NULL)
for (i = 0; i < NETDEV_HASHENTRIES; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -5701,3 +5786,32 @@ static int __init initialize_hashrnd(void)
late_initcall_sync(initialize_hashrnd);
+static LIST_HEAD(dev_cpt_operations);
+
+void register_netdev_rst(struct netdev_rst *ops)
+{
+ rtnl_lock();
+ list_add_tail(&ops->list, &dev_cpt_operations);
+ __rtnl_unlock();
+}
+EXPORT_SYMBOL(register_netdev_rst);
+
+void unregister_netdev_rst(struct netdev_rst *ops)
+{
+ rtnl_lock();
+ list_del(&ops->list);
+ __rtnl_unlock();
+}
+EXPORT_SYMBOL(unregister_netdev_rst);
+
+struct netdev_rst *netdev_find_rst(int cpt_object, struct netdev_rst *ops)
+{
+ ops = list_prepare_entry(ops, &dev_cpt_operations, list);
+
+ list_for_each_entry_continue(ops, &dev_cpt_operations, list)
+ if (ops->cpt_object == cpt_object)
+ return ops;
+
+ return NULL;
+}
+EXPORT_SYMBOL(netdev_find_rst);
diff --git a/net/core/dst.c b/net/core/dst.c
index cb1b348..696ecd8 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -313,6 +313,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void
switch (event) {
case NETDEV_UNREGISTER:
case NETDEV_DOWN:
+ dst_gc_task(NULL);
mutex_lock(&dst_gc_mutex);
for (dst = dst_busy_list; dst; dst = dst->next) {
last = dst;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4c12ddb..59dfa3e 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -954,7 +954,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GRXCLSRLALL:
break;
default:
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bd30938..093b0c4 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
{
struct fib_rule *r;
- r = kzalloc(ops->rule_size, GFP_KERNEL);
+ r = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
if (r == NULL)
return -ENOMEM;
@@ -238,7 +238,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (err < 0)
goto errout;
- rule = kzalloc(ops->rule_size, GFP_KERNEL);
+ rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC);
if (rule == NULL) {
err = -ENOMEM;
goto errout;
diff --git a/net/core/filter.c b/net/core/filter.c
index d1d779c..d5a7f15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -497,7 +497,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (fprog->filter == NULL)
return -EINVAL;
- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC);
if (!fp)
return -ENOMEM;
if (copy_from_user(fp->insns, fprog->filter, fsize)) {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e587e68..705e8ea 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -21,6 +21,8 @@
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/ve.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -35,6 +37,7 @@
#include <linux/random.h>
#include <linux/string.h>
#include <linux/log2.h>
+#include <bc/beancounter.h>
#define NEIGH_DEBUG 1
@@ -264,6 +267,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
int entries;
entries = atomic_inc_return(&tbl->entries) - 1;
+ n = ERR_PTR(-ENOBUFS);
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
@@ -274,7 +278,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
if (!n)
- goto out_entries;
+ goto out_nomem;
skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
@@ -291,6 +295,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
out:
return n;
+out_nomem:
+ n = ERR_PTR(-ENOMEM);
out_entries:
atomic_dec(&tbl->entries);
goto out;
@@ -409,12 +415,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n;
- if (!n) {
- rc = ERR_PTR(-ENOBUFS);
+ rc = n = neigh_alloc(tbl);
+ if (IS_ERR(n))
goto out;
- }
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
@@ -734,10 +739,21 @@ static void neigh_periodic_work(struct work_struct *work)
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(jiffies, n->used + n->parms->gc_staletime))) {
+ struct net_device *dev = n->dev;
+ struct ve_struct *ve;
+ struct user_beancounter *ub;
+
*np = n->next;
n->dead = 1;
write_unlock(&n->lock);
+
+ ve = set_exec_env(dev->owner_env);
+ ub = set_exec_ub(netdev_bc(dev)->owner_ub);
+
neigh_cleanup_and_release(n);
+
+ set_exec_ub(ub);
+ set_exec_env(ve);
continue;
}
write_unlock(&n->lock);
@@ -800,6 +816,11 @@ static void neigh_timer_handler(unsigned long arg)
struct neighbour *neigh = (struct neighbour *)arg;
unsigned state;
int notify = 0;
+ struct ve_struct *env;
+ struct user_beancounter *ub;
+
+ env = set_exec_env(neigh->dev->owner_env);
+ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub);
write_lock(&neigh->lock);
@@ -885,6 +906,8 @@ out:
neigh_update_notify(neigh);
neigh_release(neigh);
+ (void)set_exec_ub(ub);
+ (void)set_exec_env(env);
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
@@ -1273,9 +1296,16 @@ static void neigh_proxy_process(unsigned long arg)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
__skb_unlink(skb, &tbl->proxy_queue);
- if (tbl->proxy_redo && netif_running(dev))
+ if (tbl->proxy_redo && netif_running(dev)) {
+ struct ve_struct *ve;
+ struct user_beancounter *ub;
+
+ ve = set_exec_env(dev->owner_env);
+ ub = set_exec_ub(netdev_bc(dev)->owner_ub);
tbl->proxy_redo(skb);
- else
+ set_exec_ub(ub);
+ set_exec_env(ve);
+ } else
kfree_skb(skb);
dev_put(dev);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d5617d4..c70f2a2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -268,6 +268,27 @@ static struct device_attribute net_class_attributes[] = {
{}
};
+#ifdef CONFIG_VE
+struct device_attribute ve_net_class_attributes[] = {
+ __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
+ __ATTR(iflink, S_IRUGO, show_iflink, NULL),
+ __ATTR(ifindex, S_IRUGO, show_ifindex, NULL),
+ __ATTR(features, S_IRUGO, show_features, NULL),
+ __ATTR(type, S_IRUGO, show_type, NULL),
+ __ATTR(link_mode, S_IRUGO, show_link_mode, NULL),
+ __ATTR(address, S_IRUGO, show_address, NULL),
+ __ATTR(broadcast, S_IRUGO, show_broadcast, NULL),
+ __ATTR(carrier, S_IRUGO, show_carrier, NULL),
+ __ATTR(dormant, S_IRUGO, show_dormant, NULL),
+ __ATTR(operstate, S_IRUGO, show_operstate, NULL),
+ __ATTR(mtu, S_IRUGO, show_mtu, NULL),
+ __ATTR(flags, S_IRUGO, show_flags, NULL),
+ __ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL),
+ {}
+};
+EXPORT_SYMBOL(ve_net_class_attributes);
+#endif
+
/* Show a given an attribute in the statistics group */
static ssize_t netstat_show(const struct device *d,
struct device_attribute *attr, char *buf,
@@ -462,7 +483,7 @@ static void netdev_release(struct device *d)
kfree((char *)dev - dev->padded);
}
-static struct class net_class = {
+struct class net_class = {
.name = "net",
.dev_release = netdev_release,
#ifdef CONFIG_SYSFS
@@ -472,6 +493,13 @@ static struct class net_class = {
.dev_uevent = netdev_uevent,
#endif
};
+EXPORT_SYMBOL(net_class);
+
+#ifndef CONFIG_VE
+#define visible_net_class net_class
+#else
+#define visible_net_class (*get_exec_env()->net_class)
+#endif
/* Delete sysfs entries but hold kobject reference until after all
* netdev references are gone.
@@ -494,7 +522,7 @@ int netdev_register_kobject(struct net_device *net)
struct device *dev = &(net->dev);
const struct attribute_group **groups = net->sysfs_groups;
- dev->class = &net_class;
+ dev->class = &visible_net_class;
dev->platform_data = net;
dev->groups = groups;
@@ -509,9 +537,6 @@ int netdev_register_kobject(struct net_device *net)
#endif
#endif /* CONFIG_SYSFS */
- if (dev_net(net) != &init_net)
- return 0;
-
return device_add(dev);
}
@@ -534,7 +559,15 @@ void netdev_initialize_kobject(struct net_device *net)
device_initialize(device);
}
+void prepare_sysfs_netdev(void)
+{
+#ifdef CONFIG_VE
+ get_ve0()->net_class = &net_class;
+#endif
+}
+
int netdev_kobject_init(void)
{
+ prepare_sysfs_netdev();
return class_register(&net_class);
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1c1af27..39d8c2e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1,6 +1,7 @@
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
+#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
@@ -36,6 +37,10 @@ static __net_init int setup_net(struct net *net)
struct pernet_operations *ops;
int error = 0;
+#ifdef CONFIG_VE
+ net->owner_ve = get_exec_env();
+#endif
+
atomic_set(&net->count, 1);
#ifdef NETNS_REFCNT_DEBUG
@@ -106,6 +111,8 @@ out_free:
static void net_free(struct net *net)
{
+ struct completion *sysfs_completion;
+
#ifdef NETNS_REFCNT_DEBUG
if (unlikely(atomic_read(&net->use_count) != 0)) {
printk(KERN_EMERG "network namespace not free! Usage: %d\n",
@@ -113,8 +120,11 @@ static void net_free(struct net *net)
return;
}
#endif
+ sysfs_completion = net->sysfs_completion;
kfree(net->gen);
kmem_cache_free(net_cachep, net);
+ if (sysfs_completion)
+ complete(sysfs_completion);
}
static struct net *net_create(void)
@@ -151,6 +161,7 @@ static void cleanup_net(struct work_struct *work)
{
struct pernet_operations *ops;
struct net *net;
+ struct ve_struct *old_ve;
net = container_of(work, struct net, work);
@@ -168,11 +179,13 @@ static void cleanup_net(struct work_struct *work)
*/
synchronize_rcu();
+ old_ve = set_exec_env(net->owner_ve);
/* Run all of the network namespace exit methods */
list_for_each_entry_reverse(ops, &pernet_list, list) {
if (ops->exit)
ops->exit(net);
}
+ (void)set_exec_env(old_ve);
mutex_unlock(&net_mutex);
@@ -259,6 +272,16 @@ static int __init net_ns_init(void)
pure_initcall(net_ns_init);
#ifdef CONFIG_NET_NS
+
+#include <linux/netdevice.h>
+
+static inline void set_net_context(struct net *net)
+{
+ set_exec_env(net->owner_ve);
+ if (net->loopback_dev)
+ set_exec_ub(netdev_bc(net->loopback_dev)->exec_ub);
+}
+
static int register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
@@ -268,7 +291,9 @@ static int register_pernet_operations(struct list_head *list,
list_add_tail(&ops->list, list);
if (ops->init) {
for_each_net(net) {
+ set_net_context(net);
error = ops->init(net);
+ set_net_context(&init_net);
if (error)
goto out_undo;
}
@@ -282,7 +307,10 @@ out_undo:
for_each_net(undo_net) {
if (undo_net == net)
goto undone;
+
+ set_net_context(undo_net);
ops->exit(undo_net);
+ set_net_context(&init_net);
}
}
undone:
@@ -295,8 +323,11 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
if (ops->exit)
- for_each_net(net)
+ for_each_net(net) {
+ set_net_context(net);
ops->exit(net);
+ set_net_context(&init_net);
+ }
}
#else
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4fd895..1c96da6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1206,6 +1206,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (rtnl_msg_handlers[idx] == NULL ||
rtnl_msg_handlers[idx][type].dumpit == NULL)
continue;
+ if (vz_security_family_check(idx))
+ continue;
if (idx > s_idx)
memset(&cb->args[0], 0, sizeof(cb->args));
if (rtnl_msg_handlers[idx][type].dumpit(skb, cb))
@@ -1267,13 +1269,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return 0;
family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
- if (family >= NPROTO)
+ if (family >= NPROTO || vz_security_family_check(family))
return -EAFNOSUPPORT;
sz_idx = type>>2;
kind = type&3;
- if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN))
return -EPERM;
if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
diff --git a/net/core/scm.c b/net/core/scm.c
index 9b26463..944126f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -36,6 +36,7 @@
#include <net/compat.h>
#include <net/scm.h>
+#include <bc/kmem.h>
/*
* Only allow a user to send credentials, that they could set with
@@ -46,7 +47,9 @@ static __inline__ int scm_check_creds(struct ucred *creds)
{
const struct cred *cred = current_cred();
- if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
+ if ((creds->pid == task_tgid_vnr(current) ||
+ creds->pid == current->tgid ||
+ capable(CAP_VE_SYS_ADMIN)) &&
((creds->uid == cred->uid || creds->uid == cred->euid ||
creds->uid == cred->suid) || capable(CAP_SETUID)) &&
((creds->gid == cred->gid || creds->gid == cred->egid ||
@@ -73,7 +76,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (!fpl)
{
- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
@@ -302,7 +305,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (!fpl)
return NULL;
- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
+ new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL_UBC);
if (new_fpl) {
for (i=fpl->count-1; i>=0; i--)
get_file(fpl->fp[i]);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ec85681..b8865de 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -67,6 +67,7 @@
#include <asm/uaccess.h>
#include <asm/system.h>
#include <trace/events/skb.h>
+#include <bc/net.h>
#include "kmap_skb.h"
@@ -184,6 +185,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
if (!skb)
goto out;
+ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA))
+ goto nobc;
+
+ /* Get the DATA. Size must match skb_add_mtu(). */
size = SKB_DATA_ALIGN(size);
data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
gfp_mask, node);
@@ -202,6 +207,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb->data = data;
skb_reset_tail_pointer(skb);
skb->end = skb->tail + size;
+ skb->owner_env = get_exec_env();
kmemcheck_annotate_bitfield(skb, flags1);
kmemcheck_annotate_bitfield(skb, flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -234,6 +240,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
out:
return skb;
nodata:
+ ub_skb_free_bc(skb);
+nobc:
kmem_cache_free(cache, skb);
skb = NULL;
goto out;
@@ -362,6 +370,7 @@ static void kfree_skbmem(struct sk_buff *skb)
struct sk_buff *other;
atomic_t *fclone_ref;
+ ub_skb_free_bc(skb);
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
kmem_cache_free(skbuff_head_cache, skb);
@@ -394,6 +403,7 @@ static void skb_release_head_state(struct sk_buff *skb)
#ifdef CONFIG_XFRM
secpath_put(skb->sp);
#endif
+ ub_skb_uncharge(skb);
if (skb->destructor) {
WARN_ON(in_irq());
skb->destructor(skb);
@@ -560,6 +570,11 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#endif
new->vlan_tci = old->vlan_tci;
+#ifdef CONFIG_VE
+ new->accounted = old->accounted;
+ new->redirected = old->redirected;
+#endif
+ skb_copy_brmark(new, old);
skb_copy_secmark(new, old);
}
@@ -581,6 +596,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
+ C(owner_env);
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ C(brmark);
+#endif
n->destructor = NULL;
C(tail);
C(end);
@@ -589,6 +608,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(truesize);
atomic_set(&n->users, 1);
+#ifdef CONFIG_VE
+ C(accounted);
+ C(redirected);
+#endif
+
atomic_inc(&(skb_shinfo(skb)->dataref));
skb->cloned = 1;
@@ -647,6 +671,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
+ if (ub_skb_alloc_bc(n, gfp_mask)) {
+ kmem_cache_free(skbuff_head_cache, n);
+ return NULL;
+ }
return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6605e75..b236993 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -124,6 +124,9 @@
#include <net/xfrm.h>
#include <linux/ipsec.h>
+#include <bc/net.h>
+#include <bc/beancounter.h>
+
#include <linux/filter.h>
#ifdef CONFIG_INET
@@ -254,7 +257,7 @@ static void sock_warn_obsolete_bsdism(const char *name)
static char warncomm[TASK_COMM_LEN];
if (strcmp(warncomm, current->comm) && warned < 5) {
strcpy(warncomm, current->comm);
- printk(KERN_WARNING "process `%s' is using obsolete "
+ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
"%s SO_BSDCOMPAT\n", warncomm, name);
warned++;
}
@@ -290,7 +293,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (err)
goto out;
- if (!sk_rmem_schedule(sk, skb->truesize)) {
+ if (!sk_rmem_schedule(sk, skb)) {
err = -ENOBUFS;
goto out;
}
@@ -1006,6 +1009,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
slab = prot->slab;
security_sk_free(sk);
+ ub_sock_uncharge(sk);
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -1034,6 +1038,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
+ sk->owner_env = get_exec_env();
sock_net_set(sk, get_net(net));
atomic_set(&sk->sk_wmem_alloc, 1);
}
@@ -1146,14 +1151,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
if (filter != NULL)
sk_filter_charge(newsk, filter);
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- newsk = NULL;
- goto out;
- }
+ if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0)
+ goto out_err;
+
+ if (unlikely(xfrm_sk_clone_policy(newsk)))
+ goto out_err;
newsk->sk_err = 0;
newsk->sk_priority = 0;
@@ -1186,13 +1188,22 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
net_enable_timestamp();
}
-out:
return newsk;
+
+out_err:
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ sock_reset_flag(newsk, SOCK_TIMESTAMP);
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
}
EXPORT_SYMBOL_GPL(sk_clone);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ extern int sysctl_tcp_use_sg;
+
__sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1205,6 +1216,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_gso_max_size = dst->dev->gso_max_size;
}
}
+ if (!sysctl_tcp_use_sg)
+ sk->sk_route_caps &= ~NETIF_F_SG;
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -1382,9 +1395,8 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
/*
* Generic send/receive buffer handlers
*/
-
-struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
- unsigned long data_len, int noblock,
+struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size,
+ unsigned long size2, int noblock,
int *errcode)
{
struct sk_buff *skb;
@@ -1406,46 +1418,35 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk->sk_shutdown & SEND_SHUTDOWN)
goto failure;
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- skb = alloc_skb(header_len, gfp_mask);
- if (skb) {
- int npages;
- int i;
-
- /* No pages, we're done... */
- if (!data_len)
- break;
-
- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
- skb->truesize += data_len;
- skb_shinfo(skb)->nr_frags = npages;
- for (i = 0; i < npages; i++) {
- struct page *page;
- skb_frag_t *frag;
-
- page = alloc_pages(sk->sk_allocation, 0);
- if (!page) {
- err = -ENOBUFS;
- skb_shinfo(skb)->nr_frags = i;
- kfree_skb(skb);
- goto failure;
- }
-
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
- data_len -= PAGE_SIZE;
- }
+ if (ub_sock_getwres_other(sk, skb_charge_size(size))) {
+ if (size2 < size) {
+ size = size2;
+ continue;
+ }
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+ if (!timeo)
+ goto failure;
+ if (signal_pending(current))
+ goto interrupted;
+ timeo = ub_sock_wait_for_space(sk, timeo,
+ skb_charge_size(size));
+ continue;
+ }
+ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ skb = alloc_skb(size, gfp_mask);
+ if (skb)
/* Full success... */
break;
- }
+ ub_sock_retwres_other(sk, skb_charge_size(size),
+ SOCK_MIN_UBCSPACE_CH);
err = -ENOBUFS;
goto failure;
}
+ ub_sock_retwres_other(sk,
+ skb_charge_size(size),
+ SOCK_MIN_UBCSPACE_CH);
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
@@ -1456,6 +1457,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
timeo = sock_wait_for_wmem(sk, timeo);
}
+ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF);
skb_set_owner_w(skb, sk);
return skb;
@@ -1465,12 +1467,12 @@ failure:
*errcode = err;
return NULL;
}
-EXPORT_SYMBOL(sock_alloc_send_pskb);
+EXPORT_SYMBOL(sock_alloc_send_skb2);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode)
{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
+ return sock_alloc_send_skb2(sk, size, size, noblock, errcode);
}
EXPORT_SYMBOL(sock_alloc_send_skb);
@@ -1904,21 +1906,24 @@ void lock_sock_nested(struct sock *sk, int subclass)
__lock_sock(sk);
sk->sk_lock.owned = 1;
spin_unlock(&sk->sk_lock.slock);
+#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
/*
* The sk_lock has mutex_lock() semantics here:
*/
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+#endif
local_bh_enable();
}
EXPORT_SYMBOL(lock_sock_nested);
void release_sock(struct sock *sk)
{
+#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE)
/*
* The sk_lock has mutex_unlock() semantics:
*/
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
-
+#endif
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
@@ -2194,7 +2199,7 @@ int proto_register(struct proto *prot, int alloc_slab)
{
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | prot->slab_flags,
+ SLAB_HWCACHE_ALIGN | SLAB_UBC | prot->slab_flags,
NULL);
if (prot->slab == NULL) {
@@ -2213,7 +2218,7 @@ int proto_register(struct proto *prot, int alloc_slab)
sprintf(prot->rsk_prot->slab_name, mask, prot->name);
prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
prot->rsk_prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
if (prot->rsk_prot->slab == NULL) {
printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
@@ -2235,7 +2240,7 @@ int proto_register(struct proto *prot, int alloc_slab)
kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
0,
- SLAB_HWCACHE_ALIGN |
+ SLAB_HWCACHE_ALIGN | SLAB_UBC |
prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
diff --git a/net/core/stream.c b/net/core/stream.c
index a37debf..af5873a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -112,8 +112,10 @@ EXPORT_SYMBOL(sk_stream_wait_close);
* sk_stream_wait_memory - Wait for more memory for a socket
* @sk: socket to wait for memory
* @timeo_p: for how long
+ * @amount - amount of memory to wait for (in UB space!)
*/
-int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+int __sk_stream_wait_memory(struct sock *sk, long *timeo_p,
+ unsigned long amount)
{
int err = 0;
long vm_wait = 0;
@@ -135,7 +137,10 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
if (signal_pending(current))
goto do_interrupted;
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
- if (sk_stream_memory_free(sk) && !vm_wait)
+ if (amount == 0) {
+ if (sk_stream_memory_free(sk) && !vm_wait)
+ break;
+ } else if (!ub_sock_sndqueueadd_tcp(sk, amount))
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -145,6 +150,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
sk_stream_memory_free(sk) &&
vm_wait);
sk->sk_write_pending--;
+ if (amount > 0)
+ ub_sock_sndqueuedel(sk);
if (vm_wait) {
vm_wait -= current_timeo;
@@ -171,6 +178,10 @@ do_interrupted:
goto out;
}
+int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
+{
+ return __sk_stream_wait_memory(sk, timeo_p, 0);
+}
EXPORT_SYMBOL(sk_stream_wait_memory);
int sk_stream_error(struct sock *sk, int flags, int err)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index e48ca5d..3b3cbec 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -588,6 +588,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
__ip6_dst_store(newsk, dst, NULL, NULL);
newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
NETIF_F_TSO);
+ if (!sysctl_tcp_use_sg)
+ newsk->sk_route_caps &= ~NETIF_F_SG;
newdp6 = (struct dccp6_sock *)newsk;
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5ca49ce..e12a0d1 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -19,6 +19,8 @@
#include <net/xfrm.h>
#include <net/inet_timewait_sock.h>
+#include <bc/sock_orphan.h>
+
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
@@ -46,7 +48,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
- if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
+ if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets &&
+ ub_timewait_check(sk, &dccp_death_row))
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) {
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 6d2bd32..45567e3 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
return;
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
RCV_SKB_FAIL(-EPERM);
/* Eventually we might send routing messages too */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 57737b8..3559d3b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -115,6 +115,7 @@
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
+#include <bc/net.h>
/* The inetsw table contains everything that inet_create needs to
@@ -324,6 +325,10 @@ lookup_protocol:
goto out_rcu_unlock;
}
+ err = vz_security_protocol_check(answer->protocol);
+ if (err < 0)
+ goto out_rcu_unlock;
+
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
@@ -345,6 +350,13 @@ lookup_protocol:
if (sk == NULL)
goto out;
+ err = -ENOBUFS;
+ if (ub_sock_charge(sk, PF_INET, sock->type))
+ goto out_sk_free;
+ /* if charge was successful, sock_init_data() MUST be called to
+ * set sk->sk_type. otherwise sk will be uncharged to wrong resource
+ */
+
err = 0;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
@@ -402,6 +414,9 @@ out:
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_free:
+ sk_free(sk);
+ return err;
}
@@ -416,6 +431,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ struct ve_struct *saved_env;
+
+ saved_env = set_exec_env(sk->owner_env);
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -433,6 +451,8 @@ int inet_release(struct socket *sock)
timeout = sk->sk_lingertime;
sock->sk = NULL;
sk->sk_prot->close(sk, timeout);
+
+ (void)set_exec_env(saved_env);
}
return 0;
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4e80f33..01deb6a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1136,7 +1136,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCDARP:
case SIOCSARP:
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) &&
+ !capable(CAP_VE_NET_ADMIN))
return -EPERM;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cc35645..6450b63 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -110,10 +110,11 @@ static inline void devinet_sysctl_unregister(struct in_device *idev)
/* Locks all the inet devices. */
-static struct in_ifaddr *inet_alloc_ifa(void)
+struct in_ifaddr *inet_alloc_ifa(void)
{
- return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_UBC);
}
+EXPORT_SYMBOL_GPL(inet_alloc_ifa);
static void inet_rcu_free_ifa(struct rcu_head *head)
{
@@ -146,7 +147,7 @@ void in_dev_finish_destroy(struct in_device *idev)
}
}
-static struct in_device *inetdev_init(struct net_device *dev)
+struct in_device *inetdev_init(struct net_device *dev)
{
struct in_device *in_dev;
@@ -182,6 +183,7 @@ out_kfree:
in_dev = NULL;
goto out;
}
+EXPORT_SYMBOL_GPL(inetdev_init);
static void in_dev_rcu_put(struct rcu_head *head)
{
@@ -375,7 +377,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return 0;
}
-static int inet_insert_ifa(struct in_ifaddr *ifa)
+int inet_insert_ifa(struct in_ifaddr *ifa)
{
return __inet_insert_ifa(ifa, NULL, 0);
}
@@ -426,6 +428,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
} endfor_ifa(in_dev);
return NULL;
}
+EXPORT_SYMBOL_GPL(inet_insert_ifa);
static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
{
@@ -624,7 +627,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSIFFLAGS:
ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
goto out;
break;
case SIOCSIFADDR: /* Set interface address (and family) */
@@ -632,7 +635,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSIFDSTADDR: /* Set the destination address */
case SIOCSIFNETMASK: /* Set the netmask for the interface */
ret = -EACCES;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
goto out;
ret = -EINVAL;
if (sin->sin_family != AF_INET)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 29391ee..87bedd4 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -262,7 +262,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
net = dev_net(dev);
if (fib_lookup(net, &fl, &res))
goto last_resort;
- if (res.type != RTN_UNICAST)
+ if (res.type != RTN_UNICAST &&
+ (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL))
goto e_inval_res;
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
@@ -464,7 +465,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
switch (cmd) {
case SIOCADDRT: /* Add a route */
case SIOCDELRT: /* Delete a route */
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
return -EPERM;
if (copy_from_user(&rt, arg, sizeof(rt)))
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd3945..baf9314 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -769,10 +769,10 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin
void __init fib_hash_init(void)
{
fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_UBC, NULL);
fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_UBC, NULL);
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 537731b..5a90bdd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,9 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <bc/net.h>
+#include <bc/sock_orphan.h>
+
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
@@ -165,6 +168,8 @@ have_snum:
goto tb_not_found;
tb_found:
if (!hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse > 1)
+ goto success;
if (tb->fastreuse > 0 &&
sk->sk_reuse && sk->sk_state != TCP_LISTEN &&
smallest_size == -1) {
@@ -618,7 +623,7 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
- percpu_counter_dec(sk->sk_prot->orphan_count);
+ ub_dec_orphan_count(sk);
sock_put(sk);
}
@@ -698,7 +703,7 @@ void inet_csk_listen_stop(struct sock *sk)
sock_orphan(child);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ ub_inc_orphan_count(sk);
inet_csk_destroy_sock(child);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a706a47..f707cea 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -702,6 +702,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
const struct inet_diag_handler *handler;
struct inet_hashinfo *hashinfo;
+ struct ve_struct *ve = get_exec_env();
handler = inet_diag_lock_handler(cb->nlh->nlmsg_type);
if (IS_ERR(handler))
@@ -727,6 +728,8 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
sk_nulls_for_each(sk, node, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
+ if (!ve_accessible(sk->owner_env, ve))
+ continue;
if (num < s_num) {
num++;
continue;
@@ -793,6 +796,8 @@ skip_listen_ht:
sk_nulls_for_each(sk, node, &head->chain) {
struct inet_sock *inet = inet_sk(sk);
+ if (!ve_accessible(sk->owner_env, ve))
+ continue;
if (num < s_num)
goto next_normal;
if (!(r->idiag_states & (1 << sk->sk_state)))
@@ -817,6 +822,8 @@ next_normal:
inet_twsk_for_each(tw, node,
&head->twchain) {
+ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve)))
+ continue;
if (num < s_num)
goto next_dying;
if (r->id.idiag_sport != tw->tw_sport &&
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c..27b45e4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <linux/sched.h>
#include <net/inet_frag.h>
@@ -249,6 +250,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
spin_lock_init(&q->lock);
atomic_set(&q->refcnt, 1);
q->net = nf;
+#ifdef CONFIG_VE
+ q->owner_ve = get_exec_env();
+#endif
return q;
}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 13f0781..374bc0d 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -14,6 +14,8 @@
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+#include <bc/sock_orphan.h>
+
/* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
@@ -115,9 +117,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
{
- struct inet_timewait_sock *tw =
- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
- GFP_ATOMIC);
+ struct user_beancounter *ub;
+ struct inet_timewait_sock *tw;
+
+ ub = set_exec_ub(sock_bc(sk)->ub);
+ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+ GFP_ATOMIC);
+ (void)set_exec_ub(ub);
+
if (tw != NULL) {
const struct inet_sock *inet = inet_sk(sk);
@@ -169,6 +176,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
rescan:
inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
__inet_twsk_del_dead_node(tw);
+ ub_timewait_dec(tw, twdr);
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
#ifdef CONFIG_NET_NS
@@ -269,6 +277,7 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
{
spin_lock(&twdr->death_lock);
if (inet_twsk_del_dead_node(tw)) {
+ ub_timewait_dec(tw, twdr);
inet_twsk_put(tw);
if (--twdr->tw_count == 0)
del_timer(&twdr->tw_timer);
@@ -315,9 +324,10 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
spin_lock(&twdr->death_lock);
/* Unlink it, if it was scheduled */
- if (inet_twsk_del_dead_node(tw))
+ if (inet_twsk_del_dead_node(tw)) {
+ ub_timewait_dec(tw, twdr);
twdr->tw_count--;
- else
+ } else
atomic_inc(&tw->tw_refcnt);
if (slot >= INET_TWDR_RECYCLE_SLOTS) {
@@ -353,6 +363,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
hlist_add_head(&tw->tw_death_node, list);
+ ub_timewait_inc(tw, twdr);
if (twdr->tw_count++ == 0)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
spin_unlock(&twdr->death_lock);
@@ -387,6 +398,7 @@ void inet_twdr_twcal_tick(unsigned long data)
&twdr->twcal_row[slot]) {
__inet_twsk_del_dead_node(tw);
__inet_twsk_kill(tw, twdr->hashinfo);
+ ub_timewait_dec(tw, twdr);
#ifdef CONFIG_NET_NS
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
#endif
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc..e9b5916 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -94,6 +94,24 @@ int ip_forward(struct sk_buff *skb)
goto drop;
}
+ /*
+ * We try to optimize forwarding of VE packets:
+ * do not decrement TTL (and so save skb_cow)
+ * during forwarding of outgoing pkts from VE.
+ * For incoming pkts we still do ttl decr,
+ * since such skb is not cloned and does not require
+ * actual cow. So, there is at least one place
+ * in pkts path with mandatory ttl decr, that is
+ * sufficient to prevent routing loops.
+ */
+ iph = ip_hdr(skb);
+ if (
+#ifdef CONFIG_IP_ROUTE_NAT
+ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */
+#endif /* and */
+ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */
+ goto no_ttl_decr;
+
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
goto drop;
@@ -102,6 +120,8 @@ int ip_forward(struct sk_buff *skb)
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
+no_ttl_decr:
+
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d3fe10b..fa1c137 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -186,10 +186,13 @@ static void ip_evictor(struct net *net)
*/
static void ip_expire(unsigned long arg)
{
+ struct inet_frag_queue *q = (struct inet_frag_queue *)arg;
struct ipq *qp;
struct net *net;
+ struct ve_struct *old_ve;
- qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+ qp = container_of(q, struct ipq, q);
+ old_ve = set_exec_env(q->owner_ve);
net = container_of(qp->q.net, struct net, ipv4.frags);
spin_lock(&qp->q.lock);
@@ -214,6 +217,8 @@ static void ip_expire(unsigned long arg)
out:
spin_unlock(&qp->q.lock);
ipq_put(qp);
+
+ (void)set_exec_env(old_ve);
}
/* Find the correct entry in the "incomplete datagrams" queue for
@@ -525,6 +530,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone->csum = 0;
clone->ip_summed = head->ip_summed;
atomic_add(clone->truesize, &qp->q.net->mem);
+ clone->owner_env = head->owner_env;
}
skb_shinfo(head)->frag_list = head->next;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1433338..6efea1a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -50,6 +50,9 @@
#include <net/ip6_route.h>
#endif
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
/*
Problems & solutions
--------------------
@@ -1202,6 +1205,8 @@ static int ipgre_close(struct net_device *dev)
#endif
+static void ipgre_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx);
static const struct net_device_ops ipgre_netdev_ops = {
.ndo_init = ipgre_tunnel_init,
.ndo_uninit = ipgre_tunnel_uninit,
@@ -1212,6 +1217,7 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_start_xmit = ipgre_tunnel_xmit,
.ndo_do_ioctl = ipgre_tunnel_ioctl,
.ndo_change_mtu = ipgre_tunnel_change_mtu,
+ .ndo_cpt = ipgre_cpt,
};
static void ipgre_tunnel_setup(struct net_device *dev)
@@ -1297,6 +1303,112 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign)
}
}
+static void ipgre_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+ struct cpt_tunnel_image v;
+ struct ip_tunnel *t;
+ struct ipgre_net *ign;
+
+ t = netdev_priv(dev);
+ ign = net_generic(get_exec_env()->ve_netns, ipgre_net_id);
+ BUG_ON(ign == NULL);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ /* mark fb dev */
+ v.cpt_tnl_flags = CPT_TUNNEL_GRE;
+ if (dev == ign->fb_tunnel_dev)
+ v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+ v.cpt_i_flags = t->parms.i_flags;
+ v.cpt_o_flags = t->parms.o_flags;
+ v.cpt_i_key = t->parms.i_key;
+ v.cpt_o_key = t->parms.o_key;
+ v.cpt_i_seqno = t->i_seqno;
+ v.cpt_o_seqno = t->o_seqno;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+ ops->write(&v, sizeof(v), ctx);
+}
+
+static int ipgre_rst(loff_t start, struct cpt_netdev_image *di,
+ struct rst_ops *ops, struct cpt_context *ctx)
+{
+ int err = -ENODEV;
+ struct cpt_tunnel_image v;
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ loff_t pos;
+ int fbdev;
+ struct ipgre_net *ign;
+
+ ign = net_generic(get_exec_env()->ve_netns, ipgre_net_id);
+ if (ign == NULL)
+ return -EOPNOTSUPP;
+
+ pos = start + di->cpt_hdrlen;
+ err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+ pos, &v, sizeof(v), ctx);
+ if (err)
+ return err;
+
+ /* some sanity */
+ if (v.cpt_content != CPT_CONTENT_VOID)
+ return -EINVAL;
+
+ if (!(v.cpt_tnl_flags & CPT_TUNNEL_GRE))
+ return 1;
+
+ if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) {
+ fbdev = 1;
+ err = 0;
+ dev = ign->fb_tunnel_dev;
+ } else {
+ fbdev = 0;
+ err = -ENOMEM;
+ dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name,
+ ipgre_tunnel_setup);
+ if (!dev)
+ goto out;
+ }
+
+ t = netdev_priv(dev);
+ t->parms.i_flags = v.cpt_i_flags;
+ t->parms.o_flags = v.cpt_o_flags;
+ t->parms.i_key = v.cpt_i_key;
+ t->parms.o_key = v.cpt_o_key;
+ t->i_seqno = v.cpt_i_seqno;
+ t->o_seqno = v.cpt_o_seqno;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph));
+
+ if (!fbdev) {
+ ipgre_tunnel_init(dev);
+ err = register_netdevice(dev);
+ if (err) {
+ free_netdev(dev);
+ goto out;
+ }
+
+ dev_hold(dev);
+ ipgre_tunnel_link(ign, t);
+ }
+out:
+ return err;
+}
+
+static struct netdev_rst ipgre_netdev_rst = {
+ .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+ .ndo_rst = ipgre_rst,
+};
+
static int ipgre_init_net(struct net *net)
{
int err;
@@ -1682,6 +1794,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto tap_ops_failed;
+ register_netdev_rst(&ipgre_netdev_rst);
out:
return err;
@@ -1696,6 +1809,7 @@ gen_device_failed:
static void __exit ipgre_fini(void)
{
+ unregister_netdev_rst(&ipgre_netdev_rst);
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6c98b43..7e2f617 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -193,6 +193,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ if (skb->destructor)
+ skb_orphan(skb);
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4d50daa..1f681c7 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1362,12 +1362,13 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
char data[40];
} replyopts;
struct ipcm_cookie ipc;
- __be32 daddr;
+ __be32 saddr, daddr;
struct rtable *rt = skb_rtable(skb);
if (ip_options_echo(&replyopts.opt, skb))
return;
+ saddr = ip_hdr(skb)->daddr;
daddr = ipc.addr = rt->rt_src;
ipc.opt = NULL;
ipc.shtx.flags = 0;
@@ -1383,7 +1384,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
struct flowi fl = { .oif = arg->bound_dev_if,
.nl_u = { .ip4_u =
{ .daddr = daddr,
- .saddr = rt->rt_spec_dst,
+ .saddr = saddr,
.tos = RT_TOS(ip_hdr(skb)->tos) } },
/* Not quite clean, but right. */
.uli_u = { .ports =
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c2..e70ffc0 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -192,19 +192,20 @@ static int __init ic_open_devs(void)
struct ic_device *d, **last;
struct net_device *dev;
unsigned short oflags;
+ struct net *net = get_exec_env()->ve_netns;
last = &ic_first_dev;
rtnl_lock();
/* bring loopback device up first */
- for_each_netdev(&init_net, dev) {
+ for_each_netdev(net, dev) {
if (!(dev->flags & IFF_LOOPBACK))
continue;
if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
}
- for_each_netdev(&init_net, dev) {
+ for_each_netdev(net, dev) {
if (dev->flags & IFF_LOOPBACK)
continue;
if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
@@ -459,9 +460,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
unsigned char *sha, *tha; /* s for "source", t for "target" */
struct ic_device *d;
- if (!net_eq(dev_net(dev), &init_net))
- goto drop;
-
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
return NET_RX_DROP;
@@ -885,9 +883,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
struct ic_device *d;
int len, ext_len;
- if (!net_eq(dev_net(dev), &init_net))
- goto drop;
-
/* Perform verifications before taking the lock. */
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae40ed1..b763d8a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -106,6 +106,7 @@
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
+#include <linux/vzcalluser.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -116,6 +117,9 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
#define HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
@@ -144,6 +148,9 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
+ if (ipn == NULL)
+ return NULL;
+
for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
@@ -686,11 +693,14 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+static void ipip_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx);
static const struct net_device_ops ipip_netdev_ops = {
.ndo_uninit = ipip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
.ndo_do_ioctl = ipip_tunnel_ioctl,
.ndo_change_mtu = ipip_tunnel_change_mtu,
+ .ndo_cpt = ipip_cpt,
};
@@ -762,11 +772,116 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn)
}
}
+static void ipip_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+ struct cpt_tunnel_image v;
+ struct ip_tunnel *t;
+ struct ipip_net *ipn;
+
+ t = netdev_priv(dev);
+ ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id);
+ BUG_ON(ipn == NULL);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ /* mark fb dev */
+ v.cpt_tnl_flags = 0;
+ if (dev == ipn->fb_tunnel_dev)
+ v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+ v.cpt_i_flags = t->parms.i_flags;
+ v.cpt_o_flags = t->parms.o_flags;
+ v.cpt_i_key = t->parms.i_key;
+ v.cpt_o_key = t->parms.o_key;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+ ops->write(&v, sizeof(v), ctx);
+}
+
+static int ipip_rst(loff_t start, struct cpt_netdev_image *di,
+ struct rst_ops *ops, struct cpt_context *ctx)
+{
+ int err = -ENODEV;
+ struct cpt_tunnel_image v;
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ loff_t pos;
+ int fbdev;
+ struct ipip_net *ipn;
+
+ ipn = net_generic(get_exec_env()->ve_netns, ipip_net_id);
+ if (ipn == NULL)
+ return -EOPNOTSUPP;
+
+ pos = start + di->cpt_hdrlen;
+ err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+ pos, &v, sizeof(v), ctx);
+ if (err)
+ return err;
+
+ /* some sanity */
+ if (v.cpt_content != CPT_CONTENT_VOID)
+ return -EINVAL;
+
+ if (v.cpt_tnl_flags & (~CPT_TUNNEL_FBDEV))
+ return 1;
+
+ if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) {
+ fbdev = 1;
+ err = 0;
+ dev = ipn->fb_tunnel_dev;
+ } else {
+ fbdev = 0;
+ err = -ENOMEM;
+ dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name,
+ ipip_tunnel_setup);
+ if (!dev)
+ goto out;
+ }
+
+ t = netdev_priv(dev);
+ t->parms.i_flags = v.cpt_i_flags;
+ t->parms.o_flags = v.cpt_o_flags;
+ t->parms.i_key = v.cpt_i_key;
+ t->parms.o_key = v.cpt_o_key;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph));
+
+ if (!fbdev) {
+ ipip_tunnel_init(dev);
+ err = register_netdevice(dev);
+ if (err) {
+ free_netdev(dev);
+ goto out;
+ }
+
+ dev_hold(dev);
+ ipip_tunnel_link(ipn, t);
+ }
+out:
+ return err;
+}
+
+static struct netdev_rst ipip_netdev_rst = {
+ .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+ .ndo_rst = ipip_rst,
+};
+
static int ipip_init_net(struct net *net)
{
int err;
struct ipip_net *ipn;
+ if (!(get_exec_env()->features & VE_FEATURE_IPIP))
+ return 0;
+
err = -ENOMEM;
ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
if (ipn == NULL)
@@ -812,6 +927,9 @@ static void ipip_exit_net(struct net *net)
struct ipip_net *ipn;
ipn = net_generic(net, ipip_net_id);
+ if (ipn == NULL) /* no VE_FEATURE_IPIP */
+ return;
+
rtnl_lock();
ipip_destroy_tunnels(ipn);
unregister_netdevice(ipn->fb_tunnel_dev);
@@ -838,12 +956,15 @@ static int __init ipip_init(void)
err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
if (err)
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+ else
+ register_netdev_rst(&ipip_netdev_rst);
return err;
}
static void __exit ipip_fini(void)
{
+ unregister_netdev_rst(&ipip_netdev_rst);
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
printk(KERN_INFO "ipip close: can't deregister tunnel\n");
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c156db2..3d4e78f 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -437,7 +437,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
if (type <= IPQM_BASE)
return;
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
RCV_SKB_FAIL(-EPERM);
write_lock_bh(&queue_lock);
@@ -467,8 +467,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
static void
ipq_rcv_skb(struct sk_buff *skb)
{
+ struct ve_struct *old_ve;
+
mutex_lock(&ipqnl_mutex);
+ old_ve = set_exec_env(skb->owner_env);
__ipq_rcv_skb(skb);
+ (void)set_exec_env(old_ve);
mutex_unlock(&ipqnl_mutex);
}
@@ -478,9 +482,6 @@ ipq_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
ipq_dev_drop(dev->ifindex);
@@ -500,7 +501,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
if (event == NETLINK_URELEASE &&
n->protocol == NETLINK_FIREWALL && n->pid) {
write_lock_bh(&queue_lock);
- if ((n->net == &init_net) && (n->pid == peer_pid))
+ if (n->pid == peer_pid)
__ipq_reset();
write_unlock_bh(&queue_lock);
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 62aff31..c135650 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -321,6 +321,9 @@ ipt_do_table(struct sk_buff *skb,
struct xt_match_param mtpar;
struct xt_target_param tgpar;
+ if (ve_xt_table_forbidden(table))
+ return NF_ACCEPT;
+
/* Initialization */
ip = ip_hdr(skb);
indev = in ? in->name : nulldevname;
@@ -466,8 +469,8 @@ mark_source_chains(struct xt_table_info *newinfo,
int visited = e->comefrom & (1 << hook);
if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
- printk("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ ve_printk(VE_LOG, "iptables: loop hook %u pos "
+ "%u %08X.\n", hook, pos, e->comefrom);
return 0;
}
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
@@ -950,7 +953,7 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc_node(countersize, numa_node_id());
+ counters = ub_vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1217,7 +1220,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
void *loc_cpu_old_entry;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
@@ -1381,7 +1384,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
if (len != size + num_counters * sizeof(struct xt_counters))
return -EINVAL;
- paddc = vmalloc_node(len - size, numa_node_id());
+ paddc = ub_vmalloc_node(len - size, numa_node_id());
if (!paddc)
return -ENOMEM;
@@ -1855,13 +1858,15 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
+static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int);
+
static int
compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
unsigned int len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1874,8 +1879,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
- ret = -EINVAL;
+ ret = do_ipt_set_ctl(sk, cmd, user, len);
}
return ret;
@@ -1972,7 +1976,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -1994,7 +1998,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2019,7 +2023,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2072,7 +2076,7 @@ struct xt_table *ipt_register_table(struct net *net,
int ret;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ = { 0, 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;
struct xt_table *new_table;
@@ -2218,11 +2222,22 @@ static struct xt_match icmp_matchstruct __read_mostly = {
static int __net_init ip_tables_net_init(struct net *net)
{
- return xt_proto_init(net, NFPROTO_IPV4);
+ int res;
+
+ if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+ return 0;
+
+ res = xt_proto_init(net, NFPROTO_IPV4);
+ if (!res)
+ net_ipt_module_set(net, VE_IP_IPTABLES);
+ return res;
}
static void __net_exit ip_tables_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_IPTABLES))
+ return;
+
xt_proto_fini(net, NFPROTO_IPV4);
}
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2e4f98b..a1619a1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -20,6 +20,7 @@
#include <linux/icmp.h>
#include <linux/if_arp.h>
#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
#include <linux/netfilter_arp.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -383,7 +384,8 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
return false;
}
- dev = dev_get_by_name(&init_net, e->ip.iniface);
+ dev = dev_get_by_name(get_exec_env()->ve_netns,
+ e->ip.iniface);
if (!dev) {
printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
return false;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index acc44c6..fbef754 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
if (ih == NULL) {
- printk("TRUNCATED");
+ ve_printk(VE_LOG, "TRUNCATED");
return;
}
/* Important fields:
* TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- printk("SRC=%pI4 DST=%pI4 ",
+ ve_printk(VE_LOG, "SRC=%pI4 DST=%pI4 ",
&ih->saddr, &ih->daddr);
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+ ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
/* Max length: 6 "CE DF MF " */
if (ntohs(ih->frag_off) & IP_CE)
- printk("CE ");
+ ve_printk(VE_LOG, "CE ");
if (ntohs(ih->frag_off) & IP_DF)
- printk("DF ");
+ ve_printk(VE_LOG, "DF ");
if (ntohs(ih->frag_off) & IP_MF)
- printk("MF ");
+ ve_printk(VE_LOG, "MF ");
/* Max length: 11 "FRAG:65535 " */
if (ntohs(ih->frag_off) & IP_OFFSET)
- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+ ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
if ((logflags & IPT_LOG_IPOPT)
&& ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
op = skb_header_pointer(skb, iphoff+sizeof(_iph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ ve_printk(VE_LOG, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ ve_printk(VE_LOG, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ ve_printk(VE_LOG, "%02X", op[i]);
+ ve_printk(VE_LOG, ") ");
}
switch (ih->protocol) {
@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct tcphdr *th;
/* Max length: 10 "PROTO=TCP " */
- printk("PROTO=TCP ");
+ ve_printk(VE_LOG, "PROTO=TCP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_tcph), &_tcph);
if (th == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u ",
+ ve_printk(VE_LOG, "SPT=%u DPT=%u ",
ntohs(th->source), ntohs(th->dest));
/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
if (logflags & IPT_LOG_TCPSEQ)
- printk("SEQ=%u ACK=%u ",
+ ve_printk(VE_LOG, "SEQ=%u ACK=%u ",
ntohl(th->seq), ntohl(th->ack_seq));
/* Max length: 13 "WINDOW=65535 " */
- printk("WINDOW=%u ", ntohs(th->window));
+ ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window));
/* Max length: 9 "RES=0x3F " */
- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+ ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
if (th->cwr)
- printk("CWR ");
+ ve_printk(VE_LOG, "CWR ");
if (th->ece)
- printk("ECE ");
+ ve_printk(VE_LOG, "ECE ");
if (th->urg)
- printk("URG ");
+ ve_printk(VE_LOG, "URG ");
if (th->ack)
- printk("ACK ");
+ ve_printk(VE_LOG, "ACK ");
if (th->psh)
- printk("PSH ");
+ ve_printk(VE_LOG, "PSH ");
if (th->rst)
- printk("RST ");
+ ve_printk(VE_LOG, "RST ");
if (th->syn)
- printk("SYN ");
+ ve_printk(VE_LOG, "SYN ");
if (th->fin)
- printk("FIN ");
+ ve_printk(VE_LOG, "FIN ");
/* Max length: 11 "URGP=65535 " */
- printk("URGP=%u ", ntohs(th->urg_ptr));
+ ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr));
if ((logflags & IPT_LOG_TCPOPT)
&& th->doff * 4 > sizeof(struct tcphdr)) {
@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
iphoff+ih->ihl*4+sizeof(_tcph),
optsize, _opt);
if (op == NULL) {
- printk("TRUNCATED");
+ ve_printk(VE_LOG, "TRUNCATED");
return;
}
/* Max length: 127 "OPT (" 15*4*2chars ") " */
- printk("OPT (");
+ ve_printk(VE_LOG, "OPT (");
for (i = 0; i < optsize; i++)
- printk("%02X", op[i]);
- printk(") ");
+ ve_printk(VE_LOG, "%02X", op[i]);
+ ve_printk(VE_LOG, ") ");
}
break;
}
@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
if (ih->protocol == IPPROTO_UDP)
/* Max length: 10 "PROTO=UDP " */
- printk("PROTO=UDP " );
+ ve_printk(VE_LOG, "PROTO=UDP " );
else /* Max length: 14 "PROTO=UDPLITE " */
- printk("PROTO=UDPLITE ");
+ ve_printk(VE_LOG, "PROTO=UDPLITE ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_udph), &_udph);
if (uh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 20 "SPT=65535 DPT=65535 " */
- printk("SPT=%u DPT=%u LEN=%u ",
+ ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ",
ntohs(uh->source), ntohs(uh->dest),
ntohs(uh->len));
break;
@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
[ICMP_ADDRESSREPLY] = 12 };
/* Max length: 11 "PROTO=ICMP " */
- printk("PROTO=ICMP ");
+ ve_printk(VE_LOG, "PROTO=ICMP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
sizeof(_icmph), &_icmph);
if (ich == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Max length: 18 "TYPE=255 CODE=255 " */
- printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+ ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code);
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
if (ich->type <= NR_ICMP_TYPES
&& required_len[ich->type]
&& skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
@@ -250,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
case ICMP_ECHOREPLY:
case ICMP_ECHO:
/* Max length: 19 "ID=65535 SEQ=65535 " */
- printk("ID=%u SEQ=%u ",
+ ve_printk(VE_LOG, "ID=%u SEQ=%u ",
ntohs(ich->un.echo.id),
ntohs(ich->un.echo.sequence));
break;
case ICMP_PARAMETERPROB:
/* Max length: 14 "PARAMETER=255 " */
- printk("PARAMETER=%u ",
+ ve_printk(VE_LOG, "PARAMETER=%u ",
ntohl(ich->un.gateway) >> 24);
break;
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
- printk("GATEWAY=%pI4 ", &ich->un.gateway);
+ ve_printk(VE_LOG, "GATEWAY=%pI4 ", &ich->un.gateway);
/* Fall through */
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
/* Max length: 3+maxlen */
if (!iphoff) { /* Only recurse once. */
- printk("[");
+ ve_printk(VE_LOG, "[");
dump_packet(info, skb,
iphoff + ih->ihl*4+sizeof(_icmph));
- printk("] ");
+ ve_printk(VE_LOG, "] ");
}
/* Max length: 10 "MTU=65535 " */
if (ich->type == ICMP_DEST_UNREACH
&& ich->code == ICMP_FRAG_NEEDED)
- printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+ ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu));
}
break;
}
@@ -291,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
break;
/* Max length: 9 "PROTO=AH " */
- printk("PROTO=AH ");
+ ve_printk(VE_LOG, "PROTO=AH ");
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_ahdr), &_ahdr);
if (ah == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(ah->spi));
+ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi));
break;
}
case IPPROTO_ESP: {
@@ -311,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
const struct ip_esp_hdr *eh;
/* Max length: 10 "PROTO=ESP " */
- printk("PROTO=ESP ");
+ ve_printk(VE_LOG, "PROTO=ESP ");
if (ntohs(ih->frag_off) & IP_OFFSET)
break;
@@ -320,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
sizeof(_esph), &_esph);
if (eh == NULL) {
- printk("INCOMPLETE [%u bytes] ",
+ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
}
/* Length: 15 "SPI=0xF1234567 " */
- printk("SPI=0x%x ", ntohl(eh->spi));
+ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi));
break;
}
/* Max length: 10 "PROTO 255 " */
default:
- printk("PROTO=%u ", ih->protocol);
+ ve_printk(VE_LOG, "PROTO=%u ", ih->protocol);
}
/* Max length: 15 "UID=4294967295 " */
if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->sk->sk_socket && skb->sk->sk_socket->file)
- printk("UID=%u GID=%u ",
+ ve_printk(VE_LOG, "UID=%u GID=%u ",
skb->sk->sk_socket->file->f_cred->fsuid,
skb->sk->sk_socket->file->f_cred->fsgid);
read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -386,7 +386,7 @@ ipt_log_packet(u_int8_t pf,
loginfo = &default_loginfo;
spin_lock_bh(&log_lock);
- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+ ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
prefix,
in ? in->name : "",
out ? out->name : "");
@@ -397,30 +397,30 @@ ipt_log_packet(u_int8_t pf,
physindev = skb->nf_bridge->physindev;
if (physindev && in != physindev)
- printk("PHYSIN=%s ", physindev->name);
+ ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name);
physoutdev = skb->nf_bridge->physoutdev;
if (physoutdev && out != physoutdev)
- printk("PHYSOUT=%s ", physoutdev->name);
+ ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name);
}
#endif
if (in && !out) {
/* MAC logging for input chain only. */
- printk("MAC=");
+ ve_printk(VE_LOG, "MAC=");
if (skb->dev && skb->dev->hard_header_len
&& skb->mac_header != skb->network_header) {
int i;
const unsigned char *p = skb_mac_header(skb);
for (i = 0; i < skb->dev->hard_header_len; i++,p++)
- printk("%02x%c", *p,
+ ve_printk(VE_LOG, "%02x%c", *p,
i==skb->dev->hard_header_len - 1
? ' ':':');
} else
- printk(" ");
+ ve_printk(VE_LOG, " ");
}
dump_packet(loginfo, skb, 0);
- printk("\n");
+ ve_printk(VE_LOG, "\n");
spin_unlock_bh(&log_lock);
}
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index dada086..00e6d49 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -88,6 +88,7 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC);
}
+#if 0
static int
device_cmp(struct nf_conn *i, void *ifindex)
{
@@ -134,6 +135,7 @@ static struct notifier_block masq_dev_notifier = {
static struct notifier_block masq_inet_notifier = {
.notifier_call = masq_inet_event,
};
+#endif
static struct xt_target masquerade_tg_reg __read_mostly = {
.name = "MASQUERADE",
@@ -152,12 +154,16 @@ static int __init masquerade_tg_init(void)
ret = xt_register_target(&masquerade_tg_reg);
+#if 0
+/* These notifiers are unnecessary and may
+ lead to oops in virtual environments */
if (ret == 0) {
/* Register for device down reports */
register_netdevice_notifier(&masq_dev_notifier);
/* Register IP address change reports */
register_inetaddr_notifier(&masq_inet_notifier);
}
+#endif
return ret;
}
@@ -165,8 +171,8 @@ static int __init masquerade_tg_init(void)
static void __exit masquerade_tg_exit(void)
{
xt_unregister_target(&masquerade_tg_reg);
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
+/* unregister_netdevice_notifier(&masq_dev_notifier);
+ unregister_inetaddr_notifier(&masq_inet_notifier);*/
}
module_init(masquerade_tg_init);
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 698e5e7..be6125e 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -67,8 +67,13 @@ redirect_tg(struct sk_buff *skb, const struct xt_target_param *par)
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
- if (indev && (ifa = indev->ifa_list))
+ if (indev && (ifa = indev->ifa_list)) {
+ /* because of venet device specific, we should use
+ * second ifa in the list */
+ if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next)
+ ifa = ifa->ifa_next;
newdst = ifa->ifa_local;
+ }
rcu_read_unlock();
if (!newdst)
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index c93ae44..764a19c 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -180,13 +180,13 @@ static bool reject_tg_check(const struct xt_tgchk_param *par)
const struct ipt_entry *e = par->entryinfo;
if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
- printk("ipt_REJECT: ECHOREPLY no longer supported.\n");
+ ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n");
return false;
} else if (rejinfo->with == IPT_TCP_RESET) {
/* Must specify that it's a TCP packet */
if (e->ip.proto != IPPROTO_TCP
|| (e->ip.invflags & XT_INV_PROTO)) {
- printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
+ ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n");
return false;
}
}
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index df566cb..b129326 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -128,16 +128,24 @@ module_param(forward, bool, 0000);
static int __net_init iptable_filter_net_init(struct net *net)
{
+ if (!net_ipt_permitted(net, VE_IP_FILTER))
+ return 0;
+
/* Register table */
net->ipv4.iptable_filter =
ipt_register_table(net, &packet_filter, &initial_table.repl);
if (IS_ERR(net->ipv4.iptable_filter))
return PTR_ERR(net->ipv4.iptable_filter);
+
+ net_ipt_module_set(net, VE_IP_FILTER);
return 0;
}
static void __net_exit iptable_filter_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_FILTER))
+ return;
+
ipt_unregister_table(net->ipv4.iptable_filter);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 036047f..47730f3 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -198,16 +198,24 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = {
static int __net_init iptable_mangle_net_init(struct net *net)
{
+ if (!net_ipt_permitted(net, VE_IP_MANGLE))
+ return 0;
+
/* Register table */
net->ipv4.iptable_mangle =
ipt_register_table(net, &packet_mangler, &initial_table.repl);
if (IS_ERR(net->ipv4.iptable_mangle))
return PTR_ERR(net->ipv4.iptable_mangle);
+
+ net_ipt_module_set(net, VE_IP_MANGLE);
return 0;
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
+ return;
+
ipt_unregister_table(net->ipv4.iptable_mangle);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 1032a15..8765b1b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -10,6 +10,7 @@
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
+#include <net/net_namespace.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmp.h>
@@ -367,6 +368,30 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
.me = THIS_MODULE,
};
+static int nf_conntrack_l3proto_ipv4_init_net(struct net *net)
+{
+ if (!net_ipt_permitted(net, VE_IP_CONNTRACK))
+ return 0;
+ /*
+ * FIXME:
+ * Need virtualize per-net sysctls
+ */
+
+ net_ipt_module_set(net, VE_IP_CONNTRACK);
+ return 0;
+}
+
+static void nf_conntrack_l3proto_ipv4_fini_net(struct net *net)
+{
+ if (!net_is_ipt_module_set(net, VE_IP_CONNTRACK))
+ return;
+}
+
+static struct pernet_operations nf_conntrack_ipv4_net_ops = {
+ .init = nf_conntrack_l3proto_ipv4_init_net,
+ .exit = nf_conntrack_l3proto_ipv4_fini_net,
+};
+
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
&nf_conntrack_htable_size, 0600);
@@ -381,6 +406,12 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
need_conntrack();
nf_defrag_ipv4_enable();
+ ret = register_pernet_subsys(&nf_conntrack_ipv4_net_ops);
+ if (ret) {
+ printk(KERN_ERR "nf_conntrack_ipv4: Unable to register pernet operations\n");
+ return ret;
+ }
+
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
printk(KERN_ERR "Unable to register netfilter socket option\n");
@@ -452,6 +483,7 @@ static void __exit nf_conntrack_l3proto_ipv4_fini(void)
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4);
nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
nf_unregister_sockopt(&so_getorigdst);
+ unregister_pernet_subsys(&nf_conntrack_ipv4_net_ops);
}
module_init(nf_conntrack_l3proto_ipv4_init);
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 26066a2..2c77ffe 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -275,6 +275,22 @@ out:
rcu_read_unlock();
}
+void nf_nat_hash_conntrack(struct net *net, struct nf_conn *ct)
+{
+ unsigned int srchash;
+ struct nf_conn_nat *nat;
+
+ srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate exntension aera */
+ nat = nfct_nat(ct);
+ nat->ct = ct;
+ hlist_add_head_rcu(&nat->bysource,
+ &net->ipv4.nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
+}
+EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack);
+
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
@@ -324,18 +340,8 @@ nf_nat_setup_info(struct nf_conn *ct,
}
/* Place in source hash if this is the first time. */
- if (have_to_hash) {
- unsigned int srchash;
-
- srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
- spin_lock_bh(&nf_nat_lock);
- /* nf_conntrack_alter_reply might re-allocate exntension aera */
- nat = nfct_nat(ct);
- nat->ct = ct;
- hlist_add_head_rcu(&nat->bysource,
- &net->ipv4.nat_bysource[srchash]);
- spin_unlock_bh(&nf_nat_lock);
- }
+ if (have_to_hash)
+ nf_nat_hash_conntrack(net, ct);
/* It's done. */
if (maniptype == IP_NAT_MANIP_DST)
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 9e81e0d..e6798d6 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -186,15 +186,24 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
static int __net_init nf_nat_rule_net_init(struct net *net)
{
+ if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT))
+ return 0;
+
net->ipv4.nat_table = ipt_register_table(net, &nat_table,
&nat_initial_table.repl);
if (IS_ERR(net->ipv4.nat_table))
return PTR_ERR(net->ipv4.nat_table);
+
+ net_ipt_module_set(net, VE_IP_IPTABLE_NAT);
+
return 0;
}
static void __net_exit nf_nat_rule_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_IPTABLE_NAT))
+ return;
+
ipt_unregister_table(net->ipv4.nat_table);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f25542c..879462c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
int orphans, sockets;
local_bh_disable();
- orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+ orphans = percpu_counter_sum_positive(&get_exec_ub()->ub_orphan_count);
sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
local_bh_enable();
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5b1050a..db496b6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -69,6 +69,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/nsproxy.h>
#include <linux/bootmem.h>
#include <linux/string.h>
#include <linux/socket.h>
@@ -115,6 +116,7 @@
#define RT_GC_TIMEOUT (300*HZ)
+int ip_rt_src_check = 1;
static int ip_rt_max_size;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
static int ip_rt_gc_interval __read_mostly = 60 * HZ;
@@ -1420,6 +1422,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
rt->u.dst.xfrm = NULL;
#endif
rt->rt_genid = rt_genid(net);
+#ifdef CONFIG_VE
+ rt->fl.owner_env = get_exec_env();
+#endif
rt->rt_flags |= RTCF_REDIRECTED;
/* Gateway is different ... */
@@ -1876,9 +1881,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#ifdef CONFIG_NET_CLS_ROUTE
rth->u.dst.tclassid = itag;
#endif
+#ifdef CONFIG_VE
+ rth->fl.owner_env = get_exec_env();
+#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
- rth->u.dst.dev = init_net.loopback_dev;
+ rth->u.dst.dev = get_exec_env()->ve_netns->loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
@@ -2014,6 +2022,9 @@ static int __mkroute_input(struct sk_buff *skb,
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
+#ifdef CONFIG_VE
+ rth->fl.owner_env = get_exec_env();
+#endif
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
rth->u.dst.dev = (out_dev)->dev;
@@ -2208,6 +2219,9 @@ local_input:
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
+#ifdef CONFIG_VE
+ rth->fl.owner_env = get_exec_env();
+#endif
rth->u.dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
@@ -2401,6 +2415,9 @@ static int __mkroute_output(struct rtable **result,
rth->fl.mark = oldflp->mark;
rth->rt_dst = fl->fl4_dst;
rth->rt_src = fl->fl4_src;
+#ifdef CONFIG_VE
+ rth->fl.owner_env = get_exec_env();
+#endif
rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
/* get references to the devices that are to be hold by the routing
cache entry */
@@ -2541,7 +2558,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
goto make_route;
}
- if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+ if (!(oldflp->flags & FLOWI_FLAG_ANYSRC) && ip_rt_src_check) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2dcf04d..e739067 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -26,6 +26,9 @@ static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
+int sysctl_tcp_use_sg = 1;
+EXPORT_SYMBOL(sysctl_tcp_use_sg);
+
/* Update system visible IP port range */
static void set_local_port_range(int range[2])
{
@@ -796,6 +799,27 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_max_tw_kmem_fraction",
+ .data = &sysctl_tcp_max_tw_kmem_fraction,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_max_tw_buckets_ub",
+ .data = &sysctl_tcp_max_tw_buckets_ub,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tcp_use_sg",
+ .data = &sysctl_tcp_use_sg,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.ctl_name = CTL_UNNUMBERED,
.procname = "rt_cache_rebuild_count",
.data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f1813bc..f2d3769 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -272,6 +272,10 @@
#include <net/netdma.h>
#include <net/sock.h>
+#include <bc/sock_orphan.h>
+#include <bc/net.h>
+#include <bc/tcp.h>
+
#include <asm/uaccess.h>
#include <asm/ioctls.h>
@@ -375,6 +379,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask;
struct sock *sk = sock->sk;
struct tcp_sock *tp = tcp_sk(sk);
+ int check_send_space;
sock_poll_wait(file, sk->sk_sleep, wait);
if (sk->sk_state == TCP_LISTEN)
@@ -389,6 +394,21 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk->sk_err)
mask = POLLERR;
+ check_send_space = 1;
+#ifdef CONFIG_BEANCOUNTERS
+ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) {
+ unsigned long size;
+ size = MAX_TCP_HEADER + tp->mss_cache;
+ if (size > SOCK_MIN_UBCSPACE)
+ size = SOCK_MIN_UBCSPACE;
+ size = skb_charge_size(size);
+ if (ub_sock_makewres_tcp(sk, size)) {
+ check_send_space = 0;
+ ub_sock_sndqueueadd_tcp(sk, size);
+ }
+ }
+#endif
+
/*
* POLLHUP is certainly not done right. But poll() doesn't
* have a notion of HUP in just one direction, and for a
@@ -436,7 +456,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) {
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
@@ -684,7 +704,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) {
- if (sk_wmem_schedule(sk, skb->truesize)) {
+ if (sk_wmem_schedule(sk, skb->truesize, skb)) {
/*
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
@@ -770,15 +790,23 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int copy, i, can_coalesce;
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);
+ unsigned long chargesize = 0;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
new_segment:
+ chargesize = 0;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
+ chargesize = skb_charge_size(MAX_TCP_HEADER +
+ tp->mss_cache);
+ if (ub_sock_getwres_tcp(sk, chargesize) < 0)
+ goto wait_for_ubspace;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
if (!skb)
goto wait_for_memory;
+ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+ chargesize = 0;
skb_entail(sk, skb);
copy = size_goal;
@@ -793,7 +821,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy, skb))
goto wait_for_memory;
if (can_coalesce) {
@@ -834,10 +862,15 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
+ ub_sock_retwres_tcp(sk, chargesize,
+ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache));
+ chargesize = 0;
+wait_for_ubspace:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = __sk_stream_wait_memory(sk, &timeo, chargesize);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -873,12 +906,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
return res;
}
-#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
-#define TCP_OFF(sk) (sk->sk_sndmsg_off)
-
-static inline int select_size(struct sock *sk)
+static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sk->sk_route_caps & NETIF_F_SG) {
@@ -936,6 +965,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
while (--iovlen >= 0) {
int seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
+ unsigned long chargesize = 0;
iov++;
@@ -951,17 +981,27 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
}
if (copy <= 0) {
+ unsigned long size;
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
*/
+ chargesize = 0;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, select_size(sk),
+ size = select_size(sk, tp);
+ chargesize = skb_charge_size(MAX_TCP_HEADER +
+ size);
+ if (ub_sock_getwres_tcp(sk, chargesize) < 0)
+ goto wait_for_ubspace;
+ skb = sk_stream_alloc_skb(sk, size,
sk->sk_allocation);
if (!skb)
goto wait_for_memory;
+ ub_skb_set_charge(skb, sk, chargesize,
+ UB_TCPSNDBUF);
+ chargesize = 0;
/*
* Check whether we can use HW checksum.
@@ -1008,6 +1048,7 @@ new_segment:
} else if (page) {
if (off == PAGE_SIZE) {
put_page(page);
+ ub_sock_tcp_detachpage(sk);
TCP_PAGE(sk) = page = NULL;
off = 0;
}
@@ -1017,10 +1058,13 @@ new_segment:
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
- if (!sk_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy, skb))
goto wait_for_memory;
if (!page) {
+ chargesize = PAGE_SIZE;
+ if (ub_sock_tcp_chargepage(sk) < 0)
+ goto wait_for_ubspace;
/* Allocate new cache page. */
if (!(page = sk_stream_alloc_page(sk)))
goto wait_for_memory;
@@ -1052,7 +1096,8 @@ new_segment:
} else if (off + copy < PAGE_SIZE) {
get_page(page);
TCP_PAGE(sk) = page;
- }
+ } else
+ ub_sock_tcp_detachpage(sk);
}
TCP_OFF(sk) = off + copy;
@@ -1083,10 +1128,15 @@ new_segment:
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
+ ub_sock_retwres_tcp(sk, chargesize,
+ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache));
+ chargesize = 0;
+wait_for_ubspace:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = __sk_stream_wait_memory(sk, &timeo, chargesize);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1184,8 +1234,10 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
- KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
- tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+ KERN_INFO "cleanup rbuf bug (%d/%s): copied %X seq %X/%X rcvnxt %X\n",
+ VEID(get_exec_env()), current->comm,
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
+ TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
#endif
if (inet_csk_ack_scheduled(sk)) {
@@ -1446,8 +1498,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto found_ok_skb;
if (tcp_hdr(skb)->fin)
goto found_fin_ok;
- WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: "
+ WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2 (%d/%s): "
"copied %X seq %X rcvnxt %X fl %X\n",
+ VEID(get_exec_env()), current->comm,
*seq, TCP_SKB_CB(skb)->seq,
tp->rcv_nxt, flags);
}
@@ -1510,8 +1563,19 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
tp->ucopy.len = len;
- WARN_ON(tp->copied_seq != tp->rcv_nxt &&
- !(flags & (MSG_PEEK | MSG_TRUNC)));
+ if (WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+ !(flags & (MSG_PEEK | MSG_TRUNC)))) {
+ printk("KERNEL: assertion: tp->copied_seq == "
+ "tp->rcv_nxt || ...\n");
+ printk("VE%u pid %d comm %.16s\n",
+ (get_exec_env() ?
+ VEID(get_exec_env()) : 0),
+ current->pid, current->comm);
+ printk("flags=0x%x, len=%d, copied_seq=%d, "
+ "rcv_nxt=%d\n", flags,
+ (int)len, tp->copied_seq,
+ tp->rcv_nxt);
+ }
/* Ugly... If prequeue is not empty, we have to
* process it before releasing socket, otherwise
@@ -1935,7 +1999,7 @@ adjudge_to_death:
bh_lock_sock(sk);
WARN_ON(sock_owned_by_user(sk));
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ ub_inc_orphan_count(sk);
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -1975,14 +2039,19 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- int orphan_count = percpu_counter_read_positive(
- sk->sk_prot->orphan_count);
+ int orphans = ub_get_orphan_count(sk);
sk_mem_reclaim(sk);
- if (tcp_too_many_orphans(sk, orphan_count)) {
- if (net_ratelimit())
+ if (ub_too_many_orphans(sk, orphans)) {
+ if (net_ratelimit()) {
+ int ubid = 0;
+#ifdef CONFIG_BEANCOUNTERS
+ ubid = sock_has_ubc(sk) ?
+ top_beancounter(sock_bc(sk)->ub)->ub_uid : 0;
+#endif
printk(KERN_INFO "TCP: too many of orphaned "
- "sockets\n");
+ "sockets (%d in CT%d)\n", orphans, ubid);
+ }
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(sock_net(sk),
@@ -2059,6 +2128,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
+ tp->advmss = 65535;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2886,10 +2956,11 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0);
percpu_counter_init(&tcp_orphan_count, 0);
+ percpu_counter_init(&get_ub0()->ub_orphan_count, 0);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
@@ -2958,6 +3029,11 @@ void __init tcp_init(void)
sysctl_tcp_mem[1] = limit;
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096)
+ sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096;
+ if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096)
+ sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096;
+
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
max_share = min(4UL*1024*1024, limit);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2433bcd..0eb9c17 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,8 @@
#include <asm/unaligned.h>
#include <net/netdma.h>
+#include <bc/tcp.h>
+
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
@@ -307,7 +309,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !tcp_memory_pressure) {
+ ub_tcp_rmem_allows_expand(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -377,6 +379,8 @@ static void tcp_init_buffer_space(struct sock *sk)
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_time_stamp;
+
+ ub_tcp_update_maxadvmss(sk);
}
/* 5. Recalculate window clamp after socket hit its memory bounds. */
@@ -389,7 +393,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !tcp_memory_pressure &&
+ !ub_tcp_memory_pressure(sk) &&
atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -4268,19 +4272,19 @@ static void tcp_ofo_queue(struct sock *sk)
static int tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, size)) {
+ !sk_rmem_schedule(sk, skb)) {
if (tcp_prune_queue(sk) < 0)
return -1;
- if (!sk_rmem_schedule(sk, size)) {
+ if (!sk_rmem_schedule(sk, skb)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
- if (!sk_rmem_schedule(sk, size))
+ if (!sk_rmem_schedule(sk, skb))
return -1;
}
}
@@ -4332,8 +4336,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb->truesize))
- goto drop;
+ tcp_try_rmem_schedule(sk, skb))
+ goto drop_part;
skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -4377,6 +4381,12 @@ out_of_window:
drop:
__kfree_skb(skb);
return;
+
+drop_part:
+ if (after(tp->copied_seq, tp->rcv_nxt))
+ tp->rcv_nxt = tp->copied_seq;
+ __kfree_skb(skb);
+ return;
}
/* Out of window. F.e. zero window probe. */
@@ -4403,7 +4413,7 @@ drop:
TCP_ECN_check_ce(tp, skb);
- if (tcp_try_rmem_schedule(sk, skb->truesize))
+ if (tcp_try_rmem_schedule(sk, skb))
goto drop;
/* Disable header prediction. */
@@ -4589,6 +4599,10 @@ restart:
nskb = alloc_skb(copy + header, GFP_ATOMIC);
if (!nskb)
return;
+ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) {
+ kfree_skb(nskb);
+ return;
+ }
skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
skb_set_network_header(nskb, (skb_network_header(skb) -
@@ -4717,7 +4731,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (tcp_memory_pressure)
+ else if (ub_tcp_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4783,7 +4797,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
return 0;
/* If we are under global TCP memory pressure, do not expand. */
- if (tcp_memory_pressure)
+ if (ub_tcp_memory_pressure(sk))
return 0;
/* If we are under soft global TCP memory pressure, do not expand. */
@@ -5286,6 +5300,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
+ /* This is OK not to try to free memory here.
+ * Do this below on slow path. Den */
+ if (ub_tcprcvbuf_charge(sk, skb) < 0)
+ goto step5;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b..e141833 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -72,6 +72,8 @@
#include <net/xfrm.h>
#include <net/netdma.h>
+#include <bc/tcp.h>
+
#include <linux/inet.h>
#include <linux/ipv6.h>
#include <linux/stddef.h>
@@ -715,7 +717,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
- tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+ tcptw->tw_rcv_wnd >>
+ (tw->tw_rcv_wscale & TW_WSCALE_MASK),
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
@@ -1194,6 +1197,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
};
+EXPORT_SYMBOL_GPL(tcp_request_sock_ops);
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
@@ -1495,6 +1499,10 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
+ struct user_beancounter *ub;
+
+ ub = set_exec_ub(sock_bc(sk)->ub);
+
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
@@ -1513,7 +1521,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto reset;
}
TCP_CHECK_TIMER(sk);
- return 0;
+ goto restore_context;
}
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1529,7 +1537,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
rsk = nsk;
goto reset;
}
- return 0;
+ goto restore_context;
}
}
@@ -1539,6 +1547,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto reset;
}
TCP_CHECK_TIMER(sk);
+
+restore_context:
+ (void)set_exec_ub(ub);
return 0;
reset:
@@ -1550,7 +1561,7 @@ discard:
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
- return 0;
+ goto restore_context;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
@@ -1812,6 +1823,8 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
+ tp->advmss = 65535; /* max value */
+
tp->reordering = sysctl_tcp_reordering;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1875,6 +1888,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
* If sendmsg cached page exists, toss it.
*/
if (sk->sk_sndmsg_page) {
+ /* queue is empty, uncharge */
+ ub_sock_tcp_detachpage(sk);
__free_page(sk->sk_sndmsg_page);
sk->sk_sndmsg_page = NULL;
}
@@ -2478,6 +2493,93 @@ void __init tcp_v4_init(void)
panic("Failed to create the TCP control socket.\n");
}
+#ifdef CONFIG_VE
+static void tcp_kill_ve_onesk(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* Check the assumed state of the socket. */
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ printk(KERN_WARNING "Killing sk: dead %d, state %d, "
+ "wrseq %u unseq %u, wrqu %d.\n",
+ sock_flag(sk, SOCK_DEAD), sk->sk_state,
+ tp->write_seq, tp->snd_una,
+ !skb_queue_empty(&sk->sk_write_queue));
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ }
+
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ switch (sk->sk_state) {
+ case TCP_FIN_WAIT1:
+ case TCP_CLOSING:
+ /* In these 2 states the peer may want us to retransmit
+ * some data and/or FIN. Entering "resetting mode"
+ * instead.
+ */
+ tcp_time_wait(sk, TCP_CLOSE, 0);
+ break;
+ case TCP_FIN_WAIT2:
+ /* By some reason the socket may stay in this state
+ * without turning into a TW bucket. Fix it.
+ */
+ tcp_time_wait(sk, TCP_FIN_WAIT2, 0);
+ break;
+ default:
+ /* Just jump into CLOSED state. */
+ tcp_done(sk);
+ break;
+ }
+}
+
+void tcp_v4_kill_ve_sockets(struct ve_struct *envid)
+{
+ struct inet_ehash_bucket *head;
+ int i, retry;
+
+ /* alive */
+again:
+ retry = 0;
+ local_bh_disable();
+ head = tcp_hashinfo.ehash;
+ for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+ struct sock *sk;
+ struct hlist_nulls_node *node;
+ spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i);
+more_work:
+ spin_lock(lock);
+ sk_nulls_for_each(sk, node, &head[i].chain) {
+ if (ve_accessible_strict(sk->owner_env, envid)) {
+ sock_hold(sk);
+ spin_unlock(lock);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ retry = 1;
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ break;
+ }
+ /* sk might have disappeared from the hash before
+ * we got the lock */
+ if (sk->sk_state != TCP_CLOSE)
+ tcp_kill_ve_onesk(sk);
+ bh_unlock_sock(sk);
+ sock_put(sk);
+ goto more_work;
+ }
+ }
+ spin_unlock(lock);
+ }
+ local_bh_enable();
+ if (retry) {
+ schedule_timeout_interruptible(HZ);
+ goto again;
+ }
+}
+EXPORT_SYMBOL(tcp_v4_kill_ve_sockets);
+#endif
+
EXPORT_SYMBOL(ipv4_specific);
EXPORT_SYMBOL(tcp_hashinfo);
EXPORT_SYMBOL(tcp_prot);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4c03598..fecf8f0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -26,6 +26,9 @@
#include <net/inet_common.h>
#include <net/xfrm.h>
+#include <bc/net.h>
+#include <bc/sock_orphan.h>
+
#ifdef CONFIG_SYSCTL
#define SYNC_INIT 0 /* let the user enable it */
#else
@@ -36,6 +39,11 @@ int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
EXPORT_SYMBOL(sysctl_tcp_syncookies);
int sysctl_tcp_abort_on_overflow __read_mostly;
+int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384;
+int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536;
+
+EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction);
+EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub);
struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
@@ -51,6 +59,7 @@ struct inet_timewait_death_row tcp_death_row = {
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
+ .ub_managed = 1,
};
EXPORT_SYMBOL_GPL(tcp_death_row);
@@ -280,7 +289,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
recycle_ok = icsk->icsk_af_ops->remember_stamp(sk);
- if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
+ if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets &&
+ ub_timewait_check(sk, &tcp_death_row))
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) {
@@ -293,6 +303,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
+ if (sk->sk_user_data != NULL)
+ tw->tw_rcv_wscale |= TW_WSCALE_SPEC;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
if (tw->tw_family == PF_INET6) {
@@ -327,6 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
} while (0);
#endif
+ tw->tw_owner_env = VEID(sk->owner_env);
/* Linkage updates. */
__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
@@ -347,11 +360,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
TCP_TIMEWAIT_LEN);
inet_twsk_put(tw);
} else {
+ int ubid = 0;
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n");
+#ifdef CONFIG_BEANCOUNTERS
+ if (sock_has_ubc(sk))
+ ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid;
+#endif
+ LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid);
}
tcp_update_metrics(sk);
@@ -392,6 +410,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_sock *newtp;
/* Now setup tcp_sock */
+ newsk->owner_env = sk->owner_env;
+
newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a..723de21 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -39,6 +39,9 @@
#include <linux/compiler.h>
#include <linux/module.h>
+#include <bc/net.h>
+#include <bc/tcp.h>
+
/* People can turn this off for buggy TCP's found in printers etc. */
int sysctl_tcp_retrans_collapse __read_mostly = 1;
@@ -353,11 +356,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
TCP_SKB_CB(skb)->end_seq = seq;
}
-static inline int tcp_urg_mode(const struct tcp_sock *tp)
-{
- return tp->snd_una != tp->snd_up;
-}
-
#define OPTION_SACK_ADVERTISE (1 << 0)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
@@ -598,6 +596,13 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
return size;
}
+static int skb_header_size(struct sock *sk, int tcp_hlen)
+{
+ struct ip_options *opt = inet_sk(sk)->opt;
+ return tcp_hlen + sizeof(struct iphdr) +
+ (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */;
+}
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -622,6 +627,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
__u8 *md5_hash_location;
struct tcphdr *th;
int err;
+ int header_size;
BUG_ON(!skb || !tcp_skb_pcount(skb));
@@ -652,6 +658,20 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
&md5);
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
+ /* Unfortunately, we can have skb from outside world here
+ * with size insufficient for header. It is impossible to make
+ * guess when we queue skb, so the decision should be made
+ * here. Den
+ */
+ header_size = skb_header_size(sk, tcp_header_size);
+ if (skb->data - header_size < skb->head) {
+ int delta = header_size - skb_headroom(skb);
+ err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta),
+ 0, GFP_ATOMIC);
+ if (err)
+ return err;
+ }
+
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
@@ -824,15 +844,21 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;
- if (skb_cloned(skb) &&
- skb_is_nonlinear(skb) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
- return -ENOMEM;
+ if (skb_cloned(skb) && skb_is_nonlinear(skb)) {
+ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ return -ENOMEM;
+ ub_skb_uncharge(skb);
+ ub_tcpsndbuf_charge_forced(sk, skb);
+ }
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
+ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+ kfree_skb(buff);
+ return -ENOMEM;
+ }
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
@@ -1299,6 +1325,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (unlikely(buff == NULL))
return -ENOMEM;
+ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+ kfree_skb(buff);
+ return -ENOMEM;
+ }
+
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
@@ -1728,7 +1759,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (tcp_memory_pressure)
+ if (ub_tcp_shrink_rcvbuf(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
@@ -2142,6 +2173,7 @@ void tcp_send_fin(struct sock *sk)
break;
yield();
}
+ ub_tcpsndbuf_charge_forced(sk, skb);
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
@@ -2201,6 +2233,10 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
if (nskb == NULL)
return -ENOMEM;
+ if (ub_tcpsndbuf_charge(sk, nskb) < 0) {
+ kfree_skb(nskb);
+ return -ENOMEM;
+ }
tcp_unlink_write_queue(skb, sk);
skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
@@ -2310,6 +2346,7 @@ static void tcp_connect_init(struct sock *sk)
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ static int once = 0;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -2329,11 +2366,25 @@ static void tcp_connect_init(struct sock *sk)
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
+ if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) {
+ once = 1;
+
+ printk("Oops in connect_init! dst->advmss=%d\n",
+ dst_metric(dst, RTAX_ADVMSS));
+ printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU));
+ printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, "
+ "advmss=%d, user_mss=%d\n",
+ sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss,
+ tp->mss_cache, tp->advmss, tp->rx_opt.user_mss);
+ }
+
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
tp->advmss = dst_metric(dst, RTAX_ADVMSS);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
tp->advmss = tp->rx_opt.user_mss;
+ if (tp->advmss == 0)
+ tp->advmss = 1460;
tcp_initialize_rcv_mss(sk);
@@ -2374,6 +2425,10 @@ int tcp_connect(struct sock *sk)
buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
if (unlikely(buff == NULL))
return -ENOBUFS;
+ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
+ kfree_skb(buff);
+ return -ENOBUFS;
+ }
/* Reserve space for headers. */
skb_reserve(buff, MAX_TCP_HEADER);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cdb2ca7..78846e4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -20,6 +20,8 @@
#include <linux/module.h>
#include <net/tcp.h>
+#include <bc/sock_orphan.h>
+#include <bc/tcp.h>
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
@@ -65,7 +67,8 @@ static void tcp_write_err(struct sock *sk)
static int tcp_out_of_resources(struct sock *sk, int do_reset)
{
struct tcp_sock *tp = tcp_sk(sk);
- int orphans = percpu_counter_read_positive(&tcp_orphan_count);
+ int orphans = ub_get_orphan_count(sk);
+ int orph = orphans;
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
@@ -76,10 +79,16 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
if (sk->sk_err_soft)
orphans <<= 1;
- if (tcp_too_many_orphans(sk, orphans)) {
- if (net_ratelimit())
- printk(KERN_INFO "Out of socket memory\n");
-
+ if (ub_too_many_orphans(sk, orphans)) {
+ if (net_ratelimit()) {
+ int ubid = 0;
+#ifdef CONFIG_BEANCOUNTERS
+ ubid = sock_has_ubc(sk) ?
+ top_beancounter(sock_bc(sk)->ub)->ub_uid : 0;
+#endif
+ printk(KERN_INFO "Orphaned socket dropped "
+ "(%d,%d in CT%d)\n", orph, orphans, ubid);
+ }
/* Catch exceptional cases, when connection requires reset.
* 1. Last segment was sent recently. */
if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
@@ -177,6 +186,9 @@ static void tcp_delack_timer(unsigned long data)
struct sock *sk = (struct sock *)data;
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct ve_struct *ve;
+
+ ve = set_exec_env(sk->owner_env);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
@@ -231,6 +243,8 @@ out:
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
+
+ (void)set_exec_env(ve);
}
static void tcp_probe_timer(struct sock *sk)
@@ -238,10 +252,13 @@ static void tcp_probe_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
+ struct ve_struct *ve;
+
+ ve = set_exec_env(sk->owner_env);
if (tp->packets_out || !tcp_send_head(sk)) {
icsk->icsk_probes_out = 0;
- return;
+ goto out;
}
/* *WARNING* RFC 1122 forbids this
@@ -267,7 +284,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = tcp_orphan_retries(sk, alive);
if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes))
- return;
+ goto out;
}
if (icsk->icsk_probes_out > max_probes) {
@@ -276,6 +293,9 @@ static void tcp_probe_timer(struct sock *sk)
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
}
+
+out:
+ (void)set_exec_env(ve);
}
/*
@@ -286,6 +306,9 @@ void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct ve_struct *ve;
+
+ ve = set_exec_env(sk->owner_env);
if (!tp->packets_out)
goto out;
@@ -391,7 +414,8 @@ out_reset_timer:
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);
-out:;
+out:
+ (void)set_exec_env(ve);
}
static void tcp_write_timer(unsigned long data)
@@ -399,6 +423,9 @@ static void tcp_write_timer(unsigned long data)
struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event;
+ struct ve_struct *ve;
+
+ ve = set_exec_env(sk->owner_env);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
@@ -433,6 +460,8 @@ out:
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
+
+ (void)set_exec_env(ve);
}
/*
@@ -463,6 +492,9 @@ static void tcp_keepalive_timer (unsigned long data)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u32 elapsed;
+ struct ve_struct *ve;
+
+ ve = set_exec_env(sk->owner_env);
/* Only process if socket is not in use. */
bh_lock_sock(sk);
@@ -534,4 +566,5 @@ death:
out:
bh_unlock_sock(sk);
sock_put(sk);
+ (void)set_exec_env(ve);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c322f44..2c1435a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -138,6 +138,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
sk2 != sk &&
(bitmap || sk2->sk_hash == num) &&
(!sk2->sk_reuse || !sk->sk_reuse) &&
+ sk->sk_reuse != 2 &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(*saddr_comp)(sk, sk2)) {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d1f77cc..7fc4efd 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -407,9 +407,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_TUNNEL6 ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
- printk(KERN_INFO
- "%s: Disabled Privacy Extensions\n",
- dev->name);
+ ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n",
+ dev->name));
ndev->cnf.use_tempaddr = -1;
} else {
in6_dev_hold(ndev);
@@ -626,7 +625,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
goto out;
}
- ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+ ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC);
if (ifa == NULL) {
ADBG(("ipv6_add_addr: malloc failed\n"));
@@ -2085,7 +2084,7 @@ err_exit:
/*
* Manual configuration of address on an interface
*/
-static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
+int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
unsigned int plen, __u8 ifa_flags, __u32 prefered_lft,
__u32 valid_lft)
{
@@ -2157,6 +2156,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx,
return PTR_ERR(ifp);
}
+EXPORT_SYMBOL_GPL(inet6_addr_add);
static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
unsigned int plen)
@@ -2188,7 +2188,8 @@ static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx,
disable IPv6 on this interface.
*/
if (idev->addr_list == NULL)
- addrconf_ifdown(idev->dev, 1);
+ addrconf_ifdown(idev->dev,
+ !(idev->dev->flags & IFF_LOOPBACK));
return 0;
}
}
@@ -2202,7 +2203,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
struct in6_ifreq ireq;
int err;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
return -EPERM;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
@@ -2221,7 +2222,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
struct in6_ifreq ireq;
int err;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
return -EPERM;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
@@ -2731,6 +2732,9 @@ static int addrconf_ifdown(struct net_device *dev, int how)
static void addrconf_rs_timer(unsigned long data)
{
struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
+ struct ve_struct *old_env;
+
+ old_env = set_exec_env(ifp->idev->dev->owner_env);
if (ifp->idev->cnf.forwarding)
goto out;
@@ -2765,6 +2769,7 @@ static void addrconf_rs_timer(unsigned long data)
out:
in6_ifa_put(ifp);
+ (void)set_exec_env(old_env);
}
/*
@@ -2801,6 +2806,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
idev->cnf.accept_dad < 1 ||
!(ifp->flags&IFA_F_TENTATIVE) ||
+ dev->owner_env->disable_net ||
ifp->flags & IFA_F_NODAD) {
ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
spin_unlock_bh(&ifp->lock);
@@ -2841,7 +2847,9 @@ static void addrconf_dad_timer(unsigned long data)
struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
struct inet6_dev *idev = ifp->idev;
struct in6_addr mcaddr;
+ struct ve_struct *old_env;
+ old_env = set_exec_env(ifp->idev->dev->owner_env);
read_lock_bh(&idev->lock);
if (idev->dead) {
read_unlock_bh(&idev->lock);
@@ -2872,6 +2880,7 @@ static void addrconf_dad_timer(unsigned long data)
ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any);
out:
in6_ifa_put(ifp);
+ (void)set_exec_env(old_env);
}
static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
@@ -3093,6 +3102,7 @@ static void addrconf_verify(unsigned long foo)
struct inet6_ifaddr *ifp;
unsigned long now, next;
int i;
+ struct ve_struct *old_env;
spin_lock_bh(&addrconf_verify_lock);
now = jiffies;
@@ -3113,6 +3123,8 @@ restart:
if (ifp->flags & IFA_F_PERMANENT)
continue;
+ old_env = set_exec_env(ifp->idev->dev->owner_env);
+
spin_lock(&ifp->lock);
age = (now - ifp->tstamp) / HZ;
@@ -3128,9 +3140,11 @@ restart:
in6_ifa_hold(ifp);
read_unlock(&addrconf_hash_lock);
ipv6_del_addr(ifp);
+ (void)set_exec_env(old_env);
goto restart;
} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
spin_unlock(&ifp->lock);
+ set_exec_env(old_env);
continue;
} else if (age >= ifp->prefered_lft) {
/* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
@@ -3152,6 +3166,7 @@ restart:
ipv6_ifa_notify(0, ifp);
in6_ifa_put(ifp);
+ (void)set_exec_env(old_env);
goto restart;
}
#ifdef CONFIG_IPV6_PRIVACY
@@ -3173,6 +3188,7 @@ restart:
ipv6_create_tempaddr(ifpub, ifp);
in6_ifa_put(ifpub);
in6_ifa_put(ifp);
+ (void)set_exec_env(old_env);
goto restart;
}
} else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
@@ -3185,6 +3201,7 @@ restart:
next = ifp->tstamp + ifp->prefered_lft * HZ;
spin_unlock(&ifp->lock);
}
+ (void)set_exec_env(old_env);
}
read_unlock(&addrconf_hash_lock);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e127a32..6a7d270 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -57,6 +57,10 @@
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
+#include <bc/net.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -157,6 +161,10 @@ lookup_protocol:
goto out_rcu_unlock;
}
+ err = vz_security_protocol_check(answer->protocol);
+ if (err < 0)
+ goto out_rcu_unlock;
+
err = -EPERM;
if (answer->capability > 0 && !capable(answer->capability))
goto out_rcu_unlock;
@@ -174,6 +182,13 @@ lookup_protocol:
if (sk == NULL)
goto out;
+ err = -ENOBUFS;
+ if (ub_sock_charge(sk, PF_INET6, sock->type))
+ goto out_sk_free;
+ /* if charge was successful, sock_init_data() MUST be called to
+ * set sk->sk_type. otherwise sk will be uncharged to wrong resource
+ */
+
sock_init_data(sock, sk);
err = 0;
@@ -248,6 +263,9 @@ out:
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_free:
+ sk_free(sk);
+ return err;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0e93ca5..faa5d81 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -176,11 +176,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
- /*
- * No protection necessary, this is the only list mutatation
- * operation, tables never disappear once they exist.
- */
+ write_lock_bh(&tb->tb6_lock);
hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+ write_unlock_bh(&tb->tb6_lock);
}
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -1365,10 +1363,14 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
head = &net->ipv6.fib_table_hash[h];
hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
+ struct ve_struct *old_env;
+
+ old_env = set_exec_env(table->owner_env);
write_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
func, prune, arg);
write_unlock_bh(&table->tb6_lock);
+ (void)set_exec_env(old_env);
}
}
rcu_read_unlock();
@@ -1488,6 +1490,9 @@ static int fib6_net_init(struct net *net)
if (!net->ipv6.fib6_main_tbl)
goto out_fib_table_hash;
+#ifdef CONFIG_VE
+ net->ipv6.fib6_main_tbl->owner_env = get_exec_env();
+#endif
net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
@@ -1498,6 +1503,10 @@ static int fib6_net_init(struct net *net)
GFP_KERNEL);
if (!net->ipv6.fib6_local_tbl)
goto out_fib6_main_tbl;
+
+#ifdef CONFIG_VE
+ net->ipv6.fib6_local_tbl->owner_env = get_exec_env();
+#endif
net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
@@ -1543,7 +1552,7 @@ int __init fib6_init(void)
fib6_node_kmem = kmem_cache_create("fib6_nodes",
sizeof(struct fib6_node),
- 0, SLAB_HWCACHE_ALIGN,
+ 0, SLAB_HWCACHE_ALIGN|SLAB_UBC,
NULL);
if (!fib6_node_kmem)
goto out;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index cd48801..15e86e6 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -522,6 +522,20 @@ int ip6_forward(struct sk_buff *skb)
return -EMSGSIZE;
}
+ /*
+ * We try to optimize forwarding of VE packets:
+ * do not decrement TTL (and so save skb_cow)
+ * during forwarding of outgoing pkts from VE.
+ * For incoming pkts we still do ttl decr,
+ * since such skb is not cloned and does not require
+ * actual cow. So, there is at least one place
+ * in pkts path with mandatory ttl decr, that is
+ * sufficient to prevent routing loops.
+ */
+ hdr = ipv6_hdr(skb);
+ if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */
+ goto no_ttl_decr;
+
if (skb_cow(skb, dst->dev->hard_header_len)) {
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
goto drop;
@@ -533,6 +547,7 @@ int ip6_forward(struct sk_buff *skb)
hdr->hop_limit--;
+no_ttl_decr:
IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
ip6_forward_finish);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index f9fcf69..dfea20f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -243,6 +243,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
return 0;
}
+EXPORT_SYMBOL_GPL(ipv6_sock_mc_join);
/*
* socket leave on multicast group
@@ -2205,15 +2206,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma)
static void mld_gq_timer_expire(unsigned long data)
{
struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
idev->mc_gq_running = 0;
mld_send_report(idev, NULL);
__in6_dev_put(idev);
+ set_exec_env(old_env);
}
static void mld_ifc_timer_expire(unsigned long data)
{
struct inet6_dev *idev = (struct inet6_dev *)data;
+ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
mld_send_cr(idev);
if (idev->mc_ifc_count) {
@@ -2222,6 +2226,7 @@ static void mld_ifc_timer_expire(unsigned long data)
mld_ifc_start_timer(idev, idev->mc_maxdelay);
}
__in6_dev_put(idev);
+ set_exec_env(old_env);
}
static void mld_ifc_event(struct inet6_dev *idev)
@@ -2236,6 +2241,7 @@ static void mld_ifc_event(struct inet6_dev *idev)
static void igmp6_timer_handler(unsigned long data)
{
struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+ struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env);
if (MLD_V1_SEEN(ma->idev))
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
@@ -2247,6 +2253,7 @@ static void igmp6_timer_handler(unsigned long data)
ma->mca_flags &= ~MAF_TIMER_RUNNING;
spin_unlock(&ma->mca_lock);
ma_put(ma);
+ set_exec_env(old_env);
}
/* Device changing type */
diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 1cf3f0c..791bc00 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -439,7 +439,7 @@ __ipq_rcv_skb(struct sk_buff *skb)
if (type <= IPQM_BASE)
return;
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
RCV_SKB_FAIL(-EPERM);
write_lock_bh(&queue_lock);
@@ -469,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb)
static void
ipq_rcv_skb(struct sk_buff *skb)
{
+ struct ve_struct *old_ve;
+
mutex_lock(&ipqnl_mutex);
+ old_ve = set_exec_env(skb->owner_env);
__ipq_rcv_skb(skb);
+ (void)set_exec_env(old_ve);
mutex_unlock(&ipqnl_mutex);
}
@@ -480,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
ipq_dev_drop(dev->ifindex);
@@ -502,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block *this,
if (event == NETLINK_URELEASE &&
n->protocol == NETLINK_IP6_FW && n->pid) {
write_lock_bh(&queue_lock);
- if ((n->net == &init_net) && (n->pid == peer_pid))
+ if (n->pid == peer_pid)
__ipq_reset();
write_unlock_bh(&queue_lock);
}
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1de56fd..645d172 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -351,6 +351,9 @@ ip6t_do_table(struct sk_buff *skb,
struct xt_match_param mtpar;
struct xt_target_param tgpar;
+ if (ve_xt_table_forbidden(table))
+ return NF_ACCEPT;
+
/* Initialization */
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
@@ -1896,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2007,7 +2010,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
{
int ret;
- if (!capable(CAP_NET_ADMIN))
+ if (!capable(CAP_VE_NET_ADMIN))
return -EPERM;
switch (cmd) {
@@ -2107,7 +2110,7 @@ struct xt_table *ip6t_register_table(struct net *net,
int ret;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ = { 0, 0, 0, 0, { 0 }, { 0 }, { } };
void *loc_cpu_entry;
struct xt_table *new_table;
@@ -2252,11 +2255,22 @@ static struct xt_match icmp6_matchstruct __read_mostly = {
static int __net_init ip6_tables_net_init(struct net *net)
{
- return xt_proto_init(net, NFPROTO_IPV6);
+ int res;
+
+ if (!net_ipt_permitted(net, VE_IP_IPTABLES6))
+ return 0;
+
+ res = xt_proto_init(net, NFPROTO_IPV6);
+ if (!res)
+ net_ipt_module_set(net, VE_IP_IPTABLES6);
+ return res;
}
static void __net_exit ip6_tables_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6))
+ return;
+
xt_proto_fini(net, NFPROTO_IPV6);
}
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 6f4383a..6b9dc0b 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -121,16 +121,24 @@ module_param(forward, bool, 0000);
static int __net_init ip6table_filter_net_init(struct net *net)
{
+ if (!net_ipt_permitted(net, VE_IP_FILTER6))
+ return 0;
+
/* Register table */
net->ipv6.ip6table_filter =
ip6t_register_table(net, &packet_filter, &initial_table.repl);
if (IS_ERR(net->ipv6.ip6table_filter))
return PTR_ERR(net->ipv6.ip6table_filter);
+
+ net_ipt_module_set(net, VE_IP_FILTER6);
return 0;
}
static void __net_exit ip6table_filter_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_FILTER6))
+ return;
+
ip6t_unregister_table(net->ipv6.ip6table_filter);
}
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index 0ad9143..7960d5e 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -172,16 +172,24 @@ static struct nf_hook_ops ip6t_ops[] __read_mostly = {
static int __net_init ip6table_mangle_net_init(struct net *net)
{
+ if (!net_ipt_permitted(net, VE_IP_MANGLE6))
+ return 0;
+
/* Register table */
net->ipv6.ip6table_mangle =
ip6t_register_table(net, &packet_mangler, &initial_table.repl);
if (IS_ERR(net->ipv6.ip6table_mangle))
return PTR_ERR(net->ipv6.ip6table_mangle);
+
+ net_ipt_module_set(net, VE_IP_MANGLE6);
return 0;
}
static void __net_exit ip6table_mangle_net_exit(struct net *net)
{
+ if (!net_is_ipt_module_set(net, VE_IP_MANGLE6))
+ return;
+
ip6t_unregister_table(net->ipv6.ip6table_mangle);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 0956eba..3690afe 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -210,12 +210,13 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *reasm;
+ struct net *net = out ? dev_net(out) : dev_net(in);
/* Previously seen (loopback)? */
if (skb->nfct)
return NF_ACCEPT;
- reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+ reasm = nf_ct_frag6_gather(net, skb, nf_ct6_defrag_user(hooknum, skb));
/* queued */
if (reasm == NULL)
return NF_STOLEN;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index bfc8737..3ce6b2c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -118,11 +118,12 @@ static void nf_skb_free(struct sk_buff *skb)
}
/* Memory Tracking Functions. */
-static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work)
+static inline void frag_kfree_skb(struct netns_frags *nf,
+ struct sk_buff *skb, unsigned int *work)
{
if (work)
*work -= skb->truesize;
- atomic_sub(skb->truesize, &nf_init_frags.mem);
+ atomic_sub(skb->truesize, &nf->mem);
nf_skb_free(skb);
kfree_skb(skb);
}
@@ -142,10 +143,10 @@ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq)
inet_frag_kill(&fq->q, &nf_frags);
}
-static void nf_ct_frag6_evictor(void)
+static void nf_ct_frag6_evictor(struct netns_frags *nf)
{
local_bh_disable();
- inet_frag_evictor(&nf_init_frags, &nf_frags);
+ inet_frag_evictor(nf, &nf_frags);
local_bh_enable();
}
@@ -171,7 +172,7 @@ out:
/* Creation primitives. */
static __inline__ struct nf_ct_frag6_queue *
-fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst)
+fq_find(struct net *net, __be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -185,7 +186,7 @@ fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst)
read_lock_bh(&nf_frags.lock);
hash = inet6_hash_frag(id, src, dst, nf_frags.rnd);
- q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash);
+ q = inet_frag_find(&net->ipv6.ct_frags, &nf_frags, &arg, hash);
local_bh_enable();
if (q == NULL)
goto oom;
@@ -198,7 +199,8 @@ oom:
}
-static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
+static int nf_ct_frag6_queue(struct net *net, struct nf_ct_frag6_queue *fq,
+ struct sk_buff *skb,
const struct frag_hdr *fhdr, int nhoff)
{
struct sk_buff *prev, *next;
@@ -339,7 +341,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
fq->q.fragments = next;
fq->q.meat -= free_it->len;
- frag_kfree_skb(free_it, NULL);
+ frag_kfree_skb(fq->q.net, free_it, NULL);
}
}
@@ -355,7 +357,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
skb->dev = NULL;
fq->q.stamp = skb->tstamp;
fq->q.meat += skb->len;
- atomic_add(skb->truesize, &nf_init_frags.mem);
+ atomic_add(skb->truesize, &net->ipv6.ct_frags.mem);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -365,7 +367,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
fq->q.last_in |= INET_FRAG_FIRST_IN;
}
write_lock(&nf_frags.lock);
- list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list);
+ list_move_tail(&fq->q.lru_list, &net->ipv6.ct_frags.lru_list);
write_unlock(&nf_frags.lock);
return 0;
@@ -383,7 +385,8 @@ err:
* the last and the first frames arrived and all the bits are here.
*/
static struct sk_buff *
-nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
+nf_ct_frag6_reasm(struct net *net, struct nf_ct_frag6_queue *fq,
+ struct net_device *dev)
{
struct sk_buff *fp, *op, *head = fq->q.fragments;
int payload_len;
@@ -432,7 +435,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
clone->ip_summed = head->ip_summed;
NFCT_FRAG6_CB(clone)->orig = NULL;
- atomic_add(clone->truesize, &nf_init_frags.mem);
+ atomic_add(clone->truesize, &net->ipv6.ct_frags.mem);
}
/* We have to remove fragment header from datagram and to relocate
@@ -446,7 +449,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
skb_shinfo(head)->frag_list = head->next;
skb_reset_transport_header(head);
skb_push(head, head->data - skb_network_header(head));
- atomic_sub(head->truesize, &nf_init_frags.mem);
+ atomic_sub(head->truesize, &net->ipv6.ct_frags.mem);
for (fp=head->next; fp; fp = fp->next) {
head->data_len += fp->len;
@@ -456,7 +459,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
- atomic_sub(fp->truesize, &nf_init_frags.mem);
+ atomic_sub(fp->truesize, &net->ipv6.ct_frags.mem);
}
head->next = NULL;
@@ -563,7 +566,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
return 0;
}
-struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
+struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
struct sk_buff *clone;
struct net_device *dev = skb->dev;
@@ -606,10 +609,11 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
goto ret_orig;
}
- if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh)
- nf_ct_frag6_evictor();
+ if (atomic_read(&net->ipv6.ct_frags.mem) >
+ net->ipv6.ct_frags.high_thresh)
+ nf_ct_frag6_evictor(&net->ipv6.ct_frags);
- fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr);
+ fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr);
if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n");
goto ret_orig;
@@ -617,7 +621,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
spin_lock_bh(&fq->q.lock);
- if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
+ if (nf_ct_frag6_queue(net, fq, clone, fhdr, nhoff) < 0) {
spin_unlock_bh(&fq->q.lock);
pr_debug("Can't insert skb to queue\n");
fq_put(fq);
@@ -626,7 +630,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
- ret_skb = nf_ct_frag6_reasm(fq, dev);
+ ret_skb = nf_ct_frag6_reasm(net, fq, dev);
if (ret_skb == NULL)
pr_debug("Can't reassemble fragmented packets\n");
}
@@ -661,8 +665,32 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
nf_conntrack_put_reasm(skb);
}
+static int nf_ct_frag6_init_net(struct net *net)
+{
+ struct netns_frags *frags = &net->ipv6.ct_frags;
+
+ frags->timeout = IPV6_FRAG_TIMEOUT;
+ frags->high_thresh = 256 * 1024;
+ frags->low_thresh = 192 * 1024;
+ inet_frags_init_net(frags);
+
+ return 0; /* FIXME : sysctls */
+}
+
+static void nf_ct_frag6_exit_net(struct net *net)
+{
+ inet_frags_exit_net(&net->ipv6.ct_frags, &nf_frags);
+}
+
+static struct pernet_operations nf_ct_frag6_ops = {
+ .init = nf_ct_frag6_init_net,
+ .exit = nf_ct_frag6_exit_net,
+};
+
int nf_ct_frag6_init(void)
{
+ register_pernet_subsys(&nf_ct_frag6_ops);
+
nf_frags.hashfn = nf_hashfn;
nf_frags.constructor = ip6_frag_init;
nf_frags.destructor = NULL;
@@ -671,10 +699,6 @@ int nf_ct_frag6_init(void)
nf_frags.match = ip6_frag_match;
nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.secret_interval = 10 * 60 * HZ;
- nf_init_frags.timeout = IPV6_FRAG_TIMEOUT;
- nf_init_frags.high_thresh = 256 * 1024;
- nf_init_frags.low_thresh = 192 * 1024;
- inet_frags_init_net(&nf_init_frags);
inet_frags_init(&nf_frags);
return 0;
@@ -683,7 +707,5 @@ int nf_ct_frag6_init(void)
void nf_ct_frag6_cleanup(void)
{
inet_frags_fini(&nf_frags);
-
- nf_init_frags.low_thresh = 0;
- nf_ct_frag6_evictor();
+ unregister_pernet_subsys(&nf_ct_frag6_ops);
}
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4d18699..de5e2ec 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -199,8 +199,10 @@ static void ip6_frag_expire(unsigned long data)
struct frag_queue *fq;
struct net_device *dev = NULL;
struct net *net;
+ struct ve_struct *old_ve;
fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
+ old_ve = set_exec_env(fq->q.owner_ve);
spin_lock(&fq->q.lock);
@@ -235,6 +237,8 @@ out:
dev_put(dev);
spin_unlock(&fq->q.lock);
fq_put(fq);
+
+ (void)set_exec_env(old_ve);
}
static __inline__ struct frag_queue *
@@ -515,6 +519,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
clone->csum = 0;
clone->ip_summed = head->ip_summed;
atomic_add(clone->truesize, &fq->q.net->mem);
+ clone->owner_env = head->owner_env;
}
/* We have to remove fragment header from datagram and to relocate
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index dbd19a7..9fb663a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -32,6 +32,7 @@
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
+#include <linux/vzcalluser.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -53,6 +54,9 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/cpt_image.h>
+#include <linux/cpt_export.h>
+
/*
This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
@@ -87,6 +91,9 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net,
struct ip_tunnel *t;
struct sit_net *sitn = net_generic(net, sit_net_id);
+ if (sitn == NULL)
+ return NULL;
+
for (t = sitn->tunnels_r_l[h0^h1]; t; t = t->next) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
@@ -937,11 +944,14 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+static void sit_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx);
static const struct net_device_ops ipip6_netdev_ops = {
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = ipip6_tunnel_xmit,
.ndo_do_ioctl = ipip6_tunnel_ioctl,
.ndo_change_mtu = ipip6_tunnel_change_mtu,
+ .ndo_cpt = sit_cpt,
};
static void ipip6_tunnel_setup(struct net_device *dev)
@@ -1011,11 +1021,116 @@ static void sit_destroy_tunnels(struct sit_net *sitn)
}
}
+static void sit_cpt(struct net_device *dev,
+ struct cpt_ops *ops, struct cpt_context *ctx)
+{
+ struct cpt_tunnel_image v;
+ struct ip_tunnel *t;
+ struct sit_net *sitn;
+
+ t = netdev_priv(dev);
+ sitn = net_generic(get_exec_env()->ve_netns, sit_net_id);
+ BUG_ON(sitn == NULL);
+
+ v.cpt_next = CPT_NULL;
+ v.cpt_object = CPT_OBJ_NET_IPIP_TUNNEL;
+ v.cpt_hdrlen = sizeof(v);
+ v.cpt_content = CPT_CONTENT_VOID;
+
+ /* mark fb dev */
+ v.cpt_tnl_flags = CPT_TUNNEL_SIT;
+ if (dev == sitn->fb_tunnel_dev)
+ v.cpt_tnl_flags |= CPT_TUNNEL_FBDEV;
+
+ v.cpt_i_flags = t->parms.i_flags;
+ v.cpt_o_flags = t->parms.o_flags;
+ v.cpt_i_key = t->parms.i_key;
+ v.cpt_o_key = t->parms.o_key;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&v.cpt_iphdr, &t->parms.iph, sizeof(t->parms.iph));
+
+ ops->write(&v, sizeof(v), ctx);
+}
+
+static int sit_rst(loff_t start, struct cpt_netdev_image *di,
+ struct rst_ops *ops, struct cpt_context *ctx)
+{
+ int err = -ENODEV;
+ struct cpt_tunnel_image v;
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ loff_t pos;
+ int fbdev;
+ struct sit_net *sitn;
+
+ sitn = net_generic(get_exec_env()->ve_netns, sit_net_id);
+ if (sitn == NULL)
+ return -EOPNOTSUPP;
+
+ pos = start + di->cpt_hdrlen;
+ err = ops->get_object(CPT_OBJ_NET_IPIP_TUNNEL,
+ pos, &v, sizeof(v), ctx);
+ if (err)
+ return err;
+
+ /* some sanity */
+ if (v.cpt_content != CPT_CONTENT_VOID)
+ return -EINVAL;
+
+ if (!(v.cpt_tnl_flags & CPT_TUNNEL_SIT))
+ return 1;
+
+ if (v.cpt_tnl_flags & CPT_TUNNEL_FBDEV) {
+ fbdev = 1;
+ err = 0;
+ dev = sitn->fb_tunnel_dev;
+ } else {
+ fbdev = 0;
+ err = -ENOMEM;
+ dev = alloc_netdev(sizeof(struct ip_tunnel), di->cpt_name,
+ ipip6_tunnel_setup);
+ if (!dev)
+ goto out;
+ }
+
+ t = netdev_priv(dev);
+ t->parms.i_flags = v.cpt_i_flags;
+ t->parms.o_flags = v.cpt_o_flags;
+ t->parms.i_key = v.cpt_i_key;
+ t->parms.o_key = v.cpt_o_key;
+
+ BUILD_BUG_ON(sizeof(v.cpt_iphdr) != sizeof(t->parms.iph));
+ memcpy(&t->parms.iph, &v.cpt_iphdr, sizeof(t->parms.iph));
+
+ if (!fbdev) {
+ ipip6_tunnel_init(dev);
+ err = register_netdevice(dev);
+ if (err) {
+ free_netdev(dev);
+ goto out;
+ }
+
+ dev_hold(dev);
+ ipip6_tunnel_link(sitn, t);
+ }
+out:
+ return err;
+}
+
+static struct netdev_rst sit_netdev_rst = {
+ .cpt_object = CPT_OBJ_NET_IPIP_TUNNEL,
+ .ndo_rst = sit_rst,
+};
+
static int sit_init_net(struct net *net)
{
int err;
struct sit_net *sitn;
+ if (!(get_exec_env()->features & VE_FEATURE_SIT))
+ return 0;
+
err = -ENOMEM;
sitn = kzalloc(sizeof(struct sit_net), GFP_KERNEL);
if (sitn == NULL)
@@ -1061,6 +1176,9 @@ static void sit_exit_net(struct net *net)
struct sit_net *sitn;
sitn = net_generic(net, sit_net_id);
+ if (sitn == NULL) /* no VE_FEATURE_SIT */
+ return;
+
rtnl_lock();
sit_destroy_tunnels(sitn);
unregister_netdevice(sitn->fb_tunnel_dev);
@@ -1075,6 +1193,7 @@ static struct pernet_operations sit_net_ops = {
static void __exit sit_cleanup(void)
{
+ unregister_netdev_rst(&sit_netdev_rst);
xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
unregister_pernet_gen_device(sit_net_id, &sit_net_ops);
@@ -1094,6 +1213,8 @@ static int __init sit_init(void)
err = register_pernet_gen_device(&sit_net_id, &sit_net_ops);
if (err < 0)
xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
+ else
+ register_netdev_rst(&sit_netdev_rst);
return err;
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 21d100b..1c534b7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -61,6 +61,8 @@
#include <net/netdma.h>
#include <net/inet_common.h>
+#include <bc/tcp.h>
+
#include <asm/uaccess.h>
#include <linux/proc_fs.h>
@@ -75,7 +77,7 @@ static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-static const struct inet_connection_sock_af_ops ipv6_mapped;
+const struct inet_connection_sock_af_ops ipv6_mapped;
static const struct inet_connection_sock_af_ops ipv6_specific;
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
@@ -892,6 +894,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.destructor = tcp_v6_reqsk_destructor,
.send_reset = tcp_v6_send_reset
};
+EXPORT_SYMBOL(tcp6_request_sock_ops);
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
@@ -1496,6 +1499,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp;
struct sk_buff *opt_skb = NULL;
+ struct user_beancounter *ub;
/* Imagine: socket is IPv6. IPv4 packet arrives,
goes to IPv4 receive handler and backlogged.
@@ -1508,6 +1512,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
+ ub = set_exec_ub(sock_bc(sk)->ub);
+
#ifdef CONFIG_TCP_MD5SIG
if (tcp_v6_inbound_md5_hash (sk, skb))
goto discard;
@@ -1544,7 +1550,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
- return 0;
+ goto restore_context;
}
if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1565,7 +1571,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
goto reset;
if (opt_skb)
__kfree_skb(opt_skb);
- return 0;
+ goto restore_context;
}
}
@@ -1575,6 +1581,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
TCP_CHECK_TIMER(sk);
if (opt_skb)
goto ipv6_pktoptions;
+
+restore_context:
+ (void)set_exec_ub(ub);
return 0;
reset:
@@ -1583,7 +1592,7 @@ discard:
if (opt_skb)
__kfree_skb(opt_skb);
kfree_skb(skb);
- return 0;
+ goto restore_context;
csum_err:
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1614,7 +1623,7 @@ ipv6_pktoptions:
}
kfree_skb(opt_skb);
- return 0;
+ goto restore_context;
}
static int tcp_v6_rcv(struct sk_buff *skb)
@@ -1793,7 +1802,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
* TCP over IPv4 via INET6 API
*/
-static const struct inet_connection_sock_af_ops ipv6_mapped = {
+const struct inet_connection_sock_af_ops ipv6_mapped = {
.queue_xmit = ip_queue_xmit,
.send_check = tcp_v4_send_check,
.rebuild_header = inet_sk_rebuild_header,
@@ -1812,6 +1821,8 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
#endif
};
+EXPORT_SYMBOL_GPL(ipv6_mapped);
+
#ifdef CONFIG_TCP_MD5SIG
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
.md5_lookup = tcp_v4_md5_lookup,
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 5bb3473..30a2739 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops *reg)
struct nf_hook_ops *elem;
int err;
+ BUG_ON(!ve_is_super(get_exec_env()));
+
err = mutex_lock_interruptible(&nf_hook_mutex);
if (err < 0)
return err;
@@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook);
void nf_unregister_hook(struct nf_hook_ops *reg)
{
+ BUG_ON(!ve_is_super(get_exec_env()));
+
mutex_lock(&nf_hook_mutex);
list_del_rcu(&reg->list);
mutex_unlock(&nf_hook_mutex);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 27c30cf..c5e4424 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1070,7 +1070,7 @@ int __init ip_vs_conn_init(void)
/* Allocate ip_vs_conn slab cache */
ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
sizeof(struct ip_vs_conn), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL);
if (!ip_vs_conn_cachep) {
vfree(ip_vs_conn_tab);
return -ENOMEM;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e177f0d..1b301df 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/inetdevice.h>
#include <linux/net.h>
+#include <linux/nsproxy.h>
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/skbuff.h>
@@ -490,7 +491,8 @@ static int set_mcast_if(struct sock *sk, char *ifname)
struct net_device *dev;
struct inet_sock *inet = inet_sk(sk);
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -511,11 +513,12 @@ static int set_mcast_if(struct sock *sk, char *ifname)
*/
static int set_sync_mesg_maxlen(int sync_state)
{
+ struct net *net = get_exec_env()->ve_netns;
struct net_device *dev;
int num;
if (sync_state == IP_VS_STATE_MASTER) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+ if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL)
return -ENODEV;
num = (dev->mtu - sizeof(struct iphdr) -
@@ -526,7 +529,7 @@ static int set_sync_mesg_maxlen(int sync_state)
IP_VS_DBG(7, "setting the maximum length of sync sending "
"message %d.\n", sync_send_mesg_maxlen);
} else if (sync_state == IP_VS_STATE_BACKUP) {
- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+ if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL)
return -ENODEV;
sync_recv_mesg_maxlen = dev->mtu -
@@ -554,7 +557,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
memset(&mreq, 0, sizeof(mreq));
memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+ if (!dev)
return -ENODEV;
if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
return -EINVAL;
@@ -575,7 +579,8 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
__be32 addr;
struct sockaddr_in sin;
- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname);
+ if (!dev)
return -ENODEV;
addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 1374179..0692fd2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -45,6 +45,9 @@
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
+#include <net/sock.h>
+#include <bc/sock.h>
+
#define NF_CONNTRACK_VERSION "0.5.0"
int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
@@ -179,6 +182,11 @@ destroy_conntrack(struct nf_conntrack *nfct)
struct nf_conn *ct = (struct nf_conn *)nfct;
struct net *net = nf_ct_net(ct);
struct nf_conntrack_l4proto *l4proto;
+#ifdef CONFIG_VE_IPTABLES
+ struct ve_struct *old_ve;
+
+ old_ve = set_exec_env(ct->ct_net->owner_ve);
+#endif
pr_debug("destroy_conntrack(%p)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
@@ -215,6 +223,9 @@ destroy_conntrack(struct nf_conntrack *nfct)
pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
nf_conntrack_free(ct);
+#ifdef CONFIG_VE_IPTABLES
+ (void)set_exec_env(old);
+#endif
}
void nf_ct_delete_from_lists(struct nf_conn *ct)
@@ -538,9 +549,11 @@ static noinline int early_drop(struct net *net, unsigned int hash)
struct nf_conn *nf_conntrack_alloc(struct net *net,
const struct nf_conntrack_tuple *orig,
const struct nf_conntrack_tuple *repl,
+ struct user_beancounter *ub,
gfp_t gfp)
{
struct nf_conn *ct;
+ struct user_beancounter *old_ub;
if (unlikely(!nf_conntrack_hash_rnd_initted)) {
get_random_bytes(&nf_conntrack_hash_rnd,
@@ -568,7 +581,9 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
* Do not use kmem_cache_zalloc(), as this cache uses
* SLAB_DESTROY_BY_RCU.
*/
+ old_ub = set_exec_ub(ub);
ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
+ (void)set_exec_ub(old_ub);
if (ct == NULL) {
pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
@@ -625,13 +640,20 @@ init_conntrack(struct net *net,
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_expect *exp;
+ struct user_beancounter *ub = NULL;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
pr_debug("Can't invert tuple.\n");
return NULL;
}
- ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
+#ifdef CONFIG_BEANCOUNTERS
+ if (skb->dev != NULL) /* received skb */
+ ub = netdev_bc(skb->dev)->exec_ub;
+ else if (skb->sk != NULL) /* sent skb */
+ ub = sock_bc(skb->sk)->ub;
+#endif
+ ct = nf_conntrack_alloc(net, tuple, &repl_tuple, ub, GFP_ATOMIC);
if (IS_ERR(ct)) {
pr_debug("Can't allocate conntrack.\n");
return (struct nf_conntrack_tuple_hash *)ct;
@@ -714,6 +736,8 @@ resolve_normal_ct(struct net *net,
/* look for tuple match */
h = nf_conntrack_find_get(net, &tuple);
if (!h) {
+ if (!mask_ipt_allow(get_exec_env()->ipt_mask, VE_NF_CONNTRACK))
+ return NULL;
h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
if (!h)
return NULL;
@@ -1168,12 +1192,12 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
sz = nr_slots * sizeof(struct hlist_nulls_head);
- hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+ hash = (void *)__get_free_pages(GFP_KERNEL_UBC | __GFP_NOWARN | __GFP_ZERO,
get_order(sz));
if (!hash) {
*vmalloced = 1;
printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
- hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+ hash = __vmalloc(sz, GFP_KERNEL_UBC | __GFP_ZERO, PAGE_KERNEL);
}
if (hash && nulls)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index e73eb04..bb6f3c4 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -305,7 +305,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
}
EXPORT_SYMBOL_GPL(nf_ct_expect_put);
-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
@@ -329,6 +329,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
atomic_inc(&exp->use);
NF_CT_STAT_INC(net, expect_create);
}
+EXPORT_SYMBOL_GPL(nf_ct_expect_insert);
/* Race with expectations being used means we could have none to find; OK. */
static void evict_oldest_expect(struct nf_conn *master,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d521718..c4b213a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -46,6 +46,10 @@
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <net/sock.h>
+#include <bc/beancounter.h>
+#include <bc/sock.h>
+
MODULE_LICENSE("GPL");
static char __initdata version[] = "0.93";
@@ -1178,13 +1182,14 @@ static struct nf_conn *
ctnetlink_create_conntrack(const struct nlattr * const cda[],
struct nf_conntrack_tuple *otuple,
struct nf_conntrack_tuple *rtuple,
- u8 u3)
+ u8 u3,
+ struct user_beancounter *ub)
{
struct nf_conn *ct;
int err = -EINVAL;
struct nf_conntrack_helper *helper;
- ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_ATOMIC);
+ ct = nf_conntrack_alloc(&init_net, otuple, rtuple, ub, GFP_ATOMIC);
if (IS_ERR(ct))
return ERR_PTR(-ENOMEM);
@@ -1342,9 +1347,14 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
if (nlh->nlmsg_flags & NLM_F_CREATE) {
struct nf_conn *ct;
enum ip_conntrack_events events;
+ struct user_beancounter *ub = NULL;
+#ifdef CONFIG_BEANCOUNTERS
+ if (skb->sk)
+ ub = sock_bc(skb->sk)->ub;
+#endif
ct = ctnetlink_create_conntrack(cda, &otuple,
- &rtuple, u3);
+ &rtuple, u3, ub);
if (IS_ERR(ct)) {
err = PTR_ERR(ct);
goto out_unlock;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 1a84bf6..5d530dc 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,10 @@
MODULE_LICENSE("GPL");
+int ip_conntrack_disable_ve0 = 0;
+module_param(ip_conntrack_disable_ve0, int, 0440);
+EXPORT_SYMBOL(ip_conntrack_disable_ve0);
+
#ifdef CONFIG_PROC_FS
int
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index eedc0c1..8756766 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -133,7 +133,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
const struct nfnetlink_subsystem *ss;
int type, err;
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
return -EPERM;
/* All the messages must at least contain nfgenmsg */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7a9dec9..2640c21 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -555,9 +555,6 @@ nfqnl_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
nfqnl_dev_drop(dev->ifindex);
@@ -586,8 +583,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
struct hlist_head *head = &instance_table[i];
hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) {
- if ((n->net == &init_net) &&
- (n->pid == inst->peer_pid))
+ if (n->pid == inst->peer_pid)
__instance_destroy(inst);
}
}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f01955c..40f7121 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -24,6 +24,8 @@
#include <linux/mm.h>
#include <net/net_namespace.h>
+#include <bc/kmem.h>
+
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp.h>
@@ -66,6 +68,46 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
[NFPROTO_IPV6] = "ip6",
};
+#ifdef CONFIG_BEANCOUNTERS
+static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info)
+{
+ struct user_beancounter *ub;
+
+ for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent);
+ return ub;
+}
+
+static void uncharge_xtables(struct xt_table_info *info, unsigned long size)
+{
+ struct user_beancounter *ub;
+
+ ub = xt_table_ub(info);
+ uncharge_beancounter(ub, UB_NUMXTENT, size);
+}
+
+static int recharge_xtables(int check_ub,
+ struct xt_table_info *new, struct xt_table_info *old)
+{
+ struct user_beancounter *ub;
+ long change;
+
+ ub = xt_table_ub(new);
+ BUG_ON(check_ub && ub != xt_table_ub(old));
+
+ change = (long)new->number - (long)old->number;
+ if (change > 0) {
+ if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT))
+ return -ENOMEM;
+ } else if (change < 0)
+ uncharge_beancounter(ub, UB_NUMXTENT, -change);
+
+ return 0;
+}
+#else
+#define recharge_xtables(c, new, old) (0)
+#define uncharge_xtables(info, s) do { } while (0)
+#endif /* CONFIG_BEANCOUNTERS */
+
/* Registration hooks for targets. */
int
xt_register_target(struct xt_target *target)
@@ -364,14 +406,14 @@ int xt_check_match(struct xt_mtchk_param *par,
* ebt_among is exempt from centralized matchsize checking
* because it uses a dynamic-size data set.
*/
- pr_err("%s_tables: %s match: invalid size %Zu != %u\n",
+ ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: invalid size %Zu != %u\n",
xt_prefix[par->family], par->match->name,
XT_ALIGN(par->match->matchsize), size);
return -EINVAL;
}
if (par->match->table != NULL &&
strcmp(par->match->table, par->table) != 0) {
- pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+ ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid in %s table, not %s\n",
xt_prefix[par->family], par->match->name,
par->match->table, par->table);
return -EINVAL;
@@ -379,7 +421,7 @@ int xt_check_match(struct xt_mtchk_param *par,
if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
char used[64], allow[64];
- pr_err("%s_tables: %s match: used from hooks %s, but only "
+ ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: used from hooks %s, but only "
"valid from %s\n",
xt_prefix[par->family], par->match->name,
textify_hooks(used, sizeof(used), par->hook_mask),
@@ -387,7 +429,7 @@ int xt_check_match(struct xt_mtchk_param *par,
return -EINVAL;
}
if (par->match->proto && (par->match->proto != proto || inv_proto)) {
- pr_err("%s_tables: %s match: only valid for protocol %u\n",
+ ve_printk(VE_LOG, KERN_ERR "%s_tables: %s match: only valid for protocol %u\n",
xt_prefix[par->family], par->match->name,
par->match->proto);
return -EINVAL;
@@ -620,19 +662,19 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
- newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL);
+ newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC);
if (!newinfo)
return NULL;
- newinfo->size = size;
+ newinfo->alloc_size = newinfo->size = size;
for_each_possible_cpu(cpu) {
if (size <= PAGE_SIZE)
newinfo->entries[cpu] = kmalloc_node(size,
- GFP_KERNEL,
+ GFP_KERNEL_UBC,
cpu_to_node(cpu));
else
- newinfo->entries[cpu] = vmalloc_node(size,
+ newinfo->entries[cpu] = ub_vmalloc_node(size,
cpu_to_node(cpu));
if (newinfo->entries[cpu] == NULL) {
@@ -650,7 +692,7 @@ void xt_free_table_info(struct xt_table_info *info)
int cpu;
for_each_possible_cpu(cpu) {
- if (info->size <= PAGE_SIZE)
+ if (info->alloc_size <= PAGE_SIZE)
kfree(info->entries[cpu]);
else
vfree(info->entries[cpu]);
@@ -721,6 +763,12 @@ xt_replace_table(struct xt_table *table,
return NULL;
}
+ if (recharge_xtables(num_counters != 0, newinfo, private)) {
+ local_bh_enable();
+ *error = -ENOMEM;
+ return NULL;
+ }
+
table->private = newinfo;
newinfo->initial_entries = private->initial_entries;
@@ -798,6 +846,7 @@ void *xt_unregister_table(struct xt_table *table)
list_del(&table->list);
mutex_unlock(&xt[table->af].mutex);
kfree(table);
+ uncharge_xtables(private, private->number);
return private;
}
diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c
index 5934570..d6e5ab4 100644
--- a/net/netfilter/xt_CONNMARK.c
+++ b/net/netfilter/xt_CONNMARK.c
@@ -36,6 +36,45 @@ MODULE_ALIAS("ip6t_CONNMARK");
#include <net/netfilter/nf_conntrack_ecache.h>
static unsigned int
+connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+ const struct xt_connmark_target_info *markinfo = par->targinfo;
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ u_int32_t diff;
+ u_int32_t mark;
+ u_int32_t newmark;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ switch(markinfo->mode) {
+ case XT_CONNMARK_SET:
+ newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+ if (newmark != ct->mark) {
+ ct->mark = newmark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+ case XT_CONNMARK_SAVE:
+ newmark = (ct->mark & ~markinfo->mask) |
+ (skb->mark & markinfo->mask);
+ if (ct->mark != newmark) {
+ ct->mark = newmark;
+ nf_conntrack_event_cache(IPCT_MARK, ct);
+ }
+ break;
+ case XT_CONNMARK_RESTORE:
+ mark = skb->mark;
+ diff = (ct->mark ^ mark) & markinfo->mask;
+ skb->mark = mark ^ diff;
+ break;
+ }
+ }
+
+ return XT_CONTINUE;
+}
+
+static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
@@ -73,6 +112,30 @@ connmark_tg(struct sk_buff *skb, const struct xt_target_param *par)
return XT_CONTINUE;
}
+static bool connmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ const struct xt_connmark_target_info *matchinfo = par->targinfo;
+
+ if (matchinfo->mode == XT_CONNMARK_RESTORE) {
+ if (strcmp(par->table, "mangle") != 0) {
+ printk(KERN_WARNING "CONNMARK: restore can only be "
+ "called from \"mangle\" table, not \"%s\"\n",
+ par->table);
+ return false;
+ }
+ }
+ if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+ return false;
+ }
+ if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+ printk(KERN_WARNING "can't load conntrack support for "
+ "proto=%u\n", par->family);
+ return false;
+ }
+ return true;
+}
+
static bool connmark_tg_check(const struct xt_tgchk_param *par)
{
if (nf_ct_l3proto_try_module_get(par->family) < 0) {
@@ -88,25 +151,74 @@ static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
nf_ct_l3proto_module_put(par->family);
}
-static struct xt_target connmark_tg_reg __read_mostly = {
- .name = "CONNMARK",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = connmark_tg_check,
- .target = connmark_tg,
- .targetsize = sizeof(struct xt_connmark_tginfo1),
- .destroy = connmark_tg_destroy,
- .me = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_target_info {
+ compat_ulong_t mark, mask;
+ u_int8_t mode;
+ u_int8_t __pad1;
+ u_int16_t __pad2;
+};
+
+static void connmark_tg_compat_from_user_v0(void *dst, void *src)
+{
+ const struct compat_xt_connmark_target_info *cm = src;
+ struct xt_connmark_target_info m = {
+ .mark = cm->mark,
+ .mask = cm->mask,
+ .mode = cm->mode,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_tg_compat_to_user_v0(void __user *dst, void *src)
+{
+ const struct xt_connmark_target_info *m = src;
+ struct compat_xt_connmark_target_info cm = {
+ .mark = m->mark,
+ .mask = m->mask,
+ .mode = m->mode,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target connmark_tg_reg[] __read_mostly = {
+ {
+ .name = "CONNMARK",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_tg_check_v0,
+ .destroy = connmark_tg_destroy,
+ .target = connmark_tg_v0,
+ .targetsize = sizeof(struct xt_connmark_target_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_connmark_target_info),
+ .compat_from_user = connmark_tg_compat_from_user_v0,
+ .compat_to_user = connmark_tg_compat_to_user_v0,
+#endif
+ .me = THIS_MODULE
+ },
+ {
+ .name = "CONNMARK",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg,
+ .targetsize = sizeof(struct xt_connmark_tginfo1),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init connmark_tg_init(void)
{
- return xt_register_target(&connmark_tg_reg);
+ return xt_register_targets(connmark_tg_reg,
+ ARRAY_SIZE(connmark_tg_reg));
}
static void __exit connmark_tg_exit(void)
{
- xt_unregister_target(&connmark_tg_reg);
+ xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg));
}
module_init(connmark_tg_init);
diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c
index 74ce892..72b469b 100644
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -18,6 +18,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_DSCP.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification");
@@ -65,13 +66,48 @@ static bool dscp_tg_check(const struct xt_tgchk_param *par)
const struct xt_DSCP_info *info = par->targinfo;
if (info->dscp > XT_DSCP_MAX) {
- printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp);
+ ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp);
return false;
}
return true;
}
static unsigned int
+tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+ const struct ipt_tos_target_info *info = par->targinfo;
+ struct iphdr *iph = ip_hdr(skb);
+ u_int8_t oldtos;
+
+ if ((iph->tos & IPTOS_TOS_MASK) != info->tos) {
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ return NF_DROP;
+
+ iph = ip_hdr(skb);
+ oldtos = iph->tos;
+ iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos;
+ csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+ }
+
+ return XT_CONTINUE;
+}
+
+static bool tos_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ const struct ipt_tos_target_info *info = par->targinfo;
+ const uint8_t tos = info->tos;
+
+ if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT &&
+ tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST &&
+ tos != IPTOS_NORMALSVC) {
+ printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+ return false;
+ }
+
+ return true;
+}
+
+static unsigned int
tos_tg(struct sk_buff *skb, const struct xt_target_param *par)
{
const struct xt_tos_target_info *info = par->targinfo;
@@ -132,6 +168,16 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
},
{
.name = "TOS",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .table = "mangle",
+ .target = tos_tg_v0,
+ .targetsize = sizeof(struct ipt_tos_target_info),
+ .checkentry = tos_tg_check_v0,
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "TOS",
.revision = 1,
.family = NFPROTO_IPV4,
.table = "mangle",
diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c
index 225f8d1..67574bc 100644
--- a/net/netfilter/xt_MARK.c
+++ b/net/netfilter/xt_MARK.c
@@ -25,6 +25,39 @@ MODULE_ALIAS("ipt_MARK");
MODULE_ALIAS("ip6t_MARK");
static unsigned int
+mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par)
+{
+ const struct xt_mark_target_info *markinfo = par->targinfo;
+
+ skb->mark = markinfo->mark;
+ return XT_CONTINUE;
+}
+
+static unsigned int
+mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par)
+{
+ const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+ int mark = 0;
+
+ switch (markinfo->mode) {
+ case XT_MARK_SET:
+ mark = markinfo->mark;
+ break;
+
+ case XT_MARK_AND:
+ mark = skb->mark & markinfo->mark;
+ break;
+
+ case XT_MARK_OR:
+ mark = skb->mark | markinfo->mark;
+ break;
+ }
+
+ skb->mark = mark;
+ return XT_CONTINUE;
+}
+
+static unsigned int
mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
{
const struct xt_mark_tginfo2 *info = par->targinfo;
@@ -33,23 +66,135 @@ mark_tg(struct sk_buff *skb, const struct xt_target_param *par)
return XT_CONTINUE;
}
-static struct xt_target mark_tg_reg __read_mostly = {
- .name = "MARK",
- .revision = 2,
- .family = NFPROTO_UNSPEC,
- .target = mark_tg,
- .targetsize = sizeof(struct xt_mark_tginfo2),
- .me = THIS_MODULE,
+static bool mark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+ const struct xt_mark_target_info *markinfo = par->targinfo;
+
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return false;
+ }
+ return true;
+}
+
+static bool mark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+ const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+
+ if (markinfo->mode != XT_MARK_SET
+ && markinfo->mode != XT_MARK_AND
+ && markinfo->mode != XT_MARK_OR) {
+ printk(KERN_WARNING "MARK: unknown mode %u\n",
+ markinfo->mode);
+ return false;
+ }
+ if (markinfo->mark > 0xffffffff) {
+ printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+ return false;
+ }
+ return true;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_target_info {
+ compat_ulong_t mark;
+};
+
+static void mark_tg_compat_from_user_v0(void *dst, void *src)
+{
+ const struct compat_xt_mark_target_info *cm = src;
+ struct xt_mark_target_info m = {
+ .mark = cm->mark,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v0(void __user *dst, void *src)
+{
+ const struct xt_mark_target_info *m = src;
+ struct compat_xt_mark_target_info cm = {
+ .mark = m->mark,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+
+struct compat_xt_mark_target_info_v1 {
+ compat_ulong_t mark;
+ u_int8_t mode;
+ u_int8_t __pad1;
+ u_int16_t __pad2;
+};
+
+static void mark_tg_compat_from_user_v1(void *dst, void *src)
+{
+ const struct compat_xt_mark_target_info_v1 *cm = src;
+ struct xt_mark_target_info_v1 m = {
+ .mark = cm->mark,
+ .mode = cm->mode,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v1(void __user *dst, void *src)
+{
+ const struct xt_mark_target_info_v1 *m = src;
+ struct compat_xt_mark_target_info_v1 cm = {
+ .mark = m->mark,
+ .mode = m->mode,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target mark_tg_reg[] __read_mostly = {
+ {
+ .name = "MARK",
+ .family = NFPROTO_UNSPEC,
+ .revision = 0,
+ .checkentry = mark_tg_check_v0,
+ .target = mark_tg_v0,
+ .targetsize = sizeof(struct xt_mark_target_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_mark_target_info),
+ .compat_from_user = mark_tg_compat_from_user_v0,
+ .compat_to_user = mark_tg_compat_to_user_v0,
+#endif
+ .table = "mangle",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "MARK",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .checkentry = mark_tg_check_v1,
+ .target = mark_tg_v1,
+ .targetsize = sizeof(struct xt_mark_target_info_v1),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_mark_target_info_v1),
+ .compat_from_user = mark_tg_compat_from_user_v1,
+ .compat_to_user = mark_tg_compat_to_user_v1,
+#endif
+ .table = "mangle",
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "MARK",
+ .revision = 2,
+ .family = NFPROTO_UNSPEC,
+ .target = mark_tg,
+ .targetsize = sizeof(struct xt_mark_tginfo2),
+ .me = THIS_MODULE,
+ },
};
static int __init mark_tg_init(void)
{
- return xt_register_target(&mark_tg_reg);
+ return xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
}
static void __exit mark_tg_exit(void)
{
- xt_unregister_target(&mark_tg_reg);
+ xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
}
module_init(mark_tg_init);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index eda64c1..48d49da 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb,
badly. --RR */
if (tcplen != tcph->doff*4) {
if (net_ratelimit())
- printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
+ ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n",
skb->len);
return -1;
}
@@ -75,14 +75,14 @@ tcpmss_mangle_packet(struct sk_buff *skb,
if (info->mss == XT_TCPMSS_CLAMP_PMTU) {
if (dst_mtu(skb_dst(skb)) <= minlen) {
if (net_ratelimit())
- printk(KERN_ERR "xt_TCPMSS: "
+ ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: "
"unknown or invalid path-MTU (%u)\n",
dst_mtu(skb_dst(skb)));
return -1;
}
if (in_mtu <= minlen) {
if (net_ratelimit())
- printk(KERN_ERR "xt_TCPMSS: unknown or "
+ ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: unknown or "
"invalid path-MTU (%u)\n", in_mtu);
return -1;
}
@@ -246,13 +246,13 @@ static bool tcpmss_tg4_check(const struct xt_tgchk_param *par)
(par->hook_mask & ~((1 << NF_INET_FORWARD) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_POST_ROUTING))) != 0) {
- printk("xt_TCPMSS: path-MTU clamping only supported in "
+ ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return false;
}
if (IPT_MATCH_ITERATE(e, find_syn_match))
return true;
- printk("xt_TCPMSS: Only works on TCP SYN packets\n");
+ ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
return false;
}
@@ -266,13 +266,13 @@ static bool tcpmss_tg6_check(const struct xt_tgchk_param *par)
(par->hook_mask & ~((1 << NF_INET_FORWARD) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_POST_ROUTING))) != 0) {
- printk("xt_TCPMSS: path-MTU clamping only supported in "
+ ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in "
"FORWARD, OUTPUT and POSTROUTING hooks\n");
return false;
}
if (IP6T_MATCH_ITERATE(e, find_syn_match))
return true;
- printk("xt_TCPMSS: Only works on TCP SYN packets\n");
+ ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n");
return false;
}
#endif
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 122aa8b..86cacab 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -47,6 +47,36 @@ connmark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
return ((ct->mark & info->mask) == info->mark) ^ info->invert;
}
+static bool
+connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct xt_connmark_info *info = par->matchinfo;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return false;
+
+ return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool connmark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct xt_connmark_info *cm = par->matchinfo;
+
+ if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+ printk(KERN_WARNING "connmark: only support 32bit mark\n");
+ return false;
+ }
+ if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+ printk(KERN_WARNING "can't load conntrack support for "
+ "proto=%u\n", par->family);
+ return false;
+ }
+ return true;
+}
+
static bool connmark_mt_check(const struct xt_mtchk_param *par)
{
if (nf_ct_l3proto_try_module_get(par->family) < 0) {
@@ -62,25 +92,74 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
nf_ct_l3proto_module_put(par->family);
}
-static struct xt_match connmark_mt_reg __read_mostly = {
- .name = "connmark",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .checkentry = connmark_mt_check,
- .match = connmark_mt,
- .matchsize = sizeof(struct xt_connmark_mtinfo1),
- .destroy = connmark_mt_destroy,
- .me = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_info {
+ compat_ulong_t mark, mask;
+ u_int8_t invert;
+ u_int8_t __pad1;
+ u_int16_t __pad2;
+};
+
+static void connmark_mt_compat_from_user_v0(void *dst, void *src)
+{
+ const struct compat_xt_connmark_info *cm = src;
+ struct xt_connmark_info m = {
+ .mark = cm->mark,
+ .mask = cm->mask,
+ .invert = cm->invert,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+ const struct xt_connmark_info *m = src;
+ struct compat_xt_connmark_info cm = {
+ .mark = m->mark,
+ .mask = m->mask,
+ .invert = m->invert,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match connmark_mt_reg[] __read_mostly = {
+ {
+ .name = "connmark",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_mt_check_v0,
+ .match = connmark_mt_v0,
+ .destroy = connmark_mt_destroy,
+ .matchsize = sizeof(struct xt_connmark_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_connmark_info),
+ .compat_from_user = connmark_mt_compat_from_user_v0,
+ .compat_to_user = connmark_mt_compat_to_user_v0,
+#endif
+ .me = THIS_MODULE
+ },
+ {
+ .name = "connmark",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_mt_check,
+ .match = connmark_mt,
+ .matchsize = sizeof(struct xt_connmark_mtinfo1),
+ .destroy = connmark_mt_destroy,
+ .me = THIS_MODULE,
+ },
};
static int __init connmark_mt_init(void)
{
- return xt_register_match(&connmark_mt_reg);
+ return xt_register_matches(connmark_mt_reg,
+ ARRAY_SIZE(connmark_mt_reg));
}
static void __exit connmark_mt_exit(void)
{
- xt_unregister_match(&connmark_mt_reg);
+ xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg));
}
module_init(connmark_mt_init);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index ae66305..30ca13c 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -25,6 +25,95 @@ MODULE_ALIAS("ipt_conntrack");
MODULE_ALIAS("ip6t_conntrack");
static bool
+conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct xt_conntrack_info *sinfo = par->matchinfo;
+ const struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned int statebit;
+
+ ct = nf_ct_get(skb, &ctinfo);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg)))
+
+ if (ct == &nf_conntrack_untracked)
+ statebit = XT_CONNTRACK_STATE_UNTRACKED;
+ else if (ct)
+ statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+ else
+ statebit = XT_CONNTRACK_STATE_INVALID;
+
+ if (sinfo->flags & XT_CONNTRACK_STATE) {
+ if (ct) {
+ if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+ statebit |= XT_CONNTRACK_STATE_SNAT;
+ if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+ statebit |= XT_CONNTRACK_STATE_DNAT;
+ }
+ if (FWINV((statebit & sinfo->statemask) == 0,
+ XT_CONNTRACK_STATE))
+ return false;
+ }
+
+ if (ct == NULL) {
+ if (sinfo->flags & ~XT_CONNTRACK_STATE)
+ return false;
+ return true;
+ }
+
+ if (sinfo->flags & XT_CONNTRACK_PROTO &&
+ FWINV(nf_ct_protonum(ct) !=
+ sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum,
+ XT_CONNTRACK_PROTO))
+ return false;
+
+ if (sinfo->flags & XT_CONNTRACK_ORIGSRC &&
+ FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip &
+ sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+ sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip,
+ XT_CONNTRACK_ORIGSRC))
+ return false;
+
+ if (sinfo->flags & XT_CONNTRACK_ORIGDST &&
+ FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip &
+ sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+ sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip,
+ XT_CONNTRACK_ORIGDST))
+ return false;
+
+ if (sinfo->flags & XT_CONNTRACK_REPLSRC &&
+ FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip &
+ sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) !=
+ sinfo->tuple[IP_CT_DIR_REPLY].src.ip,
+ XT_CONNTRACK_REPLSRC))
+ return false;
+
+ if (sinfo->flags & XT_CONNTRACK_REPLDST &&
+ FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip &
+ sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) !=
+ sinfo->tuple[IP_CT_DIR_REPLY].dst.ip,
+ XT_CONNTRACK_REPLDST))
+ return false;
+
+ if (sinfo->flags & XT_CONNTRACK_STATUS &&
+ FWINV((ct->status & sinfo->statusmask) == 0,
+ XT_CONNTRACK_STATUS))
+ return false;
+
+ if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+ unsigned long expires = timer_pending(&ct->timeout) ?
+ (ct->timeout.expires - jiffies)/HZ : 0;
+
+ if (FWINV(!(expires >= sinfo->expires_min &&
+ expires <= sinfo->expires_max),
+ XT_CONNTRACK_EXPIRES))
+ return false;
+ }
+ return true;
+#undef FWINV
+}
+
+static bool
conntrack_addrcmp(const union nf_inet_addr *kaddr,
const union nf_inet_addr *uaddr,
const union nf_inet_addr *umask, unsigned int l3proto)
@@ -112,6 +201,55 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
return true;
}
+#ifdef CONFIG_COMPAT
+struct compat_xt_conntrack_info
+{
+ compat_uint_t statemask;
+ compat_uint_t statusmask;
+ struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+ struct in_addr sipmsk[IP_CT_DIR_MAX];
+ struct in_addr dipmsk[IP_CT_DIR_MAX];
+ compat_ulong_t expires_min;
+ compat_ulong_t expires_max;
+ u_int8_t flags;
+ u_int8_t invflags;
+};
+
+static void conntrack_mt_compat_from_user_v0(void *dst, void *src)
+{
+ const struct compat_xt_conntrack_info *cm = src;
+ struct xt_conntrack_info m = {
+ .statemask = cm->statemask,
+ .statusmask = cm->statusmask,
+ .expires_min = cm->expires_min,
+ .expires_max = cm->expires_max,
+ .flags = cm->flags,
+ .invflags = cm->invflags,
+ };
+ memcpy(m.tuple, cm->tuple, sizeof(m.tuple));
+ memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk));
+ memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk));
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int conntrack_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+ const struct xt_conntrack_info *m = src;
+ struct compat_xt_conntrack_info cm = {
+ .statemask = m->statemask,
+ .statusmask = m->statusmask,
+ .expires_min = m->expires_min,
+ .expires_max = m->expires_max,
+ .flags = m->flags,
+ .invflags = m->invflags,
+ };
+ memcpy(cm.tuple, m->tuple, sizeof(cm.tuple));
+ memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk));
+ memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk));
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif
+
static bool
conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par,
u16 state_mask, u16 status_mask)
@@ -224,6 +362,21 @@ static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
static struct xt_match conntrack_mt_reg[] __read_mostly = {
{
.name = "conntrack",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = conntrack_mt_v0,
+ .checkentry = conntrack_mt_check,
+ .destroy = conntrack_mt_destroy,
+ .matchsize = sizeof(struct xt_conntrack_info),
+ .me = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_conntrack_info),
+ .compat_from_user = conntrack_mt_compat_from_user_v0,
+ .compat_to_user = conntrack_mt_compat_to_user_v0,
+#endif
+ },
+ {
+ .name = "conntrack",
.revision = 1,
.family = NFPROTO_UNSPEC,
.matchsize = sizeof(struct xt_conntrack_mtinfo1),
diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c
index 0280d3a..c3f8085 100644
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -15,6 +15,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_dscp.h>
+#include <linux/netfilter_ipv4/ipt_tos.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("Xtables: DSCP/TOS field match");
@@ -54,6 +55,14 @@ static bool dscp_mt_check(const struct xt_mtchk_param *par)
return true;
}
+static bool
+tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct ipt_tos_info *info = par->matchinfo;
+
+ return (ip_hdr(skb)->tos == info->tos) ^ info->invert;
+}
+
static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par)
{
const struct xt_tos_match_info *info = par->matchinfo;
@@ -85,6 +94,14 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
},
{
.name = "tos",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = tos_mt_v0,
+ .matchsize = sizeof(struct ipt_tos_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "tos",
.revision = 1,
.family = NFPROTO_IPV4,
.match = tos_mt,
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index dd16e40..72e297b 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -15,6 +15,7 @@
#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/nsproxy.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/mm.h>
@@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit");
MODULE_ALIAS("ip6t_hashlimit");
/* need to declare this at the top */
+#ifdef CONFIG_VE_IPTABLES
+#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4)
+#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6)
+#else
static struct proc_dir_entry *hashlimit_procdir4;
static struct proc_dir_entry *hashlimit_procdir6;
+#endif
static const struct file_operations dl_file_ops;
/* hash table crap */
@@ -99,9 +105,16 @@ struct xt_hashlimit_htable {
static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */
static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */
+#ifdef CONFIG_VE_IPTABLES
+#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables)
+#else
static HLIST_HEAD(hashlimit_htables);
+#endif
static struct kmem_cache *hashlimit_cachep __read_mostly;
+static int init_xt_hashlimit(void);
+static void fini_xt_hashlimit(void);
+
static inline bool dst_cmp(const struct dsthash_ent *ent,
const struct dsthash_dst *b)
{
@@ -687,6 +700,9 @@ static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par)
if (r->name[sizeof(r->name) - 1] != '\0')
return false;
+ if (init_xt_hashlimit())
+ return 0;
+
/* This is the best we've got: We cannot release and re-grab lock,
* since checkentry() is called before x_tables.c grabs xt_mutex.
* We also cannot grab the hashtable spinlock, since htable_create will
@@ -728,6 +744,9 @@ static bool hashlimit_mt_check(const struct xt_mtchk_param *par)
return false;
}
+ if (init_xt_hashlimit())
+ return 0;
+
/* This is the best we've got: We cannot release and re-grab lock,
* since checkentry() is called before x_tables.c grabs xt_mutex.
* We also cannot grab the hashtable spinlock, since htable_create will
@@ -750,6 +769,8 @@ hashlimit_mt_destroy_v0(const struct xt_mtdtor_param *par)
const struct xt_hashlimit_info *r = par->matchinfo;
htable_put(r->hinfo);
+ if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
+ fini_xt_hashlimit();
}
static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
@@ -757,6 +778,8 @@ static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par)
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
htable_put(info->hinfo);
+ if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables))
+ fini_xt_hashlimit();
}
#ifdef CONFIG_COMPAT
@@ -957,6 +980,78 @@ static const struct file_operations dl_file_ops = {
.release = seq_release
};
+static inline struct proc_dir_entry *proc_from_netns(void)
+{
+#if defined(CONFIG_VE)
+ return get_exec_env()->ve_netns->proc_net;
+#else
+ return init_net.proc_net;
+#endif
+}
+
+static int init_xt_hashlimit(void)
+{
+ struct proc_dir_entry *proc_net = proc_from_netns();
+
+#if defined(CONFIG_VE_IPTABLES)
+ struct ve_struct *ve = get_exec_env();
+
+ if (ve->_xt_hashlimit)
+ return 0;
+
+ ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL);
+ if (!ve->_xt_hashlimit)
+ goto err1;
+#endif
+ INIT_HLIST_HEAD(&hashlimit_htables);
+
+ hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net);
+ if (!hashlimit_procdir4) {
+ printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+ "entry\n");
+ goto err2;
+ }
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+ hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net);
+ if (!hashlimit_procdir6) {
+ printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
+ "entry\n");
+ goto err3;
+ }
+#endif
+
+ return 0;
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+err3:
+ remove_proc_entry("ipt_hashlimit", proc_net);
+#endif
+err2:
+#if defined(CONFIG_VE_IPTABLES)
+ kfree(ve->_xt_hashlimit);
+ ve->_xt_hashlimit = NULL;
+err1:
+#endif
+ return -ENOMEM;
+}
+
+static void fini_xt_hashlimit(void)
+{
+ struct proc_dir_entry *proc_net = proc_from_netns();
+#ifdef CONFIG_VE_IPTABLES
+ struct ve_struct *ve = get_exec_env();
+#endif
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+ remove_proc_entry("ip6t_hashlimit", proc_net);
+#endif
+ remove_proc_entry("ipt_hashlimit", proc_net);
+
+#if defined(CONFIG_VE_IPTABLES)
+ kfree(ve->_xt_hashlimit);
+ ve->_xt_hashlimit = NULL;
+#endif
+}
+
static int __init hashlimit_mt_init(void)
{
int err;
@@ -974,24 +1069,11 @@ static int __init hashlimit_mt_init(void)
printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n");
goto err2;
}
- hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net);
- if (!hashlimit_procdir4) {
- printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
- "entry\n");
+ err = init_xt_hashlimit();
+ if (err)
goto err3;
- }
- err = 0;
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
- hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net);
- if (!hashlimit_procdir6) {
- printk(KERN_ERR "xt_hashlimit: unable to create proc dir "
- "entry\n");
- err = -ENOMEM;
- }
-#endif
if (!err)
return 0;
- remove_proc_entry("ipt_hashlimit", init_net.proc_net);
err3:
kmem_cache_destroy(hashlimit_cachep);
err2:
@@ -1003,10 +1085,7 @@ err1:
static void __exit hashlimit_mt_exit(void)
{
- remove_proc_entry("ipt_hashlimit", init_net.proc_net);
-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
- remove_proc_entry("ip6t_hashlimit", init_net.proc_net);
-#endif
+ fini_xt_hashlimit();
kmem_cache_destroy(hashlimit_cachep);
xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
}
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index ffc9638..5450cda 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -14,6 +14,40 @@
#include <linux/ipv6.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_iprange.h>
+#include <linux/netfilter_ipv4/ipt_iprange.h>
+
+static bool
+iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct ipt_iprange_info *info = par->matchinfo;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (info->flags & IPRANGE_SRC) {
+ if ((ntohl(iph->saddr) < ntohl(info->src.min_ip)
+ || ntohl(iph->saddr) > ntohl(info->src.max_ip))
+ ^ !!(info->flags & IPRANGE_SRC_INV)) {
+ pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+ &iph->saddr,
+ info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+ &info->src.min_ip,
+ &info->src.max_ip);
+ return false;
+ }
+ }
+ if (info->flags & IPRANGE_DST) {
+ if ((ntohl(iph->daddr) < ntohl(info->dst.min_ip)
+ || ntohl(iph->daddr) > ntohl(info->dst.max_ip))
+ ^ !!(info->flags & IPRANGE_DST_INV)) {
+ pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+ &iph->daddr,
+ info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+ &info->dst.min_ip,
+ &info->dst.max_ip);
+ return false;
+ }
+ }
+ return true;
+}
static bool
iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par)
@@ -93,6 +127,14 @@ iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
static struct xt_match iprange_mt_reg[] __read_mostly = {
{
.name = "iprange",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = iprange_mt_v0,
+ .matchsize = sizeof(struct ipt_iprange_info),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "iprange",
.revision = 1,
.family = NFPROTO_IPV4,
.match = iprange_mt4,
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 2773be6..847f081 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -105,7 +105,7 @@ static bool limit_mt_check(const struct xt_mtchk_param *par)
/* Check for overflow. */
if (r->burst == 0
|| user2credits(r->avg * r->burst) < user2credits(r->avg)) {
- printk("Overflow in xt_limit, try lower: %u/%u\n",
+ ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n",
r->avg, r->burst);
return false;
}
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c
index 1db07d8..0c17eca 100644
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -23,6 +23,14 @@ MODULE_ALIAS("ipt_mark");
MODULE_ALIAS("ip6t_mark");
static bool
+mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct xt_mark_info *info = par->matchinfo;
+
+ return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool
mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
{
const struct xt_mark_mtinfo1 *info = par->matchinfo;
@@ -30,23 +38,81 @@ mark_mt(const struct sk_buff *skb, const struct xt_match_param *par)
return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
-static struct xt_match mark_mt_reg __read_mostly = {
- .name = "mark",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .match = mark_mt,
- .matchsize = sizeof(struct xt_mark_mtinfo1),
- .me = THIS_MODULE,
+static bool mark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct xt_mark_info *minfo = par->matchinfo;
+
+ if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+ printk(KERN_WARNING "mark: only supports 32bit mark\n");
+ return false;
+ }
+ return true;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_info {
+ compat_ulong_t mark, mask;
+ u_int8_t invert;
+ u_int8_t __pad1;
+ u_int16_t __pad2;
+};
+
+static void mark_mt_compat_from_user_v0(void *dst, void *src)
+{
+ const struct compat_xt_mark_info *cm = src;
+ struct xt_mark_info m = {
+ .mark = cm->mark,
+ .mask = cm->mask,
+ .invert = cm->invert,
+ };
+ memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_mt_compat_to_user_v0(void __user *dst, void *src)
+{
+ const struct xt_mark_info *m = src;
+ struct compat_xt_mark_info cm = {
+ .mark = m->mark,
+ .mask = m->mask,
+ .invert = m->invert,
+ };
+ return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match mark_mt_reg[] __read_mostly = {
+ {
+ .name = "mark",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = mark_mt_check_v0,
+ .match = mark_mt_v0,
+ .matchsize = sizeof(struct xt_mark_info),
+#ifdef CONFIG_COMPAT
+ .compatsize = sizeof(struct compat_xt_mark_info),
+ .compat_from_user = mark_mt_compat_from_user_v0,
+ .compat_to_user = mark_mt_compat_to_user_v0,
+#endif
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "mark",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = mark_mt,
+ .matchsize = sizeof(struct xt_mark_mtinfo1),
+ .me = THIS_MODULE,
+ },
};
static int __init mark_mt_init(void)
{
- return xt_register_match(&mark_mt_reg);
+ return xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
}
static void __exit mark_mt_exit(void)
{
- xt_unregister_match(&mark_mt_reg);
+ xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
}
module_init(mark_mt_init);
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index d24c76d..79d6a0b 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -16,6 +16,60 @@
#include <net/sock.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_owner.h>
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv6/ip6t_owner.h>
+
+static bool
+owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct ipt_owner_info *info = par->matchinfo;
+ const struct file *filp;
+
+ if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ return false;
+
+ filp = skb->sk->sk_socket->file;
+ if (filp == NULL)
+ return false;
+
+ if (info->match & IPT_OWNER_UID)
+ if ((filp->f_cred->fsuid != info->uid) ^
+ !!(info->invert & IPT_OWNER_UID))
+ return false;
+
+ if (info->match & IPT_OWNER_GID)
+ if ((filp->f_cred->fsgid != info->gid) ^
+ !!(info->invert & IPT_OWNER_GID))
+ return false;
+
+ return true;
+}
+
+static bool
+owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+ const struct ip6t_owner_info *info = par->matchinfo;
+ const struct file *filp;
+
+ if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ return false;
+
+ filp = skb->sk->sk_socket->file;
+ if (filp == NULL)
+ return false;
+
+ if (info->match & IP6T_OWNER_UID)
+ if ((filp->f_cred->fsuid != info->uid) ^
+ !!(info->invert & IP6T_OWNER_UID))
+ return false;
+
+ if (info->match & IP6T_OWNER_GID)
+ if ((filp->f_cred->fsgid != info->gid) ^
+ !!(info->invert & IP6T_OWNER_GID))
+ return false;
+
+ return true;
+}
static bool
owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
@@ -52,25 +106,76 @@ owner_mt(const struct sk_buff *skb, const struct xt_match_param *par)
return true;
}
-static struct xt_match owner_mt_reg __read_mostly = {
- .name = "owner",
- .revision = 1,
- .family = NFPROTO_UNSPEC,
- .match = owner_mt,
- .matchsize = sizeof(struct xt_owner_match_info),
- .hooks = (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING),
- .me = THIS_MODULE,
+static bool owner_mt_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct ipt_owner_info *info = par->matchinfo;
+
+ if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) {
+ printk(KERN_WARNING KBUILD_MODNAME
+ ": PID, SID and command matching is not "
+ "supported anymore\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool owner_mt6_check_v0(const struct xt_mtchk_param *par)
+{
+ const struct ip6t_owner_info *info = par->matchinfo;
+
+ if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) {
+ printk(KERN_WARNING KBUILD_MODNAME
+ ": PID and SID matching is not supported anymore\n");
+ return false;
+ }
+
+ return true;
+}
+
+static struct xt_match owner_mt_reg[] __read_mostly = {
+ {
+ .name = "owner",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .match = owner_mt_v0,
+ .matchsize = sizeof(struct ipt_owner_info),
+ .checkentry = owner_mt_check_v0,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "owner",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .match = owner_mt6_v0,
+ .matchsize = sizeof(struct ip6t_owner_info),
+ .checkentry = owner_mt6_check_v0,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
+ {
+ .name = "owner",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .match = owner_mt,
+ .matchsize = sizeof(struct xt_owner_match_info),
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING),
+ .me = THIS_MODULE,
+ },
};
static int __init owner_mt_init(void)
{
- return xt_register_match(&owner_mt_reg);
+ return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
}
static void __exit owner_mt_exit(void)
{
- xt_unregister_match(&owner_mt_reg);
+ xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
}
module_init(owner_mt_init);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 2f181aa..3499fb2 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -17,6 +17,8 @@
#include <linux/ipv6.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/string.h>
@@ -58,6 +60,9 @@ MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files");
MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files");
+static int init_ipt_recent(struct ve_struct *ve);
+static void fini_ipt_recent(struct ve_struct *ve);
+
struct recent_entry {
struct list_head list;
struct list_head lru_list;
@@ -78,15 +83,27 @@ struct recent_table {
struct list_head iphash[0];
};
+#if defined(CONFIG_VE_IPTABLES)
+#define tables (get_exec_env()->_ipt_recent->tables)
+#else
static LIST_HEAD(tables);
+#endif
static DEFINE_SPINLOCK(recent_lock);
static DEFINE_MUTEX(recent_mutex);
#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
+#if defined(CONFIG_VE_IPTABLES)
+#define proc_old_dir (get_exec_env()->_ipt_recent->proc_old_dir)
+#else
static struct proc_dir_entry *proc_old_dir;
#endif
+#endif
+#if defined(CONFIG_VE_IPTABLES)
+#define recent_proc_dir (get_exec_env()->_ipt_recent->proc_dir)
+#else
static struct proc_dir_entry *recent_proc_dir;
+#endif
static const struct file_operations recent_old_fops, recent_mt_fops;
#endif
@@ -300,6 +317,9 @@ static bool recent_mt_check(const struct xt_mtchk_param *par)
strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN)
return false;
+ if (init_ipt_recent(get_exec_env()))
+ return 0;
+
mutex_lock(&recent_mutex);
t = recent_table_lookup(info->name);
if (t != NULL) {
@@ -351,6 +371,13 @@ static void recent_mt_destroy(const struct xt_mtdtor_param *par)
{
const struct xt_recent_mtinfo *info = par->matchinfo;
struct recent_table *t;
+ struct ve_struct *ve;
+
+ ve = get_exec_env();
+#ifdef CONFIG_VE_IPTABLES
+ if (!ve->_ipt_recent)
+ return;
+#endif
mutex_lock(&recent_mutex);
t = recent_table_lookup(info->name);
@@ -368,6 +395,8 @@ static void recent_mt_destroy(const struct xt_mtdtor_param *par)
kfree(t);
}
mutex_unlock(&recent_mutex);
+ if (!ve_is_super(ve) && list_empty(&tables))
+ fini_ipt_recent(ve);
}
#ifdef CONFIG_PROC_FS
@@ -637,19 +666,26 @@ static struct xt_match recent_mt_reg[] __read_mostly = {
},
};
-static int __init recent_mt_init(void)
+static int init_ipt_recent(struct ve_struct *ve)
{
- int err;
+ int err = 0;
- if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
- return -EINVAL;
- ip_list_hash_size = 1 << fls(ip_list_tot);
+#ifdef CONFIG_VE_IPTABLES
+ if (ve->_ipt_recent)
+ return 0;
- err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL);
+ if (!ve->_ipt_recent) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&tables);
+#endif
#ifdef CONFIG_PROC_FS
if (err)
return err;
- recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net);
+ recent_proc_dir = proc_mkdir("xt_recent", ve->ve_netns->proc_net);
if (recent_proc_dir == NULL) {
xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
err = -ENOMEM;
@@ -657,7 +693,7 @@ static int __init recent_mt_init(void)
#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
if (err < 0)
return err;
- proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net);
+ proc_old_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net);
if (proc_old_dir == NULL) {
remove_proc_entry("xt_recent", init_net.proc_net);
xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
@@ -665,20 +701,52 @@ static int __init recent_mt_init(void)
}
#endif
#endif
+out:
return err;
+out_mem:
+#ifdef CONFIG_VE_IPTABLES
+ kfree(ve->_ipt_recent);
+#endif
+ goto out;
}
-static void __exit recent_mt_exit(void)
+static void fini_ipt_recent(struct ve_struct *ve)
{
- BUG_ON(!list_empty(&tables));
- xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
- remove_proc_entry("ipt_recent", init_net.proc_net);
+ remove_proc_entry("ipt_recent", ve->ve_netns->proc_net);
+#endif
+ remove_proc_entry("xt_recent", ve->ve_netns->proc_net);
#endif
- remove_proc_entry("xt_recent", init_net.proc_net);
+#ifdef CONFIG_VE_IPTABLES
+ kfree(ve->_ipt_recent);
+ ve->_ipt_recent = NULL;
#endif
}
+static int __init recent_mt_init(void)
+{
+ int err;
+
+ if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255)
+ return -EINVAL;
+ ip_list_hash_size = 1 << fls(ip_list_tot);
+
+ err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ if (err)
+ return err;
+ err = init_ipt_recent(&ve0);
+ if (err)
+ xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ return err;
+}
+
+static void __exit recent_mt_exit(void)
+{
+ BUG_ON(!list_empty(&tables));
+ xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg));
+ fini_ipt_recent(&ve0);
+}
+
module_init(recent_mt_init);
module_exit(recent_mt_exit);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 19e9800..c97510c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -60,29 +60,14 @@
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
+#include <net/netlink_sock.h>
+
+#include <bc/beancounter.h>
+#include <bc/net.h>
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
#define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long))
-struct netlink_sock {
- /* struct sock has to be the first member of netlink_sock */
- struct sock sk;
- u32 pid;
- u32 dst_pid;
- u32 dst_group;
- u32 flags;
- u32 subscriptions;
- u32 ngroups;
- unsigned long *groups;
- unsigned long state;
- wait_queue_head_t wait;
- struct netlink_callback *cb;
- struct mutex *cb_mutex;
- struct mutex cb_def_mutex;
- void (*netlink_rcv)(struct sk_buff *skb);
- struct module *module;
-};
-
struct listeners_rcu_head {
struct rcu_head rcu_head;
void *ptr;
@@ -411,6 +396,8 @@ static int __netlink_create(struct net *net, struct socket *sock,
sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
if (!sk)
return -ENOMEM;
+ if (ub_other_sock_charge(sk))
+ goto out_free;
sock_init_data(sock, sk);
@@ -426,6 +413,10 @@ static int __netlink_create(struct net *net, struct socket *sock,
sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
return 0;
+
+out_free:
+ sk_free(sk);
+ return -ENOMEM;
}
static int netlink_create(struct net *net, struct socket *sock, int protocol)
@@ -539,7 +530,7 @@ static int netlink_autobind(struct socket *sock)
struct hlist_head *head;
struct sock *osk;
struct hlist_node *node;
- s32 pid = current->tgid;
+ s32 pid = task_tgid_vnr(current);
int err;
static s32 rover = -4097;
@@ -575,7 +566,7 @@ retry:
static inline int netlink_capable(struct socket *sock, unsigned int flag)
{
return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
- capable(CAP_NET_ADMIN);
+ capable(CAP_VE_NET_ADMIN);
}
static void
@@ -785,12 +776,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
+ unsigned long chargesize;
+ int no_ubc;
nlk = nlk_sk(sk);
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ chargesize = skb_charge_fullsize(skb);
+ no_ubc = ub_sock_getwres_other(sk, chargesize);
+ if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(0, &nlk->state)) {
DECLARE_WAITQUEUE(wait, current);
+
+ if (!no_ubc)
+ ub_sock_retwres_other(sk, chargesize,
+ SOCK_MIN_UBCSPACE_CH);
if (!*timeo) {
if (!ssk || netlink_is_kernel(ssk))
netlink_overrun(sk);
@@ -802,13 +801,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&nlk->wait, &wait);
+ /* this if can't be moved upper because ub_sock_snd_queue_add()
+ * may change task state to TASK_RUNNING */
+ if (no_ubc)
+ ub_sock_sndqueueadd_other(sk, chargesize);
+
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(0, &nlk->state)) &&
+ test_bit(0, &nlk->state) || no_ubc) &&
!sock_flag(sk, SOCK_DEAD))
*timeo = schedule_timeout(*timeo);
__set_current_state(TASK_RUNNING);
remove_wait_queue(&nlk->wait, &wait);
+ if (no_ubc)
+ ub_sock_sndqueuedel(sk);
sock_put(sk);
if (signal_pending(current)) {
@@ -818,6 +824,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
return 1;
}
skb_set_owner_r(skb, sk);
+ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
return 0;
}
@@ -984,8 +991,13 @@ static inline int do_one_broadcast(struct sock *sk,
!test_bit(p->group - 1, nlk->groups))
goto out;
+ if (!ve_accessible_strict(get_exec_env(), sk->owner_env))
+ goto out;
+
+#ifndef CONFIG_VE
if (!net_eq(sock_net(sk), p->net))
goto out;
+#endif
if (p->failure) {
netlink_overrun(sk);
@@ -1663,6 +1675,10 @@ static int netlink_dump(struct sock *sk)
skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
if (!skb)
goto errout;
+ if (ub_nlrcvbuf_charge(skb, sk) < 0) {
+ kfree_skb(skb);
+ return -EACCES;
+ }
mutex_lock(nlk->cb_mutex);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 44ff3f3..ea6f5cb 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -519,7 +519,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EOPNOTSUPP;
if ((ops->flags & GENL_ADMIN_PERM) &&
- security_netlink_recv(skb, CAP_NET_ADMIN))
+ security_netlink_recv(skb, CAP_VE_NET_ADMIN))
return -EPERM;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 41866eb..9dd7d18 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -80,6 +80,8 @@
#include <linux/init.h>
#include <linux/mutex.h>
+#include <bc/net.h>
+
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
@@ -554,6 +556,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
if (dev_net(dev) != sock_net(sk))
goto drop;
+ skb_orphan(skb);
+
skb->dev = dev;
if (dev->header_ops) {
@@ -617,6 +621,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
if (pskb_trim(skb, snaplen))
goto drop_n_acct;
+ if (ub_sockrcvbuf_charge(sk, skb))
+ goto drop_n_acct;
+
skb_set_owner_r(skb, sk);
skb->dev = NULL;
skb_dst_drop(skb);
@@ -676,6 +683,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (dev_net(dev) != sock_net(sk))
goto drop;
+ skb_orphan(skb);
+
if (dev->header_ops) {
if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb));
@@ -725,6 +734,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
snaplen = 0;
}
+ if (copy_skb &&
+ ub_sockrcvbuf_charge(sk, copy_skb)) {
+ spin_lock(&sk->sk_receive_queue.lock);
+ goto ring_is_full;
+ }
+
spin_lock(&sk->sk_receive_queue.lock);
h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
if (!h.raw)
@@ -1369,6 +1384,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
if (sk == NULL)
goto out;
+ if (ub_other_sock_charge(sk))
+ goto out_free;
sock->ops = &packet_ops;
if (sock->type == SOCK_PACKET)
@@ -1408,6 +1425,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sock_prot_inuse_add(net, &packet_proto, 1);
write_unlock_bh(&net->packet.sklist_lock);
return 0;
+
+out_free:
+ sk_free(sk);
out:
return err;
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 5b132c4..6a88cab 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -873,8 +873,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
if (cl->deficit <= 0) {
q->active[prio] = cl;
- cl = cl->next_alive;
cl->deficit += cl->quantum;
+ cl = cl->next_alive;
}
return skb;
@@ -1047,17 +1047,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
for (h = 0; h < q->clhash.hashsize; h++) {
hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
+ long mtu;
/* BUGGGG... Beware! This expression suffer of
arithmetic overflows!
*/
if (cl->priority == prio) {
- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
- q->quanta[prio];
- }
- if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum);
- cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+ cl->quantum = (cl->weight * cl->allot) /
+ (q->quanta[prio] / q->nclasses[prio]);
}
+ mtu = qdisc_dev(cl->qdisc)->mtu;
+ if (cl->quantum <= mtu/2)
+ cl->quantum = mtu/2 + 1;
+ else if (cl->quantum > 32*mtu)
+ cl->quantum = 32*mtu;
}
}
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 4ae6aa5..8bc040c 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -179,17 +179,23 @@ static inline int qdisc_restart(struct Qdisc *q)
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
+ int ret;
+ struct ve_struct *old_ve;
/* Dequeue packet */
skb = dequeue_skb(q);
if (unlikely(!skb))
return 0;
+ old_ve = set_exec_env(skb->owner_env);
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
- return sch_direct_xmit(skb, q, dev, txq, root_lock);
+ ret = sch_direct_xmit(skb, q, dev, txq, root_lock);
+ (void)set_exec_env(old_ve);
+
+ return ret;
}
void __qdisc_run(struct Qdisc *q)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 5a002c2..7917369 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -178,6 +178,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
struct teql_master *m = (struct teql_master*)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
if (dev->hard_header_len > m->dev->hard_header_len)
return -EINVAL;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 8b3560f..0c46d61 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -701,7 +701,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
if (rx_count >= asoc->base.sk->sk_rcvbuf) {
if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
- (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+ (!sk_rmem_schedule(asoc->base.sk, chunk->skb)))
goto fail;
}
diff --git a/net/socket.c b/net/socket.c
index 7565536..f674df3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -85,6 +85,7 @@
#include <linux/kmod.h>
#include <linux/audit.h>
#include <linux/wireless.h>
+#include <linux/in.h>
#include <linux/nsproxy.h>
#include <linux/magic.h>
@@ -162,15 +163,6 @@ static DEFINE_PER_CPU(int, sockets_in_use) = 0;
* divide and look after the messy bits.
*/
-#define MAX_SOCK_ADDR 128 /* 108 for Unix domain -
- 16 for IP, 16 for IPX,
- 24 for IPv6,
- about 80 for AX.25
- must be at least one bigger than
- the AF_UNIX size (see net/unix/af_unix.c
- :unix_mkname()).
- */
-
/**
* move_addr_to_kernel - copy a socket address into kernel space
* @uaddr: Address in user space
@@ -192,6 +184,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr)
return -EFAULT;
return audit_sockaddr(ulen, kaddr);
}
+EXPORT_SYMBOL(move_addr_to_kernel);
/**
* move_addr_to_user - copy an address to user space
@@ -497,6 +490,8 @@ static struct socket *sock_alloc(void)
return sock;
}
+EXPORT_SYMBOL(sock_alloc);
+
/*
* In theory you can't get an open on this inode, but /proc provides
* a back door. Remember to keep it shut otherwise you'll let the
@@ -524,6 +519,9 @@ const struct file_operations bad_sock_fops = {
void sock_release(struct socket *sock)
{
+ if (sock->sk)
+ ub_sock_sndqueuedel(sock->sk);
+
if (sock->ops) {
struct module *owner = sock->ops->owner;
@@ -1140,6 +1138,50 @@ call_kill:
return 0;
}
+int vz_security_family_check(int family)
+{
+#ifdef CONFIG_VE
+ if (ve_is_super(get_exec_env()))
+ return 0;
+
+ switch (family) {
+ case PF_UNSPEC:
+ case PF_PACKET:
+ case PF_NETLINK:
+ case PF_UNIX:
+ case PF_INET:
+ case PF_INET6:
+ case PF_PPPOX:
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+#endif
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vz_security_family_check);
+
+int vz_security_protocol_check(int protocol)
+{
+#ifdef CONFIG_VE
+ if (ve_is_super(get_exec_env()))
+ return 0;
+
+ switch (protocol) {
+ case IPPROTO_IP:
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_RAW:
+ case IPPROTO_DCCP:
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+#endif
+ return 0;
+}
+EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
static int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
@@ -1170,6 +1212,11 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
family = PF_PACKET;
}
+ /* VZ compatibility layer */
+ err = vz_security_family_check(family);
+ if (err < 0)
+ return err;
+
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
@@ -2419,9 +2466,12 @@ int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg)
{
mm_segment_t oldfs = get_fs();
int err;
+ struct ve_struct *old_env;
set_fs(KERNEL_DS);
+ old_env = set_exec_env(sock->sk->owner_env);
err = sock->ops->ioctl(sock, cmd, arg);
+ (void)set_exec_env(old_env);
set_fs(oldfs);
return err;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 38829e2..fad3e2b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -33,6 +33,7 @@
#include <linux/utsname.h>
#include <linux/workqueue.h>
#include <linux/in6.h>
+#include <linux/ve_proto.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
@@ -95,6 +96,35 @@ static void rpc_unregister_client(struct rpc_clnt *clnt)
spin_unlock(&rpc_client_lock);
}
+/*
+ * Grand abort timeout (stop the client if occures)
+ */
+int xprt_abort_timeout = RPC_MAX_ABORT_TIMEOUT;
+
+static int rpc_abort_hard(struct rpc_task *task)
+{
+ struct rpc_clnt *clnt;
+ clnt = task->tk_client;
+
+ if (clnt->cl_pr_time == 0) {
+ clnt->cl_pr_time = jiffies;
+ return 0;
+ }
+ if (xprt_abort_timeout == RPC_MAX_ABORT_TIMEOUT)
+ return 0;
+ if (time_before(jiffies, clnt->cl_pr_time + xprt_abort_timeout * HZ))
+ return 0;
+
+ clnt->cl_broken = 1;
+ rpc_killall_tasks(clnt);
+ return -ETIMEDOUT;
+}
+
+static void rpc_abort_clear(struct rpc_task *task)
+{
+ task->tk_client->cl_pr_time = 0;
+}
+
static int
rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
{
@@ -200,6 +230,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
clnt->cl_vers = version->number;
clnt->cl_stats = program->stats;
clnt->cl_metrics = rpc_alloc_iostats(clnt);
+ clnt->cl_broken = 0;
err = -ENOMEM;
if (clnt->cl_metrics == NULL)
goto out_no_stats;
@@ -336,8 +367,10 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
xprt->resvport = 0;
clnt = rpc_new_client(args, xprt);
- if (IS_ERR(clnt))
+ if (IS_ERR(clnt)) {
+ put_ve(xprt->owner_env);
return clnt;
+ }
if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
int err = rpc_ping(clnt, RPC_TASK_SOFT);
@@ -558,6 +591,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
{
struct rpc_task *task, *ret;
+ if (task_setup_data->rpc_client->cl_broken)
+ return ERR_PTR(-EIO);
+
task = rpc_new_task(task_setup_data);
if (task == NULL) {
rpc_release_calldata(task_setup_data->callback_ops,
@@ -1034,6 +1070,7 @@ call_bind_status(struct rpc_task *task)
if (task->tk_status >= 0) {
dprint_status(task);
+ rpc_abort_clear(task);
task->tk_status = 0;
task->tk_action = call_connect;
return;
@@ -1057,6 +1094,10 @@ call_bind_status(struct rpc_task *task)
case -ETIMEDOUT:
dprintk("RPC: %5u rpcbind request timed out\n",
task->tk_pid);
+ if (rpc_abort_hard(task)) {
+ status = -EIO;
+ break;
+ }
goto retry_timeout;
case -EPFNOSUPPORT:
/* server doesn't support any rpcbind version we know of */
@@ -1114,7 +1155,8 @@ call_connect_status(struct rpc_task *task)
dprint_status(task);
task->tk_status = 0;
- if (status >= 0 || status == -EAGAIN) {
+ if (status >= 0 ||
+ (status == -EAGAIN && !rpc_abort_hard(task))) {
clnt->cl_stats->netreconn++;
task->tk_action = call_transmit;
return;
@@ -1346,7 +1388,7 @@ call_timeout(struct rpc_task *task)
dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid);
task->tk_timeouts++;
- if (RPC_IS_SOFT(task)) {
+ if (RPC_IS_SOFT(task) || rpc_abort_hard(task)) {
if (clnt->cl_chatty)
printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
clnt->cl_protname, clnt->cl_server);
@@ -1394,6 +1436,7 @@ call_decode(struct rpc_task *task)
task->tk_flags &= ~RPC_CALL_MAJORSEEN;
}
+ rpc_abort_clear(task);
/*
* Ensure that we see all writes made by xprt_complete_rqst()
* before it changed req->rq_reply_bytes_recvd.
@@ -1406,7 +1449,7 @@ call_decode(struct rpc_task *task)
sizeof(req->rq_rcv_buf)) != 0);
if (req->rq_rcv_buf.len < 12) {
- if (!RPC_IS_SOFT(task)) {
+ if (!RPC_IS_SOFT(task) && !rpc_abort_hard(task)) {
task->tk_action = call_bind;
clnt->cl_stats->rpcretrans++;
goto out_retry;
@@ -1754,3 +1797,67 @@ void rpc_show_tasks(void)
spin_unlock(&rpc_client_lock);
}
#endif
+
+#ifdef CONFIG_VE
+static int ve_sunrpc_start(void *data)
+{
+ return 0;
+}
+
+void ve_sunrpc_stop(void *data)
+{
+ struct ve_struct *ve = (struct ve_struct *)data;
+ struct rpc_clnt *clnt;
+ struct rpc_task *rovr;
+
+ dprintk("RPC: killing all tasks for VE %d\n", ve->veid);
+
+ spin_lock(&rpc_client_lock);
+ list_for_each_entry(clnt, &all_clients, cl_clients) {
+ if (clnt->cl_xprt->owner_env != ve)
+ continue;
+
+ spin_lock(&clnt->cl_lock);
+ list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) {
+ if (!RPC_IS_ACTIVATED(rovr))
+ continue;
+ printk(KERN_WARNING "RPC: Killing task %d client %p\n",
+ rovr->tk_pid, clnt);
+
+ rovr->tk_flags |= RPC_TASK_KILLED;
+ rpc_exit(rovr, -EIO);
+ rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr);
+ }
+ schedule_work(&clnt->cl_xprt->task_cleanup);
+ spin_unlock(&clnt->cl_lock);
+ }
+ spin_unlock(&rpc_client_lock);
+
+ flush_scheduled_work();
+}
+
+static struct ve_hook sunrpc_hook = {
+ .init = ve_sunrpc_start,
+ .fini = ve_sunrpc_stop,
+ .owner = THIS_MODULE,
+ .priority = HOOK_PRIO_NET_PRE,
+};
+
+void ve_sunrpc_hook_register(void)
+{
+ ve_hook_register(VE_SS_CHAIN, &sunrpc_hook);
+}
+
+void ve_sunrpc_hook_unregister(void)
+{
+ ve_hook_unregister(&sunrpc_hook);
+}
+#else
+void ve_sunrpc_hook_register(void)
+{
+}
+
+void ve_sunrpc_hook_unregister(void)
+{
+}
+#endif
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 27a2378..93cb0c5 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1028,6 +1028,7 @@ static struct file_system_type rpc_pipe_fs_type = {
.name = "rpc_pipefs",
.get_sb = rpc_get_sb,
.kill_sb = kill_litter_super,
+ .fs_flags = FS_VIRTUALIZED,
};
static void
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cef74ba..fba30fe 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -606,7 +606,9 @@ static void __rpc_execute(struct rpc_task *task)
struct rpc_wait_queue *queue;
int task_is_async = RPC_IS_ASYNC(task);
int status = 0;
+ struct ve_struct *env;
+ env = set_exec_env(task->tk_client->cl_xprt->owner_env);
dprintk("RPC: %5u __rpc_execute flags=0x%x\n",
task->tk_pid, task->tk_flags);
@@ -662,8 +664,10 @@ static void __rpc_execute(struct rpc_task *task)
}
rpc_clear_running(task);
spin_unlock_bh(&queue->lock);
- if (task_is_async)
+ if (task_is_async) {
+ (void)set_exec_env(env);
return;
+ }
/* sync task: sleep here */
dprintk("RPC: %5u sync task going to sleep\n", task->tk_pid);
@@ -690,6 +694,7 @@ static void __rpc_execute(struct rpc_task *task)
task->tk_status);
/* Release all resources associated with the task */
rpc_release_task(task);
+ (void)set_exec_env(env);
}
/*
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 8cce921..9685220 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,6 +24,9 @@
extern struct cache_detail ip_map_cache, unix_gid_cache;
+extern void ve_sunrpc_hook_register(void);
+extern void ve_sunrpc_hook_unregister(void);
+
static int __init
init_sunrpc(void)
{
@@ -46,6 +49,7 @@ init_sunrpc(void)
svc_init_xprt_sock(); /* svc sock transport */
init_socket_xprt(); /* clnt sock transport */
rpcauth_init_module();
+ ve_sunrpc_hook_register();
out:
return err;
}
@@ -53,6 +57,7 @@ out:
static void __exit
cleanup_sunrpc(void)
{
+ ve_sunrpc_hook_unregister();
rpcauth_remove_module();
cleanup_socket_xprt();
svc_cleanup_xprt_sock();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 70b0a22..f66b225 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -229,6 +229,9 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
unsigned long tailoff;
unsigned long headoff;
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+ struct ve_struct *old_env;
+
+ old_env = set_exec_env(sock->sk->owner_env);
if (rqstp->rq_prot == IPPROTO_UDP) {
struct msghdr msg = {
@@ -255,6 +258,8 @@ out:
svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
+ (void)set_exec_env(old_env);
+
return len;
}
@@ -1437,8 +1442,9 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
error = sock_create_kern(family, type, protocol, &sock);
if (error < 0)
- return ERR_PTR(error);
+ return ERR_PTR(-ENOMEM);
+ sk_change_net_get(sock->sk, get_exec_env()->ve_netns);
svc_reclassify_socket(sock);
/*
@@ -1489,6 +1495,8 @@ static void svc_sock_detach(struct svc_xprt *xprt)
dprintk("svc: svc_sock_detach(%p)\n", svsk);
+ /* XXX: serialization? */
+ sk->sk_user_data = NULL;
/* put back the old socket callbacks */
sk->sk_state_change = svsk->sk_ostate;
sk->sk_data_ready = svsk->sk_odata;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index fd46d42..48d4f3e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -598,10 +598,13 @@ static void xprt_autoclose(struct work_struct *work)
{
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
+ struct ve_struct *ve;
+ ve = set_exec_env(xprt->owner_env);
xprt->ops->close(xprt);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
xprt_release_write(xprt, NULL);
+ (void)set_exec_env(ve);
}
/**
@@ -668,7 +671,9 @@ static void
xprt_init_autodisconnect(unsigned long data)
{
struct rpc_xprt *xprt = (struct rpc_xprt *)data;
+ struct ve_struct *ve;
+ ve = set_exec_env(xprt->owner_env);
spin_lock(&xprt->transport_lock);
if (!list_empty(&xprt->recv) || xprt->shutdown)
goto out_abort;
@@ -677,9 +682,11 @@ xprt_init_autodisconnect(unsigned long data)
spin_unlock(&xprt->transport_lock);
set_bit(XPRT_CONNECTION_CLOSE, &xprt->state);
queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ (void)set_exec_env(ve);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
+ (void)set_exec_env(ve);
}
/**
@@ -1095,6 +1102,7 @@ found:
xprt->last_used = jiffies;
xprt->cwnd = RPC_INITCWND;
xprt->bind_index = 0;
+ xprt->owner_env = get_ve(get_exec_env());
rpc_init_wait_queue(&xprt->binding, "xprt_binding");
rpc_init_wait_queue(&xprt->pending, "xprt_pending");
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 9a63f66..eac6b16 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -269,6 +269,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
kfree(xprt->slot);
xprt->slot = NULL;
+ put_ve(xprt->owner_env);
kfree(xprt);
dprintk("RPC: %s: returning\n", __func__);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5cdbf7c..a57133e 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -72,6 +72,8 @@ static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE;
static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT;
static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT;
+static int xprt_min_abort_timeout = RPC_MIN_ABORT_TIMEOUT;
+static int xprt_max_abort_timeout = RPC_MAX_ABORT_TIMEOUT;
static struct ctl_table_header *sunrpc_table_header;
@@ -125,6 +127,16 @@ static ctl_table xs_tunables_table[] = {
.extra2 = &xprt_max_resvport_limit
},
{
+ .procname = "abort_timeout",
+ .data = &xprt_abort_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &xprt_min_abort_timeout,
+ .extra2 = &xprt_max_abort_timeout
+ },
+ {
.procname = "tcp_fin_timeout",
.data = &xs_tcp_fin_timeout,
.maxlen = sizeof(xs_tcp_fin_timeout),
@@ -736,16 +748,22 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
static void xs_reset_transport(struct sock_xprt *transport)
{
- struct socket *sock = transport->sock;
- struct sock *sk = transport->inet;
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct socket *sock;
+ struct sock *sk;
- if (sk == NULL)
+ spin_lock_bh(&xprt->transport_lock);
+ if (transport->sock == NULL) {
+ spin_unlock_bh(&xprt->transport_lock);
return;
-
- write_lock_bh(&sk->sk_callback_lock);
+ }
+ sock = transport->sock;
+ sk = transport->inet;
transport->inet = NULL;
transport->sock = NULL;
+ spin_unlock_bh(&xprt->transport_lock);
+ write_lock_bh(&sk->sk_callback_lock);
sk->sk_user_data = NULL;
xs_restore_old_callbacks(transport, sk);
@@ -807,6 +825,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
xs_close(xprt);
xs_free_peer_addresses(xprt);
kfree(xprt->slot);
+ put_ve(xprt->owner_env);
kfree(xprt);
module_put(THIS_MODULE);
}
@@ -1703,7 +1722,12 @@ static void xs_udp_connect_worker4(struct work_struct *work)
struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock = transport->sock;
int err, status = -EIO;
+ struct ve_struct *ve;
+ ve = set_exec_env(xprt->owner_env);
+ down_read(&xprt->owner_env->op_sem);
+ if (!xprt->owner_env->is_running)
+ goto out;
if (xprt->shutdown)
goto out;
@@ -1715,6 +1739,7 @@ static void xs_udp_connect_worker4(struct work_struct *work)
dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
goto out;
}
+ sk_change_net_get(sock->sk, xprt->owner_env->ve_netns);
xs_reclassify_socket4(sock);
if (xs_bind4(transport, sock)) {
@@ -1733,6 +1758,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ up_read(&xprt->owner_env->op_sem);
+ (void)set_exec_env(ve);
}
/**
@@ -1748,7 +1775,12 @@ static void xs_udp_connect_worker6(struct work_struct *work)
struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock = transport->sock;
int err, status = -EIO;
+ struct ve_struct *ve;
+ ve = set_exec_env(xprt->owner_env);
+ down_read(&xprt->owner_env->op_sem);
+ if (!xprt->owner_env->is_running)
+ goto out;
if (xprt->shutdown)
goto out;
@@ -1760,6 +1792,7 @@ static void xs_udp_connect_worker6(struct work_struct *work)
dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
goto out;
}
+ sk_change_net_get(sock->sk, xprt->owner_env->ve_netns);
xs_reclassify_socket6(sock);
if (xs_bind6(transport, sock) < 0) {
@@ -1778,6 +1811,8 @@ static void xs_udp_connect_worker6(struct work_struct *work)
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ up_read(&xprt->owner_env->op_sem);
+ (void)set_exec_env(ve);
}
/*
@@ -1873,7 +1908,12 @@ static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
{
struct socket *sock = transport->sock;
int status = -EIO;
+ struct ve_struct *ve;
+ ve = set_exec_env(xprt->owner_env);
+ down_read(&xprt->owner_env->op_sem);
+ if (!xprt->owner_env->is_running)
+ goto out;
if (xprt->shutdown)
goto out;
@@ -1937,6 +1977,8 @@ out_eagain:
out:
xprt_clear_connecting(xprt);
xprt_wake_pending_tasks(xprt, status);
+ up_read(&xprt->owner_env->op_sem);
+ (void)set_exec_env(ve);
}
static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
@@ -1952,6 +1994,7 @@ static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
-err);
goto out_err;
}
+ sk_change_net_get(sock->sk, xprt->owner_env->ve_netns);
xs_reclassify_socket4(sock);
if (xs_bind4(transport, sock) < 0) {
@@ -1991,6 +2034,7 @@ static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
-err);
goto out_err;
}
+ sk_change_net_get(sock->sk, xprt->owner_env->ve_netns);
xs_reclassify_socket6(sock);
if (xs_bind6(transport, sock) < 0) {
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fc820cd..3c3c16d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -115,6 +115,9 @@
#include <net/checksum.h>
#include <linux/security.h>
+#include <bc/net.h>
+#include <bc/beancounter.h>
+
static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
static DEFINE_SPINLOCK(unix_table_lock);
static atomic_t unix_nr_socks = ATOMIC_INIT(0);
@@ -292,9 +295,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
&unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
struct dentry *dentry = unix_sk(s)->dentry;
- if (!net_eq(sock_net(s), net))
- continue;
-
if (dentry && dentry->d_inode == i) {
sock_hold(s);
goto found;
@@ -593,6 +593,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
if (!sk)
goto out;
+ if (ub_other_sock_charge(sk))
+ goto out_sk_free;
sock_init_data(sock, sk);
lockdep_set_class(&sk->sk_receive_queue.lock,
@@ -619,6 +621,9 @@ out:
local_bh_enable();
}
return sk;
+out_sk_free:
+ sk_free(sk);
+ return NULL;
}
static int unix_create(struct net *net, struct socket *sock, int protocol)
@@ -1026,6 +1031,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int st;
int err;
long timeo;
+ unsigned long chargesize;
err = unix_mkname(sunaddr, addr_len, &hash);
if (err < 0)
@@ -1054,6 +1060,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
if (skb == NULL)
goto out;
+ chargesize = skb_charge_fullsize(skb);
+ if (ub_sock_getwres_other(newsk, chargesize) < 0)
+ goto out;
+ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF);
restart:
/* Find listening sock. */
@@ -1302,7 +1312,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
unix_notinflight(scm->fp->fp[i]);
}
-static void unix_destruct_fds(struct sk_buff *skb)
+void unix_destruct_fds(struct sk_buff *skb)
{
struct scm_cookie scm;
memset(&scm, 0, sizeof(scm));
@@ -1313,6 +1323,7 @@ static void unix_destruct_fds(struct sk_buff *skb)
scm_destroy(&scm);
sock_wfree(skb);
}
+EXPORT_SYMBOL_GPL(unix_destruct_fds);
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
@@ -1538,6 +1549,16 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
size = len-sent;
+ if (msg->msg_flags & MSG_DONTWAIT)
+ ub_sock_makewres_other(sk, skb_charge_size(size));
+ if (sock_bc(sk) != NULL &&
+ sock_bc(sk)->poll_reserv >=
+ SOCK_MIN_UBCSPACE &&
+ skb_charge_size(size) >
+ sock_bc(sk)->poll_reserv)
+ size = skb_charge_datalen(sock_bc(sk)->poll_reserv);
+
+
/* Keep two messages in the pipe so it schedules better */
if (size > ((sk->sk_sndbuf >> 1) - 64))
size = (sk->sk_sndbuf >> 1) - 64;
@@ -1549,8 +1570,9 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
* Grab a buffer
*/
- skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
- &err);
+
+ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE,
+ msg->msg_flags&MSG_DONTWAIT, &err);
if (skb == NULL)
goto out_err;
@@ -1989,6 +2011,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
{
struct sock *sk = sock->sk;
unsigned int mask;
+ int no_ub_res;
sock_poll_wait(file, sk->sk_sleep, wait);
mask = 0;
@@ -2001,6 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP;
+ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
+ if (no_ub_res)
+ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
+
/* readable? */
if (!skb_queue_empty(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN))
@@ -2015,7 +2042,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
* we set writable also when the other side has shut down the
* connection. This prevents stuck sockets.
*/
- if (unix_writable(sk))
+ if (!no_ub_res && unix_writable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
return mask;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 19c17e4..686d373 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -81,6 +81,7 @@
#include <linux/proc_fs.h>
#include <linux/mutex.h>
#include <linux/wait.h>
+#include <linux/module.h>
#include <net/sock.h>
#include <net/af_unix.h>
@@ -153,6 +154,7 @@ void unix_notinflight(struct file *fp)
spin_unlock(&unix_gc_lock);
}
}
+EXPORT_SYMBOL_GPL(unix_notinflight);
static inline struct sk_buff *sock_queue_head(struct sock *sk)
{
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b95a2d6..ab7d01d 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2005,7 +2005,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
link = &xfrm_dispatch[type];
/* All operations require privileges, even GET */
- if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN))
return -EPERM;
if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) ||
diff --git a/security/Kconfig b/security/Kconfig
index fb363cd..dbfa601 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS
config SECURITY
bool "Enable different security models"
- depends on SYSFS
+ depends on SYSFS && !VE
help
This allows you to choose different security modules to be
configured into your kernel.
diff --git a/security/commoncap.c b/security/commoncap.c
index fe30751..3579774 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -58,6 +58,10 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
int cap_netlink_recv(struct sk_buff *skb, int cap)
{
+ if (likely(cap == CAP_VE_NET_ADMIN) &&
+ cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
+ return 0;
+
if (!cap_raised(NETLINK_CB(skb).eff_cap, cap))
return -EPERM;
return 0;
@@ -618,7 +622,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
if (!strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
return -EPERM;
return 0;
}
@@ -644,7 +648,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name)
if (!strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
return -EPERM;
return 0;
}
@@ -962,8 +966,9 @@ error:
*/
int cap_syslog(int type)
{
- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if ((type != 3 && type != 10) &&
+ !capable(CAP_VE_SYS_ADMIN) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
return 0;
}
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 6cf8fd2..02aeae6 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -12,11 +12,23 @@
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
+#include <linux/ve.h>
+#include <linux/vzcalluser.h>
+#include <linux/major.h>
#define ACC_MKNOD 1
#define ACC_READ 2
#define ACC_WRITE 4
-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
+#define ACC_QUOTA 8
+#define ACC_HIDDEN 16
+#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA)
+
+static inline int convert_bits(int acc)
+{
+ /* ...10x <-> ...01x trial: guess hwy */
+ return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) &
+ (ACC_READ | ACC_WRITE | ACC_QUOTA);
+}
#define DEV_BLOCK 1
#define DEV_CHAR 2
@@ -73,6 +85,38 @@ static int devcgroup_can_attach(struct cgroup_subsys *ss,
/*
* called under devcgroup_mutex
*/
+#ifdef CONFIG_VE
+static struct dev_whitelist_item default_whitelist_items[] = {
+ { ~0, ~0, DEV_ALL, ACC_MKNOD },
+ { UNIX98_PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { MEM_MAJOR, /* null */ 3, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { MEM_MAJOR, /* zero */ 5, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { MEM_MAJOR, /* full */ 7, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { TTYAUX_MAJOR, /* tty */ 0, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { TTYAUX_MAJOR, /* ptmx */ 2, DEV_CHAR, ACC_READ | ACC_WRITE },
+ { MEM_MAJOR, /* random */ 8, DEV_CHAR, ACC_READ },
+ { MEM_MAJOR, /* urandom */ 9, DEV_CHAR, ACC_READ },
+};
+
+static LIST_HEAD(default_perms);
+#define parent_whitelist(p) (&default_perms)
+static void prepare_def_perms(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(default_whitelist_items); i++) {
+ default_whitelist_items[i].access |= ACC_HIDDEN;
+ list_add(&default_whitelist_items[i].list, &default_perms);
+ }
+}
+#else
+#define prepare_def_perms() do { } while(0)
+#define parent_whitelist(p) (&parent_dev_cgroup->whitelist)
+#endif
+
static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
{
struct dev_whitelist_item *wh, *tmp, *new;
@@ -187,11 +231,13 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
wh->type = DEV_ALL;
wh->access = ACC_MASK;
list_add(&wh->list, &dev_cgroup->whitelist);
+
+ prepare_def_perms();
} else {
parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
mutex_lock(&devcgroup_mutex);
ret = dev_whitelist_copy(&dev_cgroup->whitelist,
- &parent_dev_cgroup->whitelist);
+ parent_whitelist(parent_dev_cgroup));
mutex_unlock(&devcgroup_mutex);
if (ret) {
kfree(dev_cgroup);
@@ -266,8 +312,15 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
set_access(acc, wh->access);
set_majmin(maj, wh->major);
set_majmin(min, wh->minor);
- seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
- maj, min, acc);
+
+ if (cft != NULL)
+ seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type),
+ maj, min, acc);
+ else if (!(wh->access & ACC_HIDDEN))
+ seq_printf(m, "%10u %c %03o %s:%s\n",
+ (unsigned)(unsigned long)m->private,
+ type_to_char(wh->type),
+ convert_bits(wh->access), maj, min);
}
rcu_read_unlock();
@@ -474,38 +527,35 @@ struct cgroup_subsys devices_subsys = {
.subsys_id = devices_subsys_id,
};
-int devcgroup_inode_permission(struct inode *inode, int mask)
+static int __devcgroup_inode_permission(int blk, dev_t device, int mask)
{
struct dev_cgroup *dev_cgroup;
struct dev_whitelist_item *wh;
- dev_t device = inode->i_rdev;
if (!device)
return 0;
- if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
- return 0;
rcu_read_lock();
-
dev_cgroup = task_devcgroup(current);
list_for_each_entry_rcu(wh, &dev_cgroup->whitelist, list) {
if (wh->type & DEV_ALL)
goto found;
- if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode))
+ if ((wh->type & DEV_BLOCK) && !blk)
continue;
- if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode))
+ if ((wh->type & DEV_CHAR) && blk)
continue;
- if (wh->major != ~0 && wh->major != imajor(inode))
+ if (wh->major != ~0 && wh->major != MAJOR(device))
continue;
- if (wh->minor != ~0 && wh->minor != iminor(inode))
+ if (wh->minor != ~0 && wh->minor != MINOR(device))
continue;
-
+found:
if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE))
continue;
if ((mask & MAY_READ) && !(wh->access & ACC_READ))
continue;
-found:
+ if ((mask & MAY_QUOTACTL) && !(wh->access & ACC_QUOTA))
+ continue;
rcu_read_unlock();
return 0;
}
@@ -515,6 +565,15 @@ found:
}
EXPORT_SYMBOL_GPL(devcgroup_inode_permission);
+int devcgroup_inode_permission(struct inode *inode, int mask)
+{
+ if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+ return 0;
+
+ return __devcgroup_inode_permission(S_ISBLK(inode->i_mode),
+ inode->i_rdev, mask);
+}
+
int devcgroup_inode_mknod(int mode, dev_t dev)
{
struct dev_cgroup *dev_cgroup;
@@ -538,10 +597,9 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
continue;
if (wh->minor != ~0 && wh->minor != MINOR(dev))
continue;
-
+found:
if (!(wh->access & ACC_MKNOD))
continue;
-found:
rcu_read_unlock();
return 0;
}
@@ -550,3 +608,75 @@ found:
return -EPERM;
}
+
+#ifdef CONFIG_VE
+int get_device_perms_ve(int dev_type, dev_t dev, int access_mode)
+{
+ int mask = 0;
+
+ mask |= (access_mode & FMODE_READ ? MAY_READ : 0);
+ mask |= (access_mode & FMODE_WRITE ? MAY_WRITE : 0);
+ mask |= (access_mode & FMODE_QUOTACTL ? MAY_QUOTACTL : 0);
+
+ return __devcgroup_inode_permission(dev_type == S_IFBLK, dev, mask);
+}
+EXPORT_SYMBOL(get_device_perms_ve);
+
+int set_device_perms_ve(struct ve_struct *ve,
+ unsigned type, dev_t dev, unsigned mask)
+{
+ int err = -EINVAL;
+ struct dev_whitelist_item *new;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (new == NULL)
+ return -ENOMEM;
+
+ if ((type & S_IFMT) == S_IFBLK)
+ new->type = DEV_BLOCK;
+ else if ((type & S_IFMT) == S_IFCHR)
+ new->type = DEV_CHAR;
+ else
+ goto out;
+
+ new->access = convert_bits(mask);
+ new->major = new->minor = ~0;
+
+ switch (type & VE_USE_MASK) {
+ default:
+ new->minor = MINOR(dev);
+ case VE_USE_MAJOR:
+ new->major = MAJOR(dev);
+ case 0:
+ ;
+ }
+
+ err = dev_whitelist_add(cgroup_to_devcgroup(ve->ve_cgroup), new);
+out:
+ if (err < 0)
+ kfree(new);
+ return err;
+}
+EXPORT_SYMBOL(set_device_perms_ve);
+
+#ifdef CONFIG_PROC_FS
+int devperms_seq_show(struct seq_file *m, void *v)
+{
+ struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
+
+ if (m->private == (void *)0) {
+ seq_printf(m, "Version: 2.7\n");
+ m->private = (void *)-1;
+ }
+
+ if (ve_is_super(ve)) {
+ seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
+ return 0;
+ }
+
+ m->private = (void *)(unsigned long)ve->veid;
+ return devcgroup_seq_read(ve->ve_cgroup, NULL, m);
+}
+EXPORT_SYMBOL(devperms_seq_show);
+#endif
+#endif
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index bca1b74..00bbeea 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -1,6 +1,6 @@
config SECURITY_SELINUX
bool "NSA SELinux Support"
- depends on SECURITY_NETWORK && AUDIT && NET && INET
+ depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE
select NETWORK_SECMARK
default n
help